diff options
Diffstat (limited to 'arch/x86/xen')
-rw-r--r-- | arch/x86/xen/Kconfig | 11 | ||||
-rw-r--r-- | arch/x86/xen/Makefile | 4 | ||||
-rw-r--r-- | arch/x86/xen/enlighten.c | 1146 | ||||
-rw-r--r-- | arch/x86/xen/events.c | 591 | ||||
-rw-r--r-- | arch/x86/xen/features.c | 29 | ||||
-rw-r--r-- | arch/x86/xen/manage.c | 143 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 567 | ||||
-rw-r--r-- | arch/x86/xen/mmu.h | 60 | ||||
-rw-r--r-- | arch/x86/xen/multicalls.c | 90 | ||||
-rw-r--r-- | arch/x86/xen/multicalls.h | 45 | ||||
-rw-r--r-- | arch/x86/xen/setup.c | 111 | ||||
-rw-r--r-- | arch/x86/xen/smp.c | 404 | ||||
-rw-r--r-- | arch/x86/xen/time.c | 593 | ||||
-rw-r--r-- | arch/x86/xen/vdso.h | 4 | ||||
-rw-r--r-- | arch/x86/xen/xen-asm.S | 291 | ||||
-rw-r--r-- | arch/x86/xen/xen-head.S | 38 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 71 |
17 files changed, 4198 insertions, 0 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig new file mode 100644 index 000000000000..9df99e1885a4 --- /dev/null +++ b/arch/x86/xen/Kconfig | |||
@@ -0,0 +1,11 @@ | |||
1 | # | ||
2 | # This Kconfig describes xen options | ||
3 | # | ||
4 | |||
5 | config XEN | ||
6 | bool "Enable support for Xen hypervisor" | ||
7 | depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES | ||
8 | help | ||
9 | This is the Linux Xen port. Enabling this will allow the | ||
10 | kernel to boot in a paravirtualized environment under the | ||
11 | Xen hypervisor. | ||
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile new file mode 100644 index 000000000000..343df246bd3e --- /dev/null +++ b/arch/x86/xen/Makefile | |||
@@ -0,0 +1,4 @@ | |||
1 | obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ | ||
2 | events.o time.o manage.o xen-asm.o | ||
3 | |||
4 | obj-$(CONFIG_SMP) += smp.o | ||
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c new file mode 100644 index 000000000000..f01bfcd4bdee --- /dev/null +++ b/arch/x86/xen/enlighten.c | |||
@@ -0,0 +1,1146 @@ | |||
1 | /* | ||
2 | * Core of Xen paravirt_ops implementation. | ||
3 | * | ||
4 | * This file contains the xen_paravirt_ops structure itself, and the | ||
5 | * implementations for: | ||
6 | * - privileged instructions | ||
7 | * - interrupt flags | ||
8 | * - segment operations | ||
9 | * - booting and setup | ||
10 | * | ||
11 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | ||
12 | */ | ||
13 | |||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/init.h> | ||
16 | #include <linux/smp.h> | ||
17 | #include <linux/preempt.h> | ||
18 | #include <linux/hardirq.h> | ||
19 | #include <linux/percpu.h> | ||
20 | #include <linux/delay.h> | ||
21 | #include <linux/start_kernel.h> | ||
22 | #include <linux/sched.h> | ||
23 | #include <linux/bootmem.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/mm.h> | ||
26 | #include <linux/page-flags.h> | ||
27 | #include <linux/highmem.h> | ||
28 | #include <linux/smp.h> | ||
29 | |||
30 | #include <xen/interface/xen.h> | ||
31 | #include <xen/interface/physdev.h> | ||
32 | #include <xen/interface/vcpu.h> | ||
33 | #include <xen/interface/sched.h> | ||
34 | #include <xen/features.h> | ||
35 | #include <xen/page.h> | ||
36 | |||
37 | #include <asm/paravirt.h> | ||
38 | #include <asm/page.h> | ||
39 | #include <asm/xen/hypercall.h> | ||
40 | #include <asm/xen/hypervisor.h> | ||
41 | #include <asm/fixmap.h> | ||
42 | #include <asm/processor.h> | ||
43 | #include <asm/setup.h> | ||
44 | #include <asm/desc.h> | ||
45 | #include <asm/pgtable.h> | ||
46 | #include <asm/tlbflush.h> | ||
47 | #include <asm/reboot.h> | ||
48 | |||
49 | #include "xen-ops.h" | ||
50 | #include "mmu.h" | ||
51 | #include "multicalls.h" | ||
52 | |||
53 | EXPORT_SYMBOL_GPL(hypercall_page); | ||
54 | |||
55 | DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); | ||
56 | |||
57 | DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); | ||
58 | DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); | ||
59 | DEFINE_PER_CPU(unsigned long, xen_cr3); | ||
60 | |||
61 | struct start_info *xen_start_info; | ||
62 | EXPORT_SYMBOL_GPL(xen_start_info); | ||
63 | |||
64 | static /* __initdata */ struct shared_info dummy_shared_info; | ||
65 | |||
66 | /* | ||
67 | * Point at some empty memory to start with. We map the real shared_info | ||
68 | * page as soon as fixmap is up and running. | ||
69 | */ | ||
70 | struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; | ||
71 | |||
72 | /* | ||
73 | * Flag to determine whether vcpu info placement is available on all | ||
74 | * VCPUs. We assume it is to start with, and then set it to zero on | ||
75 | * the first failure. This is because it can succeed on some VCPUs | ||
76 | * and not others, since it can involve hypervisor memory allocation, | ||
77 | * or because the guest failed to guarantee all the appropriate | ||
78 | * constraints on all VCPUs (ie buffer can't cross a page boundary). | ||
79 | * | ||
80 | * Note that any particular CPU may be using a placed vcpu structure, | ||
81 | * but we can only optimise if the all are. | ||
82 | * | ||
83 | * 0: not available, 1: available | ||
84 | */ | ||
85 | static int have_vcpu_info_placement = 1; | ||
86 | |||
87 | static void __init xen_vcpu_setup(int cpu) | ||
88 | { | ||
89 | struct vcpu_register_vcpu_info info; | ||
90 | int err; | ||
91 | struct vcpu_info *vcpup; | ||
92 | |||
93 | per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; | ||
94 | |||
95 | if (!have_vcpu_info_placement) | ||
96 | return; /* already tested, not available */ | ||
97 | |||
98 | vcpup = &per_cpu(xen_vcpu_info, cpu); | ||
99 | |||
100 | info.mfn = virt_to_mfn(vcpup); | ||
101 | info.offset = offset_in_page(vcpup); | ||
102 | |||
103 | printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", | ||
104 | cpu, vcpup, info.mfn, info.offset); | ||
105 | |||
106 | /* Check to see if the hypervisor will put the vcpu_info | ||
107 | structure where we want it, which allows direct access via | ||
108 | a percpu-variable. */ | ||
109 | err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); | ||
110 | |||
111 | if (err) { | ||
112 | printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); | ||
113 | have_vcpu_info_placement = 0; | ||
114 | } else { | ||
115 | /* This cpu is using the registered vcpu info, even if | ||
116 | later ones fail to. */ | ||
117 | per_cpu(xen_vcpu, cpu) = vcpup; | ||
118 | |||
119 | printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", | ||
120 | cpu, vcpup); | ||
121 | } | ||
122 | } | ||
123 | |||
124 | static void __init xen_banner(void) | ||
125 | { | ||
126 | printk(KERN_INFO "Booting paravirtualized kernel on %s\n", | ||
127 | paravirt_ops.name); | ||
128 | printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); | ||
129 | } | ||
130 | |||
131 | static void xen_cpuid(unsigned int *eax, unsigned int *ebx, | ||
132 | unsigned int *ecx, unsigned int *edx) | ||
133 | { | ||
134 | unsigned maskedx = ~0; | ||
135 | |||
136 | /* | ||
137 | * Mask out inconvenient features, to try and disable as many | ||
138 | * unsupported kernel subsystems as possible. | ||
139 | */ | ||
140 | if (*eax == 1) | ||
141 | maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ | ||
142 | (1 << X86_FEATURE_ACPI) | /* disable ACPI */ | ||
143 | (1 << X86_FEATURE_ACC)); /* thermal monitoring */ | ||
144 | |||
145 | asm(XEN_EMULATE_PREFIX "cpuid" | ||
146 | : "=a" (*eax), | ||
147 | "=b" (*ebx), | ||
148 | "=c" (*ecx), | ||
149 | "=d" (*edx) | ||
150 | : "0" (*eax), "2" (*ecx)); | ||
151 | *edx &= maskedx; | ||
152 | } | ||
153 | |||
154 | static void xen_set_debugreg(int reg, unsigned long val) | ||
155 | { | ||
156 | HYPERVISOR_set_debugreg(reg, val); | ||
157 | } | ||
158 | |||
159 | static unsigned long xen_get_debugreg(int reg) | ||
160 | { | ||
161 | return HYPERVISOR_get_debugreg(reg); | ||
162 | } | ||
163 | |||
164 | static unsigned long xen_save_fl(void) | ||
165 | { | ||
166 | struct vcpu_info *vcpu; | ||
167 | unsigned long flags; | ||
168 | |||
169 | vcpu = x86_read_percpu(xen_vcpu); | ||
170 | |||
171 | /* flag has opposite sense of mask */ | ||
172 | flags = !vcpu->evtchn_upcall_mask; | ||
173 | |||
174 | /* convert to IF type flag | ||
175 | -0 -> 0x00000000 | ||
176 | -1 -> 0xffffffff | ||
177 | */ | ||
178 | return (-flags) & X86_EFLAGS_IF; | ||
179 | } | ||
180 | |||
181 | static void xen_restore_fl(unsigned long flags) | ||
182 | { | ||
183 | struct vcpu_info *vcpu; | ||
184 | |||
185 | /* convert from IF type flag */ | ||
186 | flags = !(flags & X86_EFLAGS_IF); | ||
187 | |||
188 | /* There's a one instruction preempt window here. We need to | ||
189 | make sure we're don't switch CPUs between getting the vcpu | ||
190 | pointer and updating the mask. */ | ||
191 | preempt_disable(); | ||
192 | vcpu = x86_read_percpu(xen_vcpu); | ||
193 | vcpu->evtchn_upcall_mask = flags; | ||
194 | preempt_enable_no_resched(); | ||
195 | |||
196 | /* Doesn't matter if we get preempted here, because any | ||
197 | pending event will get dealt with anyway. */ | ||
198 | |||
199 | if (flags == 0) { | ||
200 | preempt_check_resched(); | ||
201 | barrier(); /* unmask then check (avoid races) */ | ||
202 | if (unlikely(vcpu->evtchn_upcall_pending)) | ||
203 | force_evtchn_callback(); | ||
204 | } | ||
205 | } | ||
206 | |||
207 | static void xen_irq_disable(void) | ||
208 | { | ||
209 | /* There's a one instruction preempt window here. We need to | ||
210 | make sure we're don't switch CPUs between getting the vcpu | ||
211 | pointer and updating the mask. */ | ||
212 | preempt_disable(); | ||
213 | x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; | ||
214 | preempt_enable_no_resched(); | ||
215 | } | ||
216 | |||
217 | static void xen_irq_enable(void) | ||
218 | { | ||
219 | struct vcpu_info *vcpu; | ||
220 | |||
221 | /* There's a one instruction preempt window here. We need to | ||
222 | make sure we're don't switch CPUs between getting the vcpu | ||
223 | pointer and updating the mask. */ | ||
224 | preempt_disable(); | ||
225 | vcpu = x86_read_percpu(xen_vcpu); | ||
226 | vcpu->evtchn_upcall_mask = 0; | ||
227 | preempt_enable_no_resched(); | ||
228 | |||
229 | /* Doesn't matter if we get preempted here, because any | ||
230 | pending event will get dealt with anyway. */ | ||
231 | |||
232 | barrier(); /* unmask then check (avoid races) */ | ||
233 | if (unlikely(vcpu->evtchn_upcall_pending)) | ||
234 | force_evtchn_callback(); | ||
235 | } | ||
236 | |||
237 | static void xen_safe_halt(void) | ||
238 | { | ||
239 | /* Blocking includes an implicit local_irq_enable(). */ | ||
240 | if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) | ||
241 | BUG(); | ||
242 | } | ||
243 | |||
244 | static void xen_halt(void) | ||
245 | { | ||
246 | if (irqs_disabled()) | ||
247 | HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); | ||
248 | else | ||
249 | xen_safe_halt(); | ||
250 | } | ||
251 | |||
252 | static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) | ||
253 | { | ||
254 | BUG_ON(preemptible()); | ||
255 | |||
256 | switch (mode) { | ||
257 | case PARAVIRT_LAZY_NONE: | ||
258 | BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE); | ||
259 | break; | ||
260 | |||
261 | case PARAVIRT_LAZY_MMU: | ||
262 | case PARAVIRT_LAZY_CPU: | ||
263 | BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE); | ||
264 | break; | ||
265 | |||
266 | case PARAVIRT_LAZY_FLUSH: | ||
267 | /* flush if necessary, but don't change state */ | ||
268 | if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE) | ||
269 | xen_mc_flush(); | ||
270 | return; | ||
271 | } | ||
272 | |||
273 | xen_mc_flush(); | ||
274 | x86_write_percpu(xen_lazy_mode, mode); | ||
275 | } | ||
276 | |||
277 | static unsigned long xen_store_tr(void) | ||
278 | { | ||
279 | return 0; | ||
280 | } | ||
281 | |||
282 | static void xen_set_ldt(const void *addr, unsigned entries) | ||
283 | { | ||
284 | unsigned long linear_addr = (unsigned long)addr; | ||
285 | struct mmuext_op *op; | ||
286 | struct multicall_space mcs = xen_mc_entry(sizeof(*op)); | ||
287 | |||
288 | op = mcs.args; | ||
289 | op->cmd = MMUEXT_SET_LDT; | ||
290 | if (linear_addr) { | ||
291 | /* ldt my be vmalloced, use arbitrary_virt_to_machine */ | ||
292 | xmaddr_t maddr; | ||
293 | maddr = arbitrary_virt_to_machine((unsigned long)addr); | ||
294 | linear_addr = (unsigned long)maddr.maddr; | ||
295 | } | ||
296 | op->arg1.linear_addr = linear_addr; | ||
297 | op->arg2.nr_ents = entries; | ||
298 | |||
299 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
300 | |||
301 | xen_mc_issue(PARAVIRT_LAZY_CPU); | ||
302 | } | ||
303 | |||
304 | static void xen_load_gdt(const struct Xgt_desc_struct *dtr) | ||
305 | { | ||
306 | unsigned long *frames; | ||
307 | unsigned long va = dtr->address; | ||
308 | unsigned int size = dtr->size + 1; | ||
309 | unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; | ||
310 | int f; | ||
311 | struct multicall_space mcs; | ||
312 | |||
313 | /* A GDT can be up to 64k in size, which corresponds to 8192 | ||
314 | 8-byte entries, or 16 4k pages.. */ | ||
315 | |||
316 | BUG_ON(size > 65536); | ||
317 | BUG_ON(va & ~PAGE_MASK); | ||
318 | |||
319 | mcs = xen_mc_entry(sizeof(*frames) * pages); | ||
320 | frames = mcs.args; | ||
321 | |||
322 | for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { | ||
323 | frames[f] = virt_to_mfn(va); | ||
324 | make_lowmem_page_readonly((void *)va); | ||
325 | } | ||
326 | |||
327 | MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); | ||
328 | |||
329 | xen_mc_issue(PARAVIRT_LAZY_CPU); | ||
330 | } | ||
331 | |||
332 | static void load_TLS_descriptor(struct thread_struct *t, | ||
333 | unsigned int cpu, unsigned int i) | ||
334 | { | ||
335 | struct desc_struct *gdt = get_cpu_gdt_table(cpu); | ||
336 | xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); | ||
337 | struct multicall_space mc = __xen_mc_entry(0); | ||
338 | |||
339 | MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); | ||
340 | } | ||
341 | |||
342 | static void xen_load_tls(struct thread_struct *t, unsigned int cpu) | ||
343 | { | ||
344 | xen_mc_batch(); | ||
345 | |||
346 | load_TLS_descriptor(t, cpu, 0); | ||
347 | load_TLS_descriptor(t, cpu, 1); | ||
348 | load_TLS_descriptor(t, cpu, 2); | ||
349 | |||
350 | xen_mc_issue(PARAVIRT_LAZY_CPU); | ||
351 | |||
352 | /* | ||
353 | * XXX sleazy hack: If we're being called in a lazy-cpu zone, | ||
354 | * it means we're in a context switch, and %gs has just been | ||
355 | * saved. This means we can zero it out to prevent faults on | ||
356 | * exit from the hypervisor if the next process has no %gs. | ||
357 | * Either way, it has been saved, and the new value will get | ||
358 | * loaded properly. This will go away as soon as Xen has been | ||
359 | * modified to not save/restore %gs for normal hypercalls. | ||
360 | */ | ||
361 | if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU) | ||
362 | loadsegment(gs, 0); | ||
363 | } | ||
364 | |||
365 | static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, | ||
366 | u32 low, u32 high) | ||
367 | { | ||
368 | unsigned long lp = (unsigned long)&dt[entrynum]; | ||
369 | xmaddr_t mach_lp = virt_to_machine(lp); | ||
370 | u64 entry = (u64)high << 32 | low; | ||
371 | |||
372 | preempt_disable(); | ||
373 | |||
374 | xen_mc_flush(); | ||
375 | if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) | ||
376 | BUG(); | ||
377 | |||
378 | preempt_enable(); | ||
379 | } | ||
380 | |||
381 | static int cvt_gate_to_trap(int vector, u32 low, u32 high, | ||
382 | struct trap_info *info) | ||
383 | { | ||
384 | u8 type, dpl; | ||
385 | |||
386 | type = (high >> 8) & 0x1f; | ||
387 | dpl = (high >> 13) & 3; | ||
388 | |||
389 | if (type != 0xf && type != 0xe) | ||
390 | return 0; | ||
391 | |||
392 | info->vector = vector; | ||
393 | info->address = (high & 0xffff0000) | (low & 0x0000ffff); | ||
394 | info->cs = low >> 16; | ||
395 | info->flags = dpl; | ||
396 | /* interrupt gates clear IF */ | ||
397 | if (type == 0xe) | ||
398 | info->flags |= 4; | ||
399 | |||
400 | return 1; | ||
401 | } | ||
402 | |||
403 | /* Locations of each CPU's IDT */ | ||
404 | static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc); | ||
405 | |||
406 | /* Set an IDT entry. If the entry is part of the current IDT, then | ||
407 | also update Xen. */ | ||
408 | static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, | ||
409 | u32 low, u32 high) | ||
410 | { | ||
411 | unsigned long p = (unsigned long)&dt[entrynum]; | ||
412 | unsigned long start, end; | ||
413 | |||
414 | preempt_disable(); | ||
415 | |||
416 | start = __get_cpu_var(idt_desc).address; | ||
417 | end = start + __get_cpu_var(idt_desc).size + 1; | ||
418 | |||
419 | xen_mc_flush(); | ||
420 | |||
421 | write_dt_entry(dt, entrynum, low, high); | ||
422 | |||
423 | if (p >= start && (p + 8) <= end) { | ||
424 | struct trap_info info[2]; | ||
425 | |||
426 | info[1].address = 0; | ||
427 | |||
428 | if (cvt_gate_to_trap(entrynum, low, high, &info[0])) | ||
429 | if (HYPERVISOR_set_trap_table(info)) | ||
430 | BUG(); | ||
431 | } | ||
432 | |||
433 | preempt_enable(); | ||
434 | } | ||
435 | |||
436 | static void xen_convert_trap_info(const struct Xgt_desc_struct *desc, | ||
437 | struct trap_info *traps) | ||
438 | { | ||
439 | unsigned in, out, count; | ||
440 | |||
441 | count = (desc->size+1) / 8; | ||
442 | BUG_ON(count > 256); | ||
443 | |||
444 | for (in = out = 0; in < count; in++) { | ||
445 | const u32 *entry = (u32 *)(desc->address + in * 8); | ||
446 | |||
447 | if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) | ||
448 | out++; | ||
449 | } | ||
450 | traps[out].address = 0; | ||
451 | } | ||
452 | |||
453 | void xen_copy_trap_info(struct trap_info *traps) | ||
454 | { | ||
455 | const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc); | ||
456 | |||
457 | xen_convert_trap_info(desc, traps); | ||
458 | } | ||
459 | |||
460 | /* Load a new IDT into Xen. In principle this can be per-CPU, so we | ||
461 | hold a spinlock to protect the static traps[] array (static because | ||
462 | it avoids allocation, and saves stack space). */ | ||
463 | static void xen_load_idt(const struct Xgt_desc_struct *desc) | ||
464 | { | ||
465 | static DEFINE_SPINLOCK(lock); | ||
466 | static struct trap_info traps[257]; | ||
467 | |||
468 | spin_lock(&lock); | ||
469 | |||
470 | __get_cpu_var(idt_desc) = *desc; | ||
471 | |||
472 | xen_convert_trap_info(desc, traps); | ||
473 | |||
474 | xen_mc_flush(); | ||
475 | if (HYPERVISOR_set_trap_table(traps)) | ||
476 | BUG(); | ||
477 | |||
478 | spin_unlock(&lock); | ||
479 | } | ||
480 | |||
481 | /* Write a GDT descriptor entry. Ignore LDT descriptors, since | ||
482 | they're handled differently. */ | ||
483 | static void xen_write_gdt_entry(struct desc_struct *dt, int entry, | ||
484 | u32 low, u32 high) | ||
485 | { | ||
486 | preempt_disable(); | ||
487 | |||
488 | switch ((high >> 8) & 0xff) { | ||
489 | case DESCTYPE_LDT: | ||
490 | case DESCTYPE_TSS: | ||
491 | /* ignore */ | ||
492 | break; | ||
493 | |||
494 | default: { | ||
495 | xmaddr_t maddr = virt_to_machine(&dt[entry]); | ||
496 | u64 desc = (u64)high << 32 | low; | ||
497 | |||
498 | xen_mc_flush(); | ||
499 | if (HYPERVISOR_update_descriptor(maddr.maddr, desc)) | ||
500 | BUG(); | ||
501 | } | ||
502 | |||
503 | } | ||
504 | |||
505 | preempt_enable(); | ||
506 | } | ||
507 | |||
508 | static void xen_load_esp0(struct tss_struct *tss, | ||
509 | struct thread_struct *thread) | ||
510 | { | ||
511 | struct multicall_space mcs = xen_mc_entry(0); | ||
512 | MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0); | ||
513 | xen_mc_issue(PARAVIRT_LAZY_CPU); | ||
514 | } | ||
515 | |||
516 | static void xen_set_iopl_mask(unsigned mask) | ||
517 | { | ||
518 | struct physdev_set_iopl set_iopl; | ||
519 | |||
520 | /* Force the change at ring 0. */ | ||
521 | set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; | ||
522 | HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); | ||
523 | } | ||
524 | |||
525 | static void xen_io_delay(void) | ||
526 | { | ||
527 | } | ||
528 | |||
529 | #ifdef CONFIG_X86_LOCAL_APIC | ||
530 | static unsigned long xen_apic_read(unsigned long reg) | ||
531 | { | ||
532 | return 0; | ||
533 | } | ||
534 | |||
535 | static void xen_apic_write(unsigned long reg, unsigned long val) | ||
536 | { | ||
537 | /* Warn to see if there's any stray references */ | ||
538 | WARN_ON(1); | ||
539 | } | ||
540 | #endif | ||
541 | |||
542 | static void xen_flush_tlb(void) | ||
543 | { | ||
544 | struct mmuext_op *op; | ||
545 | struct multicall_space mcs = xen_mc_entry(sizeof(*op)); | ||
546 | |||
547 | op = mcs.args; | ||
548 | op->cmd = MMUEXT_TLB_FLUSH_LOCAL; | ||
549 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
550 | |||
551 | xen_mc_issue(PARAVIRT_LAZY_MMU); | ||
552 | } | ||
553 | |||
554 | static void xen_flush_tlb_single(unsigned long addr) | ||
555 | { | ||
556 | struct mmuext_op *op; | ||
557 | struct multicall_space mcs = xen_mc_entry(sizeof(*op)); | ||
558 | |||
559 | op = mcs.args; | ||
560 | op->cmd = MMUEXT_INVLPG_LOCAL; | ||
561 | op->arg1.linear_addr = addr & PAGE_MASK; | ||
562 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
563 | |||
564 | xen_mc_issue(PARAVIRT_LAZY_MMU); | ||
565 | } | ||
566 | |||
567 | static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, | ||
568 | unsigned long va) | ||
569 | { | ||
570 | struct { | ||
571 | struct mmuext_op op; | ||
572 | cpumask_t mask; | ||
573 | } *args; | ||
574 | cpumask_t cpumask = *cpus; | ||
575 | struct multicall_space mcs; | ||
576 | |||
577 | /* | ||
578 | * A couple of (to be removed) sanity checks: | ||
579 | * | ||
580 | * - current CPU must not be in mask | ||
581 | * - mask must exist :) | ||
582 | */ | ||
583 | BUG_ON(cpus_empty(cpumask)); | ||
584 | BUG_ON(cpu_isset(smp_processor_id(), cpumask)); | ||
585 | BUG_ON(!mm); | ||
586 | |||
587 | /* If a CPU which we ran on has gone down, OK. */ | ||
588 | cpus_and(cpumask, cpumask, cpu_online_map); | ||
589 | if (cpus_empty(cpumask)) | ||
590 | return; | ||
591 | |||
592 | mcs = xen_mc_entry(sizeof(*args)); | ||
593 | args = mcs.args; | ||
594 | args->mask = cpumask; | ||
595 | args->op.arg2.vcpumask = &args->mask; | ||
596 | |||
597 | if (va == TLB_FLUSH_ALL) { | ||
598 | args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; | ||
599 | } else { | ||
600 | args->op.cmd = MMUEXT_INVLPG_MULTI; | ||
601 | args->op.arg1.linear_addr = va; | ||
602 | } | ||
603 | |||
604 | MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); | ||
605 | |||
606 | xen_mc_issue(PARAVIRT_LAZY_MMU); | ||
607 | } | ||
608 | |||
609 | static void xen_write_cr2(unsigned long cr2) | ||
610 | { | ||
611 | x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; | ||
612 | } | ||
613 | |||
614 | static unsigned long xen_read_cr2(void) | ||
615 | { | ||
616 | return x86_read_percpu(xen_vcpu)->arch.cr2; | ||
617 | } | ||
618 | |||
619 | static unsigned long xen_read_cr2_direct(void) | ||
620 | { | ||
621 | return x86_read_percpu(xen_vcpu_info.arch.cr2); | ||
622 | } | ||
623 | |||
624 | static void xen_write_cr4(unsigned long cr4) | ||
625 | { | ||
626 | /* Just ignore cr4 changes; Xen doesn't allow us to do | ||
627 | anything anyway. */ | ||
628 | } | ||
629 | |||
630 | static unsigned long xen_read_cr3(void) | ||
631 | { | ||
632 | return x86_read_percpu(xen_cr3); | ||
633 | } | ||
634 | |||
635 | static void xen_write_cr3(unsigned long cr3) | ||
636 | { | ||
637 | BUG_ON(preemptible()); | ||
638 | |||
639 | if (cr3 == x86_read_percpu(xen_cr3)) { | ||
640 | /* just a simple tlb flush */ | ||
641 | xen_flush_tlb(); | ||
642 | return; | ||
643 | } | ||
644 | |||
645 | x86_write_percpu(xen_cr3, cr3); | ||
646 | |||
647 | |||
648 | { | ||
649 | struct mmuext_op *op; | ||
650 | struct multicall_space mcs = xen_mc_entry(sizeof(*op)); | ||
651 | unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); | ||
652 | |||
653 | op = mcs.args; | ||
654 | op->cmd = MMUEXT_NEW_BASEPTR; | ||
655 | op->arg1.mfn = mfn; | ||
656 | |||
657 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
658 | |||
659 | xen_mc_issue(PARAVIRT_LAZY_CPU); | ||
660 | } | ||
661 | } | ||
662 | |||
663 | /* Early in boot, while setting up the initial pagetable, assume | ||
664 | everything is pinned. */ | ||
665 | static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) | ||
666 | { | ||
667 | BUG_ON(mem_map); /* should only be used early */ | ||
668 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | ||
669 | } | ||
670 | |||
671 | /* This needs to make sure the new pte page is pinned iff its being | ||
672 | attached to a pinned pagetable. */ | ||
673 | static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) | ||
674 | { | ||
675 | struct page *page = pfn_to_page(pfn); | ||
676 | |||
677 | if (PagePinned(virt_to_page(mm->pgd))) { | ||
678 | SetPagePinned(page); | ||
679 | |||
680 | if (!PageHighMem(page)) | ||
681 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | ||
682 | else | ||
683 | /* make sure there are no stray mappings of | ||
684 | this page */ | ||
685 | kmap_flush_unused(); | ||
686 | } | ||
687 | } | ||
688 | |||
689 | /* This should never happen until we're OK to use struct page */ | ||
690 | static void xen_release_pt(u32 pfn) | ||
691 | { | ||
692 | struct page *page = pfn_to_page(pfn); | ||
693 | |||
694 | if (PagePinned(page)) { | ||
695 | if (!PageHighMem(page)) | ||
696 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | ||
697 | } | ||
698 | } | ||
699 | |||
700 | #ifdef CONFIG_HIGHPTE | ||
701 | static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) | ||
702 | { | ||
703 | pgprot_t prot = PAGE_KERNEL; | ||
704 | |||
705 | if (PagePinned(page)) | ||
706 | prot = PAGE_KERNEL_RO; | ||
707 | |||
708 | if (0 && PageHighMem(page)) | ||
709 | printk("mapping highpte %lx type %d prot %s\n", | ||
710 | page_to_pfn(page), type, | ||
711 | (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); | ||
712 | |||
713 | return kmap_atomic_prot(page, type, prot); | ||
714 | } | ||
715 | #endif | ||
716 | |||
717 | static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) | ||
718 | { | ||
719 | /* If there's an existing pte, then don't allow _PAGE_RW to be set */ | ||
720 | if (pte_val_ma(*ptep) & _PAGE_PRESENT) | ||
721 | pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & | ||
722 | pte_val_ma(pte)); | ||
723 | |||
724 | return pte; | ||
725 | } | ||
726 | |||
727 | /* Init-time set_pte while constructing initial pagetables, which | ||
728 | doesn't allow RO pagetable pages to be remapped RW */ | ||
729 | static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) | ||
730 | { | ||
731 | pte = mask_rw_pte(ptep, pte); | ||
732 | |||
733 | xen_set_pte(ptep, pte); | ||
734 | } | ||
735 | |||
736 | static __init void xen_pagetable_setup_start(pgd_t *base) | ||
737 | { | ||
738 | pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; | ||
739 | |||
740 | /* special set_pte for pagetable initialization */ | ||
741 | paravirt_ops.set_pte = xen_set_pte_init; | ||
742 | |||
743 | init_mm.pgd = base; | ||
744 | /* | ||
745 | * copy top-level of Xen-supplied pagetable into place. For | ||
746 | * !PAE we can use this as-is, but for PAE it is a stand-in | ||
747 | * while we copy the pmd pages. | ||
748 | */ | ||
749 | memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); | ||
750 | |||
751 | if (PTRS_PER_PMD > 1) { | ||
752 | int i; | ||
753 | /* | ||
754 | * For PAE, need to allocate new pmds, rather than | ||
755 | * share Xen's, since Xen doesn't like pmd's being | ||
756 | * shared between address spaces. | ||
757 | */ | ||
758 | for (i = 0; i < PTRS_PER_PGD; i++) { | ||
759 | if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { | ||
760 | pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); | ||
761 | |||
762 | memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), | ||
763 | PAGE_SIZE); | ||
764 | |||
765 | make_lowmem_page_readonly(pmd); | ||
766 | |||
767 | set_pgd(&base[i], __pgd(1 + __pa(pmd))); | ||
768 | } else | ||
769 | pgd_clear(&base[i]); | ||
770 | } | ||
771 | } | ||
772 | |||
773 | /* make sure zero_page is mapped RO so we can use it in pagetables */ | ||
774 | make_lowmem_page_readonly(empty_zero_page); | ||
775 | make_lowmem_page_readonly(base); | ||
776 | /* | ||
777 | * Switch to new pagetable. This is done before | ||
778 | * pagetable_init has done anything so that the new pages | ||
779 | * added to the table can be prepared properly for Xen. | ||
780 | */ | ||
781 | xen_write_cr3(__pa(base)); | ||
782 | } | ||
783 | |||
784 | static __init void xen_pagetable_setup_done(pgd_t *base) | ||
785 | { | ||
786 | /* This will work as long as patching hasn't happened yet | ||
787 | (which it hasn't) */ | ||
788 | paravirt_ops.alloc_pt = xen_alloc_pt; | ||
789 | paravirt_ops.set_pte = xen_set_pte; | ||
790 | |||
791 | if (!xen_feature(XENFEAT_auto_translated_physmap)) { | ||
792 | /* | ||
793 | * Create a mapping for the shared info page. | ||
794 | * Should be set_fixmap(), but shared_info is a machine | ||
795 | * address with no corresponding pseudo-phys address. | ||
796 | */ | ||
797 | set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP), | ||
798 | PFN_DOWN(xen_start_info->shared_info), | ||
799 | PAGE_KERNEL); | ||
800 | |||
801 | HYPERVISOR_shared_info = | ||
802 | (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); | ||
803 | |||
804 | } else | ||
805 | HYPERVISOR_shared_info = | ||
806 | (struct shared_info *)__va(xen_start_info->shared_info); | ||
807 | |||
808 | /* Actually pin the pagetable down, but we can't set PG_pinned | ||
809 | yet because the page structures don't exist yet. */ | ||
810 | { | ||
811 | struct mmuext_op op; | ||
812 | #ifdef CONFIG_X86_PAE | ||
813 | op.cmd = MMUEXT_PIN_L3_TABLE; | ||
814 | #else | ||
815 | op.cmd = MMUEXT_PIN_L3_TABLE; | ||
816 | #endif | ||
817 | op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); | ||
818 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) | ||
819 | BUG(); | ||
820 | } | ||
821 | } | ||
822 | |||
823 | /* This is called once we have the cpu_possible_map */ | ||
824 | void __init xen_setup_vcpu_info_placement(void) | ||
825 | { | ||
826 | int cpu; | ||
827 | |||
828 | for_each_possible_cpu(cpu) | ||
829 | xen_vcpu_setup(cpu); | ||
830 | |||
831 | /* xen_vcpu_setup managed to place the vcpu_info within the | ||
832 | percpu area for all cpus, so make use of it */ | ||
833 | if (have_vcpu_info_placement) { | ||
834 | printk(KERN_INFO "Xen: using vcpu_info placement\n"); | ||
835 | |||
836 | paravirt_ops.save_fl = xen_save_fl_direct; | ||
837 | paravirt_ops.restore_fl = xen_restore_fl_direct; | ||
838 | paravirt_ops.irq_disable = xen_irq_disable_direct; | ||
839 | paravirt_ops.irq_enable = xen_irq_enable_direct; | ||
840 | paravirt_ops.read_cr2 = xen_read_cr2_direct; | ||
841 | paravirt_ops.iret = xen_iret_direct; | ||
842 | } | ||
843 | } | ||
844 | |||
845 | static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, | ||
846 | unsigned long addr, unsigned len) | ||
847 | { | ||
848 | char *start, *end, *reloc; | ||
849 | unsigned ret; | ||
850 | |||
851 | start = end = reloc = NULL; | ||
852 | |||
853 | #define SITE(x) \ | ||
854 | case PARAVIRT_PATCH(x): \ | ||
855 | if (have_vcpu_info_placement) { \ | ||
856 | start = (char *)xen_##x##_direct; \ | ||
857 | end = xen_##x##_direct_end; \ | ||
858 | reloc = xen_##x##_direct_reloc; \ | ||
859 | } \ | ||
860 | goto patch_site | ||
861 | |||
862 | switch (type) { | ||
863 | SITE(irq_enable); | ||
864 | SITE(irq_disable); | ||
865 | SITE(save_fl); | ||
866 | SITE(restore_fl); | ||
867 | #undef SITE | ||
868 | |||
869 | patch_site: | ||
870 | if (start == NULL || (end-start) > len) | ||
871 | goto default_patch; | ||
872 | |||
873 | ret = paravirt_patch_insns(insnbuf, len, start, end); | ||
874 | |||
875 | /* Note: because reloc is assigned from something that | ||
876 | appears to be an array, gcc assumes it's non-null, | ||
877 | but doesn't know its relationship with start and | ||
878 | end. */ | ||
879 | if (reloc > start && reloc < end) { | ||
880 | int reloc_off = reloc - start; | ||
881 | long *relocp = (long *)(insnbuf + reloc_off); | ||
882 | long delta = start - (char *)addr; | ||
883 | |||
884 | *relocp += delta; | ||
885 | } | ||
886 | break; | ||
887 | |||
888 | default_patch: | ||
889 | default: | ||
890 | ret = paravirt_patch_default(type, clobbers, insnbuf, | ||
891 | addr, len); | ||
892 | break; | ||
893 | } | ||
894 | |||
895 | return ret; | ||
896 | } | ||
897 | |||
898 | static const struct paravirt_ops xen_paravirt_ops __initdata = { | ||
899 | .paravirt_enabled = 1, | ||
900 | .shared_kernel_pmd = 0, | ||
901 | |||
902 | .name = "Xen", | ||
903 | .banner = xen_banner, | ||
904 | |||
905 | .patch = xen_patch, | ||
906 | |||
907 | .memory_setup = xen_memory_setup, | ||
908 | .arch_setup = xen_arch_setup, | ||
909 | .init_IRQ = xen_init_IRQ, | ||
910 | .post_allocator_init = xen_mark_init_mm_pinned, | ||
911 | |||
912 | .time_init = xen_time_init, | ||
913 | .set_wallclock = xen_set_wallclock, | ||
914 | .get_wallclock = xen_get_wallclock, | ||
915 | .get_cpu_khz = xen_cpu_khz, | ||
916 | .sched_clock = xen_sched_clock, | ||
917 | |||
918 | .cpuid = xen_cpuid, | ||
919 | |||
920 | .set_debugreg = xen_set_debugreg, | ||
921 | .get_debugreg = xen_get_debugreg, | ||
922 | |||
923 | .clts = native_clts, | ||
924 | |||
925 | .read_cr0 = native_read_cr0, | ||
926 | .write_cr0 = native_write_cr0, | ||
927 | |||
928 | .read_cr2 = xen_read_cr2, | ||
929 | .write_cr2 = xen_write_cr2, | ||
930 | |||
931 | .read_cr3 = xen_read_cr3, | ||
932 | .write_cr3 = xen_write_cr3, | ||
933 | |||
934 | .read_cr4 = native_read_cr4, | ||
935 | .read_cr4_safe = native_read_cr4_safe, | ||
936 | .write_cr4 = xen_write_cr4, | ||
937 | |||
938 | .save_fl = xen_save_fl, | ||
939 | .restore_fl = xen_restore_fl, | ||
940 | .irq_disable = xen_irq_disable, | ||
941 | .irq_enable = xen_irq_enable, | ||
942 | .safe_halt = xen_safe_halt, | ||
943 | .halt = xen_halt, | ||
944 | .wbinvd = native_wbinvd, | ||
945 | |||
946 | .read_msr = native_read_msr_safe, | ||
947 | .write_msr = native_write_msr_safe, | ||
948 | .read_tsc = native_read_tsc, | ||
949 | .read_pmc = native_read_pmc, | ||
950 | |||
951 | .iret = (void *)&hypercall_page[__HYPERVISOR_iret], | ||
952 | .irq_enable_sysexit = NULL, /* never called */ | ||
953 | |||
954 | .load_tr_desc = paravirt_nop, | ||
955 | .set_ldt = xen_set_ldt, | ||
956 | .load_gdt = xen_load_gdt, | ||
957 | .load_idt = xen_load_idt, | ||
958 | .load_tls = xen_load_tls, | ||
959 | |||
960 | .store_gdt = native_store_gdt, | ||
961 | .store_idt = native_store_idt, | ||
962 | .store_tr = xen_store_tr, | ||
963 | |||
964 | .write_ldt_entry = xen_write_ldt_entry, | ||
965 | .write_gdt_entry = xen_write_gdt_entry, | ||
966 | .write_idt_entry = xen_write_idt_entry, | ||
967 | .load_esp0 = xen_load_esp0, | ||
968 | |||
969 | .set_iopl_mask = xen_set_iopl_mask, | ||
970 | .io_delay = xen_io_delay, | ||
971 | |||
972 | #ifdef CONFIG_X86_LOCAL_APIC | ||
973 | .apic_write = xen_apic_write, | ||
974 | .apic_write_atomic = xen_apic_write, | ||
975 | .apic_read = xen_apic_read, | ||
976 | .setup_boot_clock = paravirt_nop, | ||
977 | .setup_secondary_clock = paravirt_nop, | ||
978 | .startup_ipi_hook = paravirt_nop, | ||
979 | #endif | ||
980 | |||
981 | .flush_tlb_user = xen_flush_tlb, | ||
982 | .flush_tlb_kernel = xen_flush_tlb, | ||
983 | .flush_tlb_single = xen_flush_tlb_single, | ||
984 | .flush_tlb_others = xen_flush_tlb_others, | ||
985 | |||
986 | .pte_update = paravirt_nop, | ||
987 | .pte_update_defer = paravirt_nop, | ||
988 | |||
989 | .pagetable_setup_start = xen_pagetable_setup_start, | ||
990 | .pagetable_setup_done = xen_pagetable_setup_done, | ||
991 | |||
992 | .alloc_pt = xen_alloc_pt_init, | ||
993 | .release_pt = xen_release_pt, | ||
994 | .alloc_pd = paravirt_nop, | ||
995 | .alloc_pd_clone = paravirt_nop, | ||
996 | .release_pd = paravirt_nop, | ||
997 | |||
998 | #ifdef CONFIG_HIGHPTE | ||
999 | .kmap_atomic_pte = xen_kmap_atomic_pte, | ||
1000 | #endif | ||
1001 | |||
1002 | .set_pte = NULL, /* see xen_pagetable_setup_* */ | ||
1003 | .set_pte_at = xen_set_pte_at, | ||
1004 | .set_pmd = xen_set_pmd, | ||
1005 | |||
1006 | .pte_val = xen_pte_val, | ||
1007 | .pgd_val = xen_pgd_val, | ||
1008 | |||
1009 | .make_pte = xen_make_pte, | ||
1010 | .make_pgd = xen_make_pgd, | ||
1011 | |||
1012 | #ifdef CONFIG_X86_PAE | ||
1013 | .set_pte_atomic = xen_set_pte_atomic, | ||
1014 | .set_pte_present = xen_set_pte_at, | ||
1015 | .set_pud = xen_set_pud, | ||
1016 | .pte_clear = xen_pte_clear, | ||
1017 | .pmd_clear = xen_pmd_clear, | ||
1018 | |||
1019 | .make_pmd = xen_make_pmd, | ||
1020 | .pmd_val = xen_pmd_val, | ||
1021 | #endif /* PAE */ | ||
1022 | |||
1023 | .activate_mm = xen_activate_mm, | ||
1024 | .dup_mmap = xen_dup_mmap, | ||
1025 | .exit_mmap = xen_exit_mmap, | ||
1026 | |||
1027 | .set_lazy_mode = xen_set_lazy_mode, | ||
1028 | }; | ||
1029 | |||
1030 | #ifdef CONFIG_SMP | ||
1031 | static const struct smp_ops xen_smp_ops __initdata = { | ||
1032 | .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, | ||
1033 | .smp_prepare_cpus = xen_smp_prepare_cpus, | ||
1034 | .cpu_up = xen_cpu_up, | ||
1035 | .smp_cpus_done = xen_smp_cpus_done, | ||
1036 | |||
1037 | .smp_send_stop = xen_smp_send_stop, | ||
1038 | .smp_send_reschedule = xen_smp_send_reschedule, | ||
1039 | .smp_call_function_mask = xen_smp_call_function_mask, | ||
1040 | }; | ||
1041 | #endif /* CONFIG_SMP */ | ||
1042 | |||
1043 | static void xen_reboot(int reason) | ||
1044 | { | ||
1045 | #ifdef CONFIG_SMP | ||
1046 | smp_send_stop(); | ||
1047 | #endif | ||
1048 | |||
1049 | if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) | ||
1050 | BUG(); | ||
1051 | } | ||
1052 | |||
1053 | static void xen_restart(char *msg) | ||
1054 | { | ||
1055 | xen_reboot(SHUTDOWN_reboot); | ||
1056 | } | ||
1057 | |||
1058 | static void xen_emergency_restart(void) | ||
1059 | { | ||
1060 | xen_reboot(SHUTDOWN_reboot); | ||
1061 | } | ||
1062 | |||
1063 | static void xen_machine_halt(void) | ||
1064 | { | ||
1065 | xen_reboot(SHUTDOWN_poweroff); | ||
1066 | } | ||
1067 | |||
1068 | static void xen_crash_shutdown(struct pt_regs *regs) | ||
1069 | { | ||
1070 | xen_reboot(SHUTDOWN_crash); | ||
1071 | } | ||
1072 | |||
1073 | static const struct machine_ops __initdata xen_machine_ops = { | ||
1074 | .restart = xen_restart, | ||
1075 | .halt = xen_machine_halt, | ||
1076 | .power_off = xen_machine_halt, | ||
1077 | .shutdown = xen_machine_halt, | ||
1078 | .crash_shutdown = xen_crash_shutdown, | ||
1079 | .emergency_restart = xen_emergency_restart, | ||
1080 | }; | ||
1081 | |||
1082 | |||
1083 | /* First C function to be called on Xen boot */ | ||
1084 | asmlinkage void __init xen_start_kernel(void) | ||
1085 | { | ||
1086 | pgd_t *pgd; | ||
1087 | |||
1088 | if (!xen_start_info) | ||
1089 | return; | ||
1090 | |||
1091 | BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); | ||
1092 | |||
1093 | /* Install Xen paravirt ops */ | ||
1094 | paravirt_ops = xen_paravirt_ops; | ||
1095 | machine_ops = xen_machine_ops; | ||
1096 | |||
1097 | #ifdef CONFIG_SMP | ||
1098 | smp_ops = xen_smp_ops; | ||
1099 | #endif | ||
1100 | |||
1101 | xen_setup_features(); | ||
1102 | |||
1103 | /* Get mfn list */ | ||
1104 | if (!xen_feature(XENFEAT_auto_translated_physmap)) | ||
1105 | phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; | ||
1106 | |||
1107 | pgd = (pgd_t *)xen_start_info->pt_base; | ||
1108 | |||
1109 | init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; | ||
1110 | |||
1111 | init_mm.pgd = pgd; /* use the Xen pagetables to start */ | ||
1112 | |||
1113 | /* keep using Xen gdt for now; no urgent need to change it */ | ||
1114 | |||
1115 | x86_write_percpu(xen_cr3, __pa(pgd)); | ||
1116 | |||
1117 | #ifdef CONFIG_SMP | ||
1118 | /* Don't do the full vcpu_info placement stuff until we have a | ||
1119 | possible map. */ | ||
1120 | per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; | ||
1121 | #else | ||
1122 | /* May as well do it now, since there's no good time to call | ||
1123 | it later on UP. */ | ||
1124 | xen_setup_vcpu_info_placement(); | ||
1125 | #endif | ||
1126 | |||
1127 | paravirt_ops.kernel_rpl = 1; | ||
1128 | if (xen_feature(XENFEAT_supervisor_mode_kernel)) | ||
1129 | paravirt_ops.kernel_rpl = 0; | ||
1130 | |||
1131 | /* set the limit of our address space */ | ||
1132 | reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); | ||
1133 | |||
1134 | /* set up basic CPUID stuff */ | ||
1135 | cpu_detect(&new_cpu_data); | ||
1136 | new_cpu_data.hard_math = 1; | ||
1137 | new_cpu_data.x86_capability[0] = cpuid_edx(1); | ||
1138 | |||
1139 | /* Poke various useful things into boot_params */ | ||
1140 | LOADER_TYPE = (9 << 4) | 0; | ||
1141 | INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0; | ||
1142 | INITRD_SIZE = xen_start_info->mod_len; | ||
1143 | |||
1144 | /* Start the world */ | ||
1145 | start_kernel(); | ||
1146 | } | ||
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c new file mode 100644 index 000000000000..da1b173547a1 --- /dev/null +++ b/arch/x86/xen/events.c | |||
@@ -0,0 +1,591 @@ | |||
1 | /* | ||
2 | * Xen event channels | ||
3 | * | ||
4 | * Xen models interrupts with abstract event channels. Because each | ||
5 | * domain gets 1024 event channels, but NR_IRQ is not that large, we | ||
6 | * must dynamically map irqs<->event channels. The event channels | ||
7 | * interface with the rest of the kernel by defining a xen interrupt | ||
8 | * chip. When an event is recieved, it is mapped to an irq and sent | ||
9 | * through the normal interrupt processing path. | ||
10 | * | ||
11 | * There are four kinds of events which can be mapped to an event | ||
12 | * channel: | ||
13 | * | ||
14 | * 1. Inter-domain notifications. This includes all the virtual | ||
15 | * device events, since they're driven by front-ends in another domain | ||
16 | * (typically dom0). | ||
17 | * 2. VIRQs, typically used for timers. These are per-cpu events. | ||
18 | * 3. IPIs. | ||
19 | * 4. Hardware interrupts. Not supported at present. | ||
20 | * | ||
21 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | ||
22 | */ | ||
23 | |||
24 | #include <linux/linkage.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/irq.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <linux/string.h> | ||
29 | |||
30 | #include <asm/ptrace.h> | ||
31 | #include <asm/irq.h> | ||
32 | #include <asm/sync_bitops.h> | ||
33 | #include <asm/xen/hypercall.h> | ||
34 | #include <asm/xen/hypervisor.h> | ||
35 | |||
36 | #include <xen/events.h> | ||
37 | #include <xen/interface/xen.h> | ||
38 | #include <xen/interface/event_channel.h> | ||
39 | |||
40 | #include "xen-ops.h" | ||
41 | |||
42 | /* | ||
43 | * This lock protects updates to the following mapping and reference-count | ||
44 | * arrays. The lock does not need to be acquired to read the mapping tables. | ||
45 | */ | ||
46 | static DEFINE_SPINLOCK(irq_mapping_update_lock); | ||
47 | |||
48 | /* IRQ <-> VIRQ mapping. */ | ||
49 | static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; | ||
50 | |||
51 | /* IRQ <-> IPI mapping */ | ||
52 | static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; | ||
53 | |||
54 | /* Packed IRQ information: binding type, sub-type index, and event channel. */ | ||
55 | struct packed_irq | ||
56 | { | ||
57 | unsigned short evtchn; | ||
58 | unsigned char index; | ||
59 | unsigned char type; | ||
60 | }; | ||
61 | |||
62 | static struct packed_irq irq_info[NR_IRQS]; | ||
63 | |||
64 | /* Binding types. */ | ||
65 | enum { | ||
66 | IRQT_UNBOUND, | ||
67 | IRQT_PIRQ, | ||
68 | IRQT_VIRQ, | ||
69 | IRQT_IPI, | ||
70 | IRQT_EVTCHN | ||
71 | }; | ||
72 | |||
73 | /* Convenient shorthand for packed representation of an unbound IRQ. */ | ||
74 | #define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) | ||
75 | |||
76 | static int evtchn_to_irq[NR_EVENT_CHANNELS] = { | ||
77 | [0 ... NR_EVENT_CHANNELS-1] = -1 | ||
78 | }; | ||
79 | static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; | ||
80 | static u8 cpu_evtchn[NR_EVENT_CHANNELS]; | ||
81 | |||
82 | /* Reference counts for bindings to IRQs. */ | ||
83 | static int irq_bindcount[NR_IRQS]; | ||
84 | |||
85 | /* Xen will never allocate port zero for any purpose. */ | ||
86 | #define VALID_EVTCHN(chn) ((chn) != 0) | ||
87 | |||
88 | /* | ||
89 | * Force a proper event-channel callback from Xen after clearing the | ||
90 | * callback mask. We do this in a very simple manner, by making a call | ||
91 | * down into Xen. The pending flag will be checked by Xen on return. | ||
92 | */ | ||
93 | void force_evtchn_callback(void) | ||
94 | { | ||
95 | (void)HYPERVISOR_xen_version(0, NULL); | ||
96 | } | ||
97 | EXPORT_SYMBOL_GPL(force_evtchn_callback); | ||
98 | |||
99 | static struct irq_chip xen_dynamic_chip; | ||
100 | |||
101 | /* Constructor for packed IRQ information. */ | ||
102 | static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn) | ||
103 | { | ||
104 | return (struct packed_irq) { evtchn, index, type }; | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | * Accessors for packed IRQ information. | ||
109 | */ | ||
110 | static inline unsigned int evtchn_from_irq(int irq) | ||
111 | { | ||
112 | return irq_info[irq].evtchn; | ||
113 | } | ||
114 | |||
115 | static inline unsigned int index_from_irq(int irq) | ||
116 | { | ||
117 | return irq_info[irq].index; | ||
118 | } | ||
119 | |||
120 | static inline unsigned int type_from_irq(int irq) | ||
121 | { | ||
122 | return irq_info[irq].type; | ||
123 | } | ||
124 | |||
125 | static inline unsigned long active_evtchns(unsigned int cpu, | ||
126 | struct shared_info *sh, | ||
127 | unsigned int idx) | ||
128 | { | ||
129 | return (sh->evtchn_pending[idx] & | ||
130 | cpu_evtchn_mask[cpu][idx] & | ||
131 | ~sh->evtchn_mask[idx]); | ||
132 | } | ||
133 | |||
134 | static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) | ||
135 | { | ||
136 | int irq = evtchn_to_irq[chn]; | ||
137 | |||
138 | BUG_ON(irq == -1); | ||
139 | #ifdef CONFIG_SMP | ||
140 | irq_desc[irq].affinity = cpumask_of_cpu(cpu); | ||
141 | #endif | ||
142 | |||
143 | __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); | ||
144 | __set_bit(chn, cpu_evtchn_mask[cpu]); | ||
145 | |||
146 | cpu_evtchn[chn] = cpu; | ||
147 | } | ||
148 | |||
149 | static void init_evtchn_cpu_bindings(void) | ||
150 | { | ||
151 | #ifdef CONFIG_SMP | ||
152 | int i; | ||
153 | /* By default all event channels notify CPU#0. */ | ||
154 | for (i = 0; i < NR_IRQS; i++) | ||
155 | irq_desc[i].affinity = cpumask_of_cpu(0); | ||
156 | #endif | ||
157 | |||
158 | memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); | ||
159 | memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); | ||
160 | } | ||
161 | |||
162 | static inline unsigned int cpu_from_evtchn(unsigned int evtchn) | ||
163 | { | ||
164 | return cpu_evtchn[evtchn]; | ||
165 | } | ||
166 | |||
167 | static inline void clear_evtchn(int port) | ||
168 | { | ||
169 | struct shared_info *s = HYPERVISOR_shared_info; | ||
170 | sync_clear_bit(port, &s->evtchn_pending[0]); | ||
171 | } | ||
172 | |||
173 | static inline void set_evtchn(int port) | ||
174 | { | ||
175 | struct shared_info *s = HYPERVISOR_shared_info; | ||
176 | sync_set_bit(port, &s->evtchn_pending[0]); | ||
177 | } | ||
178 | |||
179 | |||
180 | /** | ||
181 | * notify_remote_via_irq - send event to remote end of event channel via irq | ||
182 | * @irq: irq of event channel to send event to | ||
183 | * | ||
184 | * Unlike notify_remote_via_evtchn(), this is safe to use across | ||
185 | * save/restore. Notifications on a broken connection are silently | ||
186 | * dropped. | ||
187 | */ | ||
188 | void notify_remote_via_irq(int irq) | ||
189 | { | ||
190 | int evtchn = evtchn_from_irq(irq); | ||
191 | |||
192 | if (VALID_EVTCHN(evtchn)) | ||
193 | notify_remote_via_evtchn(evtchn); | ||
194 | } | ||
195 | EXPORT_SYMBOL_GPL(notify_remote_via_irq); | ||
196 | |||
197 | static void mask_evtchn(int port) | ||
198 | { | ||
199 | struct shared_info *s = HYPERVISOR_shared_info; | ||
200 | sync_set_bit(port, &s->evtchn_mask[0]); | ||
201 | } | ||
202 | |||
203 | static void unmask_evtchn(int port) | ||
204 | { | ||
205 | struct shared_info *s = HYPERVISOR_shared_info; | ||
206 | unsigned int cpu = get_cpu(); | ||
207 | |||
208 | BUG_ON(!irqs_disabled()); | ||
209 | |||
210 | /* Slow path (hypercall) if this is a non-local port. */ | ||
211 | if (unlikely(cpu != cpu_from_evtchn(port))) { | ||
212 | struct evtchn_unmask unmask = { .port = port }; | ||
213 | (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); | ||
214 | } else { | ||
215 | struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); | ||
216 | |||
217 | sync_clear_bit(port, &s->evtchn_mask[0]); | ||
218 | |||
219 | /* | ||
220 | * The following is basically the equivalent of | ||
221 | * 'hw_resend_irq'. Just like a real IO-APIC we 'lose | ||
222 | * the interrupt edge' if the channel is masked. | ||
223 | */ | ||
224 | if (sync_test_bit(port, &s->evtchn_pending[0]) && | ||
225 | !sync_test_and_set_bit(port / BITS_PER_LONG, | ||
226 | &vcpu_info->evtchn_pending_sel)) | ||
227 | vcpu_info->evtchn_upcall_pending = 1; | ||
228 | } | ||
229 | |||
230 | put_cpu(); | ||
231 | } | ||
232 | |||
233 | static int find_unbound_irq(void) | ||
234 | { | ||
235 | int irq; | ||
236 | |||
237 | /* Only allocate from dynirq range */ | ||
238 | for (irq = 0; irq < NR_IRQS; irq++) | ||
239 | if (irq_bindcount[irq] == 0) | ||
240 | break; | ||
241 | |||
242 | if (irq == NR_IRQS) | ||
243 | panic("No available IRQ to bind to: increase NR_IRQS!\n"); | ||
244 | |||
245 | return irq; | ||
246 | } | ||
247 | |||
248 | int bind_evtchn_to_irq(unsigned int evtchn) | ||
249 | { | ||
250 | int irq; | ||
251 | |||
252 | spin_lock(&irq_mapping_update_lock); | ||
253 | |||
254 | irq = evtchn_to_irq[evtchn]; | ||
255 | |||
256 | if (irq == -1) { | ||
257 | irq = find_unbound_irq(); | ||
258 | |||
259 | dynamic_irq_init(irq); | ||
260 | set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, | ||
261 | handle_level_irq, "event"); | ||
262 | |||
263 | evtchn_to_irq[evtchn] = irq; | ||
264 | irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); | ||
265 | } | ||
266 | |||
267 | irq_bindcount[irq]++; | ||
268 | |||
269 | spin_unlock(&irq_mapping_update_lock); | ||
270 | |||
271 | return irq; | ||
272 | } | ||
273 | EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); | ||
274 | |||
275 | static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) | ||
276 | { | ||
277 | struct evtchn_bind_ipi bind_ipi; | ||
278 | int evtchn, irq; | ||
279 | |||
280 | spin_lock(&irq_mapping_update_lock); | ||
281 | |||
282 | irq = per_cpu(ipi_to_irq, cpu)[ipi]; | ||
283 | if (irq == -1) { | ||
284 | irq = find_unbound_irq(); | ||
285 | if (irq < 0) | ||
286 | goto out; | ||
287 | |||
288 | dynamic_irq_init(irq); | ||
289 | set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, | ||
290 | handle_level_irq, "ipi"); | ||
291 | |||
292 | bind_ipi.vcpu = cpu; | ||
293 | if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, | ||
294 | &bind_ipi) != 0) | ||
295 | BUG(); | ||
296 | evtchn = bind_ipi.port; | ||
297 | |||
298 | evtchn_to_irq[evtchn] = irq; | ||
299 | irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); | ||
300 | |||
301 | per_cpu(ipi_to_irq, cpu)[ipi] = irq; | ||
302 | |||
303 | bind_evtchn_to_cpu(evtchn, cpu); | ||
304 | } | ||
305 | |||
306 | irq_bindcount[irq]++; | ||
307 | |||
308 | out: | ||
309 | spin_unlock(&irq_mapping_update_lock); | ||
310 | return irq; | ||
311 | } | ||
312 | |||
313 | |||
314 | static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) | ||
315 | { | ||
316 | struct evtchn_bind_virq bind_virq; | ||
317 | int evtchn, irq; | ||
318 | |||
319 | spin_lock(&irq_mapping_update_lock); | ||
320 | |||
321 | irq = per_cpu(virq_to_irq, cpu)[virq]; | ||
322 | |||
323 | if (irq == -1) { | ||
324 | bind_virq.virq = virq; | ||
325 | bind_virq.vcpu = cpu; | ||
326 | if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, | ||
327 | &bind_virq) != 0) | ||
328 | BUG(); | ||
329 | evtchn = bind_virq.port; | ||
330 | |||
331 | irq = find_unbound_irq(); | ||
332 | |||
333 | dynamic_irq_init(irq); | ||
334 | set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, | ||
335 | handle_level_irq, "virq"); | ||
336 | |||
337 | evtchn_to_irq[evtchn] = irq; | ||
338 | irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); | ||
339 | |||
340 | per_cpu(virq_to_irq, cpu)[virq] = irq; | ||
341 | |||
342 | bind_evtchn_to_cpu(evtchn, cpu); | ||
343 | } | ||
344 | |||
345 | irq_bindcount[irq]++; | ||
346 | |||
347 | spin_unlock(&irq_mapping_update_lock); | ||
348 | |||
349 | return irq; | ||
350 | } | ||
351 | |||
352 | static void unbind_from_irq(unsigned int irq) | ||
353 | { | ||
354 | struct evtchn_close close; | ||
355 | int evtchn = evtchn_from_irq(irq); | ||
356 | |||
357 | spin_lock(&irq_mapping_update_lock); | ||
358 | |||
359 | if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) { | ||
360 | close.port = evtchn; | ||
361 | if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) | ||
362 | BUG(); | ||
363 | |||
364 | switch (type_from_irq(irq)) { | ||
365 | case IRQT_VIRQ: | ||
366 | per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) | ||
367 | [index_from_irq(irq)] = -1; | ||
368 | break; | ||
369 | default: | ||
370 | break; | ||
371 | } | ||
372 | |||
373 | /* Closed ports are implicitly re-bound to VCPU0. */ | ||
374 | bind_evtchn_to_cpu(evtchn, 0); | ||
375 | |||
376 | evtchn_to_irq[evtchn] = -1; | ||
377 | irq_info[irq] = IRQ_UNBOUND; | ||
378 | |||
379 | dynamic_irq_init(irq); | ||
380 | } | ||
381 | |||
382 | spin_unlock(&irq_mapping_update_lock); | ||
383 | } | ||
384 | |||
385 | int bind_evtchn_to_irqhandler(unsigned int evtchn, | ||
386 | irqreturn_t (*handler)(int, void *), | ||
387 | unsigned long irqflags, | ||
388 | const char *devname, void *dev_id) | ||
389 | { | ||
390 | unsigned int irq; | ||
391 | int retval; | ||
392 | |||
393 | irq = bind_evtchn_to_irq(evtchn); | ||
394 | retval = request_irq(irq, handler, irqflags, devname, dev_id); | ||
395 | if (retval != 0) { | ||
396 | unbind_from_irq(irq); | ||
397 | return retval; | ||
398 | } | ||
399 | |||
400 | return irq; | ||
401 | } | ||
402 | EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); | ||
403 | |||
404 | int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, | ||
405 | irqreturn_t (*handler)(int, void *), | ||
406 | unsigned long irqflags, const char *devname, void *dev_id) | ||
407 | { | ||
408 | unsigned int irq; | ||
409 | int retval; | ||
410 | |||
411 | irq = bind_virq_to_irq(virq, cpu); | ||
412 | retval = request_irq(irq, handler, irqflags, devname, dev_id); | ||
413 | if (retval != 0) { | ||
414 | unbind_from_irq(irq); | ||
415 | return retval; | ||
416 | } | ||
417 | |||
418 | return irq; | ||
419 | } | ||
420 | EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); | ||
421 | |||
422 | int bind_ipi_to_irqhandler(enum ipi_vector ipi, | ||
423 | unsigned int cpu, | ||
424 | irq_handler_t handler, | ||
425 | unsigned long irqflags, | ||
426 | const char *devname, | ||
427 | void *dev_id) | ||
428 | { | ||
429 | int irq, retval; | ||
430 | |||
431 | irq = bind_ipi_to_irq(ipi, cpu); | ||
432 | if (irq < 0) | ||
433 | return irq; | ||
434 | |||
435 | retval = request_irq(irq, handler, irqflags, devname, dev_id); | ||
436 | if (retval != 0) { | ||
437 | unbind_from_irq(irq); | ||
438 | return retval; | ||
439 | } | ||
440 | |||
441 | return irq; | ||
442 | } | ||
443 | |||
444 | void unbind_from_irqhandler(unsigned int irq, void *dev_id) | ||
445 | { | ||
446 | free_irq(irq, dev_id); | ||
447 | unbind_from_irq(irq); | ||
448 | } | ||
449 | EXPORT_SYMBOL_GPL(unbind_from_irqhandler); | ||
450 | |||
451 | void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) | ||
452 | { | ||
453 | int irq = per_cpu(ipi_to_irq, cpu)[vector]; | ||
454 | BUG_ON(irq < 0); | ||
455 | notify_remote_via_irq(irq); | ||
456 | } | ||
457 | |||
458 | |||
459 | /* | ||
460 | * Search the CPUs pending events bitmasks. For each one found, map | ||
461 | * the event number to an irq, and feed it into do_IRQ() for | ||
462 | * handling. | ||
463 | * | ||
464 | * Xen uses a two-level bitmap to speed searching. The first level is | ||
465 | * a bitset of words which contain pending event bits. The second | ||
466 | * level is a bitset of pending events themselves. | ||
467 | */ | ||
468 | fastcall void xen_evtchn_do_upcall(struct pt_regs *regs) | ||
469 | { | ||
470 | int cpu = get_cpu(); | ||
471 | struct shared_info *s = HYPERVISOR_shared_info; | ||
472 | struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); | ||
473 | unsigned long pending_words; | ||
474 | |||
475 | vcpu_info->evtchn_upcall_pending = 0; | ||
476 | |||
477 | /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ | ||
478 | pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); | ||
479 | while (pending_words != 0) { | ||
480 | unsigned long pending_bits; | ||
481 | int word_idx = __ffs(pending_words); | ||
482 | pending_words &= ~(1UL << word_idx); | ||
483 | |||
484 | while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { | ||
485 | int bit_idx = __ffs(pending_bits); | ||
486 | int port = (word_idx * BITS_PER_LONG) + bit_idx; | ||
487 | int irq = evtchn_to_irq[port]; | ||
488 | |||
489 | if (irq != -1) { | ||
490 | regs->orig_eax = ~irq; | ||
491 | do_IRQ(regs); | ||
492 | } | ||
493 | } | ||
494 | } | ||
495 | |||
496 | put_cpu(); | ||
497 | } | ||
498 | |||
499 | /* Rebind an evtchn so that it gets delivered to a specific cpu */ | ||
500 | static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) | ||
501 | { | ||
502 | struct evtchn_bind_vcpu bind_vcpu; | ||
503 | int evtchn = evtchn_from_irq(irq); | ||
504 | |||
505 | if (!VALID_EVTCHN(evtchn)) | ||
506 | return; | ||
507 | |||
508 | /* Send future instances of this interrupt to other vcpu. */ | ||
509 | bind_vcpu.port = evtchn; | ||
510 | bind_vcpu.vcpu = tcpu; | ||
511 | |||
512 | /* | ||
513 | * If this fails, it usually just indicates that we're dealing with a | ||
514 | * virq or IPI channel, which don't actually need to be rebound. Ignore | ||
515 | * it, but don't do the xenlinux-level rebind in that case. | ||
516 | */ | ||
517 | if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) | ||
518 | bind_evtchn_to_cpu(evtchn, tcpu); | ||
519 | } | ||
520 | |||
521 | |||
522 | static void set_affinity_irq(unsigned irq, cpumask_t dest) | ||
523 | { | ||
524 | unsigned tcpu = first_cpu(dest); | ||
525 | rebind_irq_to_cpu(irq, tcpu); | ||
526 | } | ||
527 | |||
528 | static void enable_dynirq(unsigned int irq) | ||
529 | { | ||
530 | int evtchn = evtchn_from_irq(irq); | ||
531 | |||
532 | if (VALID_EVTCHN(evtchn)) | ||
533 | unmask_evtchn(evtchn); | ||
534 | } | ||
535 | |||
536 | static void disable_dynirq(unsigned int irq) | ||
537 | { | ||
538 | int evtchn = evtchn_from_irq(irq); | ||
539 | |||
540 | if (VALID_EVTCHN(evtchn)) | ||
541 | mask_evtchn(evtchn); | ||
542 | } | ||
543 | |||
544 | static void ack_dynirq(unsigned int irq) | ||
545 | { | ||
546 | int evtchn = evtchn_from_irq(irq); | ||
547 | |||
548 | move_native_irq(irq); | ||
549 | |||
550 | if (VALID_EVTCHN(evtchn)) | ||
551 | clear_evtchn(evtchn); | ||
552 | } | ||
553 | |||
554 | static int retrigger_dynirq(unsigned int irq) | ||
555 | { | ||
556 | int evtchn = evtchn_from_irq(irq); | ||
557 | int ret = 0; | ||
558 | |||
559 | if (VALID_EVTCHN(evtchn)) { | ||
560 | set_evtchn(evtchn); | ||
561 | ret = 1; | ||
562 | } | ||
563 | |||
564 | return ret; | ||
565 | } | ||
566 | |||
567 | static struct irq_chip xen_dynamic_chip __read_mostly = { | ||
568 | .name = "xen-dyn", | ||
569 | .mask = disable_dynirq, | ||
570 | .unmask = enable_dynirq, | ||
571 | .ack = ack_dynirq, | ||
572 | .set_affinity = set_affinity_irq, | ||
573 | .retrigger = retrigger_dynirq, | ||
574 | }; | ||
575 | |||
576 | void __init xen_init_IRQ(void) | ||
577 | { | ||
578 | int i; | ||
579 | |||
580 | init_evtchn_cpu_bindings(); | ||
581 | |||
582 | /* No event channels are 'live' right now. */ | ||
583 | for (i = 0; i < NR_EVENT_CHANNELS; i++) | ||
584 | mask_evtchn(i); | ||
585 | |||
586 | /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ | ||
587 | for (i = 0; i < NR_IRQS; i++) | ||
588 | irq_bindcount[i] = 0; | ||
589 | |||
590 | irq_ctx_init(smp_processor_id()); | ||
591 | } | ||
diff --git a/arch/x86/xen/features.c b/arch/x86/xen/features.c new file mode 100644 index 000000000000..0707714e40d6 --- /dev/null +++ b/arch/x86/xen/features.c | |||
@@ -0,0 +1,29 @@ | |||
1 | /****************************************************************************** | ||
2 | * features.c | ||
3 | * | ||
4 | * Xen feature flags. | ||
5 | * | ||
6 | * Copyright (c) 2006, Ian Campbell, XenSource Inc. | ||
7 | */ | ||
8 | #include <linux/types.h> | ||
9 | #include <linux/cache.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <asm/xen/hypervisor.h> | ||
12 | #include <xen/features.h> | ||
13 | |||
14 | u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; | ||
15 | EXPORT_SYMBOL_GPL(xen_features); | ||
16 | |||
17 | void xen_setup_features(void) | ||
18 | { | ||
19 | struct xen_feature_info fi; | ||
20 | int i, j; | ||
21 | |||
22 | for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { | ||
23 | fi.submap_idx = i; | ||
24 | if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) | ||
25 | break; | ||
26 | for (j = 0; j < 32; j++) | ||
27 | xen_features[i * 32 + j] = !!(fi.submap & 1<<j); | ||
28 | } | ||
29 | } | ||
diff --git a/arch/x86/xen/manage.c b/arch/x86/xen/manage.c new file mode 100644 index 000000000000..aa7af9e6abc0 --- /dev/null +++ b/arch/x86/xen/manage.c | |||
@@ -0,0 +1,143 @@ | |||
1 | /* | ||
2 | * Handle extern requests for shutdown, reboot and sysrq | ||
3 | */ | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/err.h> | ||
6 | #include <linux/reboot.h> | ||
7 | #include <linux/sysrq.h> | ||
8 | |||
9 | #include <xen/xenbus.h> | ||
10 | |||
11 | #define SHUTDOWN_INVALID -1 | ||
12 | #define SHUTDOWN_POWEROFF 0 | ||
13 | #define SHUTDOWN_SUSPEND 2 | ||
14 | /* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only | ||
15 | * report a crash, not be instructed to crash! | ||
16 | * HALT is the same as POWEROFF, as far as we're concerned. The tools use | ||
17 | * the distinction when we return the reason code to them. | ||
18 | */ | ||
19 | #define SHUTDOWN_HALT 4 | ||
20 | |||
21 | /* Ignore multiple shutdown requests. */ | ||
22 | static int shutting_down = SHUTDOWN_INVALID; | ||
23 | |||
24 | static void shutdown_handler(struct xenbus_watch *watch, | ||
25 | const char **vec, unsigned int len) | ||
26 | { | ||
27 | char *str; | ||
28 | struct xenbus_transaction xbt; | ||
29 | int err; | ||
30 | |||
31 | if (shutting_down != SHUTDOWN_INVALID) | ||
32 | return; | ||
33 | |||
34 | again: | ||
35 | err = xenbus_transaction_start(&xbt); | ||
36 | if (err) | ||
37 | return; | ||
38 | |||
39 | str = (char *)xenbus_read(xbt, "control", "shutdown", NULL); | ||
40 | /* Ignore read errors and empty reads. */ | ||
41 | if (XENBUS_IS_ERR_READ(str)) { | ||
42 | xenbus_transaction_end(xbt, 1); | ||
43 | return; | ||
44 | } | ||
45 | |||
46 | xenbus_write(xbt, "control", "shutdown", ""); | ||
47 | |||
48 | err = xenbus_transaction_end(xbt, 0); | ||
49 | if (err == -EAGAIN) { | ||
50 | kfree(str); | ||
51 | goto again; | ||
52 | } | ||
53 | |||
54 | if (strcmp(str, "poweroff") == 0 || | ||
55 | strcmp(str, "halt") == 0) | ||
56 | orderly_poweroff(false); | ||
57 | else if (strcmp(str, "reboot") == 0) | ||
58 | ctrl_alt_del(); | ||
59 | else { | ||
60 | printk(KERN_INFO "Ignoring shutdown request: %s\n", str); | ||
61 | shutting_down = SHUTDOWN_INVALID; | ||
62 | } | ||
63 | |||
64 | kfree(str); | ||
65 | } | ||
66 | |||
67 | static void sysrq_handler(struct xenbus_watch *watch, const char **vec, | ||
68 | unsigned int len) | ||
69 | { | ||
70 | char sysrq_key = '\0'; | ||
71 | struct xenbus_transaction xbt; | ||
72 | int err; | ||
73 | |||
74 | again: | ||
75 | err = xenbus_transaction_start(&xbt); | ||
76 | if (err) | ||
77 | return; | ||
78 | if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { | ||
79 | printk(KERN_ERR "Unable to read sysrq code in " | ||
80 | "control/sysrq\n"); | ||
81 | xenbus_transaction_end(xbt, 1); | ||
82 | return; | ||
83 | } | ||
84 | |||
85 | if (sysrq_key != '\0') | ||
86 | xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); | ||
87 | |||
88 | err = xenbus_transaction_end(xbt, 0); | ||
89 | if (err == -EAGAIN) | ||
90 | goto again; | ||
91 | |||
92 | if (sysrq_key != '\0') | ||
93 | handle_sysrq(sysrq_key, NULL); | ||
94 | } | ||
95 | |||
96 | static struct xenbus_watch shutdown_watch = { | ||
97 | .node = "control/shutdown", | ||
98 | .callback = shutdown_handler | ||
99 | }; | ||
100 | |||
101 | static struct xenbus_watch sysrq_watch = { | ||
102 | .node = "control/sysrq", | ||
103 | .callback = sysrq_handler | ||
104 | }; | ||
105 | |||
106 | static int setup_shutdown_watcher(void) | ||
107 | { | ||
108 | int err; | ||
109 | |||
110 | err = register_xenbus_watch(&shutdown_watch); | ||
111 | if (err) { | ||
112 | printk(KERN_ERR "Failed to set shutdown watcher\n"); | ||
113 | return err; | ||
114 | } | ||
115 | |||
116 | err = register_xenbus_watch(&sysrq_watch); | ||
117 | if (err) { | ||
118 | printk(KERN_ERR "Failed to set sysrq watcher\n"); | ||
119 | return err; | ||
120 | } | ||
121 | |||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | static int shutdown_event(struct notifier_block *notifier, | ||
126 | unsigned long event, | ||
127 | void *data) | ||
128 | { | ||
129 | setup_shutdown_watcher(); | ||
130 | return NOTIFY_DONE; | ||
131 | } | ||
132 | |||
133 | static int __init setup_shutdown_event(void) | ||
134 | { | ||
135 | static struct notifier_block xenstore_notifier = { | ||
136 | .notifier_call = shutdown_event | ||
137 | }; | ||
138 | register_xenstore_notifier(&xenstore_notifier); | ||
139 | |||
140 | return 0; | ||
141 | } | ||
142 | |||
143 | subsys_initcall(setup_shutdown_event); | ||
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c new file mode 100644 index 000000000000..874db0cd1d2a --- /dev/null +++ b/arch/x86/xen/mmu.c | |||
@@ -0,0 +1,567 @@ | |||
1 | /* | ||
2 | * Xen mmu operations | ||
3 | * | ||
4 | * This file contains the various mmu fetch and update operations. | ||
5 | * The most important job they must perform is the mapping between the | ||
6 | * domain's pfn and the overall machine mfns. | ||
7 | * | ||
8 | * Xen allows guests to directly update the pagetable, in a controlled | ||
9 | * fashion. In other words, the guest modifies the same pagetable | ||
10 | * that the CPU actually uses, which eliminates the overhead of having | ||
11 | * a separate shadow pagetable. | ||
12 | * | ||
13 | * In order to allow this, it falls on the guest domain to map its | ||
14 | * notion of a "physical" pfn - which is just a domain-local linear | ||
15 | * address - into a real "machine address" which the CPU's MMU can | ||
16 | * use. | ||
17 | * | ||
18 | * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be | ||
19 | * inserted directly into the pagetable. When creating a new | ||
20 | * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, | ||
21 | * when reading the content back with __(pgd|pmd|pte)_val, it converts | ||
22 | * the mfn back into a pfn. | ||
23 | * | ||
24 | * The other constraint is that all pages which make up a pagetable | ||
25 | * must be mapped read-only in the guest. This prevents uncontrolled | ||
26 | * guest updates to the pagetable. Xen strictly enforces this, and | ||
27 | * will disallow any pagetable update which will end up mapping a | ||
28 | * pagetable page RW, and will disallow using any writable page as a | ||
29 | * pagetable. | ||
30 | * | ||
31 | * Naively, when loading %cr3 with the base of a new pagetable, Xen | ||
32 | * would need to validate the whole pagetable before going on. | ||
33 | * Naturally, this is quite slow. The solution is to "pin" a | ||
34 | * pagetable, which enforces all the constraints on the pagetable even | ||
35 | * when it is not actively in use. This menas that Xen can be assured | ||
36 | * that it is still valid when you do load it into %cr3, and doesn't | ||
37 | * need to revalidate it. | ||
38 | * | ||
39 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | ||
40 | */ | ||
41 | #include <linux/sched.h> | ||
42 | #include <linux/highmem.h> | ||
43 | #include <linux/bug.h> | ||
44 | #include <linux/sched.h> | ||
45 | |||
46 | #include <asm/pgtable.h> | ||
47 | #include <asm/tlbflush.h> | ||
48 | #include <asm/mmu_context.h> | ||
49 | #include <asm/paravirt.h> | ||
50 | |||
51 | #include <asm/xen/hypercall.h> | ||
52 | #include <asm/xen/hypervisor.h> | ||
53 | |||
54 | #include <xen/page.h> | ||
55 | #include <xen/interface/xen.h> | ||
56 | |||
57 | #include "multicalls.h" | ||
58 | #include "mmu.h" | ||
59 | |||
60 | xmaddr_t arbitrary_virt_to_machine(unsigned long address) | ||
61 | { | ||
62 | pte_t *pte = lookup_address(address); | ||
63 | unsigned offset = address & PAGE_MASK; | ||
64 | |||
65 | BUG_ON(pte == NULL); | ||
66 | |||
67 | return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); | ||
68 | } | ||
69 | |||
70 | void make_lowmem_page_readonly(void *vaddr) | ||
71 | { | ||
72 | pte_t *pte, ptev; | ||
73 | unsigned long address = (unsigned long)vaddr; | ||
74 | |||
75 | pte = lookup_address(address); | ||
76 | BUG_ON(pte == NULL); | ||
77 | |||
78 | ptev = pte_wrprotect(*pte); | ||
79 | |||
80 | if (HYPERVISOR_update_va_mapping(address, ptev, 0)) | ||
81 | BUG(); | ||
82 | } | ||
83 | |||
84 | void make_lowmem_page_readwrite(void *vaddr) | ||
85 | { | ||
86 | pte_t *pte, ptev; | ||
87 | unsigned long address = (unsigned long)vaddr; | ||
88 | |||
89 | pte = lookup_address(address); | ||
90 | BUG_ON(pte == NULL); | ||
91 | |||
92 | ptev = pte_mkwrite(*pte); | ||
93 | |||
94 | if (HYPERVISOR_update_va_mapping(address, ptev, 0)) | ||
95 | BUG(); | ||
96 | } | ||
97 | |||
98 | |||
99 | void xen_set_pmd(pmd_t *ptr, pmd_t val) | ||
100 | { | ||
101 | struct multicall_space mcs; | ||
102 | struct mmu_update *u; | ||
103 | |||
104 | preempt_disable(); | ||
105 | |||
106 | mcs = xen_mc_entry(sizeof(*u)); | ||
107 | u = mcs.args; | ||
108 | u->ptr = virt_to_machine(ptr).maddr; | ||
109 | u->val = pmd_val_ma(val); | ||
110 | MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); | ||
111 | |||
112 | xen_mc_issue(PARAVIRT_LAZY_MMU); | ||
113 | |||
114 | preempt_enable(); | ||
115 | } | ||
116 | |||
117 | /* | ||
118 | * Associate a virtual page frame with a given physical page frame | ||
119 | * and protection flags for that frame. | ||
120 | */ | ||
121 | void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) | ||
122 | { | ||
123 | pgd_t *pgd; | ||
124 | pud_t *pud; | ||
125 | pmd_t *pmd; | ||
126 | pte_t *pte; | ||
127 | |||
128 | pgd = swapper_pg_dir + pgd_index(vaddr); | ||
129 | if (pgd_none(*pgd)) { | ||
130 | BUG(); | ||
131 | return; | ||
132 | } | ||
133 | pud = pud_offset(pgd, vaddr); | ||
134 | if (pud_none(*pud)) { | ||
135 | BUG(); | ||
136 | return; | ||
137 | } | ||
138 | pmd = pmd_offset(pud, vaddr); | ||
139 | if (pmd_none(*pmd)) { | ||
140 | BUG(); | ||
141 | return; | ||
142 | } | ||
143 | pte = pte_offset_kernel(pmd, vaddr); | ||
144 | /* <mfn,flags> stored as-is, to permit clearing entries */ | ||
145 | xen_set_pte(pte, mfn_pte(mfn, flags)); | ||
146 | |||
147 | /* | ||
148 | * It's enough to flush this one mapping. | ||
149 | * (PGE mappings get flushed as well) | ||
150 | */ | ||
151 | __flush_tlb_one(vaddr); | ||
152 | } | ||
153 | |||
154 | void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | ||
155 | pte_t *ptep, pte_t pteval) | ||
156 | { | ||
157 | if (mm == current->mm || mm == &init_mm) { | ||
158 | if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { | ||
159 | struct multicall_space mcs; | ||
160 | mcs = xen_mc_entry(0); | ||
161 | |||
162 | MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); | ||
163 | xen_mc_issue(PARAVIRT_LAZY_MMU); | ||
164 | return; | ||
165 | } else | ||
166 | if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) | ||
167 | return; | ||
168 | } | ||
169 | xen_set_pte(ptep, pteval); | ||
170 | } | ||
171 | |||
172 | #ifdef CONFIG_X86_PAE | ||
173 | void xen_set_pud(pud_t *ptr, pud_t val) | ||
174 | { | ||
175 | struct multicall_space mcs; | ||
176 | struct mmu_update *u; | ||
177 | |||
178 | preempt_disable(); | ||
179 | |||
180 | mcs = xen_mc_entry(sizeof(*u)); | ||
181 | u = mcs.args; | ||
182 | u->ptr = virt_to_machine(ptr).maddr; | ||
183 | u->val = pud_val_ma(val); | ||
184 | MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); | ||
185 | |||
186 | xen_mc_issue(PARAVIRT_LAZY_MMU); | ||
187 | |||
188 | preempt_enable(); | ||
189 | } | ||
190 | |||
191 | void xen_set_pte(pte_t *ptep, pte_t pte) | ||
192 | { | ||
193 | ptep->pte_high = pte.pte_high; | ||
194 | smp_wmb(); | ||
195 | ptep->pte_low = pte.pte_low; | ||
196 | } | ||
197 | |||
198 | void xen_set_pte_atomic(pte_t *ptep, pte_t pte) | ||
199 | { | ||
200 | set_64bit((u64 *)ptep, pte_val_ma(pte)); | ||
201 | } | ||
202 | |||
203 | void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | ||
204 | { | ||
205 | ptep->pte_low = 0; | ||
206 | smp_wmb(); /* make sure low gets written first */ | ||
207 | ptep->pte_high = 0; | ||
208 | } | ||
209 | |||
210 | void xen_pmd_clear(pmd_t *pmdp) | ||
211 | { | ||
212 | xen_set_pmd(pmdp, __pmd(0)); | ||
213 | } | ||
214 | |||
215 | unsigned long long xen_pte_val(pte_t pte) | ||
216 | { | ||
217 | unsigned long long ret = 0; | ||
218 | |||
219 | if (pte.pte_low) { | ||
220 | ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low; | ||
221 | ret = machine_to_phys(XMADDR(ret)).paddr | 1; | ||
222 | } | ||
223 | |||
224 | return ret; | ||
225 | } | ||
226 | |||
227 | unsigned long long xen_pmd_val(pmd_t pmd) | ||
228 | { | ||
229 | unsigned long long ret = pmd.pmd; | ||
230 | if (ret) | ||
231 | ret = machine_to_phys(XMADDR(ret)).paddr | 1; | ||
232 | return ret; | ||
233 | } | ||
234 | |||
235 | unsigned long long xen_pgd_val(pgd_t pgd) | ||
236 | { | ||
237 | unsigned long long ret = pgd.pgd; | ||
238 | if (ret) | ||
239 | ret = machine_to_phys(XMADDR(ret)).paddr | 1; | ||
240 | return ret; | ||
241 | } | ||
242 | |||
243 | pte_t xen_make_pte(unsigned long long pte) | ||
244 | { | ||
245 | if (pte & 1) | ||
246 | pte = phys_to_machine(XPADDR(pte)).maddr; | ||
247 | |||
248 | return (pte_t){ pte, pte >> 32 }; | ||
249 | } | ||
250 | |||
251 | pmd_t xen_make_pmd(unsigned long long pmd) | ||
252 | { | ||
253 | if (pmd & 1) | ||
254 | pmd = phys_to_machine(XPADDR(pmd)).maddr; | ||
255 | |||
256 | return (pmd_t){ pmd }; | ||
257 | } | ||
258 | |||
259 | pgd_t xen_make_pgd(unsigned long long pgd) | ||
260 | { | ||
261 | if (pgd & _PAGE_PRESENT) | ||
262 | pgd = phys_to_machine(XPADDR(pgd)).maddr; | ||
263 | |||
264 | return (pgd_t){ pgd }; | ||
265 | } | ||
266 | #else /* !PAE */ | ||
267 | void xen_set_pte(pte_t *ptep, pte_t pte) | ||
268 | { | ||
269 | *ptep = pte; | ||
270 | } | ||
271 | |||
272 | unsigned long xen_pte_val(pte_t pte) | ||
273 | { | ||
274 | unsigned long ret = pte.pte_low; | ||
275 | |||
276 | if (ret & _PAGE_PRESENT) | ||
277 | ret = machine_to_phys(XMADDR(ret)).paddr; | ||
278 | |||
279 | return ret; | ||
280 | } | ||
281 | |||
282 | unsigned long xen_pgd_val(pgd_t pgd) | ||
283 | { | ||
284 | unsigned long ret = pgd.pgd; | ||
285 | if (ret) | ||
286 | ret = machine_to_phys(XMADDR(ret)).paddr | 1; | ||
287 | return ret; | ||
288 | } | ||
289 | |||
290 | pte_t xen_make_pte(unsigned long pte) | ||
291 | { | ||
292 | if (pte & _PAGE_PRESENT) | ||
293 | pte = phys_to_machine(XPADDR(pte)).maddr; | ||
294 | |||
295 | return (pte_t){ pte }; | ||
296 | } | ||
297 | |||
298 | pgd_t xen_make_pgd(unsigned long pgd) | ||
299 | { | ||
300 | if (pgd & _PAGE_PRESENT) | ||
301 | pgd = phys_to_machine(XPADDR(pgd)).maddr; | ||
302 | |||
303 | return (pgd_t){ pgd }; | ||
304 | } | ||
305 | #endif /* CONFIG_X86_PAE */ | ||
306 | |||
307 | |||
308 | |||
309 | /* | ||
310 | (Yet another) pagetable walker. This one is intended for pinning a | ||
311 | pagetable. This means that it walks a pagetable and calls the | ||
312 | callback function on each page it finds making up the page table, | ||
313 | at every level. It walks the entire pagetable, but it only bothers | ||
314 | pinning pte pages which are below pte_limit. In the normal case | ||
315 | this will be TASK_SIZE, but at boot we need to pin up to | ||
316 | FIXADDR_TOP. But the important bit is that we don't pin beyond | ||
317 | there, because then we start getting into Xen's ptes. | ||
318 | */ | ||
319 | static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | ||
320 | unsigned long limit) | ||
321 | { | ||
322 | pgd_t *pgd = pgd_base; | ||
323 | int flush = 0; | ||
324 | unsigned long addr = 0; | ||
325 | unsigned long pgd_next; | ||
326 | |||
327 | BUG_ON(limit > FIXADDR_TOP); | ||
328 | |||
329 | if (xen_feature(XENFEAT_auto_translated_physmap)) | ||
330 | return 0; | ||
331 | |||
332 | for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { | ||
333 | pud_t *pud; | ||
334 | unsigned long pud_limit, pud_next; | ||
335 | |||
336 | pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); | ||
337 | |||
338 | if (!pgd_val(*pgd)) | ||
339 | continue; | ||
340 | |||
341 | pud = pud_offset(pgd, 0); | ||
342 | |||
343 | if (PTRS_PER_PUD > 1) /* not folded */ | ||
344 | flush |= (*func)(virt_to_page(pud), 0); | ||
345 | |||
346 | for (; addr != pud_limit; pud++, addr = pud_next) { | ||
347 | pmd_t *pmd; | ||
348 | unsigned long pmd_limit; | ||
349 | |||
350 | pud_next = pud_addr_end(addr, pud_limit); | ||
351 | |||
352 | if (pud_next < limit) | ||
353 | pmd_limit = pud_next; | ||
354 | else | ||
355 | pmd_limit = limit; | ||
356 | |||
357 | if (pud_none(*pud)) | ||
358 | continue; | ||
359 | |||
360 | pmd = pmd_offset(pud, 0); | ||
361 | |||
362 | if (PTRS_PER_PMD > 1) /* not folded */ | ||
363 | flush |= (*func)(virt_to_page(pmd), 0); | ||
364 | |||
365 | for (; addr != pmd_limit; pmd++) { | ||
366 | addr += (PAGE_SIZE * PTRS_PER_PTE); | ||
367 | if ((pmd_limit-1) < (addr-1)) { | ||
368 | addr = pmd_limit; | ||
369 | break; | ||
370 | } | ||
371 | |||
372 | if (pmd_none(*pmd)) | ||
373 | continue; | ||
374 | |||
375 | flush |= (*func)(pmd_page(*pmd), 0); | ||
376 | } | ||
377 | } | ||
378 | } | ||
379 | |||
380 | flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); | ||
381 | |||
382 | return flush; | ||
383 | } | ||
384 | |||
385 | static int pin_page(struct page *page, unsigned flags) | ||
386 | { | ||
387 | unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); | ||
388 | int flush; | ||
389 | |||
390 | if (pgfl) | ||
391 | flush = 0; /* already pinned */ | ||
392 | else if (PageHighMem(page)) | ||
393 | /* kmaps need flushing if we found an unpinned | ||
394 | highpage */ | ||
395 | flush = 1; | ||
396 | else { | ||
397 | void *pt = lowmem_page_address(page); | ||
398 | unsigned long pfn = page_to_pfn(page); | ||
399 | struct multicall_space mcs = __xen_mc_entry(0); | ||
400 | |||
401 | flush = 0; | ||
402 | |||
403 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | ||
404 | pfn_pte(pfn, PAGE_KERNEL_RO), | ||
405 | flags); | ||
406 | } | ||
407 | |||
408 | return flush; | ||
409 | } | ||
410 | |||
411 | /* This is called just after a mm has been created, but it has not | ||
412 | been used yet. We need to make sure that its pagetable is all | ||
413 | read-only, and can be pinned. */ | ||
414 | void xen_pgd_pin(pgd_t *pgd) | ||
415 | { | ||
416 | struct multicall_space mcs; | ||
417 | struct mmuext_op *op; | ||
418 | |||
419 | xen_mc_batch(); | ||
420 | |||
421 | if (pgd_walk(pgd, pin_page, TASK_SIZE)) { | ||
422 | /* re-enable interrupts for kmap_flush_unused */ | ||
423 | xen_mc_issue(0); | ||
424 | kmap_flush_unused(); | ||
425 | xen_mc_batch(); | ||
426 | } | ||
427 | |||
428 | mcs = __xen_mc_entry(sizeof(*op)); | ||
429 | op = mcs.args; | ||
430 | |||
431 | #ifdef CONFIG_X86_PAE | ||
432 | op->cmd = MMUEXT_PIN_L3_TABLE; | ||
433 | #else | ||
434 | op->cmd = MMUEXT_PIN_L2_TABLE; | ||
435 | #endif | ||
436 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | ||
437 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
438 | |||
439 | xen_mc_issue(0); | ||
440 | } | ||
441 | |||
442 | /* The init_mm pagetable is really pinned as soon as its created, but | ||
443 | that's before we have page structures to store the bits. So do all | ||
444 | the book-keeping now. */ | ||
445 | static __init int mark_pinned(struct page *page, unsigned flags) | ||
446 | { | ||
447 | SetPagePinned(page); | ||
448 | return 0; | ||
449 | } | ||
450 | |||
451 | void __init xen_mark_init_mm_pinned(void) | ||
452 | { | ||
453 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); | ||
454 | } | ||
455 | |||
456 | static int unpin_page(struct page *page, unsigned flags) | ||
457 | { | ||
458 | unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); | ||
459 | |||
460 | if (pgfl && !PageHighMem(page)) { | ||
461 | void *pt = lowmem_page_address(page); | ||
462 | unsigned long pfn = page_to_pfn(page); | ||
463 | struct multicall_space mcs = __xen_mc_entry(0); | ||
464 | |||
465 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | ||
466 | pfn_pte(pfn, PAGE_KERNEL), | ||
467 | flags); | ||
468 | } | ||
469 | |||
470 | return 0; /* never need to flush on unpin */ | ||
471 | } | ||
472 | |||
473 | /* Release a pagetables pages back as normal RW */ | ||
474 | static void xen_pgd_unpin(pgd_t *pgd) | ||
475 | { | ||
476 | struct mmuext_op *op; | ||
477 | struct multicall_space mcs; | ||
478 | |||
479 | xen_mc_batch(); | ||
480 | |||
481 | mcs = __xen_mc_entry(sizeof(*op)); | ||
482 | |||
483 | op = mcs.args; | ||
484 | op->cmd = MMUEXT_UNPIN_TABLE; | ||
485 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | ||
486 | |||
487 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
488 | |||
489 | pgd_walk(pgd, unpin_page, TASK_SIZE); | ||
490 | |||
491 | xen_mc_issue(0); | ||
492 | } | ||
493 | |||
494 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) | ||
495 | { | ||
496 | spin_lock(&next->page_table_lock); | ||
497 | xen_pgd_pin(next->pgd); | ||
498 | spin_unlock(&next->page_table_lock); | ||
499 | } | ||
500 | |||
501 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | ||
502 | { | ||
503 | spin_lock(&mm->page_table_lock); | ||
504 | xen_pgd_pin(mm->pgd); | ||
505 | spin_unlock(&mm->page_table_lock); | ||
506 | } | ||
507 | |||
508 | |||
509 | #ifdef CONFIG_SMP | ||
510 | /* Another cpu may still have their %cr3 pointing at the pagetable, so | ||
511 | we need to repoint it somewhere else before we can unpin it. */ | ||
512 | static void drop_other_mm_ref(void *info) | ||
513 | { | ||
514 | struct mm_struct *mm = info; | ||
515 | |||
516 | if (__get_cpu_var(cpu_tlbstate).active_mm == mm) | ||
517 | leave_mm(smp_processor_id()); | ||
518 | } | ||
519 | |||
520 | static void drop_mm_ref(struct mm_struct *mm) | ||
521 | { | ||
522 | if (current->active_mm == mm) { | ||
523 | if (current->mm == mm) | ||
524 | load_cr3(swapper_pg_dir); | ||
525 | else | ||
526 | leave_mm(smp_processor_id()); | ||
527 | } | ||
528 | |||
529 | if (!cpus_empty(mm->cpu_vm_mask)) | ||
530 | xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, | ||
531 | mm, 1); | ||
532 | } | ||
533 | #else | ||
534 | static void drop_mm_ref(struct mm_struct *mm) | ||
535 | { | ||
536 | if (current->active_mm == mm) | ||
537 | load_cr3(swapper_pg_dir); | ||
538 | } | ||
539 | #endif | ||
540 | |||
541 | /* | ||
542 | * While a process runs, Xen pins its pagetables, which means that the | ||
543 | * hypervisor forces it to be read-only, and it controls all updates | ||
544 | * to it. This means that all pagetable updates have to go via the | ||
545 | * hypervisor, which is moderately expensive. | ||
546 | * | ||
547 | * Since we're pulling the pagetable down, we switch to use init_mm, | ||
548 | * unpin old process pagetable and mark it all read-write, which | ||
549 | * allows further operations on it to be simple memory accesses. | ||
550 | * | ||
551 | * The only subtle point is that another CPU may be still using the | ||
552 | * pagetable because of lazy tlb flushing. This means we need need to | ||
553 | * switch all CPUs off this pagetable before we can unpin it. | ||
554 | */ | ||
555 | void xen_exit_mmap(struct mm_struct *mm) | ||
556 | { | ||
557 | get_cpu(); /* make sure we don't move around */ | ||
558 | drop_mm_ref(mm); | ||
559 | put_cpu(); | ||
560 | |||
561 | spin_lock(&mm->page_table_lock); | ||
562 | |||
563 | /* pgd may not be pinned in the error exit path of execve */ | ||
564 | if (PagePinned(virt_to_page(mm->pgd))) | ||
565 | xen_pgd_unpin(mm->pgd); | ||
566 | spin_unlock(&mm->page_table_lock); | ||
567 | } | ||
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h new file mode 100644 index 000000000000..c9ff27f3ac3a --- /dev/null +++ b/arch/x86/xen/mmu.h | |||
@@ -0,0 +1,60 @@ | |||
1 | #ifndef _XEN_MMU_H | ||
2 | |||
3 | #include <linux/linkage.h> | ||
4 | #include <asm/page.h> | ||
5 | |||
6 | /* | ||
7 | * Page-directory addresses above 4GB do not fit into architectural %cr3. | ||
8 | * When accessing %cr3, or equivalent field in vcpu_guest_context, guests | ||
9 | * must use the following accessor macros to pack/unpack valid MFNs. | ||
10 | * | ||
11 | * Note that Xen is using the fact that the pagetable base is always | ||
12 | * page-aligned, and putting the 12 MSB of the address into the 12 LSB | ||
13 | * of cr3. | ||
14 | */ | ||
15 | #define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) | ||
16 | #define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) | ||
17 | |||
18 | |||
19 | void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); | ||
20 | |||
21 | void xen_set_pte(pte_t *ptep, pte_t pteval); | ||
22 | void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | ||
23 | pte_t *ptep, pte_t pteval); | ||
24 | void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); | ||
25 | |||
26 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); | ||
27 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); | ||
28 | void xen_exit_mmap(struct mm_struct *mm); | ||
29 | |||
30 | void xen_pgd_pin(pgd_t *pgd); | ||
31 | //void xen_pgd_unpin(pgd_t *pgd); | ||
32 | |||
33 | #ifdef CONFIG_X86_PAE | ||
34 | unsigned long long xen_pte_val(pte_t); | ||
35 | unsigned long long xen_pmd_val(pmd_t); | ||
36 | unsigned long long xen_pgd_val(pgd_t); | ||
37 | |||
38 | pte_t xen_make_pte(unsigned long long); | ||
39 | pmd_t xen_make_pmd(unsigned long long); | ||
40 | pgd_t xen_make_pgd(unsigned long long); | ||
41 | |||
42 | void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | ||
43 | pte_t *ptep, pte_t pteval); | ||
44 | void xen_set_pte_atomic(pte_t *ptep, pte_t pte); | ||
45 | void xen_set_pud(pud_t *ptr, pud_t val); | ||
46 | void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); | ||
47 | void xen_pmd_clear(pmd_t *pmdp); | ||
48 | |||
49 | |||
50 | #else | ||
51 | unsigned long xen_pte_val(pte_t); | ||
52 | unsigned long xen_pmd_val(pmd_t); | ||
53 | unsigned long xen_pgd_val(pgd_t); | ||
54 | |||
55 | pte_t xen_make_pte(unsigned long); | ||
56 | pmd_t xen_make_pmd(unsigned long); | ||
57 | pgd_t xen_make_pgd(unsigned long); | ||
58 | #endif | ||
59 | |||
60 | #endif /* _XEN_MMU_H */ | ||
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c new file mode 100644 index 000000000000..c837e8e463db --- /dev/null +++ b/arch/x86/xen/multicalls.c | |||
@@ -0,0 +1,90 @@ | |||
1 | /* | ||
2 | * Xen hypercall batching. | ||
3 | * | ||
4 | * Xen allows multiple hypercalls to be issued at once, using the | ||
5 | * multicall interface. This allows the cost of trapping into the | ||
6 | * hypervisor to be amortized over several calls. | ||
7 | * | ||
8 | * This file implements a simple interface for multicalls. There's a | ||
9 | * per-cpu buffer of outstanding multicalls. When you want to queue a | ||
10 | * multicall for issuing, you can allocate a multicall slot for the | ||
11 | * call and its arguments, along with storage for space which is | ||
12 | * pointed to by the arguments (for passing pointers to structures, | ||
13 | * etc). When the multicall is actually issued, all the space for the | ||
14 | * commands and allocated memory is freed for reuse. | ||
15 | * | ||
16 | * Multicalls are flushed whenever any of the buffers get full, or | ||
17 | * when explicitly requested. There's no way to get per-multicall | ||
18 | * return results back. It will BUG if any of the multicalls fail. | ||
19 | * | ||
20 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | ||
21 | */ | ||
22 | #include <linux/percpu.h> | ||
23 | #include <linux/hardirq.h> | ||
24 | |||
25 | #include <asm/xen/hypercall.h> | ||
26 | |||
27 | #include "multicalls.h" | ||
28 | |||
29 | #define MC_BATCH 32 | ||
30 | #define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) | ||
31 | |||
32 | struct mc_buffer { | ||
33 | struct multicall_entry entries[MC_BATCH]; | ||
34 | u64 args[MC_ARGS]; | ||
35 | unsigned mcidx, argidx; | ||
36 | }; | ||
37 | |||
38 | static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); | ||
39 | DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); | ||
40 | |||
41 | void xen_mc_flush(void) | ||
42 | { | ||
43 | struct mc_buffer *b = &__get_cpu_var(mc_buffer); | ||
44 | int ret = 0; | ||
45 | unsigned long flags; | ||
46 | |||
47 | BUG_ON(preemptible()); | ||
48 | |||
49 | /* Disable interrupts in case someone comes in and queues | ||
50 | something in the middle */ | ||
51 | local_irq_save(flags); | ||
52 | |||
53 | if (b->mcidx) { | ||
54 | int i; | ||
55 | |||
56 | if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) | ||
57 | BUG(); | ||
58 | for (i = 0; i < b->mcidx; i++) | ||
59 | if (b->entries[i].result < 0) | ||
60 | ret++; | ||
61 | b->mcidx = 0; | ||
62 | b->argidx = 0; | ||
63 | } else | ||
64 | BUG_ON(b->argidx != 0); | ||
65 | |||
66 | local_irq_restore(flags); | ||
67 | |||
68 | BUG_ON(ret); | ||
69 | } | ||
70 | |||
71 | struct multicall_space __xen_mc_entry(size_t args) | ||
72 | { | ||
73 | struct mc_buffer *b = &__get_cpu_var(mc_buffer); | ||
74 | struct multicall_space ret; | ||
75 | unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); | ||
76 | |||
77 | BUG_ON(preemptible()); | ||
78 | BUG_ON(argspace > MC_ARGS); | ||
79 | |||
80 | if (b->mcidx == MC_BATCH || | ||
81 | (b->argidx + argspace) > MC_ARGS) | ||
82 | xen_mc_flush(); | ||
83 | |||
84 | ret.mc = &b->entries[b->mcidx]; | ||
85 | b->mcidx++; | ||
86 | ret.args = &b->args[b->argidx]; | ||
87 | b->argidx += argspace; | ||
88 | |||
89 | return ret; | ||
90 | } | ||
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h new file mode 100644 index 000000000000..e6f7530b156c --- /dev/null +++ b/arch/x86/xen/multicalls.h | |||
@@ -0,0 +1,45 @@ | |||
1 | #ifndef _XEN_MULTICALLS_H | ||
2 | #define _XEN_MULTICALLS_H | ||
3 | |||
4 | #include "xen-ops.h" | ||
5 | |||
6 | /* Multicalls */ | ||
7 | struct multicall_space | ||
8 | { | ||
9 | struct multicall_entry *mc; | ||
10 | void *args; | ||
11 | }; | ||
12 | |||
13 | /* Allocate room for a multicall and its args */ | ||
14 | struct multicall_space __xen_mc_entry(size_t args); | ||
15 | |||
16 | DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags); | ||
17 | |||
18 | /* Call to start a batch of multiple __xen_mc_entry()s. Must be | ||
19 | paired with xen_mc_issue() */ | ||
20 | static inline void xen_mc_batch(void) | ||
21 | { | ||
22 | /* need to disable interrupts until this entry is complete */ | ||
23 | local_irq_save(__get_cpu_var(xen_mc_irq_flags)); | ||
24 | } | ||
25 | |||
26 | static inline struct multicall_space xen_mc_entry(size_t args) | ||
27 | { | ||
28 | xen_mc_batch(); | ||
29 | return __xen_mc_entry(args); | ||
30 | } | ||
31 | |||
32 | /* Flush all pending multicalls */ | ||
33 | void xen_mc_flush(void); | ||
34 | |||
35 | /* Issue a multicall if we're not in a lazy mode */ | ||
36 | static inline void xen_mc_issue(unsigned mode) | ||
37 | { | ||
38 | if ((xen_get_lazy_mode() & mode) == 0) | ||
39 | xen_mc_flush(); | ||
40 | |||
41 | /* restore flags saved in xen_mc_batch */ | ||
42 | local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); | ||
43 | } | ||
44 | |||
45 | #endif /* _XEN_MULTICALLS_H */ | ||
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c new file mode 100644 index 000000000000..f84e77226646 --- /dev/null +++ b/arch/x86/xen/setup.c | |||
@@ -0,0 +1,111 @@ | |||
1 | /* | ||
2 | * Machine specific setup for xen | ||
3 | * | ||
4 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | ||
5 | */ | ||
6 | |||
7 | #include <linux/module.h> | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/mm.h> | ||
10 | #include <linux/pm.h> | ||
11 | |||
12 | #include <asm/elf.h> | ||
13 | #include <asm/e820.h> | ||
14 | #include <asm/setup.h> | ||
15 | #include <asm/xen/hypervisor.h> | ||
16 | #include <asm/xen/hypercall.h> | ||
17 | |||
18 | #include <xen/interface/physdev.h> | ||
19 | #include <xen/features.h> | ||
20 | |||
21 | #include "xen-ops.h" | ||
22 | #include "vdso.h" | ||
23 | |||
24 | /* These are code, but not functions. Defined in entry.S */ | ||
25 | extern const char xen_hypervisor_callback[]; | ||
26 | extern const char xen_failsafe_callback[]; | ||
27 | |||
28 | unsigned long *phys_to_machine_mapping; | ||
29 | EXPORT_SYMBOL(phys_to_machine_mapping); | ||
30 | |||
31 | /** | ||
32 | * machine_specific_memory_setup - Hook for machine specific memory setup. | ||
33 | **/ | ||
34 | |||
35 | char * __init xen_memory_setup(void) | ||
36 | { | ||
37 | unsigned long max_pfn = xen_start_info->nr_pages; | ||
38 | |||
39 | e820.nr_map = 0; | ||
40 | add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM); | ||
41 | |||
42 | return "Xen"; | ||
43 | } | ||
44 | |||
45 | static void xen_idle(void) | ||
46 | { | ||
47 | local_irq_disable(); | ||
48 | |||
49 | if (need_resched()) | ||
50 | local_irq_enable(); | ||
51 | else { | ||
52 | current_thread_info()->status &= ~TS_POLLING; | ||
53 | smp_mb__after_clear_bit(); | ||
54 | safe_halt(); | ||
55 | current_thread_info()->status |= TS_POLLING; | ||
56 | } | ||
57 | } | ||
58 | |||
59 | /* | ||
60 | * Set the bit indicating "nosegneg" library variants should be used. | ||
61 | */ | ||
62 | static void fiddle_vdso(void) | ||
63 | { | ||
64 | extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S. */ | ||
65 | extern char vsyscall_int80_start; | ||
66 | u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK + | ||
67 | &vsyscall_int80_start); | ||
68 | *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; | ||
69 | } | ||
70 | |||
71 | void __init xen_arch_setup(void) | ||
72 | { | ||
73 | struct physdev_set_iopl set_iopl; | ||
74 | int rc; | ||
75 | |||
76 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); | ||
77 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); | ||
78 | |||
79 | if (!xen_feature(XENFEAT_auto_translated_physmap)) | ||
80 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); | ||
81 | |||
82 | HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, | ||
83 | __KERNEL_CS, (unsigned long)xen_failsafe_callback); | ||
84 | |||
85 | set_iopl.iopl = 1; | ||
86 | rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); | ||
87 | if (rc != 0) | ||
88 | printk(KERN_INFO "physdev_op failed %d\n", rc); | ||
89 | |||
90 | #ifdef CONFIG_ACPI | ||
91 | if (!(xen_start_info->flags & SIF_INITDOMAIN)) { | ||
92 | printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); | ||
93 | disable_acpi(); | ||
94 | } | ||
95 | #endif | ||
96 | |||
97 | memcpy(boot_command_line, xen_start_info->cmd_line, | ||
98 | MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? | ||
99 | COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); | ||
100 | |||
101 | pm_idle = xen_idle; | ||
102 | |||
103 | #ifdef CONFIG_SMP | ||
104 | /* fill cpus_possible with all available cpus */ | ||
105 | xen_fill_possible_map(); | ||
106 | #endif | ||
107 | |||
108 | paravirt_disable_iospace(); | ||
109 | |||
110 | fiddle_vdso(); | ||
111 | } | ||
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c new file mode 100644 index 000000000000..557b8e24706a --- /dev/null +++ b/arch/x86/xen/smp.c | |||
@@ -0,0 +1,404 @@ | |||
1 | /* | ||
2 | * Xen SMP support | ||
3 | * | ||
4 | * This file implements the Xen versions of smp_ops. SMP under Xen is | ||
5 | * very straightforward. Bringing a CPU up is simply a matter of | ||
6 | * loading its initial context and setting it running. | ||
7 | * | ||
8 | * IPIs are handled through the Xen event mechanism. | ||
9 | * | ||
10 | * Because virtual CPUs can be scheduled onto any real CPU, there's no | ||
11 | * useful topology information for the kernel to make use of. As a | ||
12 | * result, all CPUs are treated as if they're single-core and | ||
13 | * single-threaded. | ||
14 | * | ||
15 | * This does not handle HOTPLUG_CPU yet. | ||
16 | */ | ||
17 | #include <linux/sched.h> | ||
18 | #include <linux/err.h> | ||
19 | #include <linux/smp.h> | ||
20 | |||
21 | #include <asm/paravirt.h> | ||
22 | #include <asm/desc.h> | ||
23 | #include <asm/pgtable.h> | ||
24 | #include <asm/cpu.h> | ||
25 | |||
26 | #include <xen/interface/xen.h> | ||
27 | #include <xen/interface/vcpu.h> | ||
28 | |||
29 | #include <asm/xen/interface.h> | ||
30 | #include <asm/xen/hypercall.h> | ||
31 | |||
32 | #include <xen/page.h> | ||
33 | #include <xen/events.h> | ||
34 | |||
35 | #include "xen-ops.h" | ||
36 | #include "mmu.h" | ||
37 | |||
38 | static cpumask_t cpu_initialized_map; | ||
39 | static DEFINE_PER_CPU(int, resched_irq); | ||
40 | static DEFINE_PER_CPU(int, callfunc_irq); | ||
41 | |||
42 | /* | ||
43 | * Structure and data for smp_call_function(). This is designed to minimise | ||
44 | * static memory requirements. It also looks cleaner. | ||
45 | */ | ||
46 | static DEFINE_SPINLOCK(call_lock); | ||
47 | |||
48 | struct call_data_struct { | ||
49 | void (*func) (void *info); | ||
50 | void *info; | ||
51 | atomic_t started; | ||
52 | atomic_t finished; | ||
53 | int wait; | ||
54 | }; | ||
55 | |||
56 | static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); | ||
57 | |||
58 | static struct call_data_struct *call_data; | ||
59 | |||
60 | /* | ||
61 | * Reschedule call back. Nothing to do, | ||
62 | * all the work is done automatically when | ||
63 | * we return from the interrupt. | ||
64 | */ | ||
65 | static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) | ||
66 | { | ||
67 | return IRQ_HANDLED; | ||
68 | } | ||
69 | |||
70 | static __cpuinit void cpu_bringup_and_idle(void) | ||
71 | { | ||
72 | int cpu = smp_processor_id(); | ||
73 | |||
74 | cpu_init(); | ||
75 | |||
76 | preempt_disable(); | ||
77 | per_cpu(cpu_state, cpu) = CPU_ONLINE; | ||
78 | |||
79 | xen_setup_cpu_clockevents(); | ||
80 | |||
81 | /* We can take interrupts now: we're officially "up". */ | ||
82 | local_irq_enable(); | ||
83 | |||
84 | wmb(); /* make sure everything is out */ | ||
85 | cpu_idle(); | ||
86 | } | ||
87 | |||
88 | static int xen_smp_intr_init(unsigned int cpu) | ||
89 | { | ||
90 | int rc; | ||
91 | const char *resched_name, *callfunc_name; | ||
92 | |||
93 | per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; | ||
94 | |||
95 | resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); | ||
96 | rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, | ||
97 | cpu, | ||
98 | xen_reschedule_interrupt, | ||
99 | IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, | ||
100 | resched_name, | ||
101 | NULL); | ||
102 | if (rc < 0) | ||
103 | goto fail; | ||
104 | per_cpu(resched_irq, cpu) = rc; | ||
105 | |||
106 | callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); | ||
107 | rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, | ||
108 | cpu, | ||
109 | xen_call_function_interrupt, | ||
110 | IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, | ||
111 | callfunc_name, | ||
112 | NULL); | ||
113 | if (rc < 0) | ||
114 | goto fail; | ||
115 | per_cpu(callfunc_irq, cpu) = rc; | ||
116 | |||
117 | return 0; | ||
118 | |||
119 | fail: | ||
120 | if (per_cpu(resched_irq, cpu) >= 0) | ||
121 | unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); | ||
122 | if (per_cpu(callfunc_irq, cpu) >= 0) | ||
123 | unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); | ||
124 | return rc; | ||
125 | } | ||
126 | |||
127 | void __init xen_fill_possible_map(void) | ||
128 | { | ||
129 | int i, rc; | ||
130 | |||
131 | for (i = 0; i < NR_CPUS; i++) { | ||
132 | rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); | ||
133 | if (rc >= 0) | ||
134 | cpu_set(i, cpu_possible_map); | ||
135 | } | ||
136 | } | ||
137 | |||
138 | void __init xen_smp_prepare_boot_cpu(void) | ||
139 | { | ||
140 | int cpu; | ||
141 | |||
142 | BUG_ON(smp_processor_id() != 0); | ||
143 | native_smp_prepare_boot_cpu(); | ||
144 | |||
145 | /* We've switched to the "real" per-cpu gdt, so make sure the | ||
146 | old memory can be recycled */ | ||
147 | make_lowmem_page_readwrite(&per_cpu__gdt_page); | ||
148 | |||
149 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
150 | cpus_clear(cpu_sibling_map[cpu]); | ||
151 | cpus_clear(cpu_core_map[cpu]); | ||
152 | } | ||
153 | |||
154 | xen_setup_vcpu_info_placement(); | ||
155 | } | ||
156 | |||
157 | void __init xen_smp_prepare_cpus(unsigned int max_cpus) | ||
158 | { | ||
159 | unsigned cpu; | ||
160 | |||
161 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
162 | cpus_clear(cpu_sibling_map[cpu]); | ||
163 | cpus_clear(cpu_core_map[cpu]); | ||
164 | } | ||
165 | |||
166 | smp_store_cpu_info(0); | ||
167 | set_cpu_sibling_map(0); | ||
168 | |||
169 | if (xen_smp_intr_init(0)) | ||
170 | BUG(); | ||
171 | |||
172 | cpu_initialized_map = cpumask_of_cpu(0); | ||
173 | |||
174 | /* Restrict the possible_map according to max_cpus. */ | ||
175 | while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { | ||
176 | for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--) | ||
177 | continue; | ||
178 | cpu_clear(cpu, cpu_possible_map); | ||
179 | } | ||
180 | |||
181 | for_each_possible_cpu (cpu) { | ||
182 | struct task_struct *idle; | ||
183 | |||
184 | if (cpu == 0) | ||
185 | continue; | ||
186 | |||
187 | idle = fork_idle(cpu); | ||
188 | if (IS_ERR(idle)) | ||
189 | panic("failed fork for CPU %d", cpu); | ||
190 | |||
191 | cpu_set(cpu, cpu_present_map); | ||
192 | } | ||
193 | |||
194 | //init_xenbus_allowed_cpumask(); | ||
195 | } | ||
196 | |||
197 | static __cpuinit int | ||
198 | cpu_initialize_context(unsigned int cpu, struct task_struct *idle) | ||
199 | { | ||
200 | struct vcpu_guest_context *ctxt; | ||
201 | struct gdt_page *gdt = &per_cpu(gdt_page, cpu); | ||
202 | |||
203 | if (cpu_test_and_set(cpu, cpu_initialized_map)) | ||
204 | return 0; | ||
205 | |||
206 | ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); | ||
207 | if (ctxt == NULL) | ||
208 | return -ENOMEM; | ||
209 | |||
210 | ctxt->flags = VGCF_IN_KERNEL; | ||
211 | ctxt->user_regs.ds = __USER_DS; | ||
212 | ctxt->user_regs.es = __USER_DS; | ||
213 | ctxt->user_regs.fs = __KERNEL_PERCPU; | ||
214 | ctxt->user_regs.gs = 0; | ||
215 | ctxt->user_regs.ss = __KERNEL_DS; | ||
216 | ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; | ||
217 | ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ | ||
218 | |||
219 | memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); | ||
220 | |||
221 | xen_copy_trap_info(ctxt->trap_ctxt); | ||
222 | |||
223 | ctxt->ldt_ents = 0; | ||
224 | |||
225 | BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); | ||
226 | make_lowmem_page_readonly(gdt->gdt); | ||
227 | |||
228 | ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); | ||
229 | ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); | ||
230 | |||
231 | ctxt->user_regs.cs = __KERNEL_CS; | ||
232 | ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); | ||
233 | |||
234 | ctxt->kernel_ss = __KERNEL_DS; | ||
235 | ctxt->kernel_sp = idle->thread.esp0; | ||
236 | |||
237 | ctxt->event_callback_cs = __KERNEL_CS; | ||
238 | ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; | ||
239 | ctxt->failsafe_callback_cs = __KERNEL_CS; | ||
240 | ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; | ||
241 | |||
242 | per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); | ||
243 | ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); | ||
244 | |||
245 | if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) | ||
246 | BUG(); | ||
247 | |||
248 | kfree(ctxt); | ||
249 | return 0; | ||
250 | } | ||
251 | |||
252 | int __cpuinit xen_cpu_up(unsigned int cpu) | ||
253 | { | ||
254 | struct task_struct *idle = idle_task(cpu); | ||
255 | int rc; | ||
256 | |||
257 | #if 0 | ||
258 | rc = cpu_up_check(cpu); | ||
259 | if (rc) | ||
260 | return rc; | ||
261 | #endif | ||
262 | |||
263 | init_gdt(cpu); | ||
264 | per_cpu(current_task, cpu) = idle; | ||
265 | irq_ctx_init(cpu); | ||
266 | xen_setup_timer(cpu); | ||
267 | |||
268 | /* make sure interrupts start blocked */ | ||
269 | per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; | ||
270 | |||
271 | rc = cpu_initialize_context(cpu, idle); | ||
272 | if (rc) | ||
273 | return rc; | ||
274 | |||
275 | if (num_online_cpus() == 1) | ||
276 | alternatives_smp_switch(1); | ||
277 | |||
278 | rc = xen_smp_intr_init(cpu); | ||
279 | if (rc) | ||
280 | return rc; | ||
281 | |||
282 | smp_store_cpu_info(cpu); | ||
283 | set_cpu_sibling_map(cpu); | ||
284 | /* This must be done before setting cpu_online_map */ | ||
285 | wmb(); | ||
286 | |||
287 | cpu_set(cpu, cpu_online_map); | ||
288 | |||
289 | rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); | ||
290 | BUG_ON(rc); | ||
291 | |||
292 | return 0; | ||
293 | } | ||
294 | |||
295 | void xen_smp_cpus_done(unsigned int max_cpus) | ||
296 | { | ||
297 | } | ||
298 | |||
299 | static void stop_self(void *v) | ||
300 | { | ||
301 | int cpu = smp_processor_id(); | ||
302 | |||
303 | /* make sure we're not pinning something down */ | ||
304 | load_cr3(swapper_pg_dir); | ||
305 | /* should set up a minimal gdt */ | ||
306 | |||
307 | HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); | ||
308 | BUG(); | ||
309 | } | ||
310 | |||
311 | void xen_smp_send_stop(void) | ||
312 | { | ||
313 | smp_call_function(stop_self, NULL, 0, 0); | ||
314 | } | ||
315 | |||
316 | void xen_smp_send_reschedule(int cpu) | ||
317 | { | ||
318 | xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); | ||
319 | } | ||
320 | |||
321 | |||
322 | static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) | ||
323 | { | ||
324 | unsigned cpu; | ||
325 | |||
326 | cpus_and(mask, mask, cpu_online_map); | ||
327 | |||
328 | for_each_cpu_mask(cpu, mask) | ||
329 | xen_send_IPI_one(cpu, vector); | ||
330 | } | ||
331 | |||
332 | static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) | ||
333 | { | ||
334 | void (*func) (void *info) = call_data->func; | ||
335 | void *info = call_data->info; | ||
336 | int wait = call_data->wait; | ||
337 | |||
338 | /* | ||
339 | * Notify initiating CPU that I've grabbed the data and am | ||
340 | * about to execute the function | ||
341 | */ | ||
342 | mb(); | ||
343 | atomic_inc(&call_data->started); | ||
344 | /* | ||
345 | * At this point the info structure may be out of scope unless wait==1 | ||
346 | */ | ||
347 | irq_enter(); | ||
348 | (*func)(info); | ||
349 | irq_exit(); | ||
350 | |||
351 | if (wait) { | ||
352 | mb(); /* commit everything before setting finished */ | ||
353 | atomic_inc(&call_data->finished); | ||
354 | } | ||
355 | |||
356 | return IRQ_HANDLED; | ||
357 | } | ||
358 | |||
359 | int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), | ||
360 | void *info, int wait) | ||
361 | { | ||
362 | struct call_data_struct data; | ||
363 | int cpus; | ||
364 | |||
365 | /* Holding any lock stops cpus from going down. */ | ||
366 | spin_lock(&call_lock); | ||
367 | |||
368 | cpu_clear(smp_processor_id(), mask); | ||
369 | |||
370 | cpus = cpus_weight(mask); | ||
371 | if (!cpus) { | ||
372 | spin_unlock(&call_lock); | ||
373 | return 0; | ||
374 | } | ||
375 | |||
376 | /* Can deadlock when called with interrupts disabled */ | ||
377 | WARN_ON(irqs_disabled()); | ||
378 | |||
379 | data.func = func; | ||
380 | data.info = info; | ||
381 | atomic_set(&data.started, 0); | ||
382 | data.wait = wait; | ||
383 | if (wait) | ||
384 | atomic_set(&data.finished, 0); | ||
385 | |||
386 | call_data = &data; | ||
387 | mb(); /* write everything before IPI */ | ||
388 | |||
389 | /* Send a message to other CPUs and wait for them to respond */ | ||
390 | xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); | ||
391 | |||
392 | /* Make sure other vcpus get a chance to run. | ||
393 | XXX too severe? Maybe we should check the other CPU's states? */ | ||
394 | HYPERVISOR_sched_op(SCHEDOP_yield, 0); | ||
395 | |||
396 | /* Wait for response */ | ||
397 | while (atomic_read(&data.started) != cpus || | ||
398 | (wait && atomic_read(&data.finished) != cpus)) | ||
399 | cpu_relax(); | ||
400 | |||
401 | spin_unlock(&call_lock); | ||
402 | |||
403 | return 0; | ||
404 | } | ||
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c new file mode 100644 index 000000000000..dfd6db69ead5 --- /dev/null +++ b/arch/x86/xen/time.c | |||
@@ -0,0 +1,593 @@ | |||
1 | /* | ||
2 | * Xen time implementation. | ||
3 | * | ||
4 | * This is implemented in terms of a clocksource driver which uses | ||
5 | * the hypervisor clock as a nanosecond timebase, and a clockevent | ||
6 | * driver which uses the hypervisor's timer mechanism. | ||
7 | * | ||
8 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | ||
9 | */ | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/interrupt.h> | ||
12 | #include <linux/clocksource.h> | ||
13 | #include <linux/clockchips.h> | ||
14 | #include <linux/kernel_stat.h> | ||
15 | |||
16 | #include <asm/xen/hypervisor.h> | ||
17 | #include <asm/xen/hypercall.h> | ||
18 | |||
19 | #include <xen/events.h> | ||
20 | #include <xen/interface/xen.h> | ||
21 | #include <xen/interface/vcpu.h> | ||
22 | |||
23 | #include "xen-ops.h" | ||
24 | |||
25 | #define XEN_SHIFT 22 | ||
26 | |||
27 | /* Xen may fire a timer up to this many ns early */ | ||
28 | #define TIMER_SLOP 100000 | ||
29 | #define NS_PER_TICK (1000000000LL / HZ) | ||
30 | |||
31 | static cycle_t xen_clocksource_read(void); | ||
32 | |||
33 | /* These are perodically updated in shared_info, and then copied here. */ | ||
34 | struct shadow_time_info { | ||
35 | u64 tsc_timestamp; /* TSC at last update of time vals. */ | ||
36 | u64 system_timestamp; /* Time, in nanosecs, since boot. */ | ||
37 | u32 tsc_to_nsec_mul; | ||
38 | int tsc_shift; | ||
39 | u32 version; | ||
40 | }; | ||
41 | |||
42 | static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); | ||
43 | |||
44 | /* runstate info updated by Xen */ | ||
45 | static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); | ||
46 | |||
47 | /* snapshots of runstate info */ | ||
48 | static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); | ||
49 | |||
50 | /* unused ns of stolen and blocked time */ | ||
51 | static DEFINE_PER_CPU(u64, residual_stolen); | ||
52 | static DEFINE_PER_CPU(u64, residual_blocked); | ||
53 | |||
54 | /* return an consistent snapshot of 64-bit time/counter value */ | ||
55 | static u64 get64(const u64 *p) | ||
56 | { | ||
57 | u64 ret; | ||
58 | |||
59 | if (BITS_PER_LONG < 64) { | ||
60 | u32 *p32 = (u32 *)p; | ||
61 | u32 h, l; | ||
62 | |||
63 | /* | ||
64 | * Read high then low, and then make sure high is | ||
65 | * still the same; this will only loop if low wraps | ||
66 | * and carries into high. | ||
67 | * XXX some clean way to make this endian-proof? | ||
68 | */ | ||
69 | do { | ||
70 | h = p32[1]; | ||
71 | barrier(); | ||
72 | l = p32[0]; | ||
73 | barrier(); | ||
74 | } while (p32[1] != h); | ||
75 | |||
76 | ret = (((u64)h) << 32) | l; | ||
77 | } else | ||
78 | ret = *p; | ||
79 | |||
80 | return ret; | ||
81 | } | ||
82 | |||
83 | /* | ||
84 | * Runstate accounting | ||
85 | */ | ||
86 | static void get_runstate_snapshot(struct vcpu_runstate_info *res) | ||
87 | { | ||
88 | u64 state_time; | ||
89 | struct vcpu_runstate_info *state; | ||
90 | |||
91 | BUG_ON(preemptible()); | ||
92 | |||
93 | state = &__get_cpu_var(runstate); | ||
94 | |||
95 | /* | ||
96 | * The runstate info is always updated by the hypervisor on | ||
97 | * the current CPU, so there's no need to use anything | ||
98 | * stronger than a compiler barrier when fetching it. | ||
99 | */ | ||
100 | do { | ||
101 | state_time = get64(&state->state_entry_time); | ||
102 | barrier(); | ||
103 | *res = *state; | ||
104 | barrier(); | ||
105 | } while (get64(&state->state_entry_time) != state_time); | ||
106 | } | ||
107 | |||
108 | static void setup_runstate_info(int cpu) | ||
109 | { | ||
110 | struct vcpu_register_runstate_memory_area area; | ||
111 | |||
112 | area.addr.v = &per_cpu(runstate, cpu); | ||
113 | |||
114 | if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, | ||
115 | cpu, &area)) | ||
116 | BUG(); | ||
117 | } | ||
118 | |||
119 | static void do_stolen_accounting(void) | ||
120 | { | ||
121 | struct vcpu_runstate_info state; | ||
122 | struct vcpu_runstate_info *snap; | ||
123 | s64 blocked, runnable, offline, stolen; | ||
124 | cputime_t ticks; | ||
125 | |||
126 | get_runstate_snapshot(&state); | ||
127 | |||
128 | WARN_ON(state.state != RUNSTATE_running); | ||
129 | |||
130 | snap = &__get_cpu_var(runstate_snapshot); | ||
131 | |||
132 | /* work out how much time the VCPU has not been runn*ing* */ | ||
133 | blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; | ||
134 | runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; | ||
135 | offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; | ||
136 | |||
137 | *snap = state; | ||
138 | |||
139 | /* Add the appropriate number of ticks of stolen time, | ||
140 | including any left-overs from last time. Passing NULL to | ||
141 | account_steal_time accounts the time as stolen. */ | ||
142 | stolen = runnable + offline + __get_cpu_var(residual_stolen); | ||
143 | |||
144 | if (stolen < 0) | ||
145 | stolen = 0; | ||
146 | |||
147 | ticks = 0; | ||
148 | while (stolen >= NS_PER_TICK) { | ||
149 | ticks++; | ||
150 | stolen -= NS_PER_TICK; | ||
151 | } | ||
152 | __get_cpu_var(residual_stolen) = stolen; | ||
153 | account_steal_time(NULL, ticks); | ||
154 | |||
155 | /* Add the appropriate number of ticks of blocked time, | ||
156 | including any left-overs from last time. Passing idle to | ||
157 | account_steal_time accounts the time as idle/wait. */ | ||
158 | blocked += __get_cpu_var(residual_blocked); | ||
159 | |||
160 | if (blocked < 0) | ||
161 | blocked = 0; | ||
162 | |||
163 | ticks = 0; | ||
164 | while (blocked >= NS_PER_TICK) { | ||
165 | ticks++; | ||
166 | blocked -= NS_PER_TICK; | ||
167 | } | ||
168 | __get_cpu_var(residual_blocked) = blocked; | ||
169 | account_steal_time(idle_task(smp_processor_id()), ticks); | ||
170 | } | ||
171 | |||
172 | /* | ||
173 | * Xen sched_clock implementation. Returns the number of unstolen | ||
174 | * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED | ||
175 | * states. | ||
176 | */ | ||
177 | unsigned long long xen_sched_clock(void) | ||
178 | { | ||
179 | struct vcpu_runstate_info state; | ||
180 | cycle_t now; | ||
181 | u64 ret; | ||
182 | s64 offset; | ||
183 | |||
184 | /* | ||
185 | * Ideally sched_clock should be called on a per-cpu basis | ||
186 | * anyway, so preempt should already be disabled, but that's | ||
187 | * not current practice at the moment. | ||
188 | */ | ||
189 | preempt_disable(); | ||
190 | |||
191 | now = xen_clocksource_read(); | ||
192 | |||
193 | get_runstate_snapshot(&state); | ||
194 | |||
195 | WARN_ON(state.state != RUNSTATE_running); | ||
196 | |||
197 | offset = now - state.state_entry_time; | ||
198 | if (offset < 0) | ||
199 | offset = 0; | ||
200 | |||
201 | ret = state.time[RUNSTATE_blocked] + | ||
202 | state.time[RUNSTATE_running] + | ||
203 | offset; | ||
204 | |||
205 | preempt_enable(); | ||
206 | |||
207 | return ret; | ||
208 | } | ||
209 | |||
210 | |||
211 | /* Get the CPU speed from Xen */ | ||
212 | unsigned long xen_cpu_khz(void) | ||
213 | { | ||
214 | u64 cpu_khz = 1000000ULL << 32; | ||
215 | const struct vcpu_time_info *info = | ||
216 | &HYPERVISOR_shared_info->vcpu_info[0].time; | ||
217 | |||
218 | do_div(cpu_khz, info->tsc_to_system_mul); | ||
219 | if (info->tsc_shift < 0) | ||
220 | cpu_khz <<= -info->tsc_shift; | ||
221 | else | ||
222 | cpu_khz >>= info->tsc_shift; | ||
223 | |||
224 | return cpu_khz; | ||
225 | } | ||
226 | |||
227 | /* | ||
228 | * Reads a consistent set of time-base values from Xen, into a shadow data | ||
229 | * area. | ||
230 | */ | ||
231 | static unsigned get_time_values_from_xen(void) | ||
232 | { | ||
233 | struct vcpu_time_info *src; | ||
234 | struct shadow_time_info *dst; | ||
235 | |||
236 | /* src is shared memory with the hypervisor, so we need to | ||
237 | make sure we get a consistent snapshot, even in the face of | ||
238 | being preempted. */ | ||
239 | src = &__get_cpu_var(xen_vcpu)->time; | ||
240 | dst = &__get_cpu_var(shadow_time); | ||
241 | |||
242 | do { | ||
243 | dst->version = src->version; | ||
244 | rmb(); /* fetch version before data */ | ||
245 | dst->tsc_timestamp = src->tsc_timestamp; | ||
246 | dst->system_timestamp = src->system_time; | ||
247 | dst->tsc_to_nsec_mul = src->tsc_to_system_mul; | ||
248 | dst->tsc_shift = src->tsc_shift; | ||
249 | rmb(); /* test version after fetching data */ | ||
250 | } while ((src->version & 1) | (dst->version ^ src->version)); | ||
251 | |||
252 | return dst->version; | ||
253 | } | ||
254 | |||
255 | /* | ||
256 | * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, | ||
257 | * yielding a 64-bit result. | ||
258 | */ | ||
259 | static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) | ||
260 | { | ||
261 | u64 product; | ||
262 | #ifdef __i386__ | ||
263 | u32 tmp1, tmp2; | ||
264 | #endif | ||
265 | |||
266 | if (shift < 0) | ||
267 | delta >>= -shift; | ||
268 | else | ||
269 | delta <<= shift; | ||
270 | |||
271 | #ifdef __i386__ | ||
272 | __asm__ ( | ||
273 | "mul %5 ; " | ||
274 | "mov %4,%%eax ; " | ||
275 | "mov %%edx,%4 ; " | ||
276 | "mul %5 ; " | ||
277 | "xor %5,%5 ; " | ||
278 | "add %4,%%eax ; " | ||
279 | "adc %5,%%edx ; " | ||
280 | : "=A" (product), "=r" (tmp1), "=r" (tmp2) | ||
281 | : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); | ||
282 | #elif __x86_64__ | ||
283 | __asm__ ( | ||
284 | "mul %%rdx ; shrd $32,%%rdx,%%rax" | ||
285 | : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); | ||
286 | #else | ||
287 | #error implement me! | ||
288 | #endif | ||
289 | |||
290 | return product; | ||
291 | } | ||
292 | |||
293 | static u64 get_nsec_offset(struct shadow_time_info *shadow) | ||
294 | { | ||
295 | u64 now, delta; | ||
296 | now = native_read_tsc(); | ||
297 | delta = now - shadow->tsc_timestamp; | ||
298 | return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); | ||
299 | } | ||
300 | |||
301 | static cycle_t xen_clocksource_read(void) | ||
302 | { | ||
303 | struct shadow_time_info *shadow = &get_cpu_var(shadow_time); | ||
304 | cycle_t ret; | ||
305 | unsigned version; | ||
306 | |||
307 | do { | ||
308 | version = get_time_values_from_xen(); | ||
309 | barrier(); | ||
310 | ret = shadow->system_timestamp + get_nsec_offset(shadow); | ||
311 | barrier(); | ||
312 | } while (version != __get_cpu_var(xen_vcpu)->time.version); | ||
313 | |||
314 | put_cpu_var(shadow_time); | ||
315 | |||
316 | return ret; | ||
317 | } | ||
318 | |||
319 | static void xen_read_wallclock(struct timespec *ts) | ||
320 | { | ||
321 | const struct shared_info *s = HYPERVISOR_shared_info; | ||
322 | u32 version; | ||
323 | u64 delta; | ||
324 | struct timespec now; | ||
325 | |||
326 | /* get wallclock at system boot */ | ||
327 | do { | ||
328 | version = s->wc_version; | ||
329 | rmb(); /* fetch version before time */ | ||
330 | now.tv_sec = s->wc_sec; | ||
331 | now.tv_nsec = s->wc_nsec; | ||
332 | rmb(); /* fetch time before checking version */ | ||
333 | } while ((s->wc_version & 1) | (version ^ s->wc_version)); | ||
334 | |||
335 | delta = xen_clocksource_read(); /* time since system boot */ | ||
336 | delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; | ||
337 | |||
338 | now.tv_nsec = do_div(delta, NSEC_PER_SEC); | ||
339 | now.tv_sec = delta; | ||
340 | |||
341 | set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); | ||
342 | } | ||
343 | |||
344 | unsigned long xen_get_wallclock(void) | ||
345 | { | ||
346 | struct timespec ts; | ||
347 | |||
348 | xen_read_wallclock(&ts); | ||
349 | |||
350 | return ts.tv_sec; | ||
351 | } | ||
352 | |||
353 | int xen_set_wallclock(unsigned long now) | ||
354 | { | ||
355 | /* do nothing for domU */ | ||
356 | return -1; | ||
357 | } | ||
358 | |||
359 | static struct clocksource xen_clocksource __read_mostly = { | ||
360 | .name = "xen", | ||
361 | .rating = 400, | ||
362 | .read = xen_clocksource_read, | ||
363 | .mask = ~0, | ||
364 | .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */ | ||
365 | .shift = XEN_SHIFT, | ||
366 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | ||
367 | }; | ||
368 | |||
369 | /* | ||
370 | Xen clockevent implementation | ||
371 | |||
372 | Xen has two clockevent implementations: | ||
373 | |||
374 | The old timer_op one works with all released versions of Xen prior | ||
375 | to version 3.0.4. This version of the hypervisor provides a | ||
376 | single-shot timer with nanosecond resolution. However, sharing the | ||
377 | same event channel is a 100Hz tick which is delivered while the | ||
378 | vcpu is running. We don't care about or use this tick, but it will | ||
379 | cause the core time code to think the timer fired too soon, and | ||
380 | will end up resetting it each time. It could be filtered, but | ||
381 | doing so has complications when the ktime clocksource is not yet | ||
382 | the xen clocksource (ie, at boot time). | ||
383 | |||
384 | The new vcpu_op-based timer interface allows the tick timer period | ||
385 | to be changed or turned off. The tick timer is not useful as a | ||
386 | periodic timer because events are only delivered to running vcpus. | ||
387 | The one-shot timer can report when a timeout is in the past, so | ||
388 | set_next_event is capable of returning -ETIME when appropriate. | ||
389 | This interface is used when available. | ||
390 | */ | ||
391 | |||
392 | |||
393 | /* | ||
394 | Get a hypervisor absolute time. In theory we could maintain an | ||
395 | offset between the kernel's time and the hypervisor's time, and | ||
396 | apply that to a kernel's absolute timeout. Unfortunately the | ||
397 | hypervisor and kernel times can drift even if the kernel is using | ||
398 | the Xen clocksource, because ntp can warp the kernel's clocksource. | ||
399 | */ | ||
400 | static s64 get_abs_timeout(unsigned long delta) | ||
401 | { | ||
402 | return xen_clocksource_read() + delta; | ||
403 | } | ||
404 | |||
405 | static void xen_timerop_set_mode(enum clock_event_mode mode, | ||
406 | struct clock_event_device *evt) | ||
407 | { | ||
408 | switch (mode) { | ||
409 | case CLOCK_EVT_MODE_PERIODIC: | ||
410 | /* unsupported */ | ||
411 | WARN_ON(1); | ||
412 | break; | ||
413 | |||
414 | case CLOCK_EVT_MODE_ONESHOT: | ||
415 | case CLOCK_EVT_MODE_RESUME: | ||
416 | break; | ||
417 | |||
418 | case CLOCK_EVT_MODE_UNUSED: | ||
419 | case CLOCK_EVT_MODE_SHUTDOWN: | ||
420 | HYPERVISOR_set_timer_op(0); /* cancel timeout */ | ||
421 | break; | ||
422 | } | ||
423 | } | ||
424 | |||
425 | static int xen_timerop_set_next_event(unsigned long delta, | ||
426 | struct clock_event_device *evt) | ||
427 | { | ||
428 | WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); | ||
429 | |||
430 | if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0) | ||
431 | BUG(); | ||
432 | |||
433 | /* We may have missed the deadline, but there's no real way of | ||
434 | knowing for sure. If the event was in the past, then we'll | ||
435 | get an immediate interrupt. */ | ||
436 | |||
437 | return 0; | ||
438 | } | ||
439 | |||
440 | static const struct clock_event_device xen_timerop_clockevent = { | ||
441 | .name = "xen", | ||
442 | .features = CLOCK_EVT_FEAT_ONESHOT, | ||
443 | |||
444 | .max_delta_ns = 0xffffffff, | ||
445 | .min_delta_ns = TIMER_SLOP, | ||
446 | |||
447 | .mult = 1, | ||
448 | .shift = 0, | ||
449 | .rating = 500, | ||
450 | |||
451 | .set_mode = xen_timerop_set_mode, | ||
452 | .set_next_event = xen_timerop_set_next_event, | ||
453 | }; | ||
454 | |||
455 | |||
456 | |||
457 | static void xen_vcpuop_set_mode(enum clock_event_mode mode, | ||
458 | struct clock_event_device *evt) | ||
459 | { | ||
460 | int cpu = smp_processor_id(); | ||
461 | |||
462 | switch (mode) { | ||
463 | case CLOCK_EVT_MODE_PERIODIC: | ||
464 | WARN_ON(1); /* unsupported */ | ||
465 | break; | ||
466 | |||
467 | case CLOCK_EVT_MODE_ONESHOT: | ||
468 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) | ||
469 | BUG(); | ||
470 | break; | ||
471 | |||
472 | case CLOCK_EVT_MODE_UNUSED: | ||
473 | case CLOCK_EVT_MODE_SHUTDOWN: | ||
474 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) || | ||
475 | HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) | ||
476 | BUG(); | ||
477 | break; | ||
478 | case CLOCK_EVT_MODE_RESUME: | ||
479 | break; | ||
480 | } | ||
481 | } | ||
482 | |||
483 | static int xen_vcpuop_set_next_event(unsigned long delta, | ||
484 | struct clock_event_device *evt) | ||
485 | { | ||
486 | int cpu = smp_processor_id(); | ||
487 | struct vcpu_set_singleshot_timer single; | ||
488 | int ret; | ||
489 | |||
490 | WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); | ||
491 | |||
492 | single.timeout_abs_ns = get_abs_timeout(delta); | ||
493 | single.flags = VCPU_SSHOTTMR_future; | ||
494 | |||
495 | ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single); | ||
496 | |||
497 | BUG_ON(ret != 0 && ret != -ETIME); | ||
498 | |||
499 | return ret; | ||
500 | } | ||
501 | |||
502 | static const struct clock_event_device xen_vcpuop_clockevent = { | ||
503 | .name = "xen", | ||
504 | .features = CLOCK_EVT_FEAT_ONESHOT, | ||
505 | |||
506 | .max_delta_ns = 0xffffffff, | ||
507 | .min_delta_ns = TIMER_SLOP, | ||
508 | |||
509 | .mult = 1, | ||
510 | .shift = 0, | ||
511 | .rating = 500, | ||
512 | |||
513 | .set_mode = xen_vcpuop_set_mode, | ||
514 | .set_next_event = xen_vcpuop_set_next_event, | ||
515 | }; | ||
516 | |||
517 | static const struct clock_event_device *xen_clockevent = | ||
518 | &xen_timerop_clockevent; | ||
519 | static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events); | ||
520 | |||
521 | static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) | ||
522 | { | ||
523 | struct clock_event_device *evt = &__get_cpu_var(xen_clock_events); | ||
524 | irqreturn_t ret; | ||
525 | |||
526 | ret = IRQ_NONE; | ||
527 | if (evt->event_handler) { | ||
528 | evt->event_handler(evt); | ||
529 | ret = IRQ_HANDLED; | ||
530 | } | ||
531 | |||
532 | do_stolen_accounting(); | ||
533 | |||
534 | return ret; | ||
535 | } | ||
536 | |||
537 | void xen_setup_timer(int cpu) | ||
538 | { | ||
539 | const char *name; | ||
540 | struct clock_event_device *evt; | ||
541 | int irq; | ||
542 | |||
543 | printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); | ||
544 | |||
545 | name = kasprintf(GFP_KERNEL, "timer%d", cpu); | ||
546 | if (!name) | ||
547 | name = "<timer kasprintf failed>"; | ||
548 | |||
549 | irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, | ||
550 | IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, | ||
551 | name, NULL); | ||
552 | |||
553 | evt = &per_cpu(xen_clock_events, cpu); | ||
554 | memcpy(evt, xen_clockevent, sizeof(*evt)); | ||
555 | |||
556 | evt->cpumask = cpumask_of_cpu(cpu); | ||
557 | evt->irq = irq; | ||
558 | |||
559 | setup_runstate_info(cpu); | ||
560 | } | ||
561 | |||
562 | void xen_setup_cpu_clockevents(void) | ||
563 | { | ||
564 | BUG_ON(preemptible()); | ||
565 | |||
566 | clockevents_register_device(&__get_cpu_var(xen_clock_events)); | ||
567 | } | ||
568 | |||
569 | __init void xen_time_init(void) | ||
570 | { | ||
571 | int cpu = smp_processor_id(); | ||
572 | |||
573 | get_time_values_from_xen(); | ||
574 | |||
575 | clocksource_register(&xen_clocksource); | ||
576 | |||
577 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { | ||
578 | /* Successfully turned off 100Hz tick, so we have the | ||
579 | vcpuop-based timer interface */ | ||
580 | printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); | ||
581 | xen_clockevent = &xen_vcpuop_clockevent; | ||
582 | } | ||
583 | |||
584 | /* Set initial system time with full resolution */ | ||
585 | xen_read_wallclock(&xtime); | ||
586 | set_normalized_timespec(&wall_to_monotonic, | ||
587 | -xtime.tv_sec, -xtime.tv_nsec); | ||
588 | |||
589 | tsc_disable = 0; | ||
590 | |||
591 | xen_setup_timer(cpu); | ||
592 | xen_setup_cpu_clockevents(); | ||
593 | } | ||
diff --git a/arch/x86/xen/vdso.h b/arch/x86/xen/vdso.h new file mode 100644 index 000000000000..861fedfe5230 --- /dev/null +++ b/arch/x86/xen/vdso.h | |||
@@ -0,0 +1,4 @@ | |||
1 | /* Bit used for the pseudo-hwcap for non-negative segments. We use | ||
2 | bit 1 to avoid bugs in some versions of glibc when bit 0 is | ||
3 | used; the choice is otherwise arbitrary. */ | ||
4 | #define VDSO_NOTE_NONEGSEG_BIT 1 | ||
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S new file mode 100644 index 000000000000..1a43b60c0c62 --- /dev/null +++ b/arch/x86/xen/xen-asm.S | |||
@@ -0,0 +1,291 @@ | |||
1 | /* | ||
2 | Asm versions of Xen pv-ops, suitable for either direct use or inlining. | ||
3 | The inline versions are the same as the direct-use versions, with the | ||
4 | pre- and post-amble chopped off. | ||
5 | |||
6 | This code is encoded for size rather than absolute efficiency, | ||
7 | with a view to being able to inline as much as possible. | ||
8 | |||
9 | We only bother with direct forms (ie, vcpu in pda) of the operations | ||
10 | here; the indirect forms are better handled in C, since they're | ||
11 | generally too large to inline anyway. | ||
12 | */ | ||
13 | |||
14 | #include <linux/linkage.h> | ||
15 | |||
16 | #include <asm/asm-offsets.h> | ||
17 | #include <asm/thread_info.h> | ||
18 | #include <asm/percpu.h> | ||
19 | #include <asm/processor-flags.h> | ||
20 | #include <asm/segment.h> | ||
21 | |||
22 | #include <xen/interface/xen.h> | ||
23 | |||
24 | #define RELOC(x, v) .globl x##_reloc; x##_reloc=v | ||
25 | #define ENDPATCH(x) .globl x##_end; x##_end=. | ||
26 | |||
27 | /* Pseudo-flag used for virtual NMI, which we don't implement yet */ | ||
28 | #define XEN_EFLAGS_NMI 0x80000000 | ||
29 | |||
30 | /* | ||
31 | Enable events. This clears the event mask and tests the pending | ||
32 | event status with one and operation. If there are pending | ||
33 | events, then enter the hypervisor to get them handled. | ||
34 | */ | ||
35 | ENTRY(xen_irq_enable_direct) | ||
36 | /* Clear mask and test pending */ | ||
37 | andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending | ||
38 | /* Preempt here doesn't matter because that will deal with | ||
39 | any pending interrupts. The pending check may end up being | ||
40 | run on the wrong CPU, but that doesn't hurt. */ | ||
41 | jz 1f | ||
42 | 2: call check_events | ||
43 | 1: | ||
44 | ENDPATCH(xen_irq_enable_direct) | ||
45 | ret | ||
46 | ENDPROC(xen_irq_enable_direct) | ||
47 | RELOC(xen_irq_enable_direct, 2b+1) | ||
48 | |||
49 | |||
50 | /* | ||
51 | Disabling events is simply a matter of making the event mask | ||
52 | non-zero. | ||
53 | */ | ||
54 | ENTRY(xen_irq_disable_direct) | ||
55 | movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask | ||
56 | ENDPATCH(xen_irq_disable_direct) | ||
57 | ret | ||
58 | ENDPROC(xen_irq_disable_direct) | ||
59 | RELOC(xen_irq_disable_direct, 0) | ||
60 | |||
61 | /* | ||
62 | (xen_)save_fl is used to get the current interrupt enable status. | ||
63 | Callers expect the status to be in X86_EFLAGS_IF, and other bits | ||
64 | may be set in the return value. We take advantage of this by | ||
65 | making sure that X86_EFLAGS_IF has the right value (and other bits | ||
66 | in that byte are 0), but other bits in the return value are | ||
67 | undefined. We need to toggle the state of the bit, because | ||
68 | Xen and x86 use opposite senses (mask vs enable). | ||
69 | */ | ||
70 | ENTRY(xen_save_fl_direct) | ||
71 | testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask | ||
72 | setz %ah | ||
73 | addb %ah,%ah | ||
74 | ENDPATCH(xen_save_fl_direct) | ||
75 | ret | ||
76 | ENDPROC(xen_save_fl_direct) | ||
77 | RELOC(xen_save_fl_direct, 0) | ||
78 | |||
79 | |||
80 | /* | ||
81 | In principle the caller should be passing us a value return | ||
82 | from xen_save_fl_direct, but for robustness sake we test only | ||
83 | the X86_EFLAGS_IF flag rather than the whole byte. After | ||
84 | setting the interrupt mask state, it checks for unmasked | ||
85 | pending events and enters the hypervisor to get them delivered | ||
86 | if so. | ||
87 | */ | ||
88 | ENTRY(xen_restore_fl_direct) | ||
89 | testb $X86_EFLAGS_IF>>8, %ah | ||
90 | setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask | ||
91 | /* Preempt here doesn't matter because that will deal with | ||
92 | any pending interrupts. The pending check may end up being | ||
93 | run on the wrong CPU, but that doesn't hurt. */ | ||
94 | |||
95 | /* check for unmasked and pending */ | ||
96 | cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending | ||
97 | jz 1f | ||
98 | 2: call check_events | ||
99 | 1: | ||
100 | ENDPATCH(xen_restore_fl_direct) | ||
101 | ret | ||
102 | ENDPROC(xen_restore_fl_direct) | ||
103 | RELOC(xen_restore_fl_direct, 2b+1) | ||
104 | |||
105 | /* | ||
106 | This is run where a normal iret would be run, with the same stack setup: | ||
107 | 8: eflags | ||
108 | 4: cs | ||
109 | esp-> 0: eip | ||
110 | |||
111 | This attempts to make sure that any pending events are dealt | ||
112 | with on return to usermode, but there is a small window in | ||
113 | which an event can happen just before entering usermode. If | ||
114 | the nested interrupt ends up setting one of the TIF_WORK_MASK | ||
115 | pending work flags, they will not be tested again before | ||
116 | returning to usermode. This means that a process can end up | ||
117 | with pending work, which will be unprocessed until the process | ||
118 | enters and leaves the kernel again, which could be an | ||
119 | unbounded amount of time. This means that a pending signal or | ||
120 | reschedule event could be indefinitely delayed. | ||
121 | |||
122 | The fix is to notice a nested interrupt in the critical | ||
123 | window, and if one occurs, then fold the nested interrupt into | ||
124 | the current interrupt stack frame, and re-process it | ||
125 | iteratively rather than recursively. This means that it will | ||
126 | exit via the normal path, and all pending work will be dealt | ||
127 | with appropriately. | ||
128 | |||
129 | Because the nested interrupt handler needs to deal with the | ||
130 | current stack state in whatever form its in, we keep things | ||
131 | simple by only using a single register which is pushed/popped | ||
132 | on the stack. | ||
133 | |||
134 | Non-direct iret could be done in the same way, but it would | ||
135 | require an annoying amount of code duplication. We'll assume | ||
136 | that direct mode will be the common case once the hypervisor | ||
137 | support becomes commonplace. | ||
138 | */ | ||
139 | ENTRY(xen_iret_direct) | ||
140 | /* test eflags for special cases */ | ||
141 | testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) | ||
142 | jnz hyper_iret | ||
143 | |||
144 | push %eax | ||
145 | ESP_OFFSET=4 # bytes pushed onto stack | ||
146 | |||
147 | /* Store vcpu_info pointer for easy access. Do it this | ||
148 | way to avoid having to reload %fs */ | ||
149 | #ifdef CONFIG_SMP | ||
150 | GET_THREAD_INFO(%eax) | ||
151 | movl TI_cpu(%eax),%eax | ||
152 | movl __per_cpu_offset(,%eax,4),%eax | ||
153 | lea per_cpu__xen_vcpu_info(%eax),%eax | ||
154 | #else | ||
155 | movl $per_cpu__xen_vcpu_info, %eax | ||
156 | #endif | ||
157 | |||
158 | /* check IF state we're restoring */ | ||
159 | testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) | ||
160 | |||
161 | /* Maybe enable events. Once this happens we could get a | ||
162 | recursive event, so the critical region starts immediately | ||
163 | afterwards. However, if that happens we don't end up | ||
164 | resuming the code, so we don't have to be worried about | ||
165 | being preempted to another CPU. */ | ||
166 | setz XEN_vcpu_info_mask(%eax) | ||
167 | xen_iret_start_crit: | ||
168 | |||
169 | /* check for unmasked and pending */ | ||
170 | cmpw $0x0001, XEN_vcpu_info_pending(%eax) | ||
171 | |||
172 | /* If there's something pending, mask events again so we | ||
173 | can jump back into xen_hypervisor_callback */ | ||
174 | sete XEN_vcpu_info_mask(%eax) | ||
175 | |||
176 | popl %eax | ||
177 | |||
178 | /* From this point on the registers are restored and the stack | ||
179 | updated, so we don't need to worry about it if we're preempted */ | ||
180 | iret_restore_end: | ||
181 | |||
182 | /* Jump to hypervisor_callback after fixing up the stack. | ||
183 | Events are masked, so jumping out of the critical | ||
184 | region is OK. */ | ||
185 | je xen_hypervisor_callback | ||
186 | |||
187 | iret | ||
188 | xen_iret_end_crit: | ||
189 | |||
190 | hyper_iret: | ||
191 | /* put this out of line since its very rarely used */ | ||
192 | jmp hypercall_page + __HYPERVISOR_iret * 32 | ||
193 | |||
194 | .globl xen_iret_start_crit, xen_iret_end_crit | ||
195 | |||
196 | /* | ||
197 | This is called by xen_hypervisor_callback in entry.S when it sees | ||
198 | that the EIP at the time of interrupt was between xen_iret_start_crit | ||
199 | and xen_iret_end_crit. We're passed the EIP in %eax so we can do | ||
200 | a more refined determination of what to do. | ||
201 | |||
202 | The stack format at this point is: | ||
203 | ---------------- | ||
204 | ss : (ss/esp may be present if we came from usermode) | ||
205 | esp : | ||
206 | eflags } outer exception info | ||
207 | cs } | ||
208 | eip } | ||
209 | ---------------- <- edi (copy dest) | ||
210 | eax : outer eax if it hasn't been restored | ||
211 | ---------------- | ||
212 | eflags } nested exception info | ||
213 | cs } (no ss/esp because we're nested | ||
214 | eip } from the same ring) | ||
215 | orig_eax }<- esi (copy src) | ||
216 | - - - - - - - - | ||
217 | fs } | ||
218 | es } | ||
219 | ds } SAVE_ALL state | ||
220 | eax } | ||
221 | : : | ||
222 | ebx } | ||
223 | ---------------- | ||
224 | return addr <- esp | ||
225 | ---------------- | ||
226 | |||
227 | In order to deliver the nested exception properly, we need to shift | ||
228 | everything from the return addr up to the error code so it | ||
229 | sits just under the outer exception info. This means that when we | ||
230 | handle the exception, we do it in the context of the outer exception | ||
231 | rather than starting a new one. | ||
232 | |||
233 | The only caveat is that if the outer eax hasn't been | ||
234 | restored yet (ie, it's still on stack), we need to insert | ||
235 | its value into the SAVE_ALL state before going on, since | ||
236 | it's usermode state which we eventually need to restore. | ||
237 | */ | ||
238 | ENTRY(xen_iret_crit_fixup) | ||
239 | /* offsets +4 for return address */ | ||
240 | |||
241 | /* | ||
242 | Paranoia: Make sure we're really coming from userspace. | ||
243 | One could imagine a case where userspace jumps into the | ||
244 | critical range address, but just before the CPU delivers a GP, | ||
245 | it decides to deliver an interrupt instead. Unlikely? | ||
246 | Definitely. Easy to avoid? Yes. The Intel documents | ||
247 | explicitly say that the reported EIP for a bad jump is the | ||
248 | jump instruction itself, not the destination, but some virtual | ||
249 | environments get this wrong. | ||
250 | */ | ||
251 | movl PT_CS+4(%esp), %ecx | ||
252 | andl $SEGMENT_RPL_MASK, %ecx | ||
253 | cmpl $USER_RPL, %ecx | ||
254 | je 2f | ||
255 | |||
256 | lea PT_ORIG_EAX+4(%esp), %esi | ||
257 | lea PT_EFLAGS+4(%esp), %edi | ||
258 | |||
259 | /* If eip is before iret_restore_end then stack | ||
260 | hasn't been restored yet. */ | ||
261 | cmp $iret_restore_end, %eax | ||
262 | jae 1f | ||
263 | |||
264 | movl 0+4(%edi),%eax /* copy EAX */ | ||
265 | movl %eax, PT_EAX+4(%esp) | ||
266 | |||
267 | lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ | ||
268 | |||
269 | /* set up the copy */ | ||
270 | 1: std | ||
271 | mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ | ||
272 | rep movsl | ||
273 | cld | ||
274 | |||
275 | lea 4(%edi),%esp /* point esp to new frame */ | ||
276 | 2: ret | ||
277 | |||
278 | |||
279 | /* | ||
280 | Force an event check by making a hypercall, | ||
281 | but preserve regs before making the call. | ||
282 | */ | ||
283 | check_events: | ||
284 | push %eax | ||
285 | push %ecx | ||
286 | push %edx | ||
287 | call force_evtchn_callback | ||
288 | pop %edx | ||
289 | pop %ecx | ||
290 | pop %eax | ||
291 | ret | ||
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S new file mode 100644 index 000000000000..f8d6937db2ec --- /dev/null +++ b/arch/x86/xen/xen-head.S | |||
@@ -0,0 +1,38 @@ | |||
1 | /* Xen-specific pieces of head.S, intended to be included in the right | ||
2 | place in head.S */ | ||
3 | |||
4 | #ifdef CONFIG_XEN | ||
5 | |||
6 | #include <linux/elfnote.h> | ||
7 | #include <asm/boot.h> | ||
8 | #include <xen/interface/elfnote.h> | ||
9 | |||
10 | .pushsection .init.text | ||
11 | ENTRY(startup_xen) | ||
12 | movl %esi,xen_start_info | ||
13 | cld | ||
14 | movl $(init_thread_union+THREAD_SIZE),%esp | ||
15 | jmp xen_start_kernel | ||
16 | .popsection | ||
17 | |||
18 | .pushsection .bss.page_aligned | ||
19 | .align PAGE_SIZE_asm | ||
20 | ENTRY(hypercall_page) | ||
21 | .skip 0x1000 | ||
22 | .popsection | ||
23 | |||
24 | ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") | ||
25 | ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") | ||
26 | ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") | ||
27 | ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) | ||
28 | ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) | ||
29 | ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) | ||
30 | ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") | ||
31 | #ifdef CONFIG_X86_PAE | ||
32 | ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") | ||
33 | #else | ||
34 | ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") | ||
35 | #endif | ||
36 | ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") | ||
37 | |||
38 | #endif /*CONFIG_XEN */ | ||
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h new file mode 100644 index 000000000000..b9aaea45f07f --- /dev/null +++ b/arch/x86/xen/xen-ops.h | |||
@@ -0,0 +1,71 @@ | |||
1 | #ifndef XEN_OPS_H | ||
2 | #define XEN_OPS_H | ||
3 | |||
4 | #include <linux/init.h> | ||
5 | |||
6 | /* These are code, but not functions. Defined in entry.S */ | ||
7 | extern const char xen_hypervisor_callback[]; | ||
8 | extern const char xen_failsafe_callback[]; | ||
9 | |||
10 | void xen_copy_trap_info(struct trap_info *traps); | ||
11 | |||
12 | DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); | ||
13 | DECLARE_PER_CPU(unsigned long, xen_cr3); | ||
14 | |||
15 | extern struct start_info *xen_start_info; | ||
16 | extern struct shared_info *HYPERVISOR_shared_info; | ||
17 | |||
18 | char * __init xen_memory_setup(void); | ||
19 | void __init xen_arch_setup(void); | ||
20 | void __init xen_init_IRQ(void); | ||
21 | |||
22 | void xen_setup_timer(int cpu); | ||
23 | void xen_setup_cpu_clockevents(void); | ||
24 | unsigned long xen_cpu_khz(void); | ||
25 | void __init xen_time_init(void); | ||
26 | unsigned long xen_get_wallclock(void); | ||
27 | int xen_set_wallclock(unsigned long time); | ||
28 | unsigned long long xen_sched_clock(void); | ||
29 | |||
30 | void xen_mark_init_mm_pinned(void); | ||
31 | |||
32 | DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); | ||
33 | |||
34 | static inline unsigned xen_get_lazy_mode(void) | ||
35 | { | ||
36 | return x86_read_percpu(xen_lazy_mode); | ||
37 | } | ||
38 | |||
39 | void __init xen_fill_possible_map(void); | ||
40 | |||
41 | void __init xen_setup_vcpu_info_placement(void); | ||
42 | void xen_smp_prepare_boot_cpu(void); | ||
43 | void xen_smp_prepare_cpus(unsigned int max_cpus); | ||
44 | int xen_cpu_up(unsigned int cpu); | ||
45 | void xen_smp_cpus_done(unsigned int max_cpus); | ||
46 | |||
47 | void xen_smp_send_stop(void); | ||
48 | void xen_smp_send_reschedule(int cpu); | ||
49 | int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic, | ||
50 | int wait); | ||
51 | int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, | ||
52 | int nonatomic, int wait); | ||
53 | |||
54 | int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), | ||
55 | void *info, int wait); | ||
56 | |||
57 | |||
58 | /* Declare an asm function, along with symbols needed to make it | ||
59 | inlineable */ | ||
60 | #define DECL_ASM(ret, name, ...) \ | ||
61 | ret name(__VA_ARGS__); \ | ||
62 | extern char name##_end[]; \ | ||
63 | extern char name##_reloc[] \ | ||
64 | |||
65 | DECL_ASM(void, xen_irq_enable_direct, void); | ||
66 | DECL_ASM(void, xen_irq_disable_direct, void); | ||
67 | DECL_ASM(unsigned long, xen_save_fl_direct, void); | ||
68 | DECL_ASM(void, xen_restore_fl_direct, unsigned long); | ||
69 | |||
70 | void xen_iret_direct(void); | ||
71 | #endif /* XEN_OPS_H */ | ||