aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-06-10 19:16:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-06-10 19:16:27 -0400
commitbe15f9d63b97da0065187696962331de6cd9de9e (patch)
treecc85c72e92afccfdcdfa851c4694a93f4ea22b84
parent595dc54a1da91408a52c4b962f3deeb1109aaca0 (diff)
parenta789ed5fb6d0256c4177c2cc27e06520ddbe4d4c (diff)
Merge branch 'x86-xen-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-xen-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (42 commits) xen: cache cr0 value to avoid trap'n'emulate for read_cr0 xen/x86-64: clean up warnings about IST-using traps xen/x86-64: fix breakpoints and hardware watchpoints xen: reserve Xen start_info rather than e820 reserving xen: add FIX_TEXT_POKE to fixmap lguest: update lazy mmu changes to match lguest's use of kvm hypercalls xen: honour VCPU availability on boot xen: add "capabilities" file xen: drop kexec bits from /sys/hypervisor since kexec isn't implemented yet xen/sys/hypervisor: change writable_pt to features xen: add /sys/hypervisor support xen/xenbus: export xenbus_dev_changed xen: use device model for suspending xenbus devices xen: remove suspend_cancel hook xen/dev-evtchn: clean up locking in evtchn xen: export ioctl headers to userspace xen: add /dev/xen/evtchn driver xen: add irq_from_evtchn xen: clean up gate trap/interrupt constants xen: set _PAGE_NX in __supported_pte_mask before pagetable construction ...
-rw-r--r--arch/x86/include/asm/paravirt.h22
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/required-features.h8
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/traps.h3
-rw-r--r--arch/x86/kernel/entry_64.S5
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/paravirt.c56
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/vmi_32.c20
-rw-r--r--arch/x86/lguest/boot.c16
-rw-r--r--arch/x86/mm/fault.c6
-rw-r--r--arch/x86/mm/highmem_32.c2
-rw-r--r--arch/x86/mm/iomap_32.c1
-rw-r--r--arch/x86/mm/pageattr.c14
-rw-r--r--arch/x86/xen/enlighten.c65
-rw-r--r--arch/x86/xen/mmu.c23
-rw-r--r--arch/x86/xen/setup.c6
-rw-r--r--arch/x86/xen/xen-ops.h1
-rw-r--r--drivers/xen/Kconfig20
-rw-r--r--drivers/xen/Makefile4
-rw-r--r--drivers/xen/events.c6
-rw-r--r--drivers/xen/evtchn.c507
-rw-r--r--drivers/xen/manage.c9
-rw-r--r--drivers/xen/sys-hypervisor.c445
-rw-r--r--drivers/xen/xenbus/xenbus_probe.c61
-rw-r--r--drivers/xen/xenbus/xenbus_xs.c2
-rw-r--r--drivers/xen/xenfs/super.c19
-rw-r--r--include/Kbuild1
-rw-r--r--include/asm-generic/pgtable.h21
-rw-r--r--include/xen/Kbuild1
-rw-r--r--include/xen/events.h3
-rw-r--r--include/xen/evtchn.h88
-rw-r--r--include/xen/interface/version.h3
-rw-r--r--include/xen/xenbus.h3
-rw-r--r--kernel/sched.c2
37 files changed, 1281 insertions, 174 deletions
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index a53da004e08e..4fb37c8a0832 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -56,6 +56,7 @@ struct desc_ptr;
56struct tss_struct; 56struct tss_struct;
57struct mm_struct; 57struct mm_struct;
58struct desc_struct; 58struct desc_struct;
59struct task_struct;
59 60
60/* 61/*
61 * Wrapper type for pointers to code which uses the non-standard 62 * Wrapper type for pointers to code which uses the non-standard
@@ -203,7 +204,8 @@ struct pv_cpu_ops {
203 204
204 void (*swapgs)(void); 205 void (*swapgs)(void);
205 206
206 struct pv_lazy_ops lazy_mode; 207 void (*start_context_switch)(struct task_struct *prev);
208 void (*end_context_switch)(struct task_struct *next);
207}; 209};
208 210
209struct pv_irq_ops { 211struct pv_irq_ops {
@@ -1399,25 +1401,23 @@ enum paravirt_lazy_mode {
1399}; 1401};
1400 1402
1401enum paravirt_lazy_mode paravirt_get_lazy_mode(void); 1403enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
1402void paravirt_enter_lazy_cpu(void); 1404void paravirt_start_context_switch(struct task_struct *prev);
1403void paravirt_leave_lazy_cpu(void); 1405void paravirt_end_context_switch(struct task_struct *next);
1406
1404void paravirt_enter_lazy_mmu(void); 1407void paravirt_enter_lazy_mmu(void);
1405void paravirt_leave_lazy_mmu(void); 1408void paravirt_leave_lazy_mmu(void);
1406void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
1407 1409
1408#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE 1410#define __HAVE_ARCH_START_CONTEXT_SWITCH
1409static inline void arch_enter_lazy_cpu_mode(void) 1411static inline void arch_start_context_switch(struct task_struct *prev)
1410{ 1412{
1411 PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter); 1413 PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
1412} 1414}
1413 1415
1414static inline void arch_leave_lazy_cpu_mode(void) 1416static inline void arch_end_context_switch(struct task_struct *next)
1415{ 1417{
1416 PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); 1418 PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
1417} 1419}
1418 1420
1419void arch_flush_lazy_cpu_mode(void);
1420
1421#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE 1421#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
1422static inline void arch_enter_lazy_mmu_mode(void) 1422static inline void arch_enter_lazy_mmu_mode(void)
1423{ 1423{
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 3f8d09d94eb3..18ef7ebf2631 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -81,6 +81,8 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
81#define pte_val(x) native_pte_val(x) 81#define pte_val(x) native_pte_val(x)
82#define __pte(x) native_make_pte(x) 82#define __pte(x) native_make_pte(x)
83 83
84#define arch_end_context_switch(prev) do {} while(0)
85
84#endif /* CONFIG_PARAVIRT */ 86#endif /* CONFIG_PARAVIRT */
85 87
86/* 88/*
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index a4737dddfd58..64cf2d24fad1 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -48,9 +48,15 @@
48#endif 48#endif
49 49
50#ifdef CONFIG_X86_64 50#ifdef CONFIG_X86_64
51#ifdef CONFIG_PARAVIRT
52/* Paravirtualized systems may not have PSE or PGE available */
51#define NEED_PSE 0 53#define NEED_PSE 0
52#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
53#define NEED_PGE 0 54#define NEED_PGE 0
55#else
56#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31)
57#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31)
58#endif
59#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
54#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) 60#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
55#define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) 61#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
56#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) 62#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8820a73ae090..602c769fc98c 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -94,7 +94,8 @@ struct thread_info {
94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ 95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ 96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
97#define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */ 97#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
98#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */
98 99
99#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 100#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
100#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 101#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -116,6 +117,7 @@ struct thread_info {
116#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 117#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
117#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) 118#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
118#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) 119#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
120#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
119#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) 121#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE)
120 122
121/* work to do in syscall_trace_enter() */ 123/* work to do in syscall_trace_enter() */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index cbfdc26b1460..bfd74c032fca 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -14,6 +14,9 @@ asmlinkage void divide_error(void);
14asmlinkage void debug(void); 14asmlinkage void debug(void);
15asmlinkage void nmi(void); 15asmlinkage void nmi(void);
16asmlinkage void int3(void); 16asmlinkage void int3(void);
17asmlinkage void xen_debug(void);
18asmlinkage void xen_int3(void);
19asmlinkage void xen_stack_segment(void);
17asmlinkage void overflow(void); 20asmlinkage void overflow(void);
18asmlinkage void bounds(void); 21asmlinkage void bounds(void);
19asmlinkage void invalid_op(void); 22asmlinkage void invalid_op(void);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 38946c6e8433..bb01ce080b80 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1379,6 +1379,11 @@ END(xen_failsafe_callback)
1379paranoidzeroentry_ist debug do_debug DEBUG_STACK 1379paranoidzeroentry_ist debug do_debug DEBUG_STACK
1380paranoidzeroentry_ist int3 do_int3 DEBUG_STACK 1380paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1381paranoiderrorentry stack_segment do_stack_segment 1381paranoiderrorentry stack_segment do_stack_segment
1382#ifdef CONFIG_XEN
1383zeroentry xen_debug do_debug
1384zeroentry xen_int3 do_int3
1385errorentry xen_stack_segment do_stack_segment
1386#endif
1382errorentry general_protection do_general_protection 1387errorentry general_protection do_general_protection
1383errorentry page_fault do_page_fault 1388errorentry page_fault do_page_fault
1384#ifdef CONFIG_X86_MCE 1389#ifdef CONFIG_X86_MCE
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33019ddb56b4..6551dedee20c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -195,7 +195,7 @@ static void kvm_leave_lazy_mmu(void)
195 struct kvm_para_state *state = kvm_para_state(); 195 struct kvm_para_state *state = kvm_para_state();
196 196
197 mmu_queue_flush(state); 197 mmu_queue_flush(state);
198 paravirt_leave_lazy(paravirt_get_lazy_mode()); 198 paravirt_leave_lazy_mmu();
199 state->mode = paravirt_get_lazy_mode(); 199 state->mode = paravirt_get_lazy_mode();
200} 200}
201 201
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 9faf43bea336..70ec9b951d76 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -248,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
248 248
249static inline void enter_lazy(enum paravirt_lazy_mode mode) 249static inline void enter_lazy(enum paravirt_lazy_mode mode)
250{ 250{
251 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); 251 BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
252 BUG_ON(preemptible());
253 252
254 __get_cpu_var(paravirt_lazy_mode) = mode; 253 percpu_write(paravirt_lazy_mode, mode);
255} 254}
256 255
257void paravirt_leave_lazy(enum paravirt_lazy_mode mode) 256static void leave_lazy(enum paravirt_lazy_mode mode)
258{ 257{
259 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); 258 BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
260 BUG_ON(preemptible());
261 259
262 __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; 260 percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
263} 261}
264 262
265void paravirt_enter_lazy_mmu(void) 263void paravirt_enter_lazy_mmu(void)
@@ -269,22 +267,36 @@ void paravirt_enter_lazy_mmu(void)
269 267
270void paravirt_leave_lazy_mmu(void) 268void paravirt_leave_lazy_mmu(void)
271{ 269{
272 paravirt_leave_lazy(PARAVIRT_LAZY_MMU); 270 leave_lazy(PARAVIRT_LAZY_MMU);
273} 271}
274 272
275void paravirt_enter_lazy_cpu(void) 273void paravirt_start_context_switch(struct task_struct *prev)
276{ 274{
275 BUG_ON(preemptible());
276
277 if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
278 arch_leave_lazy_mmu_mode();
279 set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
280 }
277 enter_lazy(PARAVIRT_LAZY_CPU); 281 enter_lazy(PARAVIRT_LAZY_CPU);
278} 282}
279 283
280void paravirt_leave_lazy_cpu(void) 284void paravirt_end_context_switch(struct task_struct *next)
281{ 285{
282 paravirt_leave_lazy(PARAVIRT_LAZY_CPU); 286 BUG_ON(preemptible());
287
288 leave_lazy(PARAVIRT_LAZY_CPU);
289
290 if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
291 arch_enter_lazy_mmu_mode();
283} 292}
284 293
285enum paravirt_lazy_mode paravirt_get_lazy_mode(void) 294enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
286{ 295{
287 return __get_cpu_var(paravirt_lazy_mode); 296 if (in_interrupt())
297 return PARAVIRT_LAZY_NONE;
298
299 return percpu_read(paravirt_lazy_mode);
288} 300}
289 301
290void arch_flush_lazy_mmu_mode(void) 302void arch_flush_lazy_mmu_mode(void)
@@ -292,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void)
292 preempt_disable(); 304 preempt_disable();
293 305
294 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 306 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
295 WARN_ON(preempt_count() == 1);
296 arch_leave_lazy_mmu_mode(); 307 arch_leave_lazy_mmu_mode();
297 arch_enter_lazy_mmu_mode(); 308 arch_enter_lazy_mmu_mode();
298 } 309 }
@@ -300,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void)
300 preempt_enable(); 311 preempt_enable();
301} 312}
302 313
303void arch_flush_lazy_cpu_mode(void)
304{
305 preempt_disable();
306
307 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
308 WARN_ON(preempt_count() == 1);
309 arch_leave_lazy_cpu_mode();
310 arch_enter_lazy_cpu_mode();
311 }
312
313 preempt_enable();
314}
315
316struct pv_info pv_info = { 314struct pv_info pv_info = {
317 .name = "bare hardware", 315 .name = "bare hardware",
318 .paravirt_enabled = 0, 316 .paravirt_enabled = 0,
@@ -404,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = {
404 .set_iopl_mask = native_set_iopl_mask, 402 .set_iopl_mask = native_set_iopl_mask,
405 .io_delay = native_io_delay, 403 .io_delay = native_io_delay,
406 404
407 .lazy_mode = { 405 .start_context_switch = paravirt_nop,
408 .enter = paravirt_nop, 406 .end_context_switch = paravirt_nop,
409 .leave = paravirt_nop,
410 },
411}; 407};
412 408
413struct pv_apic_ops pv_apic_ops = { 409struct pv_apic_ops pv_apic_ops = {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 56d50b7d71df..c60924b5d123 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -404,7 +404,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
404 * done before math_state_restore, so the TS bit is up 404 * done before math_state_restore, so the TS bit is up
405 * to date. 405 * to date.
406 */ 406 */
407 arch_leave_lazy_cpu_mode(); 407 arch_end_context_switch(next_p);
408 408
409 /* If the task has used fpu the last 5 timeslices, just do a full 409 /* If the task has used fpu the last 5 timeslices, just do a full
410 * restore of the math state immediately to avoid the trap; the 410 * restore of the math state immediately to avoid the trap; the
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9d6b20e6cd80..45f010fb2e20 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -425,7 +425,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
425 * done before math_state_restore, so the TS bit is up 425 * done before math_state_restore, so the TS bit is up
426 * to date. 426 * to date.
427 */ 427 */
428 arch_leave_lazy_cpu_mode(); 428 arch_end_context_switch(next_p);
429 429
430 /* 430 /*
431 * Switch FS and GS. 431 * Switch FS and GS.
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95deb9f2211e..b263423fbe2a 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
462} 462}
463#endif 463#endif
464 464
465static void vmi_enter_lazy_cpu(void) 465static void vmi_start_context_switch(struct task_struct *prev)
466{ 466{
467 paravirt_enter_lazy_cpu(); 467 paravirt_start_context_switch(prev);
468 vmi_ops.set_lazy_mode(2); 468 vmi_ops.set_lazy_mode(2);
469} 469}
470 470
471static void vmi_end_context_switch(struct task_struct *next)
472{
473 vmi_ops.set_lazy_mode(0);
474 paravirt_end_context_switch(next);
475}
476
471static void vmi_enter_lazy_mmu(void) 477static void vmi_enter_lazy_mmu(void)
472{ 478{
473 paravirt_enter_lazy_mmu(); 479 paravirt_enter_lazy_mmu();
474 vmi_ops.set_lazy_mode(1); 480 vmi_ops.set_lazy_mode(1);
475} 481}
476 482
477static void vmi_leave_lazy(void) 483static void vmi_leave_lazy_mmu(void)
478{ 484{
479 paravirt_leave_lazy(paravirt_get_lazy_mode());
480 vmi_ops.set_lazy_mode(0); 485 vmi_ops.set_lazy_mode(0);
486 paravirt_leave_lazy_mmu();
481} 487}
482 488
483static inline int __init check_vmi_rom(struct vrom_header *rom) 489static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -711,14 +717,14 @@ static inline int __init activate_vmi(void)
711 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); 717 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
712 para_fill(pv_cpu_ops.io_delay, IODelay); 718 para_fill(pv_cpu_ops.io_delay, IODelay);
713 719
714 para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, 720 para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
715 set_lazy_mode, SetLazyMode); 721 set_lazy_mode, SetLazyMode);
716 para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, 722 para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
717 set_lazy_mode, SetLazyMode); 723 set_lazy_mode, SetLazyMode);
718 724
719 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, 725 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
720 set_lazy_mode, SetLazyMode); 726 set_lazy_mode, SetLazyMode);
721 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, 727 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
722 set_lazy_mode, SetLazyMode); 728 set_lazy_mode, SetLazyMode);
723 729
724 /* user and kernel flush are just handled with different flags to FlushTLB */ 730 /* user and kernel flush are just handled with different flags to FlushTLB */
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ef4205c1a7a5..4e0c26559395 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -167,10 +167,16 @@ static void lazy_hcall3(unsigned long call,
167 167
168/* When lazy mode is turned off reset the per-cpu lazy mode variable and then 168/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
169 * issue the do-nothing hypercall to flush any stored calls. */ 169 * issue the do-nothing hypercall to flush any stored calls. */
170static void lguest_leave_lazy_mode(void) 170static void lguest_leave_lazy_mmu_mode(void)
171{ 171{
172 paravirt_leave_lazy(paravirt_get_lazy_mode());
173 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 172 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
173 paravirt_leave_lazy_mmu();
174}
175
176static void lguest_end_context_switch(struct task_struct *next)
177{
178 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
179 paravirt_end_context_switch(next);
174} 180}
175 181
176/*G:033 182/*G:033
@@ -1054,8 +1060,8 @@ __init void lguest_init(void)
1054 pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; 1060 pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
1055 pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; 1061 pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
1056 pv_cpu_ops.wbinvd = lguest_wbinvd; 1062 pv_cpu_ops.wbinvd = lguest_wbinvd;
1057 pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; 1063 pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
1058 pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode; 1064 pv_cpu_ops.end_context_switch = lguest_end_context_switch;
1059 1065
1060 /* pagetable management */ 1066 /* pagetable management */
1061 pv_mmu_ops.write_cr3 = lguest_write_cr3; 1067 pv_mmu_ops.write_cr3 = lguest_write_cr3;
@@ -1068,7 +1074,7 @@ __init void lguest_init(void)
1068 pv_mmu_ops.read_cr2 = lguest_read_cr2; 1074 pv_mmu_ops.read_cr2 = lguest_read_cr2;
1069 pv_mmu_ops.read_cr3 = lguest_read_cr3; 1075 pv_mmu_ops.read_cr3 = lguest_read_cr3;
1070 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; 1076 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
1071 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode; 1077 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
1072 pv_mmu_ops.pte_update = lguest_pte_update; 1078 pv_mmu_ops.pte_update = lguest_pte_update;
1073 pv_mmu_ops.pte_update_defer = lguest_pte_update; 1079 pv_mmu_ops.pte_update_defer = lguest_pte_update;
1074 1080
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b9ca6d767dbb..5ec7ae366615 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -201,12 +201,10 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
201 if (!pmd_present(*pmd_k)) 201 if (!pmd_present(*pmd_k))
202 return NULL; 202 return NULL;
203 203
204 if (!pmd_present(*pmd)) { 204 if (!pmd_present(*pmd))
205 set_pmd(pmd, *pmd_k); 205 set_pmd(pmd, *pmd_k);
206 arch_flush_lazy_mmu_mode(); 206 else
207 } else {
208 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 207 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
209 }
210 208
211 return pmd_k; 209 return pmd_k;
212} 210}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 8126e8d1a2a4..58f621e81919 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -44,7 +44,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
44 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 44 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
45 BUG_ON(!pte_none(*(kmap_pte-idx))); 45 BUG_ON(!pte_none(*(kmap_pte-idx)));
46 set_pte(kmap_pte-idx, mk_pte(page, prot)); 46 set_pte(kmap_pte-idx, mk_pte(page, prot));
47 arch_flush_lazy_mmu_mode();
48 47
49 return (void *)vaddr; 48 return (void *)vaddr;
50} 49}
@@ -74,7 +73,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
74#endif 73#endif
75 } 74 }
76 75
77 arch_flush_lazy_mmu_mode();
78 pagefault_enable(); 76 pagefault_enable();
79} 77}
80 78
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 8056545e2d39..fe6f84ca121e 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -82,7 +82,6 @@ iounmap_atomic(void *kvaddr, enum km_type type)
82 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) 82 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
83 kpte_clear_flush(kmap_pte-idx, vaddr); 83 kpte_clear_flush(kmap_pte-idx, vaddr);
84 84
85 arch_flush_lazy_mmu_mode();
86 pagefault_enable(); 85 pagefault_enable();
87} 86}
88EXPORT_SYMBOL_GPL(iounmap_atomic); 87EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e17efed088c5..6ce9518fe2ac 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -839,13 +839,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
839 839
840 vm_unmap_aliases(); 840 vm_unmap_aliases();
841 841
842 /*
843 * If we're called with lazy mmu updates enabled, the
844 * in-memory pte state may be stale. Flush pending updates to
845 * bring them up to date.
846 */
847 arch_flush_lazy_mmu_mode();
848
849 cpa.vaddr = addr; 842 cpa.vaddr = addr;
850 cpa.pages = pages; 843 cpa.pages = pages;
851 cpa.numpages = numpages; 844 cpa.numpages = numpages;
@@ -890,13 +883,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
890 } else 883 } else
891 cpa_flush_all(cache); 884 cpa_flush_all(cache);
892 885
893 /*
894 * If we've been called with lazy mmu updates enabled, then
895 * make sure that everything gets flushed out before we
896 * return.
897 */
898 arch_flush_lazy_mmu_mode();
899
900out: 886out:
901 return ret; 887 return ret;
902} 888}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f09e8c36ee80..0a1700a2be9c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -20,6 +20,7 @@
20#include <linux/delay.h> 20#include <linux/delay.h>
21#include <linux/start_kernel.h> 21#include <linux/start_kernel.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/kprobes.h>
23#include <linux/bootmem.h> 24#include <linux/bootmem.h>
24#include <linux/module.h> 25#include <linux/module.h>
25#include <linux/mm.h> 26#include <linux/mm.h>
@@ -44,6 +45,7 @@
44#include <asm/processor.h> 45#include <asm/processor.h>
45#include <asm/proto.h> 46#include <asm/proto.h>
46#include <asm/msr-index.h> 47#include <asm/msr-index.h>
48#include <asm/traps.h>
47#include <asm/setup.h> 49#include <asm/setup.h>
48#include <asm/desc.h> 50#include <asm/desc.h>
49#include <asm/pgtable.h> 51#include <asm/pgtable.h>
@@ -240,10 +242,10 @@ static unsigned long xen_get_debugreg(int reg)
240 return HYPERVISOR_get_debugreg(reg); 242 return HYPERVISOR_get_debugreg(reg);
241} 243}
242 244
243void xen_leave_lazy(void) 245static void xen_end_context_switch(struct task_struct *next)
244{ 246{
245 paravirt_leave_lazy(paravirt_get_lazy_mode());
246 xen_mc_flush(); 247 xen_mc_flush();
248 paravirt_end_context_switch(next);
247} 249}
248 250
249static unsigned long xen_store_tr(void) 251static unsigned long xen_store_tr(void)
@@ -428,11 +430,44 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
428static int cvt_gate_to_trap(int vector, const gate_desc *val, 430static int cvt_gate_to_trap(int vector, const gate_desc *val,
429 struct trap_info *info) 431 struct trap_info *info)
430{ 432{
433 unsigned long addr;
434
431 if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) 435 if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
432 return 0; 436 return 0;
433 437
434 info->vector = vector; 438 info->vector = vector;
435 info->address = gate_offset(*val); 439
440 addr = gate_offset(*val);
441#ifdef CONFIG_X86_64
442 /*
443 * Look for known traps using IST, and substitute them
444 * appropriately. The debugger ones are the only ones we care
445 * about. Xen will handle faults like double_fault and
446 * machine_check, so we should never see them. Warn if
447 * there's an unexpected IST-using fault handler.
448 */
449 if (addr == (unsigned long)debug)
450 addr = (unsigned long)xen_debug;
451 else if (addr == (unsigned long)int3)
452 addr = (unsigned long)xen_int3;
453 else if (addr == (unsigned long)stack_segment)
454 addr = (unsigned long)xen_stack_segment;
455 else if (addr == (unsigned long)double_fault ||
456 addr == (unsigned long)nmi) {
457 /* Don't need to handle these */
458 return 0;
459#ifdef CONFIG_X86_MCE
460 } else if (addr == (unsigned long)machine_check) {
461 return 0;
462#endif
463 } else {
464 /* Some other trap using IST? */
465 if (WARN_ON(val->ist != 0))
466 return 0;
467 }
468#endif /* CONFIG_X86_64 */
469 info->address = addr;
470
436 info->cs = gate_segment(*val); 471 info->cs = gate_segment(*val);
437 info->flags = val->dpl; 472 info->flags = val->dpl;
438 /* interrupt gates clear IF */ 473 /* interrupt gates clear IF */
@@ -623,10 +658,26 @@ static void xen_clts(void)
623 xen_mc_issue(PARAVIRT_LAZY_CPU); 658 xen_mc_issue(PARAVIRT_LAZY_CPU);
624} 659}
625 660
661static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
662
663static unsigned long xen_read_cr0(void)
664{
665 unsigned long cr0 = percpu_read(xen_cr0_value);
666
667 if (unlikely(cr0 == 0)) {
668 cr0 = native_read_cr0();
669 percpu_write(xen_cr0_value, cr0);
670 }
671
672 return cr0;
673}
674
626static void xen_write_cr0(unsigned long cr0) 675static void xen_write_cr0(unsigned long cr0)
627{ 676{
628 struct multicall_space mcs; 677 struct multicall_space mcs;
629 678
679 percpu_write(xen_cr0_value, cr0);
680
630 /* Only pay attention to cr0.TS; everything else is 681 /* Only pay attention to cr0.TS; everything else is
631 ignored. */ 682 ignored. */
632 mcs = xen_mc_entry(0); 683 mcs = xen_mc_entry(0);
@@ -812,7 +863,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
812 863
813 .clts = xen_clts, 864 .clts = xen_clts,
814 865
815 .read_cr0 = native_read_cr0, 866 .read_cr0 = xen_read_cr0,
816 .write_cr0 = xen_write_cr0, 867 .write_cr0 = xen_write_cr0,
817 868
818 .read_cr4 = native_read_cr4, 869 .read_cr4 = native_read_cr4,
@@ -860,10 +911,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
860 /* Xen takes care of %gs when switching to usermode for us */ 911 /* Xen takes care of %gs when switching to usermode for us */
861 .swapgs = paravirt_nop, 912 .swapgs = paravirt_nop,
862 913
863 .lazy_mode = { 914 .start_context_switch = paravirt_start_context_switch,
864 .enter = paravirt_enter_lazy_cpu, 915 .end_context_switch = xen_end_context_switch,
865 .leave = xen_leave_lazy,
866 },
867}; 916};
868 917
869static const struct pv_apic_ops xen_apic_ops __initdata = { 918static const struct pv_apic_ops xen_apic_ops __initdata = {
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index fba55b1a4021..4ceb28581652 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -452,10 +452,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
452void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 452void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
453 pte_t *ptep, pte_t pteval) 453 pte_t *ptep, pte_t pteval)
454{ 454{
455 /* updates to init_mm may be done without lock */
456 if (mm == &init_mm)
457 preempt_disable();
458
459 ADD_STATS(set_pte_at, 1); 455 ADD_STATS(set_pte_at, 1);
460// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); 456// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
461 ADD_STATS(set_pte_at_current, mm == current->mm); 457 ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -476,9 +472,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
476 } 472 }
477 xen_set_pte(ptep, pteval); 473 xen_set_pte(ptep, pteval);
478 474
479out: 475out: return;
480 if (mm == &init_mm)
481 preempt_enable();
482} 476}
483 477
484pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 478pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
@@ -1152,10 +1146,8 @@ static void drop_other_mm_ref(void *info)
1152 1146
1153 /* If this cpu still has a stale cr3 reference, then make sure 1147 /* If this cpu still has a stale cr3 reference, then make sure
1154 it has been flushed. */ 1148 it has been flushed. */
1155 if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) { 1149 if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1156 load_cr3(swapper_pg_dir); 1150 load_cr3(swapper_pg_dir);
1157 arch_flush_lazy_cpu_mode();
1158 }
1159} 1151}
1160 1152
1161static void xen_drop_mm_ref(struct mm_struct *mm) 1153static void xen_drop_mm_ref(struct mm_struct *mm)
@@ -1168,7 +1160,6 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1168 load_cr3(swapper_pg_dir); 1160 load_cr3(swapper_pg_dir);
1169 else 1161 else
1170 leave_mm(smp_processor_id()); 1162 leave_mm(smp_processor_id());
1171 arch_flush_lazy_cpu_mode();
1172 } 1163 }
1173 1164
1174 /* Get the "official" set of cpus referring to our pagetable. */ 1165 /* Get the "official" set of cpus referring to our pagetable. */
@@ -1876,6 +1867,14 @@ __init void xen_post_allocator_init(void)
1876 xen_mark_init_mm_pinned(); 1867 xen_mark_init_mm_pinned();
1877} 1868}
1878 1869
1870static void xen_leave_lazy_mmu(void)
1871{
1872 preempt_disable();
1873 xen_mc_flush();
1874 paravirt_leave_lazy_mmu();
1875 preempt_enable();
1876}
1877
1879const struct pv_mmu_ops xen_mmu_ops __initdata = { 1878const struct pv_mmu_ops xen_mmu_ops __initdata = {
1880 .pagetable_setup_start = xen_pagetable_setup_start, 1879 .pagetable_setup_start = xen_pagetable_setup_start,
1881 .pagetable_setup_done = xen_pagetable_setup_done, 1880 .pagetable_setup_done = xen_pagetable_setup_done,
@@ -1949,7 +1948,7 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
1949 1948
1950 .lazy_mode = { 1949 .lazy_mode = {
1951 .enter = paravirt_enter_lazy_mmu, 1950 .enter = paravirt_enter_lazy_mmu,
1952 .leave = xen_leave_lazy, 1951 .leave = xen_leave_lazy_mmu,
1953 }, 1952 },
1954 1953
1955 .set_fixmap = xen_set_fixmap, 1954 .set_fixmap = xen_set_fixmap,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 15c6c68db6a2..ad0047f47cd4 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -61,9 +61,9 @@ char * __init xen_memory_setup(void)
61 * - xen_start_info 61 * - xen_start_info
62 * See comment above "struct start_info" in <xen/interface/xen.h> 62 * See comment above "struct start_info" in <xen/interface/xen.h>
63 */ 63 */
64 e820_add_region(__pa(xen_start_info->mfn_list), 64 reserve_early(__pa(xen_start_info->mfn_list),
65 xen_start_info->pt_base - xen_start_info->mfn_list, 65 __pa(xen_start_info->pt_base),
66 E820_RESERVED); 66 "XEN START INFO");
67 67
68 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 68 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
69 69
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index ca6596b05d53..22494fd4c9b5 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,7 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void); 30void xen_ident_map_ISA(void);
31void xen_reserve_top(void); 31void xen_reserve_top(void);
32 32
33void xen_leave_lazy(void);
34void xen_post_allocator_init(void); 33void xen_post_allocator_init(void);
35 34
36char * __init xen_memory_setup(void); 35char * __init xen_memory_setup(void);
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 8ac9cddac575..cab100acf983 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -18,6 +18,16 @@ config XEN_SCRUB_PAGES
18 secure, but slightly less efficient. 18 secure, but slightly less efficient.
19 If in doubt, say yes. 19 If in doubt, say yes.
20 20
21config XEN_DEV_EVTCHN
22 tristate "Xen /dev/xen/evtchn device"
23 depends on XEN
24 default y
25 help
26 The evtchn driver allows a userspace process to triger event
27 channels and to receive notification of an event channel
28 firing.
29 If in doubt, say yes.
30
21config XENFS 31config XENFS
22 tristate "Xen filesystem" 32 tristate "Xen filesystem"
23 depends on XEN 33 depends on XEN
@@ -41,3 +51,13 @@ config XEN_COMPAT_XENFS
41 a xen platform. 51 a xen platform.
42 If in doubt, say yes. 52 If in doubt, say yes.
43 53
54config XEN_SYS_HYPERVISOR
55 bool "Create xen entries under /sys/hypervisor"
56 depends on XEN && SYSFS
57 select SYS_HYPERVISOR
58 default y
59 help
60 Create entries under /sys/hypervisor describing the Xen
61 hypervisor environment. When running native or in another
62 virtual environment, /sys/hypervisor will still be present,
63 but will have no xen contents. \ No newline at end of file
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index ff8accc9e103..ec2a39b1e26f 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -4,4 +4,6 @@ obj-y += xenbus/
4obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o 4obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
5obj-$(CONFIG_XEN_XENCOMM) += xencomm.o 5obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
6obj-$(CONFIG_XEN_BALLOON) += balloon.o 6obj-$(CONFIG_XEN_BALLOON) += balloon.o
7obj-$(CONFIG_XENFS) += xenfs/ \ No newline at end of file 7obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o
8obj-$(CONFIG_XENFS) += xenfs/
9obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o \ No newline at end of file
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index be437c2bc942..891d2e90753a 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -151,6 +151,12 @@ static unsigned int evtchn_from_irq(unsigned irq)
151 return info_for_irq(irq)->evtchn; 151 return info_for_irq(irq)->evtchn;
152} 152}
153 153
154unsigned irq_from_evtchn(unsigned int evtchn)
155{
156 return evtchn_to_irq[evtchn];
157}
158EXPORT_SYMBOL_GPL(irq_from_evtchn);
159
154static enum ipi_vector ipi_from_irq(unsigned irq) 160static enum ipi_vector ipi_from_irq(unsigned irq)
155{ 161{
156 struct irq_info *info = info_for_irq(irq); 162 struct irq_info *info = info_for_irq(irq);
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
new file mode 100644
index 000000000000..af031950f9b1
--- /dev/null
+++ b/drivers/xen/evtchn.c
@@ -0,0 +1,507 @@
1/******************************************************************************
2 * evtchn.c
3 *
4 * Driver for receiving and demuxing event-channel signals.
5 *
6 * Copyright (c) 2004-2005, K A Fraser
7 * Multi-process extensions Copyright (c) 2004, Steven Smith
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License version 2
11 * as published by the Free Software Foundation; or, when distributed
12 * separately from the Linux kernel or incorporated into other
13 * software packages, subject to the following license:
14 *
15 * Permission is hereby granted, free of charge, to any person obtaining a copy
16 * of this source file (the "Software"), to deal in the Software without
17 * restriction, including without limitation the rights to use, copy, modify,
18 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
19 * and to permit persons to whom the Software is furnished to do so, subject to
20 * the following conditions:
21 *
22 * The above copyright notice and this permission notice shall be included in
23 * all copies or substantial portions of the Software.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
30 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
31 * IN THE SOFTWARE.
32 */
33
34#include <linux/module.h>
35#include <linux/kernel.h>
36#include <linux/sched.h>
37#include <linux/slab.h>
38#include <linux/string.h>
39#include <linux/errno.h>
40#include <linux/fs.h>
41#include <linux/errno.h>
42#include <linux/miscdevice.h>
43#include <linux/major.h>
44#include <linux/proc_fs.h>
45#include <linux/stat.h>
46#include <linux/poll.h>
47#include <linux/irq.h>
48#include <linux/init.h>
49#include <linux/gfp.h>
50#include <linux/mutex.h>
51#include <linux/cpu.h>
52#include <xen/events.h>
53#include <xen/evtchn.h>
54#include <asm/xen/hypervisor.h>
55
56struct per_user_data {
57 struct mutex bind_mutex; /* serialize bind/unbind operations */
58
59 /* Notification ring, accessed via /dev/xen/evtchn. */
60#define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t))
61#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
62 evtchn_port_t *ring;
63 unsigned int ring_cons, ring_prod, ring_overflow;
64 struct mutex ring_cons_mutex; /* protect against concurrent readers */
65
66 /* Processes wait on this queue when ring is empty. */
67 wait_queue_head_t evtchn_wait;
68 struct fasync_struct *evtchn_async_queue;
69 const char *name;
70};
71
72/* Who's bound to each port? */
73static struct per_user_data *port_user[NR_EVENT_CHANNELS];
74static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
75
76irqreturn_t evtchn_interrupt(int irq, void *data)
77{
78 unsigned int port = (unsigned long)data;
79 struct per_user_data *u;
80
81 spin_lock(&port_user_lock);
82
83 u = port_user[port];
84
85 disable_irq_nosync(irq);
86
87 if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
88 u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
89 wmb(); /* Ensure ring contents visible */
90 if (u->ring_cons == u->ring_prod++) {
91 wake_up_interruptible(&u->evtchn_wait);
92 kill_fasync(&u->evtchn_async_queue,
93 SIGIO, POLL_IN);
94 }
95 } else {
96 u->ring_overflow = 1;
97 }
98
99 spin_unlock(&port_user_lock);
100
101 return IRQ_HANDLED;
102}
103
104static ssize_t evtchn_read(struct file *file, char __user *buf,
105 size_t count, loff_t *ppos)
106{
107 int rc;
108 unsigned int c, p, bytes1 = 0, bytes2 = 0;
109 struct per_user_data *u = file->private_data;
110
111 /* Whole number of ports. */
112 count &= ~(sizeof(evtchn_port_t)-1);
113
114 if (count == 0)
115 return 0;
116
117 if (count > PAGE_SIZE)
118 count = PAGE_SIZE;
119
120 for (;;) {
121 mutex_lock(&u->ring_cons_mutex);
122
123 rc = -EFBIG;
124 if (u->ring_overflow)
125 goto unlock_out;
126
127 c = u->ring_cons;
128 p = u->ring_prod;
129 if (c != p)
130 break;
131
132 mutex_unlock(&u->ring_cons_mutex);
133
134 if (file->f_flags & O_NONBLOCK)
135 return -EAGAIN;
136
137 rc = wait_event_interruptible(u->evtchn_wait,
138 u->ring_cons != u->ring_prod);
139 if (rc)
140 return rc;
141 }
142
143 /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
144 if (((c ^ p) & EVTCHN_RING_SIZE) != 0) {
145 bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) *
146 sizeof(evtchn_port_t);
147 bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t);
148 } else {
149 bytes1 = (p - c) * sizeof(evtchn_port_t);
150 bytes2 = 0;
151 }
152
153 /* Truncate chunks according to caller's maximum byte count. */
154 if (bytes1 > count) {
155 bytes1 = count;
156 bytes2 = 0;
157 } else if ((bytes1 + bytes2) > count) {
158 bytes2 = count - bytes1;
159 }
160
161 rc = -EFAULT;
162 rmb(); /* Ensure that we see the port before we copy it. */
163 if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) ||
164 ((bytes2 != 0) &&
165 copy_to_user(&buf[bytes1], &u->ring[0], bytes2)))
166 goto unlock_out;
167
168 u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t);
169 rc = bytes1 + bytes2;
170
171 unlock_out:
172 mutex_unlock(&u->ring_cons_mutex);
173 return rc;
174}
175
176static ssize_t evtchn_write(struct file *file, const char __user *buf,
177 size_t count, loff_t *ppos)
178{
179 int rc, i;
180 evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
181 struct per_user_data *u = file->private_data;
182
183 if (kbuf == NULL)
184 return -ENOMEM;
185
186 /* Whole number of ports. */
187 count &= ~(sizeof(evtchn_port_t)-1);
188
189 rc = 0;
190 if (count == 0)
191 goto out;
192
193 if (count > PAGE_SIZE)
194 count = PAGE_SIZE;
195
196 rc = -EFAULT;
197 if (copy_from_user(kbuf, buf, count) != 0)
198 goto out;
199
200 spin_lock_irq(&port_user_lock);
201 for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
202 if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
203 enable_irq(irq_from_evtchn(kbuf[i]));
204 spin_unlock_irq(&port_user_lock);
205
206 rc = count;
207
208 out:
209 free_page((unsigned long)kbuf);
210 return rc;
211}
212
213static int evtchn_bind_to_user(struct per_user_data *u, int port)
214{
215 int rc = 0;
216
217 /*
218 * Ports are never reused, so every caller should pass in a
219 * unique port.
220 *
221 * (Locking not necessary because we haven't registered the
222 * interrupt handler yet, and our caller has already
223 * serialized bind operations.)
224 */
225 BUG_ON(port_user[port] != NULL);
226 port_user[port] = u;
227
228 rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
229 u->name, (void *)(unsigned long)port);
230 if (rc >= 0)
231 rc = 0;
232
233 return rc;
234}
235
236static void evtchn_unbind_from_user(struct per_user_data *u, int port)
237{
238 int irq = irq_from_evtchn(port);
239
240 unbind_from_irqhandler(irq, (void *)(unsigned long)port);
241
242 /* make sure we unbind the irq handler before clearing the port */
243 barrier();
244
245 port_user[port] = NULL;
246}
247
248static long evtchn_ioctl(struct file *file,
249 unsigned int cmd, unsigned long arg)
250{
251 int rc;
252 struct per_user_data *u = file->private_data;
253 void __user *uarg = (void __user *) arg;
254
255 /* Prevent bind from racing with unbind */
256 mutex_lock(&u->bind_mutex);
257
258 switch (cmd) {
259 case IOCTL_EVTCHN_BIND_VIRQ: {
260 struct ioctl_evtchn_bind_virq bind;
261 struct evtchn_bind_virq bind_virq;
262
263 rc = -EFAULT;
264 if (copy_from_user(&bind, uarg, sizeof(bind)))
265 break;
266
267 bind_virq.virq = bind.virq;
268 bind_virq.vcpu = 0;
269 rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
270 &bind_virq);
271 if (rc != 0)
272 break;
273
274 rc = evtchn_bind_to_user(u, bind_virq.port);
275 if (rc == 0)
276 rc = bind_virq.port;
277 break;
278 }
279
280 case IOCTL_EVTCHN_BIND_INTERDOMAIN: {
281 struct ioctl_evtchn_bind_interdomain bind;
282 struct evtchn_bind_interdomain bind_interdomain;
283
284 rc = -EFAULT;
285 if (copy_from_user(&bind, uarg, sizeof(bind)))
286 break;
287
288 bind_interdomain.remote_dom = bind.remote_domain;
289 bind_interdomain.remote_port = bind.remote_port;
290 rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
291 &bind_interdomain);
292 if (rc != 0)
293 break;
294
295 rc = evtchn_bind_to_user(u, bind_interdomain.local_port);
296 if (rc == 0)
297 rc = bind_interdomain.local_port;
298 break;
299 }
300
301 case IOCTL_EVTCHN_BIND_UNBOUND_PORT: {
302 struct ioctl_evtchn_bind_unbound_port bind;
303 struct evtchn_alloc_unbound alloc_unbound;
304
305 rc = -EFAULT;
306 if (copy_from_user(&bind, uarg, sizeof(bind)))
307 break;
308
309 alloc_unbound.dom = DOMID_SELF;
310 alloc_unbound.remote_dom = bind.remote_domain;
311 rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
312 &alloc_unbound);
313 if (rc != 0)
314 break;
315
316 rc = evtchn_bind_to_user(u, alloc_unbound.port);
317 if (rc == 0)
318 rc = alloc_unbound.port;
319 break;
320 }
321
322 case IOCTL_EVTCHN_UNBIND: {
323 struct ioctl_evtchn_unbind unbind;
324
325 rc = -EFAULT;
326 if (copy_from_user(&unbind, uarg, sizeof(unbind)))
327 break;
328
329 rc = -EINVAL;
330 if (unbind.port >= NR_EVENT_CHANNELS)
331 break;
332
333 spin_lock_irq(&port_user_lock);
334
335 rc = -ENOTCONN;
336 if (port_user[unbind.port] != u) {
337 spin_unlock_irq(&port_user_lock);
338 break;
339 }
340
341 evtchn_unbind_from_user(u, unbind.port);
342
343 spin_unlock_irq(&port_user_lock);
344
345 rc = 0;
346 break;
347 }
348
349 case IOCTL_EVTCHN_NOTIFY: {
350 struct ioctl_evtchn_notify notify;
351
352 rc = -EFAULT;
353 if (copy_from_user(&notify, uarg, sizeof(notify)))
354 break;
355
356 if (notify.port >= NR_EVENT_CHANNELS) {
357 rc = -EINVAL;
358 } else if (port_user[notify.port] != u) {
359 rc = -ENOTCONN;
360 } else {
361 notify_remote_via_evtchn(notify.port);
362 rc = 0;
363 }
364 break;
365 }
366
367 case IOCTL_EVTCHN_RESET: {
368 /* Initialise the ring to empty. Clear errors. */
369 mutex_lock(&u->ring_cons_mutex);
370 spin_lock_irq(&port_user_lock);
371 u->ring_cons = u->ring_prod = u->ring_overflow = 0;
372 spin_unlock_irq(&port_user_lock);
373 mutex_unlock(&u->ring_cons_mutex);
374 rc = 0;
375 break;
376 }
377
378 default:
379 rc = -ENOSYS;
380 break;
381 }
382 mutex_unlock(&u->bind_mutex);
383
384 return rc;
385}
386
387static unsigned int evtchn_poll(struct file *file, poll_table *wait)
388{
389 unsigned int mask = POLLOUT | POLLWRNORM;
390 struct per_user_data *u = file->private_data;
391
392 poll_wait(file, &u->evtchn_wait, wait);
393 if (u->ring_cons != u->ring_prod)
394 mask |= POLLIN | POLLRDNORM;
395 if (u->ring_overflow)
396 mask = POLLERR;
397 return mask;
398}
399
400static int evtchn_fasync(int fd, struct file *filp, int on)
401{
402 struct per_user_data *u = filp->private_data;
403 return fasync_helper(fd, filp, on, &u->evtchn_async_queue);
404}
405
406static int evtchn_open(struct inode *inode, struct file *filp)
407{
408 struct per_user_data *u;
409
410 u = kzalloc(sizeof(*u), GFP_KERNEL);
411 if (u == NULL)
412 return -ENOMEM;
413
414 u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm);
415 if (u->name == NULL) {
416 kfree(u);
417 return -ENOMEM;
418 }
419
420 init_waitqueue_head(&u->evtchn_wait);
421
422 u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
423 if (u->ring == NULL) {
424 kfree(u->name);
425 kfree(u);
426 return -ENOMEM;
427 }
428
429 mutex_init(&u->bind_mutex);
430 mutex_init(&u->ring_cons_mutex);
431
432 filp->private_data = u;
433
434 return 0;
435}
436
437static int evtchn_release(struct inode *inode, struct file *filp)
438{
439 int i;
440 struct per_user_data *u = filp->private_data;
441
442 spin_lock_irq(&port_user_lock);
443
444 free_page((unsigned long)u->ring);
445
446 for (i = 0; i < NR_EVENT_CHANNELS; i++) {
447 if (port_user[i] != u)
448 continue;
449
450 evtchn_unbind_from_user(port_user[i], i);
451 }
452
453 spin_unlock_irq(&port_user_lock);
454
455 kfree(u->name);
456 kfree(u);
457
458 return 0;
459}
460
461static const struct file_operations evtchn_fops = {
462 .owner = THIS_MODULE,
463 .read = evtchn_read,
464 .write = evtchn_write,
465 .unlocked_ioctl = evtchn_ioctl,
466 .poll = evtchn_poll,
467 .fasync = evtchn_fasync,
468 .open = evtchn_open,
469 .release = evtchn_release,
470};
471
472static struct miscdevice evtchn_miscdev = {
473 .minor = MISC_DYNAMIC_MINOR,
474 .name = "evtchn",
475 .fops = &evtchn_fops,
476};
477static int __init evtchn_init(void)
478{
479 int err;
480
481 if (!xen_domain())
482 return -ENODEV;
483
484 spin_lock_init(&port_user_lock);
485 memset(port_user, 0, sizeof(port_user));
486
487 /* Create '/dev/misc/evtchn'. */
488 err = misc_register(&evtchn_miscdev);
489 if (err != 0) {
490 printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
491 return err;
492 }
493
494 printk(KERN_INFO "Event-channel device installed.\n");
495
496 return 0;
497}
498
499static void __exit evtchn_cleanup(void)
500{
501 misc_deregister(&evtchn_miscdev);
502}
503
504module_init(evtchn_init);
505module_exit(evtchn_cleanup);
506
507MODULE_LICENSE("GPL");
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 4b5b84837ee1..fddc2025dece 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -98,9 +98,8 @@ static void do_suspend(void)
98 goto out; 98 goto out;
99 } 99 }
100 100
101 printk("suspending xenbus...\n"); 101 printk(KERN_DEBUG "suspending xenstore...\n");
102 /* XXX use normal device tree? */ 102 xs_suspend();
103 xenbus_suspend();
104 103
105 err = device_power_down(PMSG_SUSPEND); 104 err = device_power_down(PMSG_SUSPEND);
106 if (err) { 105 if (err) {
@@ -116,9 +115,9 @@ static void do_suspend(void)
116 115
117 if (!cancelled) { 116 if (!cancelled) {
118 xen_arch_resume(); 117 xen_arch_resume();
119 xenbus_resume(); 118 xs_resume();
120 } else 119 } else
121 xenbus_suspend_cancel(); 120 xs_suspend_cancel();
122 121
123 device_power_up(PMSG_RESUME); 122 device_power_up(PMSG_RESUME);
124 123
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
new file mode 100644
index 000000000000..88a60e03ccf0
--- /dev/null
+++ b/drivers/xen/sys-hypervisor.c
@@ -0,0 +1,445 @@
1/*
2 * copyright (c) 2006 IBM Corporation
3 * Authored by: Mike D. Day <ncmike@us.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/kernel.h>
11#include <linux/module.h>
12#include <linux/kobject.h>
13
14#include <asm/xen/hypervisor.h>
15#include <asm/xen/hypercall.h>
16
17#include <xen/xenbus.h>
18#include <xen/interface/xen.h>
19#include <xen/interface/version.h>
20
21#define HYPERVISOR_ATTR_RO(_name) \
22static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name)
23
24#define HYPERVISOR_ATTR_RW(_name) \
25static struct hyp_sysfs_attr _name##_attr = \
26 __ATTR(_name, 0644, _name##_show, _name##_store)
27
28struct hyp_sysfs_attr {
29 struct attribute attr;
30 ssize_t (*show)(struct hyp_sysfs_attr *, char *);
31 ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
32 void *hyp_attr_data;
33};
34
35static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
36{
37 return sprintf(buffer, "xen\n");
38}
39
40HYPERVISOR_ATTR_RO(type);
41
42static int __init xen_sysfs_type_init(void)
43{
44 return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
45}
46
47static void xen_sysfs_type_destroy(void)
48{
49 sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
50}
51
52/* xen version attributes */
53static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
54{
55 int version = HYPERVISOR_xen_version(XENVER_version, NULL);
56 if (version)
57 return sprintf(buffer, "%d\n", version >> 16);
58 return -ENODEV;
59}
60
61HYPERVISOR_ATTR_RO(major);
62
63static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
64{
65 int version = HYPERVISOR_xen_version(XENVER_version, NULL);
66 if (version)
67 return sprintf(buffer, "%d\n", version & 0xff);
68 return -ENODEV;
69}
70
71HYPERVISOR_ATTR_RO(minor);
72
73static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
74{
75 int ret = -ENOMEM;
76 char *extra;
77
78 extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
79 if (extra) {
80 ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
81 if (!ret)
82 ret = sprintf(buffer, "%s\n", extra);
83 kfree(extra);
84 }
85
86 return ret;
87}
88
89HYPERVISOR_ATTR_RO(extra);
90
91static struct attribute *version_attrs[] = {
92 &major_attr.attr,
93 &minor_attr.attr,
94 &extra_attr.attr,
95 NULL
96};
97
98static struct attribute_group version_group = {
99 .name = "version",
100 .attrs = version_attrs,
101};
102
103static int __init xen_sysfs_version_init(void)
104{
105 return sysfs_create_group(hypervisor_kobj, &version_group);
106}
107
108static void xen_sysfs_version_destroy(void)
109{
110 sysfs_remove_group(hypervisor_kobj, &version_group);
111}
112
113/* UUID */
114
115static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
116{
117 char *vm, *val;
118 int ret;
119 extern int xenstored_ready;
120
121 if (!xenstored_ready)
122 return -EBUSY;
123
124 vm = xenbus_read(XBT_NIL, "vm", "", NULL);
125 if (IS_ERR(vm))
126 return PTR_ERR(vm);
127 val = xenbus_read(XBT_NIL, vm, "uuid", NULL);
128 kfree(vm);
129 if (IS_ERR(val))
130 return PTR_ERR(val);
131 ret = sprintf(buffer, "%s\n", val);
132 kfree(val);
133 return ret;
134}
135
136HYPERVISOR_ATTR_RO(uuid);
137
138static int __init xen_sysfs_uuid_init(void)
139{
140 return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
141}
142
143static void xen_sysfs_uuid_destroy(void)
144{
145 sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
146}
147
148/* xen compilation attributes */
149
150static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
151{
152 int ret = -ENOMEM;
153 struct xen_compile_info *info;
154
155 info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
156 if (info) {
157 ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
158 if (!ret)
159 ret = sprintf(buffer, "%s\n", info->compiler);
160 kfree(info);
161 }
162
163 return ret;
164}
165
166HYPERVISOR_ATTR_RO(compiler);
167
168static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
169{
170 int ret = -ENOMEM;
171 struct xen_compile_info *info;
172
173 info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
174 if (info) {
175 ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
176 if (!ret)
177 ret = sprintf(buffer, "%s\n", info->compile_by);
178 kfree(info);
179 }
180
181 return ret;
182}
183
184HYPERVISOR_ATTR_RO(compiled_by);
185
186static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
187{
188 int ret = -ENOMEM;
189 struct xen_compile_info *info;
190
191 info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
192 if (info) {
193 ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
194 if (!ret)
195 ret = sprintf(buffer, "%s\n", info->compile_date);
196 kfree(info);
197 }
198
199 return ret;
200}
201
202HYPERVISOR_ATTR_RO(compile_date);
203
204static struct attribute *xen_compile_attrs[] = {
205 &compiler_attr.attr,
206 &compiled_by_attr.attr,
207 &compile_date_attr.attr,
208 NULL
209};
210
211static struct attribute_group xen_compilation_group = {
212 .name = "compilation",
213 .attrs = xen_compile_attrs,
214};
215
216int __init static xen_compilation_init(void)
217{
218 return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
219}
220
221static void xen_compilation_destroy(void)
222{
223 sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
224}
225
226/* xen properties info */
227
228static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
229{
230 int ret = -ENOMEM;
231 char *caps;
232
233 caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
234 if (caps) {
235 ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
236 if (!ret)
237 ret = sprintf(buffer, "%s\n", caps);
238 kfree(caps);
239 }
240
241 return ret;
242}
243
244HYPERVISOR_ATTR_RO(capabilities);
245
246static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
247{
248 int ret = -ENOMEM;
249 char *cset;
250
251 cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
252 if (cset) {
253 ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
254 if (!ret)
255 ret = sprintf(buffer, "%s\n", cset);
256 kfree(cset);
257 }
258
259 return ret;
260}
261
262HYPERVISOR_ATTR_RO(changeset);
263
264static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
265{
266 int ret = -ENOMEM;
267 struct xen_platform_parameters *parms;
268
269 parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
270 if (parms) {
271 ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
272 parms);
273 if (!ret)
274 ret = sprintf(buffer, "%lx\n", parms->virt_start);
275 kfree(parms);
276 }
277
278 return ret;
279}
280
281HYPERVISOR_ATTR_RO(virtual_start);
282
283static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
284{
285 int ret;
286
287 ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL);
288 if (ret > 0)
289 ret = sprintf(buffer, "%x\n", ret);
290
291 return ret;
292}
293
294HYPERVISOR_ATTR_RO(pagesize);
295
296static ssize_t xen_feature_show(int index, char *buffer)
297{
298 ssize_t ret;
299 struct xen_feature_info info;
300
301 info.submap_idx = index;
302 ret = HYPERVISOR_xen_version(XENVER_get_features, &info);
303 if (!ret)
304 ret = sprintf(buffer, "%08x", info.submap);
305
306 return ret;
307}
308
309static ssize_t features_show(struct hyp_sysfs_attr *attr, char *buffer)
310{
311 ssize_t len;
312 int i;
313
314 len = 0;
315 for (i = XENFEAT_NR_SUBMAPS-1; i >= 0; i--) {
316 int ret = xen_feature_show(i, buffer + len);
317 if (ret < 0) {
318 if (len == 0)
319 len = ret;
320 break;
321 }
322 len += ret;
323 }
324 if (len > 0)
325 buffer[len++] = '\n';
326
327 return len;
328}
329
330HYPERVISOR_ATTR_RO(features);
331
332static struct attribute *xen_properties_attrs[] = {
333 &capabilities_attr.attr,
334 &changeset_attr.attr,
335 &virtual_start_attr.attr,
336 &pagesize_attr.attr,
337 &features_attr.attr,
338 NULL
339};
340
341static struct attribute_group xen_properties_group = {
342 .name = "properties",
343 .attrs = xen_properties_attrs,
344};
345
346static int __init xen_properties_init(void)
347{
348 return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
349}
350
351static void xen_properties_destroy(void)
352{
353 sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
354}
355
356static int __init hyper_sysfs_init(void)
357{
358 int ret;
359
360 if (!xen_domain())
361 return -ENODEV;
362
363 ret = xen_sysfs_type_init();
364 if (ret)
365 goto out;
366 ret = xen_sysfs_version_init();
367 if (ret)
368 goto version_out;
369 ret = xen_compilation_init();
370 if (ret)
371 goto comp_out;
372 ret = xen_sysfs_uuid_init();
373 if (ret)
374 goto uuid_out;
375 ret = xen_properties_init();
376 if (ret)
377 goto prop_out;
378
379 goto out;
380
381prop_out:
382 xen_sysfs_uuid_destroy();
383uuid_out:
384 xen_compilation_destroy();
385comp_out:
386 xen_sysfs_version_destroy();
387version_out:
388 xen_sysfs_type_destroy();
389out:
390 return ret;
391}
392
393static void __exit hyper_sysfs_exit(void)
394{
395 xen_properties_destroy();
396 xen_compilation_destroy();
397 xen_sysfs_uuid_destroy();
398 xen_sysfs_version_destroy();
399 xen_sysfs_type_destroy();
400
401}
402module_init(hyper_sysfs_init);
403module_exit(hyper_sysfs_exit);
404
405static ssize_t hyp_sysfs_show(struct kobject *kobj,
406 struct attribute *attr,
407 char *buffer)
408{
409 struct hyp_sysfs_attr *hyp_attr;
410 hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
411 if (hyp_attr->show)
412 return hyp_attr->show(hyp_attr, buffer);
413 return 0;
414}
415
416static ssize_t hyp_sysfs_store(struct kobject *kobj,
417 struct attribute *attr,
418 const char *buffer,
419 size_t len)
420{
421 struct hyp_sysfs_attr *hyp_attr;
422 hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
423 if (hyp_attr->store)
424 return hyp_attr->store(hyp_attr, buffer, len);
425 return 0;
426}
427
428static struct sysfs_ops hyp_sysfs_ops = {
429 .show = hyp_sysfs_show,
430 .store = hyp_sysfs_store,
431};
432
433static struct kobj_type hyp_sysfs_kobj_type = {
434 .sysfs_ops = &hyp_sysfs_ops,
435};
436
437static int __init hypervisor_subsys_init(void)
438{
439 if (!xen_domain())
440 return -ENODEV;
441
442 hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
443 return 0;
444}
445device_initcall(hypervisor_subsys_init);
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 773d1cf23283..d42e25d5968d 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -71,6 +71,9 @@ static int xenbus_probe_frontend(const char *type, const char *name);
71 71
72static void xenbus_dev_shutdown(struct device *_dev); 72static void xenbus_dev_shutdown(struct device *_dev);
73 73
74static int xenbus_dev_suspend(struct device *dev, pm_message_t state);
75static int xenbus_dev_resume(struct device *dev);
76
74/* If something in array of ids matches this device, return it. */ 77/* If something in array of ids matches this device, return it. */
75static const struct xenbus_device_id * 78static const struct xenbus_device_id *
76match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) 79match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
@@ -188,6 +191,9 @@ static struct xen_bus_type xenbus_frontend = {
188 .remove = xenbus_dev_remove, 191 .remove = xenbus_dev_remove,
189 .shutdown = xenbus_dev_shutdown, 192 .shutdown = xenbus_dev_shutdown,
190 .dev_attrs = xenbus_dev_attrs, 193 .dev_attrs = xenbus_dev_attrs,
194
195 .suspend = xenbus_dev_suspend,
196 .resume = xenbus_dev_resume,
191 }, 197 },
192}; 198};
193 199
@@ -654,6 +660,7 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
654 660
655 kfree(root); 661 kfree(root);
656} 662}
663EXPORT_SYMBOL_GPL(xenbus_dev_changed);
657 664
658static void frontend_changed(struct xenbus_watch *watch, 665static void frontend_changed(struct xenbus_watch *watch,
659 const char **vec, unsigned int len) 666 const char **vec, unsigned int len)
@@ -669,7 +676,7 @@ static struct xenbus_watch fe_watch = {
669 .callback = frontend_changed, 676 .callback = frontend_changed,
670}; 677};
671 678
672static int suspend_dev(struct device *dev, void *data) 679static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
673{ 680{
674 int err = 0; 681 int err = 0;
675 struct xenbus_driver *drv; 682 struct xenbus_driver *drv;
@@ -682,35 +689,14 @@ static int suspend_dev(struct device *dev, void *data)
682 drv = to_xenbus_driver(dev->driver); 689 drv = to_xenbus_driver(dev->driver);
683 xdev = container_of(dev, struct xenbus_device, dev); 690 xdev = container_of(dev, struct xenbus_device, dev);
684 if (drv->suspend) 691 if (drv->suspend)
685 err = drv->suspend(xdev); 692 err = drv->suspend(xdev, state);
686 if (err) 693 if (err)
687 printk(KERN_WARNING 694 printk(KERN_WARNING
688 "xenbus: suspend %s failed: %i\n", dev_name(dev), err); 695 "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
689 return 0; 696 return 0;
690} 697}
691 698
692static int suspend_cancel_dev(struct device *dev, void *data) 699static int xenbus_dev_resume(struct device *dev)
693{
694 int err = 0;
695 struct xenbus_driver *drv;
696 struct xenbus_device *xdev;
697
698 DPRINTK("");
699
700 if (dev->driver == NULL)
701 return 0;
702 drv = to_xenbus_driver(dev->driver);
703 xdev = container_of(dev, struct xenbus_device, dev);
704 if (drv->suspend_cancel)
705 err = drv->suspend_cancel(xdev);
706 if (err)
707 printk(KERN_WARNING
708 "xenbus: suspend_cancel %s failed: %i\n",
709 dev_name(dev), err);
710 return 0;
711}
712
713static int resume_dev(struct device *dev, void *data)
714{ 700{
715 int err; 701 int err;
716 struct xenbus_driver *drv; 702 struct xenbus_driver *drv;
@@ -755,33 +741,6 @@ static int resume_dev(struct device *dev, void *data)
755 return 0; 741 return 0;
756} 742}
757 743
758void xenbus_suspend(void)
759{
760 DPRINTK("");
761
762 bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
763 xenbus_backend_suspend(suspend_dev);
764 xs_suspend();
765}
766EXPORT_SYMBOL_GPL(xenbus_suspend);
767
768void xenbus_resume(void)
769{
770 xb_init_comms();
771 xs_resume();
772 bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
773 xenbus_backend_resume(resume_dev);
774}
775EXPORT_SYMBOL_GPL(xenbus_resume);
776
777void xenbus_suspend_cancel(void)
778{
779 xs_suspend_cancel();
780 bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev);
781 xenbus_backend_resume(suspend_cancel_dev);
782}
783EXPORT_SYMBOL_GPL(xenbus_suspend_cancel);
784
785/* A flag to determine if xenstored is 'ready' (i.e. has started) */ 744/* A flag to determine if xenstored is 'ready' (i.e. has started) */
786int xenstored_ready = 0; 745int xenstored_ready = 0;
787 746
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index e325eab4724d..eab33f1dbdf7 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -673,6 +673,8 @@ void xs_resume(void)
673 struct xenbus_watch *watch; 673 struct xenbus_watch *watch;
674 char token[sizeof(watch) * 2 + 1]; 674 char token[sizeof(watch) * 2 + 1];
675 675
676 xb_init_comms();
677
676 mutex_unlock(&xs_state.response_mutex); 678 mutex_unlock(&xs_state.response_mutex);
677 mutex_unlock(&xs_state.request_mutex); 679 mutex_unlock(&xs_state.request_mutex);
678 up_write(&xs_state.transaction_mutex); 680 up_write(&xs_state.transaction_mutex);
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 515741a8e6b8..6559e0c752ce 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -20,10 +20,27 @@
20MODULE_DESCRIPTION("Xen filesystem"); 20MODULE_DESCRIPTION("Xen filesystem");
21MODULE_LICENSE("GPL"); 21MODULE_LICENSE("GPL");
22 22
23static ssize_t capabilities_read(struct file *file, char __user *buf,
24 size_t size, loff_t *off)
25{
26 char *tmp = "";
27
28 if (xen_initial_domain())
29 tmp = "control_d\n";
30
31 return simple_read_from_buffer(buf, size, off, tmp, strlen(tmp));
32}
33
34static const struct file_operations capabilities_file_ops = {
35 .read = capabilities_read,
36};
37
23static int xenfs_fill_super(struct super_block *sb, void *data, int silent) 38static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
24{ 39{
25 static struct tree_descr xenfs_files[] = { 40 static struct tree_descr xenfs_files[] = {
26 [2] = {"xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR}, 41 [1] = {},
42 { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
43 { "capabilities", &capabilities_file_ops, S_IRUGO },
27 {""}, 44 {""},
28 }; 45 };
29 46
diff --git a/include/Kbuild b/include/Kbuild
index d8c3e3cbf416..fe36accd4328 100644
--- a/include/Kbuild
+++ b/include/Kbuild
@@ -8,3 +8,4 @@ header-y += mtd/
8header-y += rdma/ 8header-y += rdma/
9header-y += video/ 9header-y += video/
10header-y += drm/ 10header-y += drm/
11header-y += xen/
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 8e6d0ca70aba..e410f602cab1 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -280,17 +280,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
280#endif 280#endif
281 281
282/* 282/*
283 * A facility to provide batching of the reload of page tables with the 283 * A facility to provide batching of the reload of page tables and
284 * actual context switch code for paravirtualized guests. By convention, 284 * other process state with the actual context switch code for
285 * only one of the lazy modes (CPU, MMU) should be active at any given 285 * paravirtualized guests. By convention, only one of the batched
286 * time, entry should never be nested, and entry and exits should always 286 * update (lazy) modes (CPU, MMU) should be active at any given time,
287 * be paired. This is for sanity of maintaining and reasoning about the 287 * entry should never be nested, and entry and exits should always be
288 * kernel code. 288 * paired. This is for sanity of maintaining and reasoning about the
289 * kernel code. In this case, the exit (end of the context switch) is
290 * in architecture-specific code, and so doesn't need a generic
291 * definition.
289 */ 292 */
290#ifndef __HAVE_ARCH_ENTER_LAZY_CPU_MODE 293#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
291#define arch_enter_lazy_cpu_mode() do {} while (0) 294#define arch_start_context_switch(prev) do {} while (0)
292#define arch_leave_lazy_cpu_mode() do {} while (0)
293#define arch_flush_lazy_cpu_mode() do {} while (0)
294#endif 295#endif
295 296
296#ifndef __HAVE_PFNMAP_TRACKING 297#ifndef __HAVE_PFNMAP_TRACKING
diff --git a/include/xen/Kbuild b/include/xen/Kbuild
new file mode 100644
index 000000000000..4e65c16a445b
--- /dev/null
+++ b/include/xen/Kbuild
@@ -0,0 +1 @@
header-y += evtchn.h
diff --git a/include/xen/events.h b/include/xen/events.h
index 0d5f1adc0363..e68d59a90ca8 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -53,4 +53,7 @@ bool xen_test_irq_pending(int irq);
53 irq will be disabled so it won't deliver an interrupt. */ 53 irq will be disabled so it won't deliver an interrupt. */
54void xen_poll_irq(int irq); 54void xen_poll_irq(int irq);
55 55
56/* Determine the IRQ which is bound to an event channel */
57unsigned irq_from_evtchn(unsigned int evtchn);
58
56#endif /* _XEN_EVENTS_H */ 59#endif /* _XEN_EVENTS_H */
diff --git a/include/xen/evtchn.h b/include/xen/evtchn.h
new file mode 100644
index 000000000000..14e833ee4e0b
--- /dev/null
+++ b/include/xen/evtchn.h
@@ -0,0 +1,88 @@
1/******************************************************************************
2 * evtchn.h
3 *
4 * Interface to /dev/xen/evtchn.
5 *
6 * Copyright (c) 2003-2005, K A Fraser
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version 2
10 * as published by the Free Software Foundation; or, when distributed
11 * separately from the Linux kernel or incorporated into other
12 * software packages, subject to the following license:
13 *
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 * IN THE SOFTWARE.
31 */
32
33#ifndef __LINUX_PUBLIC_EVTCHN_H__
34#define __LINUX_PUBLIC_EVTCHN_H__
35
36/*
37 * Bind a fresh port to VIRQ @virq.
38 * Return allocated port.
39 */
40#define IOCTL_EVTCHN_BIND_VIRQ \
41 _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
42struct ioctl_evtchn_bind_virq {
43 unsigned int virq;
44};
45
46/*
47 * Bind a fresh port to remote <@remote_domain, @remote_port>.
48 * Return allocated port.
49 */
50#define IOCTL_EVTCHN_BIND_INTERDOMAIN \
51 _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
52struct ioctl_evtchn_bind_interdomain {
53 unsigned int remote_domain, remote_port;
54};
55
56/*
57 * Allocate a fresh port for binding to @remote_domain.
58 * Return allocated port.
59 */
60#define IOCTL_EVTCHN_BIND_UNBOUND_PORT \
61 _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
62struct ioctl_evtchn_bind_unbound_port {
63 unsigned int remote_domain;
64};
65
66/*
67 * Unbind previously allocated @port.
68 */
69#define IOCTL_EVTCHN_UNBIND \
70 _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
71struct ioctl_evtchn_unbind {
72 unsigned int port;
73};
74
75/*
76 * Unbind previously allocated @port.
77 */
78#define IOCTL_EVTCHN_NOTIFY \
79 _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
80struct ioctl_evtchn_notify {
81 unsigned int port;
82};
83
84/* Clear and reinitialise the event buffer. Clear error condition. */
85#define IOCTL_EVTCHN_RESET \
86 _IOC(_IOC_NONE, 'E', 5, 0)
87
88#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
diff --git a/include/xen/interface/version.h b/include/xen/interface/version.h
index 453235e923f0..e8b6519d47e9 100644
--- a/include/xen/interface/version.h
+++ b/include/xen/interface/version.h
@@ -57,4 +57,7 @@ struct xen_feature_info {
57/* Declares the features reported by XENVER_get_features. */ 57/* Declares the features reported by XENVER_get_features. */
58#include "features.h" 58#include "features.h"
59 59
60/* arg == NULL; returns host memory page size. */
61#define XENVER_pagesize 7
62
60#endif /* __XEN_PUBLIC_VERSION_H__ */ 63#endif /* __XEN_PUBLIC_VERSION_H__ */
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index f87f9614844d..b9763badbd77 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -91,8 +91,7 @@ struct xenbus_driver {
91 void (*otherend_changed)(struct xenbus_device *dev, 91 void (*otherend_changed)(struct xenbus_device *dev,
92 enum xenbus_state backend_state); 92 enum xenbus_state backend_state);
93 int (*remove)(struct xenbus_device *dev); 93 int (*remove)(struct xenbus_device *dev);
94 int (*suspend)(struct xenbus_device *dev); 94 int (*suspend)(struct xenbus_device *dev, pm_message_t state);
95 int (*suspend_cancel)(struct xenbus_device *dev);
96 int (*resume)(struct xenbus_device *dev); 95 int (*resume)(struct xenbus_device *dev);
97 int (*uevent)(struct xenbus_device *, char **, int, char *, int); 96 int (*uevent)(struct xenbus_device *, char **, int, char *, int);
98 struct device_driver driver; 97 struct device_driver driver;
diff --git a/kernel/sched.c b/kernel/sched.c
index c3c04e256560..076e403b9c88 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2783,7 +2783,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2783 * combine the page table reload and the switch backend into 2783 * combine the page table reload and the switch backend into
2784 * one hypercall. 2784 * one hypercall.
2785 */ 2785 */
2786 arch_enter_lazy_cpu_mode(); 2786 arch_start_context_switch(prev);
2787 2787
2788 if (unlikely(!mm)) { 2788 if (unlikely(!mm)) {
2789 next->active_mm = oldmm; 2789 next->active_mm = oldmm;