aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/i386/Kconfig2
-rw-r--r--arch/i386/Makefile3
-rw-r--r--arch/i386/boot/compressed/relocs.c2
-rw-r--r--arch/i386/kernel/asm-offsets.c9
-rw-r--r--arch/i386/kernel/entry.S85
-rw-r--r--arch/i386/kernel/head.S5
-rw-r--r--arch/i386/kernel/paravirt.c37
-rw-r--r--arch/i386/kernel/setup.c2
-rw-r--r--arch/i386/kernel/smp.c5
-rw-r--r--arch/i386/kernel/smpboot.c8
-rw-r--r--arch/i386/kernel/tsc.c23
-rw-r--r--arch/i386/kernel/vmi.c4
-rw-r--r--arch/i386/kernel/vmiclock.c6
-rw-r--r--arch/i386/kernel/vmlinux.lds.S1
-rw-r--r--arch/i386/kernel/vsyscall-note.S49
-rw-r--r--arch/i386/mach-voyager/voyager_thread.c2
-rw-r--r--arch/i386/mm/init.c3
-rw-r--r--arch/i386/mm/pageattr.c2
-rw-r--r--arch/i386/xen/Kconfig11
-rw-r--r--arch/i386/xen/Makefile4
-rw-r--r--arch/i386/xen/enlighten.c1144
-rw-r--r--arch/i386/xen/events.c590
-rw-r--r--arch/i386/xen/features.c29
-rw-r--r--arch/i386/xen/manage.c143
-rw-r--r--arch/i386/xen/mmu.c564
-rw-r--r--arch/i386/xen/mmu.h60
-rw-r--r--arch/i386/xen/multicalls.c90
-rw-r--r--arch/i386/xen/multicalls.h45
-rw-r--r--arch/i386/xen/setup.c96
-rw-r--r--arch/i386/xen/smp.c404
-rw-r--r--arch/i386/xen/time.c590
-rw-r--r--arch/i386/xen/xen-asm.S291
-rw-r--r--arch/i386/xen/xen-head.S36
-rw-r--r--arch/i386/xen/xen-ops.h71
-rw-r--r--arch/x86_64/kernel/early_printk.c5
-rw-r--r--arch/x86_64/kernel/mce.c2
-rw-r--r--drivers/Makefile2
-rw-r--r--drivers/acpi/thermal.c24
-rw-r--r--drivers/block/Kconfig9
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/xen-blkfront.c988
-rw-r--r--drivers/char/Kconfig8
-rw-r--r--drivers/char/Makefile1
-rw-r--r--drivers/char/hvc_xen.c159
-rw-r--r--drivers/macintosh/therm_pm72.c3
-rw-r--r--drivers/macintosh/windfarm_core.c3
-rw-r--r--drivers/net/Kconfig12
-rw-r--r--drivers/net/Makefile2
-rw-r--r--drivers/net/hamradio/baycom_epp.c2
-rw-r--r--drivers/net/xen-netfront.c1863
-rw-r--r--drivers/pnp/pnpbios/core.c2
-rw-r--r--drivers/sbus/char/bbc_envctrl.c5
-rw-r--r--drivers/sbus/char/envctrl.c7
-rw-r--r--drivers/xen/Makefile2
-rw-r--r--drivers/xen/grant-table.c582
-rw-r--r--drivers/xen/xenbus/Makefile7
-rw-r--r--drivers/xen/xenbus/xenbus_client.c569
-rw-r--r--drivers/xen/xenbus/xenbus_comms.c233
-rw-r--r--drivers/xen/xenbus/xenbus_comms.h46
-rw-r--r--drivers/xen/xenbus/xenbus_probe.c935
-rw-r--r--drivers/xen/xenbus/xenbus_probe.h74
-rw-r--r--drivers/xen/xenbus/xenbus_xs.c861
-rw-r--r--fs/ocfs2/heartbeat.c2
-rw-r--r--include/asm-i386/irq.h1
-rw-r--r--include/asm-i386/mach-default/irq_vectors_limits.h2
-rw-r--r--include/asm-i386/mmu_context.h2
-rw-r--r--include/asm-i386/paravirt.h22
-rw-r--r--include/asm-i386/pgalloc.h6
-rw-r--r--include/asm-i386/setup.h4
-rw-r--r--include/asm-i386/smp.h5
-rw-r--r--include/asm-i386/timer.h32
-rw-r--r--include/asm-i386/vmi_time.h2
-rw-r--r--include/asm-i386/xen/hypercall.h413
-rw-r--r--include/asm-i386/xen/hypervisor.h73
-rw-r--r--include/asm-i386/xen/interface.h188
-rw-r--r--include/linux/elfnote.h22
-rw-r--r--include/linux/kmod.h52
-rw-r--r--include/linux/major.h2
-rw-r--r--include/linux/page-flags.h5
-rw-r--r--include/linux/reboot.h5
-rw-r--r--include/linux/string.h4
-rw-r--r--include/linux/vmalloc.h4
-rw-r--r--include/xen/events.h48
-rw-r--r--include/xen/features.h23
-rw-r--r--include/xen/grant_table.h107
-rw-r--r--include/xen/hvc-console.h6
-rw-r--r--include/xen/interface/elfnote.h133
-rw-r--r--include/xen/interface/event_channel.h195
-rw-r--r--include/xen/interface/features.h43
-rw-r--r--include/xen/interface/grant_table.h375
-rw-r--r--include/xen/interface/io/blkif.h94
-rw-r--r--include/xen/interface/io/console.h23
-rw-r--r--include/xen/interface/io/netif.h158
-rw-r--r--include/xen/interface/io/ring.h260
-rw-r--r--include/xen/interface/io/xenbus.h44
-rw-r--r--include/xen/interface/io/xs_wire.h87
-rw-r--r--include/xen/interface/memory.h145
-rw-r--r--include/xen/interface/physdev.h145
-rw-r--r--include/xen/interface/sched.h77
-rw-r--r--include/xen/interface/vcpu.h167
-rw-r--r--include/xen/interface/version.h60
-rw-r--r--include/xen/interface/xen.h447
-rw-r--r--include/xen/page.h179
-rw-r--r--include/xen/xenbus.h234
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/kmod.c216
-rw-r--r--kernel/sys.c58
-rw-r--r--kernel/sysctl.c10
-rw-r--r--lib/Makefile2
-rw-r--r--lib/argv_split.c105
-rw-r--r--lib/kobject_uevent.c2
-rw-r--r--mm/util.c26
-rw-r--r--mm/vmalloc.c53
-rw-r--r--net/bridge/br_stp_if.c2
-rw-r--r--net/irda/irias_object.c43
-rw-r--r--security/keys/request_key.c3
116 files changed, 15031 insertions, 210 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index c7c9c2a15fab..7a11b905ef49 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -222,6 +222,8 @@ config PARAVIRT
222 However, when run without a hypervisor the kernel is 222 However, when run without a hypervisor the kernel is
223 theoretically slower. If in doubt, say N. 223 theoretically slower. If in doubt, say N.
224 224
225source "arch/i386/xen/Kconfig"
226
225config VMI 227config VMI
226 bool "VMI Paravirt-ops support" 228 bool "VMI Paravirt-ops support"
227 depends on PARAVIRT 229 depends on PARAVIRT
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index 181cc29a7c4f..01f0ff0daaf4 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-i386/mach-es7000
93mcore-$(CONFIG_X86_ES7000) := mach-default 93mcore-$(CONFIG_X86_ES7000) := mach-default
94core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/ 94core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/
95 95
96# Xen paravirtualization support
97core-$(CONFIG_XEN) += arch/i386/xen/
98
96# default subarch .h files 99# default subarch .h files
97mflags-y += -Iinclude/asm-i386/mach-default 100mflags-y += -Iinclude/asm-i386/mach-default
98 101
diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c
index ce4fda261aaf..b0e21c3cee5c 100644
--- a/arch/i386/boot/compressed/relocs.c
+++ b/arch/i386/boot/compressed/relocs.c
@@ -31,6 +31,8 @@ static const char* safe_abs_relocs[] = {
31 "__kernel_rt_sigreturn", 31 "__kernel_rt_sigreturn",
32 "__kernel_sigreturn", 32 "__kernel_sigreturn",
33 "SYSENTER_RETURN", 33 "SYSENTER_RETURN",
34 "xen_irq_disable_direct_reloc",
35 "xen_save_fl_direct_reloc",
34}; 36};
35 37
36static int is_safe_abs_reloc(const char* sym_name) 38static int is_safe_abs_reloc(const char* sym_name)
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 27a776c9044d..25f7eb513928 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -17,6 +17,8 @@
17#include <asm/thread_info.h> 17#include <asm/thread_info.h>
18#include <asm/elf.h> 18#include <asm/elf.h>
19 19
20#include <xen/interface/xen.h>
21
20#define DEFINE(sym, val) \ 22#define DEFINE(sym, val) \
21 asm volatile("\n->" #sym " %0 " #val : : "i" (val)) 23 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
22 24
@@ -59,6 +61,7 @@ void foo(void)
59 OFFSET(TI_addr_limit, thread_info, addr_limit); 61 OFFSET(TI_addr_limit, thread_info, addr_limit);
60 OFFSET(TI_restart_block, thread_info, restart_block); 62 OFFSET(TI_restart_block, thread_info, restart_block);
61 OFFSET(TI_sysenter_return, thread_info, sysenter_return); 63 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
64 OFFSET(TI_cpu, thread_info, cpu);
62 BLANK(); 65 BLANK();
63 66
64 OFFSET(GDS_size, Xgt_desc_struct, size); 67 OFFSET(GDS_size, Xgt_desc_struct, size);
@@ -115,4 +118,10 @@ void foo(void)
115 OFFSET(PARAVIRT_iret, paravirt_ops, iret); 118 OFFSET(PARAVIRT_iret, paravirt_ops, iret);
116 OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); 119 OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
117#endif 120#endif
121
122#ifdef CONFIG_XEN
123 BLANK();
124 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
125 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
126#endif
118} 127}
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 3c3c220488c9..32980b834935 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -1023,6 +1023,91 @@ ENTRY(kernel_thread_helper)
1023 CFI_ENDPROC 1023 CFI_ENDPROC
1024ENDPROC(kernel_thread_helper) 1024ENDPROC(kernel_thread_helper)
1025 1025
1026#ifdef CONFIG_XEN
1027ENTRY(xen_hypervisor_callback)
1028 CFI_STARTPROC
1029 pushl $0
1030 CFI_ADJUST_CFA_OFFSET 4
1031 SAVE_ALL
1032 TRACE_IRQS_OFF
1033
1034 /* Check to see if we got the event in the critical
1035 region in xen_iret_direct, after we've reenabled
1036 events and checked for pending events. This simulates
1037 iret instruction's behaviour where it delivers a
1038 pending interrupt when enabling interrupts. */
1039 movl PT_EIP(%esp),%eax
1040 cmpl $xen_iret_start_crit,%eax
1041 jb 1f
1042 cmpl $xen_iret_end_crit,%eax
1043 jae 1f
1044
1045 call xen_iret_crit_fixup
1046
10471: mov %esp, %eax
1048 call xen_evtchn_do_upcall
1049 jmp ret_from_intr
1050 CFI_ENDPROC
1051ENDPROC(xen_hypervisor_callback)
1052
1053# Hypervisor uses this for application faults while it executes.
1054# We get here for two reasons:
1055# 1. Fault while reloading DS, ES, FS or GS
1056# 2. Fault while executing IRET
1057# Category 1 we fix up by reattempting the load, and zeroing the segment
1058# register if the load fails.
1059# Category 2 we fix up by jumping to do_iret_error. We cannot use the
1060# normal Linux return path in this case because if we use the IRET hypercall
1061# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1062# We distinguish between categories by maintaining a status value in EAX.
1063ENTRY(xen_failsafe_callback)
1064 CFI_STARTPROC
1065 pushl %eax
1066 CFI_ADJUST_CFA_OFFSET 4
1067 movl $1,%eax
10681: mov 4(%esp),%ds
10692: mov 8(%esp),%es
10703: mov 12(%esp),%fs
10714: mov 16(%esp),%gs
1072 testl %eax,%eax
1073 popl %eax
1074 CFI_ADJUST_CFA_OFFSET -4
1075 lea 16(%esp),%esp
1076 CFI_ADJUST_CFA_OFFSET -16
1077 jz 5f
1078 addl $16,%esp
1079 jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
10805: pushl $0 # EAX == 0 => Category 1 (Bad segment)
1081 CFI_ADJUST_CFA_OFFSET 4
1082 SAVE_ALL
1083 jmp ret_from_exception
1084 CFI_ENDPROC
1085
1086.section .fixup,"ax"
10876: xorl %eax,%eax
1088 movl %eax,4(%esp)
1089 jmp 1b
10907: xorl %eax,%eax
1091 movl %eax,8(%esp)
1092 jmp 2b
10938: xorl %eax,%eax
1094 movl %eax,12(%esp)
1095 jmp 3b
10969: xorl %eax,%eax
1097 movl %eax,16(%esp)
1098 jmp 4b
1099.previous
1100.section __ex_table,"a"
1101 .align 4
1102 .long 1b,6b
1103 .long 2b,7b
1104 .long 3b,8b
1105 .long 4b,9b
1106.previous
1107ENDPROC(xen_failsafe_callback)
1108
1109#endif /* CONFIG_XEN */
1110
1026.section .rodata,"a" 1111.section .rodata,"a"
1027#include "syscall_table.S" 1112#include "syscall_table.S"
1028 1113
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 82714668d43b..7c52b222207e 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -510,7 +510,8 @@ ENTRY(_stext)
510/* 510/*
511 * BSS section 511 * BSS section
512 */ 512 */
513.section ".bss.page_aligned","w" 513.section ".bss.page_aligned","wa"
514 .align PAGE_SIZE_asm
514ENTRY(swapper_pg_dir) 515ENTRY(swapper_pg_dir)
515 .fill 1024,4,0 516 .fill 1024,4,0
516ENTRY(swapper_pg_pmd) 517ENTRY(swapper_pg_pmd)
@@ -538,6 +539,8 @@ fault_msg:
538 .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n" 539 .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n"
539 .asciz "Stack: %p %p %p %p %p %p %p %p\n" 540 .asciz "Stack: %p %p %p %p %p %p %p %p\n"
540 541
542#include "../xen/xen-head.S"
543
541/* 544/*
542 * The IDT and GDT 'descriptors' are a strange 48-bit object 545 * The IDT and GDT 'descriptors' are a strange 48-bit object
543 * only used by the lidt and lgdt instructions. They are not 546 * only used by the lidt and lgdt instructions. They are not
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index faab09abca5e..53f07a8275e3 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -228,6 +228,41 @@ static int __init print_banner(void)
228} 228}
229core_initcall(print_banner); 229core_initcall(print_banner);
230 230
231static struct resource reserve_ioports = {
232 .start = 0,
233 .end = IO_SPACE_LIMIT,
234 .name = "paravirt-ioport",
235 .flags = IORESOURCE_IO | IORESOURCE_BUSY,
236};
237
238static struct resource reserve_iomem = {
239 .start = 0,
240 .end = -1,
241 .name = "paravirt-iomem",
242 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
243};
244
245/*
246 * Reserve the whole legacy IO space to prevent any legacy drivers
247 * from wasting time probing for their hardware. This is a fairly
248 * brute-force approach to disabling all non-virtual drivers.
249 *
250 * Note that this must be called very early to have any effect.
251 */
252int paravirt_disable_iospace(void)
253{
254 int ret;
255
256 ret = request_resource(&ioport_resource, &reserve_ioports);
257 if (ret == 0) {
258 ret = request_resource(&iomem_resource, &reserve_iomem);
259 if (ret)
260 release_resource(&reserve_ioports);
261 }
262
263 return ret;
264}
265
231struct paravirt_ops paravirt_ops = { 266struct paravirt_ops paravirt_ops = {
232 .name = "bare hardware", 267 .name = "bare hardware",
233 .paravirt_enabled = 0, 268 .paravirt_enabled = 0,
@@ -267,7 +302,7 @@ struct paravirt_ops paravirt_ops = {
267 .write_msr = native_write_msr_safe, 302 .write_msr = native_write_msr_safe,
268 .read_tsc = native_read_tsc, 303 .read_tsc = native_read_tsc,
269 .read_pmc = native_read_pmc, 304 .read_pmc = native_read_pmc,
270 .get_scheduled_cycles = native_read_tsc, 305 .sched_clock = native_sched_clock,
271 .get_cpu_khz = native_calculate_cpu_khz, 306 .get_cpu_khz = native_calculate_cpu_khz,
272 .load_tr_desc = native_load_tr_desc, 307 .load_tr_desc = native_load_tr_desc,
273 .set_ldt = native_set_ldt, 308 .set_ldt = native_set_ldt,
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 2d61e65eeb50..74871d066c2b 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -601,6 +601,8 @@ void __init setup_arch(char **cmdline_p)
601 * NOTE: at this point the bootmem allocator is fully available. 601 * NOTE: at this point the bootmem allocator is fully available.
602 */ 602 */
603 603
604 paravirt_post_allocator_init();
605
604 dmi_scan_machine(); 606 dmi_scan_machine();
605 607
606#ifdef CONFIG_X86_GENERICARCH 608#ifdef CONFIG_X86_GENERICARCH
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 6299c080f6e2..2d35d8502029 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -22,6 +22,7 @@
22 22
23#include <asm/mtrr.h> 23#include <asm/mtrr.h>
24#include <asm/tlbflush.h> 24#include <asm/tlbflush.h>
25#include <asm/mmu_context.h>
25#include <mach_apic.h> 26#include <mach_apic.h>
26 27
27/* 28/*
@@ -249,13 +250,13 @@ static unsigned long flush_va;
249static DEFINE_SPINLOCK(tlbstate_lock); 250static DEFINE_SPINLOCK(tlbstate_lock);
250 251
251/* 252/*
252 * We cannot call mmdrop() because we are in interrupt context, 253 * We cannot call mmdrop() because we are in interrupt context,
253 * instead update mm->cpu_vm_mask. 254 * instead update mm->cpu_vm_mask.
254 * 255 *
255 * We need to reload %cr3 since the page tables may be going 256 * We need to reload %cr3 since the page tables may be going
256 * away from under us.. 257 * away from under us..
257 */ 258 */
258static inline void leave_mm (unsigned long cpu) 259void leave_mm(unsigned long cpu)
259{ 260{
260 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) 261 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
261 BUG(); 262 BUG();
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 0b2954534b8e..5910d3fac561 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -148,7 +148,7 @@ void __init smp_alloc_memory(void)
148 * a given CPU 148 * a given CPU
149 */ 149 */
150 150
151static void __cpuinit smp_store_cpu_info(int id) 151void __cpuinit smp_store_cpu_info(int id)
152{ 152{
153 struct cpuinfo_x86 *c = cpu_data + id; 153 struct cpuinfo_x86 *c = cpu_data + id;
154 154
@@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu)
308/* representing cpus for which sibling maps can be computed */ 308/* representing cpus for which sibling maps can be computed */
309static cpumask_t cpu_sibling_setup_map; 309static cpumask_t cpu_sibling_setup_map;
310 310
311static inline void 311void set_cpu_sibling_map(int cpu)
312set_cpu_sibling_map(int cpu)
313{ 312{
314 int i; 313 int i;
315 struct cpuinfo_x86 *c = cpu_data; 314 struct cpuinfo_x86 *c = cpu_data;
@@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void)
1144} 1143}
1145 1144
1146#ifdef CONFIG_HOTPLUG_CPU 1145#ifdef CONFIG_HOTPLUG_CPU
1147static void 1146void remove_siblinginfo(int cpu)
1148remove_siblinginfo(int cpu)
1149{ 1147{
1150 int sibling; 1148 int sibling;
1151 struct cpuinfo_x86 *c = cpu_data; 1149 struct cpuinfo_x86 *c = cpu_data;
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index ea63a30ca3e8..252f9010f283 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -84,7 +84,7 @@ static inline int check_tsc_unstable(void)
84 * 84 *
85 * -johnstul@us.ibm.com "math is hard, lets go shopping!" 85 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
86 */ 86 */
87static unsigned long cyc2ns_scale __read_mostly; 87unsigned long cyc2ns_scale __read_mostly;
88 88
89#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ 89#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
90 90
@@ -93,15 +93,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz)
93 cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; 93 cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
94} 94}
95 95
96static inline unsigned long long cycles_2_ns(unsigned long long cyc)
97{
98 return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
99}
100
101/* 96/*
102 * Scheduler clock - returns current time in nanosec units. 97 * Scheduler clock - returns current time in nanosec units.
103 */ 98 */
104unsigned long long sched_clock(void) 99unsigned long long native_sched_clock(void)
105{ 100{
106 unsigned long long this_offset; 101 unsigned long long this_offset;
107 102
@@ -118,12 +113,24 @@ unsigned long long sched_clock(void)
118 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); 113 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
119 114
120 /* read the Time Stamp Counter: */ 115 /* read the Time Stamp Counter: */
121 get_scheduled_cycles(this_offset); 116 rdtscll(this_offset);
122 117
123 /* return the value in ns */ 118 /* return the value in ns */
124 return cycles_2_ns(this_offset); 119 return cycles_2_ns(this_offset);
125} 120}
126 121
122/* We need to define a real function for sched_clock, to override the
123 weak default version */
124#ifdef CONFIG_PARAVIRT
125unsigned long long sched_clock(void)
126{
127 return paravirt_sched_clock();
128}
129#else
130unsigned long long sched_clock(void)
131 __attribute__((alias("native_sched_clock")));
132#endif
133
127unsigned long native_calculate_cpu_khz(void) 134unsigned long native_calculate_cpu_khz(void)
128{ 135{
129 unsigned long long start, end; 136 unsigned long long start, end;
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
index c12720d7cbc5..72042bb7ec94 100644
--- a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
362} 362}
363#endif 363#endif
364 364
365static void vmi_allocate_pt(u32 pfn) 365static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
366{ 366{
367 vmi_set_page_type(pfn, VMI_PAGE_L1); 367 vmi_set_page_type(pfn, VMI_PAGE_L1);
368 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 368 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -891,7 +891,7 @@ static inline int __init activate_vmi(void)
891 paravirt_ops.setup_boot_clock = vmi_time_bsp_init; 891 paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
892 paravirt_ops.setup_secondary_clock = vmi_time_ap_init; 892 paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
893#endif 893#endif
894 paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles; 894 paravirt_ops.sched_clock = vmi_sched_clock;
895 paravirt_ops.get_cpu_khz = vmi_cpu_khz; 895 paravirt_ops.get_cpu_khz = vmi_cpu_khz;
896 896
897 /* We have true wallclock functions; disable CMOS clock sync */ 897 /* We have true wallclock functions; disable CMOS clock sync */
diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c
index 26a37f8a8762..f9b845f4e692 100644
--- a/arch/i386/kernel/vmiclock.c
+++ b/arch/i386/kernel/vmiclock.c
@@ -64,10 +64,10 @@ int vmi_set_wallclock(unsigned long now)
64 return 0; 64 return 0;
65} 65}
66 66
67/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */ 67/* paravirt_ops.sched_clock = vmi_sched_clock */
68unsigned long long vmi_get_sched_cycles(void) 68unsigned long long vmi_sched_clock(void)
69{ 69{
70 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); 70 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
71} 71}
72 72
73/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ 73/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index aa87b06c7c82..00f1bc47d3a2 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -88,6 +88,7 @@ SECTIONS
88 88
89 . = ALIGN(4096); 89 . = ALIGN(4096);
90 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { 90 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
91 *(.data.page_aligned)
91 *(.data.idt) 92 *(.data.idt)
92 } 93 }
93 94
diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S
index d4b5be4f3d5f..271f16a8ca01 100644
--- a/arch/i386/kernel/vsyscall-note.S
+++ b/arch/i386/kernel/vsyscall-note.S
@@ -3,23 +3,40 @@
3 * Here we can supply some information useful to userland. 3 * Here we can supply some information useful to userland.
4 */ 4 */
5 5
6#include <linux/uts.h>
7#include <linux/version.h> 6#include <linux/version.h>
7#include <linux/elfnote.h>
8 8
9#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type) \ 9/* Ideally this would use UTS_NAME, but using a quoted string here
10 .section name, flags; \ 10 doesn't work. Remember to change this when changing the
11 .balign 4; \ 11 kernel's name. */
12 .long 1f - 0f; /* name length */ \ 12ELFNOTE_START(Linux, 0, "a")
13 .long 3f - 2f; /* data length */ \ 13 .long LINUX_VERSION_CODE
14 .long type; /* note type */ \ 14ELFNOTE_END
150: .asciz vendor; /* vendor name */ \
161: .balign 4; \
172:
18 15
19#define ASM_ELF_NOTE_END \ 16#ifdef CONFIG_XEN
203: .balign 4; /* pad out section */ \
21 .previous
22 17
23 ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0) 18/*
24 .long LINUX_VERSION_CODE 19 * Add a special note telling glibc's dynamic linker a fake hardware
25 ASM_ELF_NOTE_END 20 * flavor that it will use to choose the search path for libraries in the
21 * same way it uses real hardware capabilities like "mmx".
22 * We supply "nosegneg" as the fake capability, to indicate that we
23 * do not like negative offsets in instructions using segment overrides,
24 * since we implement those inefficiently. This makes it possible to
25 * install libraries optimized to avoid those access patterns in someplace
26 * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
27 * corresponding to the bits here is needed to make ldconfig work right.
28 * It should contain:
29 * hwcap 1 nosegneg
30 * to match the mapping of bit to name that we give here.
31 */
32
33/* Bit used for the pseudo-hwcap for non-negative segments. We use
34 bit 1 to avoid bugs in some versions of glibc when bit 0 is
35 used; the choice is otherwise arbitrary. */
36#define VDSO_NOTE_NONEGSEG_BIT 1
37
38ELFNOTE_START(GNU, 2, "a")
39 .long 1, 1<<VDSO_NOTE_NONEGSEG_BIT /* ncaps, mask */
40 .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
41ELFNOTE_END
42#endif
diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c
index b4b24e0e45e1..f9d595338159 100644
--- a/arch/i386/mach-voyager/voyager_thread.c
+++ b/arch/i386/mach-voyager/voyager_thread.c
@@ -52,7 +52,7 @@ execute(const char *string)
52 NULL, 52 NULL,
53 }; 53 };
54 54
55 if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) { 55 if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
56 printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", 56 printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
57 string, ret); 57 string, ret);
58 } 58 }
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 7135946d3663..6a68b1ae061c 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
87 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 87 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
88 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); 88 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
89 89
90 paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); 90 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
91 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 91 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
92 BUG_ON(page_table != pte_offset_kernel(pmd, 0)); 92 BUG_ON(page_table != pte_offset_kernel(pmd, 0));
93 } 93 }
@@ -473,6 +473,7 @@ void zap_low_mappings (void)
473 473
474static int disable_nx __initdata = 0; 474static int disable_nx __initdata = 0;
475u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; 475u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
476EXPORT_SYMBOL_GPL(__supported_pte_mask);
476 477
477/* 478/*
478 * noexec = on|off 479 * noexec = on|off
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 2eb14a73be9c..37992ffb1633 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
60 address = __pa(address); 60 address = __pa(address);
61 addr = address & LARGE_PAGE_MASK; 61 addr = address & LARGE_PAGE_MASK;
62 pbase = (pte_t *)page_address(base); 62 pbase = (pte_t *)page_address(base);
63 paravirt_alloc_pt(page_to_pfn(base)); 63 paravirt_alloc_pt(&init_mm, page_to_pfn(base));
64 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { 64 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
65 set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, 65 set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
66 addr == address ? prot : ref_prot)); 66 addr == address ? prot : ref_prot));
diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig
new file mode 100644
index 000000000000..9df99e1885a4
--- /dev/null
+++ b/arch/i386/xen/Kconfig
@@ -0,0 +1,11 @@
1#
2# This Kconfig describes xen options
3#
4
5config XEN
6 bool "Enable support for Xen hypervisor"
7 depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
8 help
9 This is the Linux Xen port. Enabling this will allow the
10 kernel to boot in a paravirtualized environment under the
11 Xen hypervisor.
diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile
new file mode 100644
index 000000000000..343df246bd3e
--- /dev/null
+++ b/arch/i386/xen/Makefile
@@ -0,0 +1,4 @@
1obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \
2 events.o time.o manage.o xen-asm.o
3
4obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
new file mode 100644
index 000000000000..9a8c1181c001
--- /dev/null
+++ b/arch/i386/xen/enlighten.c
@@ -0,0 +1,1144 @@
1/*
2 * Core of Xen paravirt_ops implementation.
3 *
4 * This file contains the xen_paravirt_ops structure itself, and the
5 * implementations for:
6 * - privileged instructions
7 * - interrupt flags
8 * - segment operations
9 * - booting and setup
10 *
11 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
12 */
13
14#include <linux/kernel.h>
15#include <linux/init.h>
16#include <linux/smp.h>
17#include <linux/preempt.h>
18#include <linux/hardirq.h>
19#include <linux/percpu.h>
20#include <linux/delay.h>
21#include <linux/start_kernel.h>
22#include <linux/sched.h>
23#include <linux/bootmem.h>
24#include <linux/module.h>
25#include <linux/mm.h>
26#include <linux/page-flags.h>
27#include <linux/highmem.h>
28#include <linux/smp.h>
29
30#include <xen/interface/xen.h>
31#include <xen/interface/physdev.h>
32#include <xen/interface/vcpu.h>
33#include <xen/interface/sched.h>
34#include <xen/features.h>
35#include <xen/page.h>
36
37#include <asm/paravirt.h>
38#include <asm/page.h>
39#include <asm/xen/hypercall.h>
40#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h>
42#include <asm/processor.h>
43#include <asm/setup.h>
44#include <asm/desc.h>
45#include <asm/pgtable.h>
46#include <asm/tlbflush.h>
47#include <asm/reboot.h>
48
49#include "xen-ops.h"
50#include "mmu.h"
51#include "multicalls.h"
52
53EXPORT_SYMBOL_GPL(hypercall_page);
54
55DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
56
57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
59DEFINE_PER_CPU(unsigned long, xen_cr3);
60
61struct start_info *xen_start_info;
62EXPORT_SYMBOL_GPL(xen_start_info);
63
64static /* __initdata */ struct shared_info dummy_shared_info;
65
66/*
67 * Point at some empty memory to start with. We map the real shared_info
68 * page as soon as fixmap is up and running.
69 */
70struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
71
72/*
73 * Flag to determine whether vcpu info placement is available on all
74 * VCPUs. We assume it is to start with, and then set it to zero on
75 * the first failure. This is because it can succeed on some VCPUs
76 * and not others, since it can involve hypervisor memory allocation,
77 * or because the guest failed to guarantee all the appropriate
78 * constraints on all VCPUs (ie buffer can't cross a page boundary).
79 *
80 * Note that any particular CPU may be using a placed vcpu structure,
81 * but we can only optimise if the all are.
82 *
83 * 0: not available, 1: available
84 */
85static int have_vcpu_info_placement = 1;
86
87static void __init xen_vcpu_setup(int cpu)
88{
89 struct vcpu_register_vcpu_info info;
90 int err;
91 struct vcpu_info *vcpup;
92
93 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
94
95 if (!have_vcpu_info_placement)
96 return; /* already tested, not available */
97
98 vcpup = &per_cpu(xen_vcpu_info, cpu);
99
100 info.mfn = virt_to_mfn(vcpup);
101 info.offset = offset_in_page(vcpup);
102
103 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
104 cpu, vcpup, info.mfn, info.offset);
105
106 /* Check to see if the hypervisor will put the vcpu_info
107 structure where we want it, which allows direct access via
108 a percpu-variable. */
109 err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
110
111 if (err) {
112 printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
113 have_vcpu_info_placement = 0;
114 } else {
115 /* This cpu is using the registered vcpu info, even if
116 later ones fail to. */
117 per_cpu(xen_vcpu, cpu) = vcpup;
118
119 printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
120 cpu, vcpup);
121 }
122}
123
124static void __init xen_banner(void)
125{
126 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
127 paravirt_ops.name);
128 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
129}
130
131static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
132 unsigned int *ecx, unsigned int *edx)
133{
134 unsigned maskedx = ~0;
135
136 /*
137 * Mask out inconvenient features, to try and disable as many
138 * unsupported kernel subsystems as possible.
139 */
140 if (*eax == 1)
141 maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
142 (1 << X86_FEATURE_ACPI) | /* disable ACPI */
143 (1 << X86_FEATURE_ACC)); /* thermal monitoring */
144
145 asm(XEN_EMULATE_PREFIX "cpuid"
146 : "=a" (*eax),
147 "=b" (*ebx),
148 "=c" (*ecx),
149 "=d" (*edx)
150 : "0" (*eax), "2" (*ecx));
151 *edx &= maskedx;
152}
153
154static void xen_set_debugreg(int reg, unsigned long val)
155{
156 HYPERVISOR_set_debugreg(reg, val);
157}
158
159static unsigned long xen_get_debugreg(int reg)
160{
161 return HYPERVISOR_get_debugreg(reg);
162}
163
164static unsigned long xen_save_fl(void)
165{
166 struct vcpu_info *vcpu;
167 unsigned long flags;
168
169 vcpu = x86_read_percpu(xen_vcpu);
170
171 /* flag has opposite sense of mask */
172 flags = !vcpu->evtchn_upcall_mask;
173
174 /* convert to IF type flag
175 -0 -> 0x00000000
176 -1 -> 0xffffffff
177 */
178 return (-flags) & X86_EFLAGS_IF;
179}
180
181static void xen_restore_fl(unsigned long flags)
182{
183 struct vcpu_info *vcpu;
184
185 /* convert from IF type flag */
186 flags = !(flags & X86_EFLAGS_IF);
187
188 /* There's a one instruction preempt window here. We need to
189 make sure we're don't switch CPUs between getting the vcpu
190 pointer and updating the mask. */
191 preempt_disable();
192 vcpu = x86_read_percpu(xen_vcpu);
193 vcpu->evtchn_upcall_mask = flags;
194 preempt_enable_no_resched();
195
196 /* Doesn't matter if we get preempted here, because any
197 pending event will get dealt with anyway. */
198
199 if (flags == 0) {
200 preempt_check_resched();
201 barrier(); /* unmask then check (avoid races) */
202 if (unlikely(vcpu->evtchn_upcall_pending))
203 force_evtchn_callback();
204 }
205}
206
207static void xen_irq_disable(void)
208{
209 /* There's a one instruction preempt window here. We need to
210 make sure we're don't switch CPUs between getting the vcpu
211 pointer and updating the mask. */
212 preempt_disable();
213 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
214 preempt_enable_no_resched();
215}
216
217static void xen_irq_enable(void)
218{
219 struct vcpu_info *vcpu;
220
221 /* There's a one instruction preempt window here. We need to
222 make sure we're don't switch CPUs between getting the vcpu
223 pointer and updating the mask. */
224 preempt_disable();
225 vcpu = x86_read_percpu(xen_vcpu);
226 vcpu->evtchn_upcall_mask = 0;
227 preempt_enable_no_resched();
228
229 /* Doesn't matter if we get preempted here, because any
230 pending event will get dealt with anyway. */
231
232 barrier(); /* unmask then check (avoid races) */
233 if (unlikely(vcpu->evtchn_upcall_pending))
234 force_evtchn_callback();
235}
236
237static void xen_safe_halt(void)
238{
239 /* Blocking includes an implicit local_irq_enable(). */
240 if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
241 BUG();
242}
243
244static void xen_halt(void)
245{
246 if (irqs_disabled())
247 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
248 else
249 xen_safe_halt();
250}
251
252static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
253{
254 BUG_ON(preemptible());
255
256 switch (mode) {
257 case PARAVIRT_LAZY_NONE:
258 BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
259 break;
260
261 case PARAVIRT_LAZY_MMU:
262 case PARAVIRT_LAZY_CPU:
263 BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
264 break;
265
266 case PARAVIRT_LAZY_FLUSH:
267 /* flush if necessary, but don't change state */
268 if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
269 xen_mc_flush();
270 return;
271 }
272
273 xen_mc_flush();
274 x86_write_percpu(xen_lazy_mode, mode);
275}
276
277static unsigned long xen_store_tr(void)
278{
279 return 0;
280}
281
282static void xen_set_ldt(const void *addr, unsigned entries)
283{
284 unsigned long linear_addr = (unsigned long)addr;
285 struct mmuext_op *op;
286 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
287
288 op = mcs.args;
289 op->cmd = MMUEXT_SET_LDT;
290 if (linear_addr) {
291 /* ldt my be vmalloced, use arbitrary_virt_to_machine */
292 xmaddr_t maddr;
293 maddr = arbitrary_virt_to_machine((unsigned long)addr);
294 linear_addr = (unsigned long)maddr.maddr;
295 }
296 op->arg1.linear_addr = linear_addr;
297 op->arg2.nr_ents = entries;
298
299 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
300
301 xen_mc_issue(PARAVIRT_LAZY_CPU);
302}
303
304static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
305{
306 unsigned long *frames;
307 unsigned long va = dtr->address;
308 unsigned int size = dtr->size + 1;
309 unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
310 int f;
311 struct multicall_space mcs;
312
313 /* A GDT can be up to 64k in size, which corresponds to 8192
314 8-byte entries, or 16 4k pages.. */
315
316 BUG_ON(size > 65536);
317 BUG_ON(va & ~PAGE_MASK);
318
319 mcs = xen_mc_entry(sizeof(*frames) * pages);
320 frames = mcs.args;
321
322 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
323 frames[f] = virt_to_mfn(va);
324 make_lowmem_page_readonly((void *)va);
325 }
326
327 MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
328
329 xen_mc_issue(PARAVIRT_LAZY_CPU);
330}
331
332static void load_TLS_descriptor(struct thread_struct *t,
333 unsigned int cpu, unsigned int i)
334{
335 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
336 xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
337 struct multicall_space mc = __xen_mc_entry(0);
338
339 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
340}
341
342static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
343{
344 xen_mc_batch();
345
346 load_TLS_descriptor(t, cpu, 0);
347 load_TLS_descriptor(t, cpu, 1);
348 load_TLS_descriptor(t, cpu, 2);
349
350 xen_mc_issue(PARAVIRT_LAZY_CPU);
351
352 /*
353 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
354 * it means we're in a context switch, and %gs has just been
355 * saved. This means we can zero it out to prevent faults on
356 * exit from the hypervisor if the next process has no %gs.
357 * Either way, it has been saved, and the new value will get
358 * loaded properly. This will go away as soon as Xen has been
359 * modified to not save/restore %gs for normal hypercalls.
360 */
361 if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
362 loadsegment(gs, 0);
363}
364
365static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
366 u32 low, u32 high)
367{
368 unsigned long lp = (unsigned long)&dt[entrynum];
369 xmaddr_t mach_lp = virt_to_machine(lp);
370 u64 entry = (u64)high << 32 | low;
371
372 preempt_disable();
373
374 xen_mc_flush();
375 if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
376 BUG();
377
378 preempt_enable();
379}
380
381static int cvt_gate_to_trap(int vector, u32 low, u32 high,
382 struct trap_info *info)
383{
384 u8 type, dpl;
385
386 type = (high >> 8) & 0x1f;
387 dpl = (high >> 13) & 3;
388
389 if (type != 0xf && type != 0xe)
390 return 0;
391
392 info->vector = vector;
393 info->address = (high & 0xffff0000) | (low & 0x0000ffff);
394 info->cs = low >> 16;
395 info->flags = dpl;
396 /* interrupt gates clear IF */
397 if (type == 0xe)
398 info->flags |= 4;
399
400 return 1;
401}
402
403/* Locations of each CPU's IDT */
404static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
405
406/* Set an IDT entry. If the entry is part of the current IDT, then
407 also update Xen. */
408static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
409 u32 low, u32 high)
410{
411 unsigned long p = (unsigned long)&dt[entrynum];
412 unsigned long start, end;
413
414 preempt_disable();
415
416 start = __get_cpu_var(idt_desc).address;
417 end = start + __get_cpu_var(idt_desc).size + 1;
418
419 xen_mc_flush();
420
421 write_dt_entry(dt, entrynum, low, high);
422
423 if (p >= start && (p + 8) <= end) {
424 struct trap_info info[2];
425
426 info[1].address = 0;
427
428 if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
429 if (HYPERVISOR_set_trap_table(info))
430 BUG();
431 }
432
433 preempt_enable();
434}
435
436static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
437 struct trap_info *traps)
438{
439 unsigned in, out, count;
440
441 count = (desc->size+1) / 8;
442 BUG_ON(count > 256);
443
444 for (in = out = 0; in < count; in++) {
445 const u32 *entry = (u32 *)(desc->address + in * 8);
446
447 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
448 out++;
449 }
450 traps[out].address = 0;
451}
452
453void xen_copy_trap_info(struct trap_info *traps)
454{
455 const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
456
457 xen_convert_trap_info(desc, traps);
458}
459
460/* Load a new IDT into Xen. In principle this can be per-CPU, so we
461 hold a spinlock to protect the static traps[] array (static because
462 it avoids allocation, and saves stack space). */
463static void xen_load_idt(const struct Xgt_desc_struct *desc)
464{
465 static DEFINE_SPINLOCK(lock);
466 static struct trap_info traps[257];
467
468 spin_lock(&lock);
469
470 __get_cpu_var(idt_desc) = *desc;
471
472 xen_convert_trap_info(desc, traps);
473
474 xen_mc_flush();
475 if (HYPERVISOR_set_trap_table(traps))
476 BUG();
477
478 spin_unlock(&lock);
479}
480
481/* Write a GDT descriptor entry. Ignore LDT descriptors, since
482 they're handled differently. */
483static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
484 u32 low, u32 high)
485{
486 preempt_disable();
487
488 switch ((high >> 8) & 0xff) {
489 case DESCTYPE_LDT:
490 case DESCTYPE_TSS:
491 /* ignore */
492 break;
493
494 default: {
495 xmaddr_t maddr = virt_to_machine(&dt[entry]);
496 u64 desc = (u64)high << 32 | low;
497
498 xen_mc_flush();
499 if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
500 BUG();
501 }
502
503 }
504
505 preempt_enable();
506}
507
508static void xen_load_esp0(struct tss_struct *tss,
509 struct thread_struct *thread)
510{
511 struct multicall_space mcs = xen_mc_entry(0);
512 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
513 xen_mc_issue(PARAVIRT_LAZY_CPU);
514}
515
516static void xen_set_iopl_mask(unsigned mask)
517{
518 struct physdev_set_iopl set_iopl;
519
520 /* Force the change at ring 0. */
521 set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
522 HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
523}
524
525static void xen_io_delay(void)
526{
527}
528
529#ifdef CONFIG_X86_LOCAL_APIC
530static unsigned long xen_apic_read(unsigned long reg)
531{
532 return 0;
533}
534
535static void xen_apic_write(unsigned long reg, unsigned long val)
536{
537 /* Warn to see if there's any stray references */
538 WARN_ON(1);
539}
540#endif
541
542static void xen_flush_tlb(void)
543{
544 struct mmuext_op *op;
545 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
546
547 op = mcs.args;
548 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
549 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
550
551 xen_mc_issue(PARAVIRT_LAZY_MMU);
552}
553
554static void xen_flush_tlb_single(unsigned long addr)
555{
556 struct mmuext_op *op;
557 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
558
559 op = mcs.args;
560 op->cmd = MMUEXT_INVLPG_LOCAL;
561 op->arg1.linear_addr = addr & PAGE_MASK;
562 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
563
564 xen_mc_issue(PARAVIRT_LAZY_MMU);
565}
566
567static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
568 unsigned long va)
569{
570 struct {
571 struct mmuext_op op;
572 cpumask_t mask;
573 } *args;
574 cpumask_t cpumask = *cpus;
575 struct multicall_space mcs;
576
577 /*
578 * A couple of (to be removed) sanity checks:
579 *
580 * - current CPU must not be in mask
581 * - mask must exist :)
582 */
583 BUG_ON(cpus_empty(cpumask));
584 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
585 BUG_ON(!mm);
586
587 /* If a CPU which we ran on has gone down, OK. */
588 cpus_and(cpumask, cpumask, cpu_online_map);
589 if (cpus_empty(cpumask))
590 return;
591
592 mcs = xen_mc_entry(sizeof(*args));
593 args = mcs.args;
594 args->mask = cpumask;
595 args->op.arg2.vcpumask = &args->mask;
596
597 if (va == TLB_FLUSH_ALL) {
598 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
599 } else {
600 args->op.cmd = MMUEXT_INVLPG_MULTI;
601 args->op.arg1.linear_addr = va;
602 }
603
604 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
605
606 xen_mc_issue(PARAVIRT_LAZY_MMU);
607}
608
609static void xen_write_cr2(unsigned long cr2)
610{
611 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
612}
613
614static unsigned long xen_read_cr2(void)
615{
616 return x86_read_percpu(xen_vcpu)->arch.cr2;
617}
618
619static unsigned long xen_read_cr2_direct(void)
620{
621 return x86_read_percpu(xen_vcpu_info.arch.cr2);
622}
623
624static void xen_write_cr4(unsigned long cr4)
625{
626 /* never allow TSC to be disabled */
627 native_write_cr4(cr4 & ~X86_CR4_TSD);
628}
629
630static unsigned long xen_read_cr3(void)
631{
632 return x86_read_percpu(xen_cr3);
633}
634
635static void xen_write_cr3(unsigned long cr3)
636{
637 BUG_ON(preemptible());
638
639 if (cr3 == x86_read_percpu(xen_cr3)) {
640 /* just a simple tlb flush */
641 xen_flush_tlb();
642 return;
643 }
644
645 x86_write_percpu(xen_cr3, cr3);
646
647
648 {
649 struct mmuext_op *op;
650 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
651 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
652
653 op = mcs.args;
654 op->cmd = MMUEXT_NEW_BASEPTR;
655 op->arg1.mfn = mfn;
656
657 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
658
659 xen_mc_issue(PARAVIRT_LAZY_CPU);
660 }
661}
662
663/* Early in boot, while setting up the initial pagetable, assume
664 everything is pinned. */
665static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
666{
667 BUG_ON(mem_map); /* should only be used early */
668 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
669}
670
671/* This needs to make sure the new pte page is pinned iff its being
672 attached to a pinned pagetable. */
673static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
674{
675 struct page *page = pfn_to_page(pfn);
676
677 if (PagePinned(virt_to_page(mm->pgd))) {
678 SetPagePinned(page);
679
680 if (!PageHighMem(page))
681 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
682 else
683 /* make sure there are no stray mappings of
684 this page */
685 kmap_flush_unused();
686 }
687}
688
689/* This should never happen until we're OK to use struct page */
690static void xen_release_pt(u32 pfn)
691{
692 struct page *page = pfn_to_page(pfn);
693
694 if (PagePinned(page)) {
695 if (!PageHighMem(page))
696 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
697 }
698}
699
700#ifdef CONFIG_HIGHPTE
701static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
702{
703 pgprot_t prot = PAGE_KERNEL;
704
705 if (PagePinned(page))
706 prot = PAGE_KERNEL_RO;
707
708 if (0 && PageHighMem(page))
709 printk("mapping highpte %lx type %d prot %s\n",
710 page_to_pfn(page), type,
711 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
712
713 return kmap_atomic_prot(page, type, prot);
714}
715#endif
716
717static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
718{
719 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
720 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
721 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
722 pte_val_ma(pte));
723
724 return pte;
725}
726
727/* Init-time set_pte while constructing initial pagetables, which
728 doesn't allow RO pagetable pages to be remapped RW */
729static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
730{
731 pte = mask_rw_pte(ptep, pte);
732
733 xen_set_pte(ptep, pte);
734}
735
736static __init void xen_pagetable_setup_start(pgd_t *base)
737{
738 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
739
740 /* special set_pte for pagetable initialization */
741 paravirt_ops.set_pte = xen_set_pte_init;
742
743 init_mm.pgd = base;
744 /*
745 * copy top-level of Xen-supplied pagetable into place. For
746 * !PAE we can use this as-is, but for PAE it is a stand-in
747 * while we copy the pmd pages.
748 */
749 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
750
751 if (PTRS_PER_PMD > 1) {
752 int i;
753 /*
754 * For PAE, need to allocate new pmds, rather than
755 * share Xen's, since Xen doesn't like pmd's being
756 * shared between address spaces.
757 */
758 for (i = 0; i < PTRS_PER_PGD; i++) {
759 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
760 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
761
762 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
763 PAGE_SIZE);
764
765 make_lowmem_page_readonly(pmd);
766
767 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
768 } else
769 pgd_clear(&base[i]);
770 }
771 }
772
773 /* make sure zero_page is mapped RO so we can use it in pagetables */
774 make_lowmem_page_readonly(empty_zero_page);
775 make_lowmem_page_readonly(base);
776 /*
777 * Switch to new pagetable. This is done before
778 * pagetable_init has done anything so that the new pages
779 * added to the table can be prepared properly for Xen.
780 */
781 xen_write_cr3(__pa(base));
782}
783
784static __init void xen_pagetable_setup_done(pgd_t *base)
785{
786 /* This will work as long as patching hasn't happened yet
787 (which it hasn't) */
788 paravirt_ops.alloc_pt = xen_alloc_pt;
789 paravirt_ops.set_pte = xen_set_pte;
790
791 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
792 /*
793 * Create a mapping for the shared info page.
794 * Should be set_fixmap(), but shared_info is a machine
795 * address with no corresponding pseudo-phys address.
796 */
797 set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
798 PFN_DOWN(xen_start_info->shared_info),
799 PAGE_KERNEL);
800
801 HYPERVISOR_shared_info =
802 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
803
804 } else
805 HYPERVISOR_shared_info =
806 (struct shared_info *)__va(xen_start_info->shared_info);
807
808 /* Actually pin the pagetable down, but we can't set PG_pinned
809 yet because the page structures don't exist yet. */
810 {
811 struct mmuext_op op;
812#ifdef CONFIG_X86_PAE
813 op.cmd = MMUEXT_PIN_L3_TABLE;
814#else
815 op.cmd = MMUEXT_PIN_L3_TABLE;
816#endif
817 op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
818 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
819 BUG();
820 }
821}
822
823/* This is called once we have the cpu_possible_map */
824void __init xen_setup_vcpu_info_placement(void)
825{
826 int cpu;
827
828 for_each_possible_cpu(cpu)
829 xen_vcpu_setup(cpu);
830
831 /* xen_vcpu_setup managed to place the vcpu_info within the
832 percpu area for all cpus, so make use of it */
833 if (have_vcpu_info_placement) {
834 printk(KERN_INFO "Xen: using vcpu_info placement\n");
835
836 paravirt_ops.save_fl = xen_save_fl_direct;
837 paravirt_ops.restore_fl = xen_restore_fl_direct;
838 paravirt_ops.irq_disable = xen_irq_disable_direct;
839 paravirt_ops.irq_enable = xen_irq_enable_direct;
840 paravirt_ops.read_cr2 = xen_read_cr2_direct;
841 paravirt_ops.iret = xen_iret_direct;
842 }
843}
844
845static unsigned xen_patch(u8 type, u16 clobbers, void *insns, unsigned len)
846{
847 char *start, *end, *reloc;
848 unsigned ret;
849
850 start = end = reloc = NULL;
851
852#define SITE(x) \
853 case PARAVIRT_PATCH(x): \
854 if (have_vcpu_info_placement) { \
855 start = (char *)xen_##x##_direct; \
856 end = xen_##x##_direct_end; \
857 reloc = xen_##x##_direct_reloc; \
858 } \
859 goto patch_site
860
861 switch (type) {
862 SITE(irq_enable);
863 SITE(irq_disable);
864 SITE(save_fl);
865 SITE(restore_fl);
866#undef SITE
867
868 patch_site:
869 if (start == NULL || (end-start) > len)
870 goto default_patch;
871
872 ret = paravirt_patch_insns(insns, len, start, end);
873
874 /* Note: because reloc is assigned from something that
875 appears to be an array, gcc assumes it's non-null,
876 but doesn't know its relationship with start and
877 end. */
878 if (reloc > start && reloc < end) {
879 int reloc_off = reloc - start;
880 long *relocp = (long *)(insns + reloc_off);
881 long delta = start - (char *)insns;
882
883 *relocp += delta;
884 }
885 break;
886
887 default_patch:
888 default:
889 ret = paravirt_patch_default(type, clobbers, insns, len);
890 break;
891 }
892
893 return ret;
894}
895
896static const struct paravirt_ops xen_paravirt_ops __initdata = {
897 .paravirt_enabled = 1,
898 .shared_kernel_pmd = 0,
899
900 .name = "Xen",
901 .banner = xen_banner,
902
903 .patch = xen_patch,
904
905 .memory_setup = xen_memory_setup,
906 .arch_setup = xen_arch_setup,
907 .init_IRQ = xen_init_IRQ,
908 .post_allocator_init = xen_mark_init_mm_pinned,
909
910 .time_init = xen_time_init,
911 .set_wallclock = xen_set_wallclock,
912 .get_wallclock = xen_get_wallclock,
913 .get_cpu_khz = xen_cpu_khz,
914 .sched_clock = xen_sched_clock,
915
916 .cpuid = xen_cpuid,
917
918 .set_debugreg = xen_set_debugreg,
919 .get_debugreg = xen_get_debugreg,
920
921 .clts = native_clts,
922
923 .read_cr0 = native_read_cr0,
924 .write_cr0 = native_write_cr0,
925
926 .read_cr2 = xen_read_cr2,
927 .write_cr2 = xen_write_cr2,
928
929 .read_cr3 = xen_read_cr3,
930 .write_cr3 = xen_write_cr3,
931
932 .read_cr4 = native_read_cr4,
933 .read_cr4_safe = native_read_cr4_safe,
934 .write_cr4 = xen_write_cr4,
935
936 .save_fl = xen_save_fl,
937 .restore_fl = xen_restore_fl,
938 .irq_disable = xen_irq_disable,
939 .irq_enable = xen_irq_enable,
940 .safe_halt = xen_safe_halt,
941 .halt = xen_halt,
942 .wbinvd = native_wbinvd,
943
944 .read_msr = native_read_msr_safe,
945 .write_msr = native_write_msr_safe,
946 .read_tsc = native_read_tsc,
947 .read_pmc = native_read_pmc,
948
949 .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
950 .irq_enable_sysexit = NULL, /* never called */
951
952 .load_tr_desc = paravirt_nop,
953 .set_ldt = xen_set_ldt,
954 .load_gdt = xen_load_gdt,
955 .load_idt = xen_load_idt,
956 .load_tls = xen_load_tls,
957
958 .store_gdt = native_store_gdt,
959 .store_idt = native_store_idt,
960 .store_tr = xen_store_tr,
961
962 .write_ldt_entry = xen_write_ldt_entry,
963 .write_gdt_entry = xen_write_gdt_entry,
964 .write_idt_entry = xen_write_idt_entry,
965 .load_esp0 = xen_load_esp0,
966
967 .set_iopl_mask = xen_set_iopl_mask,
968 .io_delay = xen_io_delay,
969
970#ifdef CONFIG_X86_LOCAL_APIC
971 .apic_write = xen_apic_write,
972 .apic_write_atomic = xen_apic_write,
973 .apic_read = xen_apic_read,
974 .setup_boot_clock = paravirt_nop,
975 .setup_secondary_clock = paravirt_nop,
976 .startup_ipi_hook = paravirt_nop,
977#endif
978
979 .flush_tlb_user = xen_flush_tlb,
980 .flush_tlb_kernel = xen_flush_tlb,
981 .flush_tlb_single = xen_flush_tlb_single,
982 .flush_tlb_others = xen_flush_tlb_others,
983
984 .pte_update = paravirt_nop,
985 .pte_update_defer = paravirt_nop,
986
987 .pagetable_setup_start = xen_pagetable_setup_start,
988 .pagetable_setup_done = xen_pagetable_setup_done,
989
990 .alloc_pt = xen_alloc_pt_init,
991 .release_pt = xen_release_pt,
992 .alloc_pd = paravirt_nop,
993 .alloc_pd_clone = paravirt_nop,
994 .release_pd = paravirt_nop,
995
996#ifdef CONFIG_HIGHPTE
997 .kmap_atomic_pte = xen_kmap_atomic_pte,
998#endif
999
1000 .set_pte = NULL, /* see xen_pagetable_setup_* */
1001 .set_pte_at = xen_set_pte_at,
1002 .set_pmd = xen_set_pmd,
1003
1004 .pte_val = xen_pte_val,
1005 .pgd_val = xen_pgd_val,
1006
1007 .make_pte = xen_make_pte,
1008 .make_pgd = xen_make_pgd,
1009
1010#ifdef CONFIG_X86_PAE
1011 .set_pte_atomic = xen_set_pte_atomic,
1012 .set_pte_present = xen_set_pte_at,
1013 .set_pud = xen_set_pud,
1014 .pte_clear = xen_pte_clear,
1015 .pmd_clear = xen_pmd_clear,
1016
1017 .make_pmd = xen_make_pmd,
1018 .pmd_val = xen_pmd_val,
1019#endif /* PAE */
1020
1021 .activate_mm = xen_activate_mm,
1022 .dup_mmap = xen_dup_mmap,
1023 .exit_mmap = xen_exit_mmap,
1024
1025 .set_lazy_mode = xen_set_lazy_mode,
1026};
1027
1028#ifdef CONFIG_SMP
1029static const struct smp_ops xen_smp_ops __initdata = {
1030 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1031 .smp_prepare_cpus = xen_smp_prepare_cpus,
1032 .cpu_up = xen_cpu_up,
1033 .smp_cpus_done = xen_smp_cpus_done,
1034
1035 .smp_send_stop = xen_smp_send_stop,
1036 .smp_send_reschedule = xen_smp_send_reschedule,
1037 .smp_call_function_mask = xen_smp_call_function_mask,
1038};
1039#endif /* CONFIG_SMP */
1040
1041static void xen_reboot(int reason)
1042{
1043#ifdef CONFIG_SMP
1044 smp_send_stop();
1045#endif
1046
1047 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
1048 BUG();
1049}
1050
1051static void xen_restart(char *msg)
1052{
1053 xen_reboot(SHUTDOWN_reboot);
1054}
1055
1056static void xen_emergency_restart(void)
1057{
1058 xen_reboot(SHUTDOWN_reboot);
1059}
1060
1061static void xen_machine_halt(void)
1062{
1063 xen_reboot(SHUTDOWN_poweroff);
1064}
1065
1066static void xen_crash_shutdown(struct pt_regs *regs)
1067{
1068 xen_reboot(SHUTDOWN_crash);
1069}
1070
1071static const struct machine_ops __initdata xen_machine_ops = {
1072 .restart = xen_restart,
1073 .halt = xen_machine_halt,
1074 .power_off = xen_machine_halt,
1075 .shutdown = xen_machine_halt,
1076 .crash_shutdown = xen_crash_shutdown,
1077 .emergency_restart = xen_emergency_restart,
1078};
1079
1080
1081/* First C function to be called on Xen boot */
1082asmlinkage void __init xen_start_kernel(void)
1083{
1084 pgd_t *pgd;
1085
1086 if (!xen_start_info)
1087 return;
1088
1089 BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
1090
1091 /* Install Xen paravirt ops */
1092 paravirt_ops = xen_paravirt_ops;
1093 machine_ops = xen_machine_ops;
1094
1095#ifdef CONFIG_SMP
1096 smp_ops = xen_smp_ops;
1097#endif
1098
1099 xen_setup_features();
1100
1101 /* Get mfn list */
1102 if (!xen_feature(XENFEAT_auto_translated_physmap))
1103 phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
1104
1105 pgd = (pgd_t *)xen_start_info->pt_base;
1106
1107 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1108
1109 init_mm.pgd = pgd; /* use the Xen pagetables to start */
1110
1111 /* keep using Xen gdt for now; no urgent need to change it */
1112
1113 x86_write_percpu(xen_cr3, __pa(pgd));
1114
1115#ifdef CONFIG_SMP
1116 /* Don't do the full vcpu_info placement stuff until we have a
1117 possible map. */
1118 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1119#else
1120 /* May as well do it now, since there's no good time to call
1121 it later on UP. */
1122 xen_setup_vcpu_info_placement();
1123#endif
1124
1125 paravirt_ops.kernel_rpl = 1;
1126 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1127 paravirt_ops.kernel_rpl = 0;
1128
1129 /* set the limit of our address space */
1130 reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
1131
1132 /* set up basic CPUID stuff */
1133 cpu_detect(&new_cpu_data);
1134 new_cpu_data.hard_math = 1;
1135 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1136
1137 /* Poke various useful things into boot_params */
1138 LOADER_TYPE = (9 << 4) | 0;
1139 INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
1140 INITRD_SIZE = xen_start_info->mod_len;
1141
1142 /* Start the world */
1143 start_kernel();
1144}
diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c
new file mode 100644
index 000000000000..8904acc20f8c
--- /dev/null
+++ b/arch/i386/xen/events.c
@@ -0,0 +1,590 @@
1/*
2 * Xen event channels
3 *
4 * Xen models interrupts with abstract event channels. Because each
5 * domain gets 1024 event channels, but NR_IRQ is not that large, we
6 * must dynamically map irqs<->event channels. The event channels
7 * interface with the rest of the kernel by defining a xen interrupt
8 * chip. When an event is recieved, it is mapped to an irq and sent
9 * through the normal interrupt processing path.
10 *
11 * There are four kinds of events which can be mapped to an event
12 * channel:
13 *
14 * 1. Inter-domain notifications. This includes all the virtual
15 * device events, since they're driven by front-ends in another domain
16 * (typically dom0).
17 * 2. VIRQs, typically used for timers. These are per-cpu events.
18 * 3. IPIs.
19 * 4. Hardware interrupts. Not supported at present.
20 *
21 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
22 */
23
24#include <linux/linkage.h>
25#include <linux/interrupt.h>
26#include <linux/irq.h>
27#include <linux/module.h>
28#include <linux/string.h>
29
30#include <asm/ptrace.h>
31#include <asm/irq.h>
32#include <asm/sync_bitops.h>
33#include <asm/xen/hypercall.h>
34
35#include <xen/events.h>
36#include <xen/interface/xen.h>
37#include <xen/interface/event_channel.h>
38
39#include "xen-ops.h"
40
41/*
42 * This lock protects updates to the following mapping and reference-count
43 * arrays. The lock does not need to be acquired to read the mapping tables.
44 */
45static DEFINE_SPINLOCK(irq_mapping_update_lock);
46
47/* IRQ <-> VIRQ mapping. */
48static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
49
50/* IRQ <-> IPI mapping */
51static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
52
53/* Packed IRQ information: binding type, sub-type index, and event channel. */
54struct packed_irq
55{
56 unsigned short evtchn;
57 unsigned char index;
58 unsigned char type;
59};
60
61static struct packed_irq irq_info[NR_IRQS];
62
63/* Binding types. */
64enum {
65 IRQT_UNBOUND,
66 IRQT_PIRQ,
67 IRQT_VIRQ,
68 IRQT_IPI,
69 IRQT_EVTCHN
70};
71
72/* Convenient shorthand for packed representation of an unbound IRQ. */
73#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
74
75static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
76 [0 ... NR_EVENT_CHANNELS-1] = -1
77};
78static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
79static u8 cpu_evtchn[NR_EVENT_CHANNELS];
80
81/* Reference counts for bindings to IRQs. */
82static int irq_bindcount[NR_IRQS];
83
84/* Xen will never allocate port zero for any purpose. */
85#define VALID_EVTCHN(chn) ((chn) != 0)
86
87/*
88 * Force a proper event-channel callback from Xen after clearing the
89 * callback mask. We do this in a very simple manner, by making a call
90 * down into Xen. The pending flag will be checked by Xen on return.
91 */
92void force_evtchn_callback(void)
93{
94 (void)HYPERVISOR_xen_version(0, NULL);
95}
96EXPORT_SYMBOL_GPL(force_evtchn_callback);
97
98static struct irq_chip xen_dynamic_chip;
99
100/* Constructor for packed IRQ information. */
101static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
102{
103 return (struct packed_irq) { evtchn, index, type };
104}
105
106/*
107 * Accessors for packed IRQ information.
108 */
109static inline unsigned int evtchn_from_irq(int irq)
110{
111 return irq_info[irq].evtchn;
112}
113
114static inline unsigned int index_from_irq(int irq)
115{
116 return irq_info[irq].index;
117}
118
119static inline unsigned int type_from_irq(int irq)
120{
121 return irq_info[irq].type;
122}
123
124static inline unsigned long active_evtchns(unsigned int cpu,
125 struct shared_info *sh,
126 unsigned int idx)
127{
128 return (sh->evtchn_pending[idx] &
129 cpu_evtchn_mask[cpu][idx] &
130 ~sh->evtchn_mask[idx]);
131}
132
133static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
134{
135 int irq = evtchn_to_irq[chn];
136
137 BUG_ON(irq == -1);
138#ifdef CONFIG_SMP
139 irq_desc[irq].affinity = cpumask_of_cpu(cpu);
140#endif
141
142 __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
143 __set_bit(chn, cpu_evtchn_mask[cpu]);
144
145 cpu_evtchn[chn] = cpu;
146}
147
148static void init_evtchn_cpu_bindings(void)
149{
150#ifdef CONFIG_SMP
151 int i;
152 /* By default all event channels notify CPU#0. */
153 for (i = 0; i < NR_IRQS; i++)
154 irq_desc[i].affinity = cpumask_of_cpu(0);
155#endif
156
157 memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
158 memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
159}
160
161static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
162{
163 return cpu_evtchn[evtchn];
164}
165
166static inline void clear_evtchn(int port)
167{
168 struct shared_info *s = HYPERVISOR_shared_info;
169 sync_clear_bit(port, &s->evtchn_pending[0]);
170}
171
172static inline void set_evtchn(int port)
173{
174 struct shared_info *s = HYPERVISOR_shared_info;
175 sync_set_bit(port, &s->evtchn_pending[0]);
176}
177
178
179/**
180 * notify_remote_via_irq - send event to remote end of event channel via irq
181 * @irq: irq of event channel to send event to
182 *
183 * Unlike notify_remote_via_evtchn(), this is safe to use across
184 * save/restore. Notifications on a broken connection are silently
185 * dropped.
186 */
187void notify_remote_via_irq(int irq)
188{
189 int evtchn = evtchn_from_irq(irq);
190
191 if (VALID_EVTCHN(evtchn))
192 notify_remote_via_evtchn(evtchn);
193}
194EXPORT_SYMBOL_GPL(notify_remote_via_irq);
195
196static void mask_evtchn(int port)
197{
198 struct shared_info *s = HYPERVISOR_shared_info;
199 sync_set_bit(port, &s->evtchn_mask[0]);
200}
201
202static void unmask_evtchn(int port)
203{
204 struct shared_info *s = HYPERVISOR_shared_info;
205 unsigned int cpu = get_cpu();
206
207 BUG_ON(!irqs_disabled());
208
209 /* Slow path (hypercall) if this is a non-local port. */
210 if (unlikely(cpu != cpu_from_evtchn(port))) {
211 struct evtchn_unmask unmask = { .port = port };
212 (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
213 } else {
214 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
215
216 sync_clear_bit(port, &s->evtchn_mask[0]);
217
218 /*
219 * The following is basically the equivalent of
220 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
221 * the interrupt edge' if the channel is masked.
222 */
223 if (sync_test_bit(port, &s->evtchn_pending[0]) &&
224 !sync_test_and_set_bit(port / BITS_PER_LONG,
225 &vcpu_info->evtchn_pending_sel))
226 vcpu_info->evtchn_upcall_pending = 1;
227 }
228
229 put_cpu();
230}
231
232static int find_unbound_irq(void)
233{
234 int irq;
235
236 /* Only allocate from dynirq range */
237 for (irq = 0; irq < NR_IRQS; irq++)
238 if (irq_bindcount[irq] == 0)
239 break;
240
241 if (irq == NR_IRQS)
242 panic("No available IRQ to bind to: increase NR_IRQS!\n");
243
244 return irq;
245}
246
247int bind_evtchn_to_irq(unsigned int evtchn)
248{
249 int irq;
250
251 spin_lock(&irq_mapping_update_lock);
252
253 irq = evtchn_to_irq[evtchn];
254
255 if (irq == -1) {
256 irq = find_unbound_irq();
257
258 dynamic_irq_init(irq);
259 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
260 handle_level_irq, "event");
261
262 evtchn_to_irq[evtchn] = irq;
263 irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
264 }
265
266 irq_bindcount[irq]++;
267
268 spin_unlock(&irq_mapping_update_lock);
269
270 return irq;
271}
272EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
273
274static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
275{
276 struct evtchn_bind_ipi bind_ipi;
277 int evtchn, irq;
278
279 spin_lock(&irq_mapping_update_lock);
280
281 irq = per_cpu(ipi_to_irq, cpu)[ipi];
282 if (irq == -1) {
283 irq = find_unbound_irq();
284 if (irq < 0)
285 goto out;
286
287 dynamic_irq_init(irq);
288 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
289 handle_level_irq, "ipi");
290
291 bind_ipi.vcpu = cpu;
292 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
293 &bind_ipi) != 0)
294 BUG();
295 evtchn = bind_ipi.port;
296
297 evtchn_to_irq[evtchn] = irq;
298 irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
299
300 per_cpu(ipi_to_irq, cpu)[ipi] = irq;
301
302 bind_evtchn_to_cpu(evtchn, cpu);
303 }
304
305 irq_bindcount[irq]++;
306
307 out:
308 spin_unlock(&irq_mapping_update_lock);
309 return irq;
310}
311
312
313static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
314{
315 struct evtchn_bind_virq bind_virq;
316 int evtchn, irq;
317
318 spin_lock(&irq_mapping_update_lock);
319
320 irq = per_cpu(virq_to_irq, cpu)[virq];
321
322 if (irq == -1) {
323 bind_virq.virq = virq;
324 bind_virq.vcpu = cpu;
325 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
326 &bind_virq) != 0)
327 BUG();
328 evtchn = bind_virq.port;
329
330 irq = find_unbound_irq();
331
332 dynamic_irq_init(irq);
333 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
334 handle_level_irq, "virq");
335
336 evtchn_to_irq[evtchn] = irq;
337 irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
338
339 per_cpu(virq_to_irq, cpu)[virq] = irq;
340
341 bind_evtchn_to_cpu(evtchn, cpu);
342 }
343
344 irq_bindcount[irq]++;
345
346 spin_unlock(&irq_mapping_update_lock);
347
348 return irq;
349}
350
351static void unbind_from_irq(unsigned int irq)
352{
353 struct evtchn_close close;
354 int evtchn = evtchn_from_irq(irq);
355
356 spin_lock(&irq_mapping_update_lock);
357
358 if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
359 close.port = evtchn;
360 if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
361 BUG();
362
363 switch (type_from_irq(irq)) {
364 case IRQT_VIRQ:
365 per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
366 [index_from_irq(irq)] = -1;
367 break;
368 default:
369 break;
370 }
371
372 /* Closed ports are implicitly re-bound to VCPU0. */
373 bind_evtchn_to_cpu(evtchn, 0);
374
375 evtchn_to_irq[evtchn] = -1;
376 irq_info[irq] = IRQ_UNBOUND;
377
378 dynamic_irq_init(irq);
379 }
380
381 spin_unlock(&irq_mapping_update_lock);
382}
383
384int bind_evtchn_to_irqhandler(unsigned int evtchn,
385 irqreturn_t (*handler)(int, void *),
386 unsigned long irqflags,
387 const char *devname, void *dev_id)
388{
389 unsigned int irq;
390 int retval;
391
392 irq = bind_evtchn_to_irq(evtchn);
393 retval = request_irq(irq, handler, irqflags, devname, dev_id);
394 if (retval != 0) {
395 unbind_from_irq(irq);
396 return retval;
397 }
398
399 return irq;
400}
401EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
402
403int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
404 irqreturn_t (*handler)(int, void *),
405 unsigned long irqflags, const char *devname, void *dev_id)
406{
407 unsigned int irq;
408 int retval;
409
410 irq = bind_virq_to_irq(virq, cpu);
411 retval = request_irq(irq, handler, irqflags, devname, dev_id);
412 if (retval != 0) {
413 unbind_from_irq(irq);
414 return retval;
415 }
416
417 return irq;
418}
419EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
420
421int bind_ipi_to_irqhandler(enum ipi_vector ipi,
422 unsigned int cpu,
423 irq_handler_t handler,
424 unsigned long irqflags,
425 const char *devname,
426 void *dev_id)
427{
428 int irq, retval;
429
430 irq = bind_ipi_to_irq(ipi, cpu);
431 if (irq < 0)
432 return irq;
433
434 retval = request_irq(irq, handler, irqflags, devname, dev_id);
435 if (retval != 0) {
436 unbind_from_irq(irq);
437 return retval;
438 }
439
440 return irq;
441}
442
443void unbind_from_irqhandler(unsigned int irq, void *dev_id)
444{
445 free_irq(irq, dev_id);
446 unbind_from_irq(irq);
447}
448EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
449
450void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
451{
452 int irq = per_cpu(ipi_to_irq, cpu)[vector];
453 BUG_ON(irq < 0);
454 notify_remote_via_irq(irq);
455}
456
457
458/*
459 * Search the CPUs pending events bitmasks. For each one found, map
460 * the event number to an irq, and feed it into do_IRQ() for
461 * handling.
462 *
463 * Xen uses a two-level bitmap to speed searching. The first level is
464 * a bitset of words which contain pending event bits. The second
465 * level is a bitset of pending events themselves.
466 */
467fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
468{
469 int cpu = get_cpu();
470 struct shared_info *s = HYPERVISOR_shared_info;
471 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
472 unsigned long pending_words;
473
474 vcpu_info->evtchn_upcall_pending = 0;
475
476 /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
477 pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
478 while (pending_words != 0) {
479 unsigned long pending_bits;
480 int word_idx = __ffs(pending_words);
481 pending_words &= ~(1UL << word_idx);
482
483 while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
484 int bit_idx = __ffs(pending_bits);
485 int port = (word_idx * BITS_PER_LONG) + bit_idx;
486 int irq = evtchn_to_irq[port];
487
488 if (irq != -1) {
489 regs->orig_eax = ~irq;
490 do_IRQ(regs);
491 }
492 }
493 }
494
495 put_cpu();
496}
497
498/* Rebind an evtchn so that it gets delivered to a specific cpu */
499static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
500{
501 struct evtchn_bind_vcpu bind_vcpu;
502 int evtchn = evtchn_from_irq(irq);
503
504 if (!VALID_EVTCHN(evtchn))
505 return;
506
507 /* Send future instances of this interrupt to other vcpu. */
508 bind_vcpu.port = evtchn;
509 bind_vcpu.vcpu = tcpu;
510
511 /*
512 * If this fails, it usually just indicates that we're dealing with a
513 * virq or IPI channel, which don't actually need to be rebound. Ignore
514 * it, but don't do the xenlinux-level rebind in that case.
515 */
516 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
517 bind_evtchn_to_cpu(evtchn, tcpu);
518}
519
520
521static void set_affinity_irq(unsigned irq, cpumask_t dest)
522{
523 unsigned tcpu = first_cpu(dest);
524 rebind_irq_to_cpu(irq, tcpu);
525}
526
527static void enable_dynirq(unsigned int irq)
528{
529 int evtchn = evtchn_from_irq(irq);
530
531 if (VALID_EVTCHN(evtchn))
532 unmask_evtchn(evtchn);
533}
534
535static void disable_dynirq(unsigned int irq)
536{
537 int evtchn = evtchn_from_irq(irq);
538
539 if (VALID_EVTCHN(evtchn))
540 mask_evtchn(evtchn);
541}
542
543static void ack_dynirq(unsigned int irq)
544{
545 int evtchn = evtchn_from_irq(irq);
546
547 move_native_irq(irq);
548
549 if (VALID_EVTCHN(evtchn))
550 clear_evtchn(evtchn);
551}
552
553static int retrigger_dynirq(unsigned int irq)
554{
555 int evtchn = evtchn_from_irq(irq);
556 int ret = 0;
557
558 if (VALID_EVTCHN(evtchn)) {
559 set_evtchn(evtchn);
560 ret = 1;
561 }
562
563 return ret;
564}
565
566static struct irq_chip xen_dynamic_chip __read_mostly = {
567 .name = "xen-dyn",
568 .mask = disable_dynirq,
569 .unmask = enable_dynirq,
570 .ack = ack_dynirq,
571 .set_affinity = set_affinity_irq,
572 .retrigger = retrigger_dynirq,
573};
574
575void __init xen_init_IRQ(void)
576{
577 int i;
578
579 init_evtchn_cpu_bindings();
580
581 /* No event channels are 'live' right now. */
582 for (i = 0; i < NR_EVENT_CHANNELS; i++)
583 mask_evtchn(i);
584
585 /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
586 for (i = 0; i < NR_IRQS; i++)
587 irq_bindcount[i] = 0;
588
589 irq_ctx_init(smp_processor_id());
590}
diff --git a/arch/i386/xen/features.c b/arch/i386/xen/features.c
new file mode 100644
index 000000000000..0707714e40d6
--- /dev/null
+++ b/arch/i386/xen/features.c
@@ -0,0 +1,29 @@
1/******************************************************************************
2 * features.c
3 *
4 * Xen feature flags.
5 *
6 * Copyright (c) 2006, Ian Campbell, XenSource Inc.
7 */
8#include <linux/types.h>
9#include <linux/cache.h>
10#include <linux/module.h>
11#include <asm/xen/hypervisor.h>
12#include <xen/features.h>
13
14u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
15EXPORT_SYMBOL_GPL(xen_features);
16
17void xen_setup_features(void)
18{
19 struct xen_feature_info fi;
20 int i, j;
21
22 for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
23 fi.submap_idx = i;
24 if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
25 break;
26 for (j = 0; j < 32; j++)
27 xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
28 }
29}
diff --git a/arch/i386/xen/manage.c b/arch/i386/xen/manage.c
new file mode 100644
index 000000000000..aa7af9e6abc0
--- /dev/null
+++ b/arch/i386/xen/manage.c
@@ -0,0 +1,143 @@
1/*
2 * Handle extern requests for shutdown, reboot and sysrq
3 */
4#include <linux/kernel.h>
5#include <linux/err.h>
6#include <linux/reboot.h>
7#include <linux/sysrq.h>
8
9#include <xen/xenbus.h>
10
11#define SHUTDOWN_INVALID -1
12#define SHUTDOWN_POWEROFF 0
13#define SHUTDOWN_SUSPEND 2
14/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
15 * report a crash, not be instructed to crash!
16 * HALT is the same as POWEROFF, as far as we're concerned. The tools use
17 * the distinction when we return the reason code to them.
18 */
19#define SHUTDOWN_HALT 4
20
21/* Ignore multiple shutdown requests. */
22static int shutting_down = SHUTDOWN_INVALID;
23
24static void shutdown_handler(struct xenbus_watch *watch,
25 const char **vec, unsigned int len)
26{
27 char *str;
28 struct xenbus_transaction xbt;
29 int err;
30
31 if (shutting_down != SHUTDOWN_INVALID)
32 return;
33
34 again:
35 err = xenbus_transaction_start(&xbt);
36 if (err)
37 return;
38
39 str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
40 /* Ignore read errors and empty reads. */
41 if (XENBUS_IS_ERR_READ(str)) {
42 xenbus_transaction_end(xbt, 1);
43 return;
44 }
45
46 xenbus_write(xbt, "control", "shutdown", "");
47
48 err = xenbus_transaction_end(xbt, 0);
49 if (err == -EAGAIN) {
50 kfree(str);
51 goto again;
52 }
53
54 if (strcmp(str, "poweroff") == 0 ||
55 strcmp(str, "halt") == 0)
56 orderly_poweroff(false);
57 else if (strcmp(str, "reboot") == 0)
58 ctrl_alt_del();
59 else {
60 printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
61 shutting_down = SHUTDOWN_INVALID;
62 }
63
64 kfree(str);
65}
66
67static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
68 unsigned int len)
69{
70 char sysrq_key = '\0';
71 struct xenbus_transaction xbt;
72 int err;
73
74 again:
75 err = xenbus_transaction_start(&xbt);
76 if (err)
77 return;
78 if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
79 printk(KERN_ERR "Unable to read sysrq code in "
80 "control/sysrq\n");
81 xenbus_transaction_end(xbt, 1);
82 return;
83 }
84
85 if (sysrq_key != '\0')
86 xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
87
88 err = xenbus_transaction_end(xbt, 0);
89 if (err == -EAGAIN)
90 goto again;
91
92 if (sysrq_key != '\0')
93 handle_sysrq(sysrq_key, NULL);
94}
95
96static struct xenbus_watch shutdown_watch = {
97 .node = "control/shutdown",
98 .callback = shutdown_handler
99};
100
101static struct xenbus_watch sysrq_watch = {
102 .node = "control/sysrq",
103 .callback = sysrq_handler
104};
105
106static int setup_shutdown_watcher(void)
107{
108 int err;
109
110 err = register_xenbus_watch(&shutdown_watch);
111 if (err) {
112 printk(KERN_ERR "Failed to set shutdown watcher\n");
113 return err;
114 }
115
116 err = register_xenbus_watch(&sysrq_watch);
117 if (err) {
118 printk(KERN_ERR "Failed to set sysrq watcher\n");
119 return err;
120 }
121
122 return 0;
123}
124
125static int shutdown_event(struct notifier_block *notifier,
126 unsigned long event,
127 void *data)
128{
129 setup_shutdown_watcher();
130 return NOTIFY_DONE;
131}
132
133static int __init setup_shutdown_event(void)
134{
135 static struct notifier_block xenstore_notifier = {
136 .notifier_call = shutdown_event
137 };
138 register_xenstore_notifier(&xenstore_notifier);
139
140 return 0;
141}
142
143subsys_initcall(setup_shutdown_event);
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
new file mode 100644
index 000000000000..4ae038aa6c24
--- /dev/null
+++ b/arch/i386/xen/mmu.c
@@ -0,0 +1,564 @@
1/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
41#include <linux/sched.h>
42#include <linux/highmem.h>
43#include <linux/bug.h>
44#include <linux/sched.h>
45
46#include <asm/pgtable.h>
47#include <asm/tlbflush.h>
48#include <asm/mmu_context.h>
49#include <asm/paravirt.h>
50
51#include <asm/xen/hypercall.h>
52#include <asm/xen/hypervisor.h>
53
54#include <xen/page.h>
55#include <xen/interface/xen.h>
56
57#include "multicalls.h"
58#include "mmu.h"
59
60xmaddr_t arbitrary_virt_to_machine(unsigned long address)
61{
62 pte_t *pte = lookup_address(address);
63 unsigned offset = address & PAGE_MASK;
64
65 BUG_ON(pte == NULL);
66
67 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
68}
69
70void make_lowmem_page_readonly(void *vaddr)
71{
72 pte_t *pte, ptev;
73 unsigned long address = (unsigned long)vaddr;
74
75 pte = lookup_address(address);
76 BUG_ON(pte == NULL);
77
78 ptev = pte_wrprotect(*pte);
79
80 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
81 BUG();
82}
83
84void make_lowmem_page_readwrite(void *vaddr)
85{
86 pte_t *pte, ptev;
87 unsigned long address = (unsigned long)vaddr;
88
89 pte = lookup_address(address);
90 BUG_ON(pte == NULL);
91
92 ptev = pte_mkwrite(*pte);
93
94 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
95 BUG();
96}
97
98
99void xen_set_pmd(pmd_t *ptr, pmd_t val)
100{
101 struct multicall_space mcs;
102 struct mmu_update *u;
103
104 preempt_disable();
105
106 mcs = xen_mc_entry(sizeof(*u));
107 u = mcs.args;
108 u->ptr = virt_to_machine(ptr).maddr;
109 u->val = pmd_val_ma(val);
110 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
111
112 xen_mc_issue(PARAVIRT_LAZY_MMU);
113
114 preempt_enable();
115}
116
117/*
118 * Associate a virtual page frame with a given physical page frame
119 * and protection flags for that frame.
120 */
121void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
122{
123 pgd_t *pgd;
124 pud_t *pud;
125 pmd_t *pmd;
126 pte_t *pte;
127
128 pgd = swapper_pg_dir + pgd_index(vaddr);
129 if (pgd_none(*pgd)) {
130 BUG();
131 return;
132 }
133 pud = pud_offset(pgd, vaddr);
134 if (pud_none(*pud)) {
135 BUG();
136 return;
137 }
138 pmd = pmd_offset(pud, vaddr);
139 if (pmd_none(*pmd)) {
140 BUG();
141 return;
142 }
143 pte = pte_offset_kernel(pmd, vaddr);
144 /* <mfn,flags> stored as-is, to permit clearing entries */
145 xen_set_pte(pte, mfn_pte(mfn, flags));
146
147 /*
148 * It's enough to flush this one mapping.
149 * (PGE mappings get flushed as well)
150 */
151 __flush_tlb_one(vaddr);
152}
153
154void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
155 pte_t *ptep, pte_t pteval)
156{
157 if (mm == current->mm || mm == &init_mm) {
158 if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
159 struct multicall_space mcs;
160 mcs = xen_mc_entry(0);
161
162 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
163 xen_mc_issue(PARAVIRT_LAZY_MMU);
164 return;
165 } else
166 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
167 return;
168 }
169 xen_set_pte(ptep, pteval);
170}
171
172#ifdef CONFIG_X86_PAE
173void xen_set_pud(pud_t *ptr, pud_t val)
174{
175 struct multicall_space mcs;
176 struct mmu_update *u;
177
178 preempt_disable();
179
180 mcs = xen_mc_entry(sizeof(*u));
181 u = mcs.args;
182 u->ptr = virt_to_machine(ptr).maddr;
183 u->val = pud_val_ma(val);
184 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
185
186 xen_mc_issue(PARAVIRT_LAZY_MMU);
187
188 preempt_enable();
189}
190
191void xen_set_pte(pte_t *ptep, pte_t pte)
192{
193 ptep->pte_high = pte.pte_high;
194 smp_wmb();
195 ptep->pte_low = pte.pte_low;
196}
197
198void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
199{
200 set_64bit((u64 *)ptep, pte_val_ma(pte));
201}
202
203void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
204{
205 ptep->pte_low = 0;
206 smp_wmb(); /* make sure low gets written first */
207 ptep->pte_high = 0;
208}
209
210void xen_pmd_clear(pmd_t *pmdp)
211{
212 xen_set_pmd(pmdp, __pmd(0));
213}
214
215unsigned long long xen_pte_val(pte_t pte)
216{
217 unsigned long long ret = 0;
218
219 if (pte.pte_low) {
220 ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
221 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
222 }
223
224 return ret;
225}
226
227unsigned long long xen_pmd_val(pmd_t pmd)
228{
229 unsigned long long ret = pmd.pmd;
230 if (ret)
231 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
232 return ret;
233}
234
235unsigned long long xen_pgd_val(pgd_t pgd)
236{
237 unsigned long long ret = pgd.pgd;
238 if (ret)
239 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
240 return ret;
241}
242
243pte_t xen_make_pte(unsigned long long pte)
244{
245 if (pte & 1)
246 pte = phys_to_machine(XPADDR(pte)).maddr;
247
248 return (pte_t){ pte, pte >> 32 };
249}
250
251pmd_t xen_make_pmd(unsigned long long pmd)
252{
253 if (pmd & 1)
254 pmd = phys_to_machine(XPADDR(pmd)).maddr;
255
256 return (pmd_t){ pmd };
257}
258
259pgd_t xen_make_pgd(unsigned long long pgd)
260{
261 if (pgd & _PAGE_PRESENT)
262 pgd = phys_to_machine(XPADDR(pgd)).maddr;
263
264 return (pgd_t){ pgd };
265}
266#else /* !PAE */
267void xen_set_pte(pte_t *ptep, pte_t pte)
268{
269 *ptep = pte;
270}
271
272unsigned long xen_pte_val(pte_t pte)
273{
274 unsigned long ret = pte.pte_low;
275
276 if (ret & _PAGE_PRESENT)
277 ret = machine_to_phys(XMADDR(ret)).paddr;
278
279 return ret;
280}
281
282unsigned long xen_pgd_val(pgd_t pgd)
283{
284 unsigned long ret = pgd.pgd;
285 if (ret)
286 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
287 return ret;
288}
289
290pte_t xen_make_pte(unsigned long pte)
291{
292 if (pte & _PAGE_PRESENT)
293 pte = phys_to_machine(XPADDR(pte)).maddr;
294
295 return (pte_t){ pte };
296}
297
298pgd_t xen_make_pgd(unsigned long pgd)
299{
300 if (pgd & _PAGE_PRESENT)
301 pgd = phys_to_machine(XPADDR(pgd)).maddr;
302
303 return (pgd_t){ pgd };
304}
305#endif /* CONFIG_X86_PAE */
306
307
308
309/*
310 (Yet another) pagetable walker. This one is intended for pinning a
311 pagetable. This means that it walks a pagetable and calls the
312 callback function on each page it finds making up the page table,
313 at every level. It walks the entire pagetable, but it only bothers
314 pinning pte pages which are below pte_limit. In the normal case
315 this will be TASK_SIZE, but at boot we need to pin up to
316 FIXADDR_TOP. But the important bit is that we don't pin beyond
317 there, because then we start getting into Xen's ptes.
318*/
319static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
320 unsigned long limit)
321{
322 pgd_t *pgd = pgd_base;
323 int flush = 0;
324 unsigned long addr = 0;
325 unsigned long pgd_next;
326
327 BUG_ON(limit > FIXADDR_TOP);
328
329 if (xen_feature(XENFEAT_auto_translated_physmap))
330 return 0;
331
332 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
333 pud_t *pud;
334 unsigned long pud_limit, pud_next;
335
336 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
337
338 if (!pgd_val(*pgd))
339 continue;
340
341 pud = pud_offset(pgd, 0);
342
343 if (PTRS_PER_PUD > 1) /* not folded */
344 flush |= (*func)(virt_to_page(pud), 0);
345
346 for (; addr != pud_limit; pud++, addr = pud_next) {
347 pmd_t *pmd;
348 unsigned long pmd_limit;
349
350 pud_next = pud_addr_end(addr, pud_limit);
351
352 if (pud_next < limit)
353 pmd_limit = pud_next;
354 else
355 pmd_limit = limit;
356
357 if (pud_none(*pud))
358 continue;
359
360 pmd = pmd_offset(pud, 0);
361
362 if (PTRS_PER_PMD > 1) /* not folded */
363 flush |= (*func)(virt_to_page(pmd), 0);
364
365 for (; addr != pmd_limit; pmd++) {
366 addr += (PAGE_SIZE * PTRS_PER_PTE);
367 if ((pmd_limit-1) < (addr-1)) {
368 addr = pmd_limit;
369 break;
370 }
371
372 if (pmd_none(*pmd))
373 continue;
374
375 flush |= (*func)(pmd_page(*pmd), 0);
376 }
377 }
378 }
379
380 flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
381
382 return flush;
383}
384
385static int pin_page(struct page *page, unsigned flags)
386{
387 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
388 int flush;
389
390 if (pgfl)
391 flush = 0; /* already pinned */
392 else if (PageHighMem(page))
393 /* kmaps need flushing if we found an unpinned
394 highpage */
395 flush = 1;
396 else {
397 void *pt = lowmem_page_address(page);
398 unsigned long pfn = page_to_pfn(page);
399 struct multicall_space mcs = __xen_mc_entry(0);
400
401 flush = 0;
402
403 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
404 pfn_pte(pfn, PAGE_KERNEL_RO),
405 flags);
406 }
407
408 return flush;
409}
410
411/* This is called just after a mm has been created, but it has not
412 been used yet. We need to make sure that its pagetable is all
413 read-only, and can be pinned. */
414void xen_pgd_pin(pgd_t *pgd)
415{
416 struct multicall_space mcs;
417 struct mmuext_op *op;
418
419 xen_mc_batch();
420
421 if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
422 /* re-enable interrupts for kmap_flush_unused */
423 xen_mc_issue(0);
424 kmap_flush_unused();
425 xen_mc_batch();
426 }
427
428 mcs = __xen_mc_entry(sizeof(*op));
429 op = mcs.args;
430
431#ifdef CONFIG_X86_PAE
432 op->cmd = MMUEXT_PIN_L3_TABLE;
433#else
434 op->cmd = MMUEXT_PIN_L2_TABLE;
435#endif
436 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
437 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
438
439 xen_mc_issue(0);
440}
441
442/* The init_mm pagetable is really pinned as soon as its created, but
443 that's before we have page structures to store the bits. So do all
444 the book-keeping now. */
445static __init int mark_pinned(struct page *page, unsigned flags)
446{
447 SetPagePinned(page);
448 return 0;
449}
450
451void __init xen_mark_init_mm_pinned(void)
452{
453 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
454}
455
456static int unpin_page(struct page *page, unsigned flags)
457{
458 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
459
460 if (pgfl && !PageHighMem(page)) {
461 void *pt = lowmem_page_address(page);
462 unsigned long pfn = page_to_pfn(page);
463 struct multicall_space mcs = __xen_mc_entry(0);
464
465 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
466 pfn_pte(pfn, PAGE_KERNEL),
467 flags);
468 }
469
470 return 0; /* never need to flush on unpin */
471}
472
473/* Release a pagetables pages back as normal RW */
474static void xen_pgd_unpin(pgd_t *pgd)
475{
476 struct mmuext_op *op;
477 struct multicall_space mcs;
478
479 xen_mc_batch();
480
481 mcs = __xen_mc_entry(sizeof(*op));
482
483 op = mcs.args;
484 op->cmd = MMUEXT_UNPIN_TABLE;
485 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
486
487 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
488
489 pgd_walk(pgd, unpin_page, TASK_SIZE);
490
491 xen_mc_issue(0);
492}
493
494void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
495{
496 spin_lock(&next->page_table_lock);
497 xen_pgd_pin(next->pgd);
498 spin_unlock(&next->page_table_lock);
499}
500
501void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
502{
503 spin_lock(&mm->page_table_lock);
504 xen_pgd_pin(mm->pgd);
505 spin_unlock(&mm->page_table_lock);
506}
507
508
509#ifdef CONFIG_SMP
510/* Another cpu may still have their %cr3 pointing at the pagetable, so
511 we need to repoint it somewhere else before we can unpin it. */
512static void drop_other_mm_ref(void *info)
513{
514 struct mm_struct *mm = info;
515
516 if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
517 leave_mm(smp_processor_id());
518}
519
520static void drop_mm_ref(struct mm_struct *mm)
521{
522 if (current->active_mm == mm) {
523 if (current->mm == mm)
524 load_cr3(swapper_pg_dir);
525 else
526 leave_mm(smp_processor_id());
527 }
528
529 if (!cpus_empty(mm->cpu_vm_mask))
530 xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
531 mm, 1);
532}
533#else
534static void drop_mm_ref(struct mm_struct *mm)
535{
536 if (current->active_mm == mm)
537 load_cr3(swapper_pg_dir);
538}
539#endif
540
541/*
542 * While a process runs, Xen pins its pagetables, which means that the
543 * hypervisor forces it to be read-only, and it controls all updates
544 * to it. This means that all pagetable updates have to go via the
545 * hypervisor, which is moderately expensive.
546 *
547 * Since we're pulling the pagetable down, we switch to use init_mm,
548 * unpin old process pagetable and mark it all read-write, which
549 * allows further operations on it to be simple memory accesses.
550 *
551 * The only subtle point is that another CPU may be still using the
552 * pagetable because of lazy tlb flushing. This means we need need to
553 * switch all CPUs off this pagetable before we can unpin it.
554 */
555void xen_exit_mmap(struct mm_struct *mm)
556{
557 get_cpu(); /* make sure we don't move around */
558 drop_mm_ref(mm);
559 put_cpu();
560
561 spin_lock(&mm->page_table_lock);
562 xen_pgd_unpin(mm->pgd);
563 spin_unlock(&mm->page_table_lock);
564}
diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h
new file mode 100644
index 000000000000..c9ff27f3ac3a
--- /dev/null
+++ b/arch/i386/xen/mmu.h
@@ -0,0 +1,60 @@
1#ifndef _XEN_MMU_H
2
3#include <linux/linkage.h>
4#include <asm/page.h>
5
6/*
7 * Page-directory addresses above 4GB do not fit into architectural %cr3.
8 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
9 * must use the following accessor macros to pack/unpack valid MFNs.
10 *
11 * Note that Xen is using the fact that the pagetable base is always
12 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
13 * of cr3.
14 */
15#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
16#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
17
18
19void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
20
21void xen_set_pte(pte_t *ptep, pte_t pteval);
22void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
23 pte_t *ptep, pte_t pteval);
24void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
25
26void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
27void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
28void xen_exit_mmap(struct mm_struct *mm);
29
30void xen_pgd_pin(pgd_t *pgd);
31//void xen_pgd_unpin(pgd_t *pgd);
32
33#ifdef CONFIG_X86_PAE
34unsigned long long xen_pte_val(pte_t);
35unsigned long long xen_pmd_val(pmd_t);
36unsigned long long xen_pgd_val(pgd_t);
37
38pte_t xen_make_pte(unsigned long long);
39pmd_t xen_make_pmd(unsigned long long);
40pgd_t xen_make_pgd(unsigned long long);
41
42void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
43 pte_t *ptep, pte_t pteval);
44void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
45void xen_set_pud(pud_t *ptr, pud_t val);
46void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
47void xen_pmd_clear(pmd_t *pmdp);
48
49
50#else
51unsigned long xen_pte_val(pte_t);
52unsigned long xen_pmd_val(pmd_t);
53unsigned long xen_pgd_val(pgd_t);
54
55pte_t xen_make_pte(unsigned long);
56pmd_t xen_make_pmd(unsigned long);
57pgd_t xen_make_pgd(unsigned long);
58#endif
59
60#endif /* _XEN_MMU_H */
diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c
new file mode 100644
index 000000000000..c837e8e463db
--- /dev/null
+++ b/arch/i386/xen/multicalls.c
@@ -0,0 +1,90 @@
1/*
2 * Xen hypercall batching.
3 *
4 * Xen allows multiple hypercalls to be issued at once, using the
5 * multicall interface. This allows the cost of trapping into the
6 * hypervisor to be amortized over several calls.
7 *
8 * This file implements a simple interface for multicalls. There's a
9 * per-cpu buffer of outstanding multicalls. When you want to queue a
10 * multicall for issuing, you can allocate a multicall slot for the
11 * call and its arguments, along with storage for space which is
12 * pointed to by the arguments (for passing pointers to structures,
13 * etc). When the multicall is actually issued, all the space for the
14 * commands and allocated memory is freed for reuse.
15 *
16 * Multicalls are flushed whenever any of the buffers get full, or
17 * when explicitly requested. There's no way to get per-multicall
18 * return results back. It will BUG if any of the multicalls fail.
19 *
20 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
21 */
22#include <linux/percpu.h>
23#include <linux/hardirq.h>
24
25#include <asm/xen/hypercall.h>
26
27#include "multicalls.h"
28
29#define MC_BATCH 32
30#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
31
32struct mc_buffer {
33 struct multicall_entry entries[MC_BATCH];
34 u64 args[MC_ARGS];
35 unsigned mcidx, argidx;
36};
37
38static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
39DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
40
41void xen_mc_flush(void)
42{
43 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
44 int ret = 0;
45 unsigned long flags;
46
47 BUG_ON(preemptible());
48
49 /* Disable interrupts in case someone comes in and queues
50 something in the middle */
51 local_irq_save(flags);
52
53 if (b->mcidx) {
54 int i;
55
56 if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
57 BUG();
58 for (i = 0; i < b->mcidx; i++)
59 if (b->entries[i].result < 0)
60 ret++;
61 b->mcidx = 0;
62 b->argidx = 0;
63 } else
64 BUG_ON(b->argidx != 0);
65
66 local_irq_restore(flags);
67
68 BUG_ON(ret);
69}
70
71struct multicall_space __xen_mc_entry(size_t args)
72{
73 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
74 struct multicall_space ret;
75 unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
76
77 BUG_ON(preemptible());
78 BUG_ON(argspace > MC_ARGS);
79
80 if (b->mcidx == MC_BATCH ||
81 (b->argidx + argspace) > MC_ARGS)
82 xen_mc_flush();
83
84 ret.mc = &b->entries[b->mcidx];
85 b->mcidx++;
86 ret.args = &b->args[b->argidx];
87 b->argidx += argspace;
88
89 return ret;
90}
diff --git a/arch/i386/xen/multicalls.h b/arch/i386/xen/multicalls.h
new file mode 100644
index 000000000000..e6f7530b156c
--- /dev/null
+++ b/arch/i386/xen/multicalls.h
@@ -0,0 +1,45 @@
1#ifndef _XEN_MULTICALLS_H
2#define _XEN_MULTICALLS_H
3
4#include "xen-ops.h"
5
6/* Multicalls */
7struct multicall_space
8{
9 struct multicall_entry *mc;
10 void *args;
11};
12
13/* Allocate room for a multicall and its args */
14struct multicall_space __xen_mc_entry(size_t args);
15
16DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
17
18/* Call to start a batch of multiple __xen_mc_entry()s. Must be
19 paired with xen_mc_issue() */
20static inline void xen_mc_batch(void)
21{
22 /* need to disable interrupts until this entry is complete */
23 local_irq_save(__get_cpu_var(xen_mc_irq_flags));
24}
25
26static inline struct multicall_space xen_mc_entry(size_t args)
27{
28 xen_mc_batch();
29 return __xen_mc_entry(args);
30}
31
32/* Flush all pending multicalls */
33void xen_mc_flush(void);
34
35/* Issue a multicall if we're not in a lazy mode */
36static inline void xen_mc_issue(unsigned mode)
37{
38 if ((xen_get_lazy_mode() & mode) == 0)
39 xen_mc_flush();
40
41 /* restore flags saved in xen_mc_batch */
42 local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
43}
44
45#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c
new file mode 100644
index 000000000000..2fe6eac510f0
--- /dev/null
+++ b/arch/i386/xen/setup.c
@@ -0,0 +1,96 @@
1/*
2 * Machine specific setup for xen
3 *
4 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
5 */
6
7#include <linux/module.h>
8#include <linux/sched.h>
9#include <linux/mm.h>
10#include <linux/pm.h>
11
12#include <asm/elf.h>
13#include <asm/e820.h>
14#include <asm/setup.h>
15#include <asm/xen/hypervisor.h>
16#include <asm/xen/hypercall.h>
17
18#include <xen/interface/physdev.h>
19#include <xen/features.h>
20
21#include "xen-ops.h"
22
23/* These are code, but not functions. Defined in entry.S */
24extern const char xen_hypervisor_callback[];
25extern const char xen_failsafe_callback[];
26
27unsigned long *phys_to_machine_mapping;
28EXPORT_SYMBOL(phys_to_machine_mapping);
29
30/**
31 * machine_specific_memory_setup - Hook for machine specific memory setup.
32 **/
33
34char * __init xen_memory_setup(void)
35{
36 unsigned long max_pfn = xen_start_info->nr_pages;
37
38 e820.nr_map = 0;
39 add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
40
41 return "Xen";
42}
43
44static void xen_idle(void)
45{
46 local_irq_disable();
47
48 if (need_resched())
49 local_irq_enable();
50 else {
51 current_thread_info()->status &= ~TS_POLLING;
52 smp_mb__after_clear_bit();
53 safe_halt();
54 current_thread_info()->status |= TS_POLLING;
55 }
56}
57
58void __init xen_arch_setup(void)
59{
60 struct physdev_set_iopl set_iopl;
61 int rc;
62
63 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
64 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
65
66 if (!xen_feature(XENFEAT_auto_translated_physmap))
67 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
68
69 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
70 __KERNEL_CS, (unsigned long)xen_failsafe_callback);
71
72 set_iopl.iopl = 1;
73 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
74 if (rc != 0)
75 printk(KERN_INFO "physdev_op failed %d\n", rc);
76
77#ifdef CONFIG_ACPI
78 if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
79 printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
80 disable_acpi();
81 }
82#endif
83
84 memcpy(boot_command_line, xen_start_info->cmd_line,
85 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
86 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
87
88 pm_idle = xen_idle;
89
90#ifdef CONFIG_SMP
91 /* fill cpus_possible with all available cpus */
92 xen_fill_possible_map();
93#endif
94
95 paravirt_disable_iospace();
96}
diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c
new file mode 100644
index 000000000000..557b8e24706a
--- /dev/null
+++ b/arch/i386/xen/smp.c
@@ -0,0 +1,404 @@
1/*
2 * Xen SMP support
3 *
4 * This file implements the Xen versions of smp_ops. SMP under Xen is
5 * very straightforward. Bringing a CPU up is simply a matter of
6 * loading its initial context and setting it running.
7 *
8 * IPIs are handled through the Xen event mechanism.
9 *
10 * Because virtual CPUs can be scheduled onto any real CPU, there's no
11 * useful topology information for the kernel to make use of. As a
12 * result, all CPUs are treated as if they're single-core and
13 * single-threaded.
14 *
15 * This does not handle HOTPLUG_CPU yet.
16 */
17#include <linux/sched.h>
18#include <linux/err.h>
19#include <linux/smp.h>
20
21#include <asm/paravirt.h>
22#include <asm/desc.h>
23#include <asm/pgtable.h>
24#include <asm/cpu.h>
25
26#include <xen/interface/xen.h>
27#include <xen/interface/vcpu.h>
28
29#include <asm/xen/interface.h>
30#include <asm/xen/hypercall.h>
31
32#include <xen/page.h>
33#include <xen/events.h>
34
35#include "xen-ops.h"
36#include "mmu.h"
37
38static cpumask_t cpu_initialized_map;
39static DEFINE_PER_CPU(int, resched_irq);
40static DEFINE_PER_CPU(int, callfunc_irq);
41
42/*
43 * Structure and data for smp_call_function(). This is designed to minimise
44 * static memory requirements. It also looks cleaner.
45 */
46static DEFINE_SPINLOCK(call_lock);
47
48struct call_data_struct {
49 void (*func) (void *info);
50 void *info;
51 atomic_t started;
52 atomic_t finished;
53 int wait;
54};
55
56static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
57
58static struct call_data_struct *call_data;
59
60/*
61 * Reschedule call back. Nothing to do,
62 * all the work is done automatically when
63 * we return from the interrupt.
64 */
65static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
66{
67 return IRQ_HANDLED;
68}
69
70static __cpuinit void cpu_bringup_and_idle(void)
71{
72 int cpu = smp_processor_id();
73
74 cpu_init();
75
76 preempt_disable();
77 per_cpu(cpu_state, cpu) = CPU_ONLINE;
78
79 xen_setup_cpu_clockevents();
80
81 /* We can take interrupts now: we're officially "up". */
82 local_irq_enable();
83
84 wmb(); /* make sure everything is out */
85 cpu_idle();
86}
87
88static int xen_smp_intr_init(unsigned int cpu)
89{
90 int rc;
91 const char *resched_name, *callfunc_name;
92
93 per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
94
95 resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
96 rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
97 cpu,
98 xen_reschedule_interrupt,
99 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
100 resched_name,
101 NULL);
102 if (rc < 0)
103 goto fail;
104 per_cpu(resched_irq, cpu) = rc;
105
106 callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
107 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
108 cpu,
109 xen_call_function_interrupt,
110 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
111 callfunc_name,
112 NULL);
113 if (rc < 0)
114 goto fail;
115 per_cpu(callfunc_irq, cpu) = rc;
116
117 return 0;
118
119 fail:
120 if (per_cpu(resched_irq, cpu) >= 0)
121 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
122 if (per_cpu(callfunc_irq, cpu) >= 0)
123 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
124 return rc;
125}
126
127void __init xen_fill_possible_map(void)
128{
129 int i, rc;
130
131 for (i = 0; i < NR_CPUS; i++) {
132 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
133 if (rc >= 0)
134 cpu_set(i, cpu_possible_map);
135 }
136}
137
138void __init xen_smp_prepare_boot_cpu(void)
139{
140 int cpu;
141
142 BUG_ON(smp_processor_id() != 0);
143 native_smp_prepare_boot_cpu();
144
145 /* We've switched to the "real" per-cpu gdt, so make sure the
146 old memory can be recycled */
147 make_lowmem_page_readwrite(&per_cpu__gdt_page);
148
149 for (cpu = 0; cpu < NR_CPUS; cpu++) {
150 cpus_clear(cpu_sibling_map[cpu]);
151 cpus_clear(cpu_core_map[cpu]);
152 }
153
154 xen_setup_vcpu_info_placement();
155}
156
157void __init xen_smp_prepare_cpus(unsigned int max_cpus)
158{
159 unsigned cpu;
160
161 for (cpu = 0; cpu < NR_CPUS; cpu++) {
162 cpus_clear(cpu_sibling_map[cpu]);
163 cpus_clear(cpu_core_map[cpu]);
164 }
165
166 smp_store_cpu_info(0);
167 set_cpu_sibling_map(0);
168
169 if (xen_smp_intr_init(0))
170 BUG();
171
172 cpu_initialized_map = cpumask_of_cpu(0);
173
174 /* Restrict the possible_map according to max_cpus. */
175 while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
176 for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
177 continue;
178 cpu_clear(cpu, cpu_possible_map);
179 }
180
181 for_each_possible_cpu (cpu) {
182 struct task_struct *idle;
183
184 if (cpu == 0)
185 continue;
186
187 idle = fork_idle(cpu);
188 if (IS_ERR(idle))
189 panic("failed fork for CPU %d", cpu);
190
191 cpu_set(cpu, cpu_present_map);
192 }
193
194 //init_xenbus_allowed_cpumask();
195}
196
197static __cpuinit int
198cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
199{
200 struct vcpu_guest_context *ctxt;
201 struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
202
203 if (cpu_test_and_set(cpu, cpu_initialized_map))
204 return 0;
205
206 ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
207 if (ctxt == NULL)
208 return -ENOMEM;
209
210 ctxt->flags = VGCF_IN_KERNEL;
211 ctxt->user_regs.ds = __USER_DS;
212 ctxt->user_regs.es = __USER_DS;
213 ctxt->user_regs.fs = __KERNEL_PERCPU;
214 ctxt->user_regs.gs = 0;
215 ctxt->user_regs.ss = __KERNEL_DS;
216 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
217 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
218
219 memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
220
221 xen_copy_trap_info(ctxt->trap_ctxt);
222
223 ctxt->ldt_ents = 0;
224
225 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
226 make_lowmem_page_readonly(gdt->gdt);
227
228 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
229 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
230
231 ctxt->user_regs.cs = __KERNEL_CS;
232 ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
233
234 ctxt->kernel_ss = __KERNEL_DS;
235 ctxt->kernel_sp = idle->thread.esp0;
236
237 ctxt->event_callback_cs = __KERNEL_CS;
238 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
239 ctxt->failsafe_callback_cs = __KERNEL_CS;
240 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
241
242 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
243 ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
244
245 if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
246 BUG();
247
248 kfree(ctxt);
249 return 0;
250}
251
252int __cpuinit xen_cpu_up(unsigned int cpu)
253{
254 struct task_struct *idle = idle_task(cpu);
255 int rc;
256
257#if 0
258 rc = cpu_up_check(cpu);
259 if (rc)
260 return rc;
261#endif
262
263 init_gdt(cpu);
264 per_cpu(current_task, cpu) = idle;
265 irq_ctx_init(cpu);
266 xen_setup_timer(cpu);
267
268 /* make sure interrupts start blocked */
269 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
270
271 rc = cpu_initialize_context(cpu, idle);
272 if (rc)
273 return rc;
274
275 if (num_online_cpus() == 1)
276 alternatives_smp_switch(1);
277
278 rc = xen_smp_intr_init(cpu);
279 if (rc)
280 return rc;
281
282 smp_store_cpu_info(cpu);
283 set_cpu_sibling_map(cpu);
284 /* This must be done before setting cpu_online_map */
285 wmb();
286
287 cpu_set(cpu, cpu_online_map);
288
289 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
290 BUG_ON(rc);
291
292 return 0;
293}
294
295void xen_smp_cpus_done(unsigned int max_cpus)
296{
297}
298
299static void stop_self(void *v)
300{
301 int cpu = smp_processor_id();
302
303 /* make sure we're not pinning something down */
304 load_cr3(swapper_pg_dir);
305 /* should set up a minimal gdt */
306
307 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
308 BUG();
309}
310
311void xen_smp_send_stop(void)
312{
313 smp_call_function(stop_self, NULL, 0, 0);
314}
315
316void xen_smp_send_reschedule(int cpu)
317{
318 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
319}
320
321
322static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
323{
324 unsigned cpu;
325
326 cpus_and(mask, mask, cpu_online_map);
327
328 for_each_cpu_mask(cpu, mask)
329 xen_send_IPI_one(cpu, vector);
330}
331
332static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
333{
334 void (*func) (void *info) = call_data->func;
335 void *info = call_data->info;
336 int wait = call_data->wait;
337
338 /*
339 * Notify initiating CPU that I've grabbed the data and am
340 * about to execute the function
341 */
342 mb();
343 atomic_inc(&call_data->started);
344 /*
345 * At this point the info structure may be out of scope unless wait==1
346 */
347 irq_enter();
348 (*func)(info);
349 irq_exit();
350
351 if (wait) {
352 mb(); /* commit everything before setting finished */
353 atomic_inc(&call_data->finished);
354 }
355
356 return IRQ_HANDLED;
357}
358
359int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
360 void *info, int wait)
361{
362 struct call_data_struct data;
363 int cpus;
364
365 /* Holding any lock stops cpus from going down. */
366 spin_lock(&call_lock);
367
368 cpu_clear(smp_processor_id(), mask);
369
370 cpus = cpus_weight(mask);
371 if (!cpus) {
372 spin_unlock(&call_lock);
373 return 0;
374 }
375
376 /* Can deadlock when called with interrupts disabled */
377 WARN_ON(irqs_disabled());
378
379 data.func = func;
380 data.info = info;
381 atomic_set(&data.started, 0);
382 data.wait = wait;
383 if (wait)
384 atomic_set(&data.finished, 0);
385
386 call_data = &data;
387 mb(); /* write everything before IPI */
388
389 /* Send a message to other CPUs and wait for them to respond */
390 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
391
392 /* Make sure other vcpus get a chance to run.
393 XXX too severe? Maybe we should check the other CPU's states? */
394 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
395
396 /* Wait for response */
397 while (atomic_read(&data.started) != cpus ||
398 (wait && atomic_read(&data.finished) != cpus))
399 cpu_relax();
400
401 spin_unlock(&call_lock);
402
403 return 0;
404}
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c
new file mode 100644
index 000000000000..51fdabf1fd4d
--- /dev/null
+++ b/arch/i386/xen/time.c
@@ -0,0 +1,590 @@
1/*
2 * Xen time implementation.
3 *
4 * This is implemented in terms of a clocksource driver which uses
5 * the hypervisor clock as a nanosecond timebase, and a clockevent
6 * driver which uses the hypervisor's timer mechanism.
7 *
8 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
9 */
10#include <linux/kernel.h>
11#include <linux/interrupt.h>
12#include <linux/clocksource.h>
13#include <linux/clockchips.h>
14#include <linux/kernel_stat.h>
15
16#include <asm/xen/hypervisor.h>
17#include <asm/xen/hypercall.h>
18
19#include <xen/events.h>
20#include <xen/interface/xen.h>
21#include <xen/interface/vcpu.h>
22
23#include "xen-ops.h"
24
25#define XEN_SHIFT 22
26
27/* Xen may fire a timer up to this many ns early */
28#define TIMER_SLOP 100000
29#define NS_PER_TICK (1000000000LL / HZ)
30
31static cycle_t xen_clocksource_read(void);
32
33/* These are perodically updated in shared_info, and then copied here. */
34struct shadow_time_info {
35 u64 tsc_timestamp; /* TSC at last update of time vals. */
36 u64 system_timestamp; /* Time, in nanosecs, since boot. */
37 u32 tsc_to_nsec_mul;
38 int tsc_shift;
39 u32 version;
40};
41
42static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
43
44/* runstate info updated by Xen */
45static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
46
47/* snapshots of runstate info */
48static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
49
50/* unused ns of stolen and blocked time */
51static DEFINE_PER_CPU(u64, residual_stolen);
52static DEFINE_PER_CPU(u64, residual_blocked);
53
54/* return an consistent snapshot of 64-bit time/counter value */
55static u64 get64(const u64 *p)
56{
57 u64 ret;
58
59 if (BITS_PER_LONG < 64) {
60 u32 *p32 = (u32 *)p;
61 u32 h, l;
62
63 /*
64 * Read high then low, and then make sure high is
65 * still the same; this will only loop if low wraps
66 * and carries into high.
67 * XXX some clean way to make this endian-proof?
68 */
69 do {
70 h = p32[1];
71 barrier();
72 l = p32[0];
73 barrier();
74 } while (p32[1] != h);
75
76 ret = (((u64)h) << 32) | l;
77 } else
78 ret = *p;
79
80 return ret;
81}
82
83/*
84 * Runstate accounting
85 */
86static void get_runstate_snapshot(struct vcpu_runstate_info *res)
87{
88 u64 state_time;
89 struct vcpu_runstate_info *state;
90
91 BUG_ON(preemptible());
92
93 state = &__get_cpu_var(runstate);
94
95 /*
96 * The runstate info is always updated by the hypervisor on
97 * the current CPU, so there's no need to use anything
98 * stronger than a compiler barrier when fetching it.
99 */
100 do {
101 state_time = get64(&state->state_entry_time);
102 barrier();
103 *res = *state;
104 barrier();
105 } while (get64(&state->state_entry_time) != state_time);
106}
107
108static void setup_runstate_info(int cpu)
109{
110 struct vcpu_register_runstate_memory_area area;
111
112 area.addr.v = &per_cpu(runstate, cpu);
113
114 if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
115 cpu, &area))
116 BUG();
117}
118
119static void do_stolen_accounting(void)
120{
121 struct vcpu_runstate_info state;
122 struct vcpu_runstate_info *snap;
123 s64 blocked, runnable, offline, stolen;
124 cputime_t ticks;
125
126 get_runstate_snapshot(&state);
127
128 WARN_ON(state.state != RUNSTATE_running);
129
130 snap = &__get_cpu_var(runstate_snapshot);
131
132 /* work out how much time the VCPU has not been runn*ing* */
133 blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
134 runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
135 offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
136
137 *snap = state;
138
139 /* Add the appropriate number of ticks of stolen time,
140 including any left-overs from last time. Passing NULL to
141 account_steal_time accounts the time as stolen. */
142 stolen = runnable + offline + __get_cpu_var(residual_stolen);
143
144 if (stolen < 0)
145 stolen = 0;
146
147 ticks = 0;
148 while (stolen >= NS_PER_TICK) {
149 ticks++;
150 stolen -= NS_PER_TICK;
151 }
152 __get_cpu_var(residual_stolen) = stolen;
153 account_steal_time(NULL, ticks);
154
155 /* Add the appropriate number of ticks of blocked time,
156 including any left-overs from last time. Passing idle to
157 account_steal_time accounts the time as idle/wait. */
158 blocked += __get_cpu_var(residual_blocked);
159
160 if (blocked < 0)
161 blocked = 0;
162
163 ticks = 0;
164 while (blocked >= NS_PER_TICK) {
165 ticks++;
166 blocked -= NS_PER_TICK;
167 }
168 __get_cpu_var(residual_blocked) = blocked;
169 account_steal_time(idle_task(smp_processor_id()), ticks);
170}
171
172/*
173 * Xen sched_clock implementation. Returns the number of unstolen
174 * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
175 * states.
176 */
177unsigned long long xen_sched_clock(void)
178{
179 struct vcpu_runstate_info state;
180 cycle_t now;
181 u64 ret;
182 s64 offset;
183
184 /*
185 * Ideally sched_clock should be called on a per-cpu basis
186 * anyway, so preempt should already be disabled, but that's
187 * not current practice at the moment.
188 */
189 preempt_disable();
190
191 now = xen_clocksource_read();
192
193 get_runstate_snapshot(&state);
194
195 WARN_ON(state.state != RUNSTATE_running);
196
197 offset = now - state.state_entry_time;
198 if (offset < 0)
199 offset = 0;
200
201 ret = state.time[RUNSTATE_blocked] +
202 state.time[RUNSTATE_running] +
203 offset;
204
205 preempt_enable();
206
207 return ret;
208}
209
210
211/* Get the CPU speed from Xen */
212unsigned long xen_cpu_khz(void)
213{
214 u64 cpu_khz = 1000000ULL << 32;
215 const struct vcpu_time_info *info =
216 &HYPERVISOR_shared_info->vcpu_info[0].time;
217
218 do_div(cpu_khz, info->tsc_to_system_mul);
219 if (info->tsc_shift < 0)
220 cpu_khz <<= -info->tsc_shift;
221 else
222 cpu_khz >>= info->tsc_shift;
223
224 return cpu_khz;
225}
226
227/*
228 * Reads a consistent set of time-base values from Xen, into a shadow data
229 * area.
230 */
231static unsigned get_time_values_from_xen(void)
232{
233 struct vcpu_time_info *src;
234 struct shadow_time_info *dst;
235
236 /* src is shared memory with the hypervisor, so we need to
237 make sure we get a consistent snapshot, even in the face of
238 being preempted. */
239 src = &__get_cpu_var(xen_vcpu)->time;
240 dst = &__get_cpu_var(shadow_time);
241
242 do {
243 dst->version = src->version;
244 rmb(); /* fetch version before data */
245 dst->tsc_timestamp = src->tsc_timestamp;
246 dst->system_timestamp = src->system_time;
247 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
248 dst->tsc_shift = src->tsc_shift;
249 rmb(); /* test version after fetching data */
250 } while ((src->version & 1) | (dst->version ^ src->version));
251
252 return dst->version;
253}
254
255/*
256 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
257 * yielding a 64-bit result.
258 */
259static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
260{
261 u64 product;
262#ifdef __i386__
263 u32 tmp1, tmp2;
264#endif
265
266 if (shift < 0)
267 delta >>= -shift;
268 else
269 delta <<= shift;
270
271#ifdef __i386__
272 __asm__ (
273 "mul %5 ; "
274 "mov %4,%%eax ; "
275 "mov %%edx,%4 ; "
276 "mul %5 ; "
277 "xor %5,%5 ; "
278 "add %4,%%eax ; "
279 "adc %5,%%edx ; "
280 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
281 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
282#elif __x86_64__
283 __asm__ (
284 "mul %%rdx ; shrd $32,%%rdx,%%rax"
285 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
286#else
287#error implement me!
288#endif
289
290 return product;
291}
292
293static u64 get_nsec_offset(struct shadow_time_info *shadow)
294{
295 u64 now, delta;
296 now = native_read_tsc();
297 delta = now - shadow->tsc_timestamp;
298 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
299}
300
301static cycle_t xen_clocksource_read(void)
302{
303 struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
304 cycle_t ret;
305 unsigned version;
306
307 do {
308 version = get_time_values_from_xen();
309 barrier();
310 ret = shadow->system_timestamp + get_nsec_offset(shadow);
311 barrier();
312 } while (version != __get_cpu_var(xen_vcpu)->time.version);
313
314 put_cpu_var(shadow_time);
315
316 return ret;
317}
318
319static void xen_read_wallclock(struct timespec *ts)
320{
321 const struct shared_info *s = HYPERVISOR_shared_info;
322 u32 version;
323 u64 delta;
324 struct timespec now;
325
326 /* get wallclock at system boot */
327 do {
328 version = s->wc_version;
329 rmb(); /* fetch version before time */
330 now.tv_sec = s->wc_sec;
331 now.tv_nsec = s->wc_nsec;
332 rmb(); /* fetch time before checking version */
333 } while ((s->wc_version & 1) | (version ^ s->wc_version));
334
335 delta = xen_clocksource_read(); /* time since system boot */
336 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
337
338 now.tv_nsec = do_div(delta, NSEC_PER_SEC);
339 now.tv_sec = delta;
340
341 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
342}
343
344unsigned long xen_get_wallclock(void)
345{
346 struct timespec ts;
347
348 xen_read_wallclock(&ts);
349
350 return ts.tv_sec;
351}
352
353int xen_set_wallclock(unsigned long now)
354{
355 /* do nothing for domU */
356 return -1;
357}
358
359static struct clocksource xen_clocksource __read_mostly = {
360 .name = "xen",
361 .rating = 400,
362 .read = xen_clocksource_read,
363 .mask = ~0,
364 .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
365 .shift = XEN_SHIFT,
366 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
367};
368
369/*
370 Xen clockevent implementation
371
372 Xen has two clockevent implementations:
373
374 The old timer_op one works with all released versions of Xen prior
375 to version 3.0.4. This version of the hypervisor provides a
376 single-shot timer with nanosecond resolution. However, sharing the
377 same event channel is a 100Hz tick which is delivered while the
378 vcpu is running. We don't care about or use this tick, but it will
379 cause the core time code to think the timer fired too soon, and
380 will end up resetting it each time. It could be filtered, but
381 doing so has complications when the ktime clocksource is not yet
382 the xen clocksource (ie, at boot time).
383
384 The new vcpu_op-based timer interface allows the tick timer period
385 to be changed or turned off. The tick timer is not useful as a
386 periodic timer because events are only delivered to running vcpus.
387 The one-shot timer can report when a timeout is in the past, so
388 set_next_event is capable of returning -ETIME when appropriate.
389 This interface is used when available.
390*/
391
392
393/*
394 Get a hypervisor absolute time. In theory we could maintain an
395 offset between the kernel's time and the hypervisor's time, and
396 apply that to a kernel's absolute timeout. Unfortunately the
397 hypervisor and kernel times can drift even if the kernel is using
398 the Xen clocksource, because ntp can warp the kernel's clocksource.
399*/
400static s64 get_abs_timeout(unsigned long delta)
401{
402 return xen_clocksource_read() + delta;
403}
404
405static void xen_timerop_set_mode(enum clock_event_mode mode,
406 struct clock_event_device *evt)
407{
408 switch (mode) {
409 case CLOCK_EVT_MODE_PERIODIC:
410 /* unsupported */
411 WARN_ON(1);
412 break;
413
414 case CLOCK_EVT_MODE_ONESHOT:
415 break;
416
417 case CLOCK_EVT_MODE_UNUSED:
418 case CLOCK_EVT_MODE_SHUTDOWN:
419 HYPERVISOR_set_timer_op(0); /* cancel timeout */
420 break;
421 }
422}
423
424static int xen_timerop_set_next_event(unsigned long delta,
425 struct clock_event_device *evt)
426{
427 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
428
429 if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
430 BUG();
431
432 /* We may have missed the deadline, but there's no real way of
433 knowing for sure. If the event was in the past, then we'll
434 get an immediate interrupt. */
435
436 return 0;
437}
438
439static const struct clock_event_device xen_timerop_clockevent = {
440 .name = "xen",
441 .features = CLOCK_EVT_FEAT_ONESHOT,
442
443 .max_delta_ns = 0xffffffff,
444 .min_delta_ns = TIMER_SLOP,
445
446 .mult = 1,
447 .shift = 0,
448 .rating = 500,
449
450 .set_mode = xen_timerop_set_mode,
451 .set_next_event = xen_timerop_set_next_event,
452};
453
454
455
456static void xen_vcpuop_set_mode(enum clock_event_mode mode,
457 struct clock_event_device *evt)
458{
459 int cpu = smp_processor_id();
460
461 switch (mode) {
462 case CLOCK_EVT_MODE_PERIODIC:
463 WARN_ON(1); /* unsupported */
464 break;
465
466 case CLOCK_EVT_MODE_ONESHOT:
467 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
468 BUG();
469 break;
470
471 case CLOCK_EVT_MODE_UNUSED:
472 case CLOCK_EVT_MODE_SHUTDOWN:
473 if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
474 HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
475 BUG();
476 break;
477 }
478}
479
480static int xen_vcpuop_set_next_event(unsigned long delta,
481 struct clock_event_device *evt)
482{
483 int cpu = smp_processor_id();
484 struct vcpu_set_singleshot_timer single;
485 int ret;
486
487 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
488
489 single.timeout_abs_ns = get_abs_timeout(delta);
490 single.flags = VCPU_SSHOTTMR_future;
491
492 ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
493
494 BUG_ON(ret != 0 && ret != -ETIME);
495
496 return ret;
497}
498
499static const struct clock_event_device xen_vcpuop_clockevent = {
500 .name = "xen",
501 .features = CLOCK_EVT_FEAT_ONESHOT,
502
503 .max_delta_ns = 0xffffffff,
504 .min_delta_ns = TIMER_SLOP,
505
506 .mult = 1,
507 .shift = 0,
508 .rating = 500,
509
510 .set_mode = xen_vcpuop_set_mode,
511 .set_next_event = xen_vcpuop_set_next_event,
512};
513
514static const struct clock_event_device *xen_clockevent =
515 &xen_timerop_clockevent;
516static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
517
518static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
519{
520 struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
521 irqreturn_t ret;
522
523 ret = IRQ_NONE;
524 if (evt->event_handler) {
525 evt->event_handler(evt);
526 ret = IRQ_HANDLED;
527 }
528
529 do_stolen_accounting();
530
531 return ret;
532}
533
534void xen_setup_timer(int cpu)
535{
536 const char *name;
537 struct clock_event_device *evt;
538 int irq;
539
540 printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
541
542 name = kasprintf(GFP_KERNEL, "timer%d", cpu);
543 if (!name)
544 name = "<timer kasprintf failed>";
545
546 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
547 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
548 name, NULL);
549
550 evt = &per_cpu(xen_clock_events, cpu);
551 memcpy(evt, xen_clockevent, sizeof(*evt));
552
553 evt->cpumask = cpumask_of_cpu(cpu);
554 evt->irq = irq;
555
556 setup_runstate_info(cpu);
557}
558
559void xen_setup_cpu_clockevents(void)
560{
561 BUG_ON(preemptible());
562
563 clockevents_register_device(&__get_cpu_var(xen_clock_events));
564}
565
566__init void xen_time_init(void)
567{
568 int cpu = smp_processor_id();
569
570 get_time_values_from_xen();
571
572 clocksource_register(&xen_clocksource);
573
574 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
575 /* Successfully turned off 100Hz tick, so we have the
576 vcpuop-based timer interface */
577 printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
578 xen_clockevent = &xen_vcpuop_clockevent;
579 }
580
581 /* Set initial system time with full resolution */
582 xen_read_wallclock(&xtime);
583 set_normalized_timespec(&wall_to_monotonic,
584 -xtime.tv_sec, -xtime.tv_nsec);
585
586 tsc_disable = 0;
587
588 xen_setup_timer(cpu);
589 xen_setup_cpu_clockevents();
590}
diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S
new file mode 100644
index 000000000000..1a43b60c0c62
--- /dev/null
+++ b/arch/i386/xen/xen-asm.S
@@ -0,0 +1,291 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/thread_info.h>
18#include <asm/percpu.h>
19#include <asm/processor-flags.h>
20#include <asm/segment.h>
21
22#include <xen/interface/xen.h>
23
24#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
25#define ENDPATCH(x) .globl x##_end; x##_end=.
26
27/* Pseudo-flag used for virtual NMI, which we don't implement yet */
28#define XEN_EFLAGS_NMI 0x80000000
29
30/*
31 Enable events. This clears the event mask and tests the pending
32 event status with one and operation. If there are pending
33 events, then enter the hypervisor to get them handled.
34 */
35ENTRY(xen_irq_enable_direct)
36 /* Clear mask and test pending */
37 andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
38 /* Preempt here doesn't matter because that will deal with
39 any pending interrupts. The pending check may end up being
40 run on the wrong CPU, but that doesn't hurt. */
41 jz 1f
422: call check_events
431:
44ENDPATCH(xen_irq_enable_direct)
45 ret
46 ENDPROC(xen_irq_enable_direct)
47 RELOC(xen_irq_enable_direct, 2b+1)
48
49
50/*
51 Disabling events is simply a matter of making the event mask
52 non-zero.
53 */
54ENTRY(xen_irq_disable_direct)
55 movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
56ENDPATCH(xen_irq_disable_direct)
57 ret
58 ENDPROC(xen_irq_disable_direct)
59 RELOC(xen_irq_disable_direct, 0)
60
61/*
62 (xen_)save_fl is used to get the current interrupt enable status.
63 Callers expect the status to be in X86_EFLAGS_IF, and other bits
64 may be set in the return value. We take advantage of this by
65 making sure that X86_EFLAGS_IF has the right value (and other bits
66 in that byte are 0), but other bits in the return value are
67 undefined. We need to toggle the state of the bit, because
68 Xen and x86 use opposite senses (mask vs enable).
69 */
70ENTRY(xen_save_fl_direct)
71 testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
72 setz %ah
73 addb %ah,%ah
74ENDPATCH(xen_save_fl_direct)
75 ret
76 ENDPROC(xen_save_fl_direct)
77 RELOC(xen_save_fl_direct, 0)
78
79
80/*
81 In principle the caller should be passing us a value return
82 from xen_save_fl_direct, but for robustness sake we test only
83 the X86_EFLAGS_IF flag rather than the whole byte. After
84 setting the interrupt mask state, it checks for unmasked
85 pending events and enters the hypervisor to get them delivered
86 if so.
87 */
88ENTRY(xen_restore_fl_direct)
89 testb $X86_EFLAGS_IF>>8, %ah
90 setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
91 /* Preempt here doesn't matter because that will deal with
92 any pending interrupts. The pending check may end up being
93 run on the wrong CPU, but that doesn't hurt. */
94
95 /* check for unmasked and pending */
96 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
97 jz 1f
982: call check_events
991:
100ENDPATCH(xen_restore_fl_direct)
101 ret
102 ENDPROC(xen_restore_fl_direct)
103 RELOC(xen_restore_fl_direct, 2b+1)
104
105/*
106 This is run where a normal iret would be run, with the same stack setup:
107 8: eflags
108 4: cs
109 esp-> 0: eip
110
111 This attempts to make sure that any pending events are dealt
112 with on return to usermode, but there is a small window in
113 which an event can happen just before entering usermode. If
114 the nested interrupt ends up setting one of the TIF_WORK_MASK
115 pending work flags, they will not be tested again before
116 returning to usermode. This means that a process can end up
117 with pending work, which will be unprocessed until the process
118 enters and leaves the kernel again, which could be an
119 unbounded amount of time. This means that a pending signal or
120 reschedule event could be indefinitely delayed.
121
122 The fix is to notice a nested interrupt in the critical
123 window, and if one occurs, then fold the nested interrupt into
124 the current interrupt stack frame, and re-process it
125 iteratively rather than recursively. This means that it will
126 exit via the normal path, and all pending work will be dealt
127 with appropriately.
128
129 Because the nested interrupt handler needs to deal with the
130 current stack state in whatever form its in, we keep things
131 simple by only using a single register which is pushed/popped
132 on the stack.
133
134 Non-direct iret could be done in the same way, but it would
135 require an annoying amount of code duplication. We'll assume
136 that direct mode will be the common case once the hypervisor
137 support becomes commonplace.
138 */
139ENTRY(xen_iret_direct)
140 /* test eflags for special cases */
141 testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
142 jnz hyper_iret
143
144 push %eax
145 ESP_OFFSET=4 # bytes pushed onto stack
146
147 /* Store vcpu_info pointer for easy access. Do it this
148 way to avoid having to reload %fs */
149#ifdef CONFIG_SMP
150 GET_THREAD_INFO(%eax)
151 movl TI_cpu(%eax),%eax
152 movl __per_cpu_offset(,%eax,4),%eax
153 lea per_cpu__xen_vcpu_info(%eax),%eax
154#else
155 movl $per_cpu__xen_vcpu_info, %eax
156#endif
157
158 /* check IF state we're restoring */
159 testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
160
161 /* Maybe enable events. Once this happens we could get a
162 recursive event, so the critical region starts immediately
163 afterwards. However, if that happens we don't end up
164 resuming the code, so we don't have to be worried about
165 being preempted to another CPU. */
166 setz XEN_vcpu_info_mask(%eax)
167xen_iret_start_crit:
168
169 /* check for unmasked and pending */
170 cmpw $0x0001, XEN_vcpu_info_pending(%eax)
171
172 /* If there's something pending, mask events again so we
173 can jump back into xen_hypervisor_callback */
174 sete XEN_vcpu_info_mask(%eax)
175
176 popl %eax
177
178 /* From this point on the registers are restored and the stack
179 updated, so we don't need to worry about it if we're preempted */
180iret_restore_end:
181
182 /* Jump to hypervisor_callback after fixing up the stack.
183 Events are masked, so jumping out of the critical
184 region is OK. */
185 je xen_hypervisor_callback
186
187 iret
188xen_iret_end_crit:
189
190hyper_iret:
191 /* put this out of line since its very rarely used */
192 jmp hypercall_page + __HYPERVISOR_iret * 32
193
194 .globl xen_iret_start_crit, xen_iret_end_crit
195
196/*
197 This is called by xen_hypervisor_callback in entry.S when it sees
198 that the EIP at the time of interrupt was between xen_iret_start_crit
199 and xen_iret_end_crit. We're passed the EIP in %eax so we can do
200 a more refined determination of what to do.
201
202 The stack format at this point is:
203 ----------------
204 ss : (ss/esp may be present if we came from usermode)
205 esp :
206 eflags } outer exception info
207 cs }
208 eip }
209 ---------------- <- edi (copy dest)
210 eax : outer eax if it hasn't been restored
211 ----------------
212 eflags } nested exception info
213 cs } (no ss/esp because we're nested
214 eip } from the same ring)
215 orig_eax }<- esi (copy src)
216 - - - - - - - -
217 fs }
218 es }
219 ds } SAVE_ALL state
220 eax }
221 : :
222 ebx }
223 ----------------
224 return addr <- esp
225 ----------------
226
227 In order to deliver the nested exception properly, we need to shift
228 everything from the return addr up to the error code so it
229 sits just under the outer exception info. This means that when we
230 handle the exception, we do it in the context of the outer exception
231 rather than starting a new one.
232
233 The only caveat is that if the outer eax hasn't been
234 restored yet (ie, it's still on stack), we need to insert
235 its value into the SAVE_ALL state before going on, since
236 it's usermode state which we eventually need to restore.
237 */
238ENTRY(xen_iret_crit_fixup)
239 /* offsets +4 for return address */
240
241 /*
242 Paranoia: Make sure we're really coming from userspace.
243 One could imagine a case where userspace jumps into the
244 critical range address, but just before the CPU delivers a GP,
245 it decides to deliver an interrupt instead. Unlikely?
246 Definitely. Easy to avoid? Yes. The Intel documents
247 explicitly say that the reported EIP for a bad jump is the
248 jump instruction itself, not the destination, but some virtual
249 environments get this wrong.
250 */
251 movl PT_CS+4(%esp), %ecx
252 andl $SEGMENT_RPL_MASK, %ecx
253 cmpl $USER_RPL, %ecx
254 je 2f
255
256 lea PT_ORIG_EAX+4(%esp), %esi
257 lea PT_EFLAGS+4(%esp), %edi
258
259 /* If eip is before iret_restore_end then stack
260 hasn't been restored yet. */
261 cmp $iret_restore_end, %eax
262 jae 1f
263
264 movl 0+4(%edi),%eax /* copy EAX */
265 movl %eax, PT_EAX+4(%esp)
266
267 lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
268
269 /* set up the copy */
2701: std
271 mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */
272 rep movsl
273 cld
274
275 lea 4(%edi),%esp /* point esp to new frame */
2762: ret
277
278
279/*
280 Force an event check by making a hypercall,
281 but preserve regs before making the call.
282 */
283check_events:
284 push %eax
285 push %ecx
286 push %edx
287 call force_evtchn_callback
288 pop %edx
289 pop %ecx
290 pop %eax
291 ret
diff --git a/arch/i386/xen/xen-head.S b/arch/i386/xen/xen-head.S
new file mode 100644
index 000000000000..2998d55a0017
--- /dev/null
+++ b/arch/i386/xen/xen-head.S
@@ -0,0 +1,36 @@
1/* Xen-specific pieces of head.S, intended to be included in the right
2 place in head.S */
3
4#ifdef CONFIG_XEN
5
6#include <linux/elfnote.h>
7#include <asm/boot.h>
8#include <xen/interface/elfnote.h>
9
10ENTRY(startup_xen)
11 movl %esi,xen_start_info
12 cld
13 movl $(init_thread_union+THREAD_SIZE),%esp
14 jmp xen_start_kernel
15
16.pushsection ".bss.page_aligned"
17 .align PAGE_SIZE_asm
18ENTRY(hypercall_page)
19 .skip 0x1000
20.popsection
21
22 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
23 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
24 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
25 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
26 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
27 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
28 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
29#ifdef CONFIG_X86_PAE
30 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
31#else
32 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
33#endif
34 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
35
36#endif /*CONFIG_XEN */
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h
new file mode 100644
index 000000000000..b9aaea45f07f
--- /dev/null
+++ b/arch/i386/xen/xen-ops.h
@@ -0,0 +1,71 @@
1#ifndef XEN_OPS_H
2#define XEN_OPS_H
3
4#include <linux/init.h>
5
6/* These are code, but not functions. Defined in entry.S */
7extern const char xen_hypervisor_callback[];
8extern const char xen_failsafe_callback[];
9
10void xen_copy_trap_info(struct trap_info *traps);
11
12DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
13DECLARE_PER_CPU(unsigned long, xen_cr3);
14
15extern struct start_info *xen_start_info;
16extern struct shared_info *HYPERVISOR_shared_info;
17
18char * __init xen_memory_setup(void);
19void __init xen_arch_setup(void);
20void __init xen_init_IRQ(void);
21
22void xen_setup_timer(int cpu);
23void xen_setup_cpu_clockevents(void);
24unsigned long xen_cpu_khz(void);
25void __init xen_time_init(void);
26unsigned long xen_get_wallclock(void);
27int xen_set_wallclock(unsigned long time);
28unsigned long long xen_sched_clock(void);
29
30void xen_mark_init_mm_pinned(void);
31
32DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
33
34static inline unsigned xen_get_lazy_mode(void)
35{
36 return x86_read_percpu(xen_lazy_mode);
37}
38
39void __init xen_fill_possible_map(void);
40
41void __init xen_setup_vcpu_info_placement(void);
42void xen_smp_prepare_boot_cpu(void);
43void xen_smp_prepare_cpus(unsigned int max_cpus);
44int xen_cpu_up(unsigned int cpu);
45void xen_smp_cpus_done(unsigned int max_cpus);
46
47void xen_smp_send_stop(void);
48void xen_smp_send_reschedule(int cpu);
49int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
50 int wait);
51int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
52 int nonatomic, int wait);
53
54int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
55 void *info, int wait);
56
57
58/* Declare an asm function, along with symbols needed to make it
59 inlineable */
60#define DECL_ASM(ret, name, ...) \
61 ret name(__VA_ARGS__); \
62 extern char name##_end[]; \
63 extern char name##_reloc[] \
64
65DECL_ASM(void, xen_irq_enable_direct, void);
66DECL_ASM(void, xen_irq_disable_direct, void);
67DECL_ASM(unsigned long, xen_save_fl_direct, void);
68DECL_ASM(void, xen_restore_fl_direct, unsigned long);
69
70void xen_iret_direct(void);
71#endif /* XEN_OPS_H */
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index 296d2b0c5d88..fd9aff3f3890 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -6,6 +6,7 @@
6#include <asm/io.h> 6#include <asm/io.h>
7#include <asm/processor.h> 7#include <asm/processor.h>
8#include <asm/fcntl.h> 8#include <asm/fcntl.h>
9#include <xen/hvc-console.h>
9 10
10/* Simple VGA output */ 11/* Simple VGA output */
11 12
@@ -242,6 +243,10 @@ static int __init setup_early_printk(char *buf)
242 simnow_init(buf + 6); 243 simnow_init(buf + 6);
243 early_console = &simnow_console; 244 early_console = &simnow_console;
244 keep_early = 1; 245 keep_early = 1;
246#ifdef CONFIG_HVC_XEN
247 } else if (!strncmp(buf, "xen", 3)) {
248 early_console = &xenboot_console;
249#endif
245 } 250 }
246 251
247 if (keep_early) 252 if (keep_early)
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index aa1d15991794..f3fb8174559e 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -174,7 +174,7 @@ static void do_mce_trigger(void)
174 if (events != atomic_read(&mce_logged) && trigger[0]) { 174 if (events != atomic_read(&mce_logged) && trigger[0]) {
175 /* Small race window, but should be harmless. */ 175 /* Small race window, but should be harmless. */
176 atomic_set(&mce_logged, events); 176 atomic_set(&mce_logged, events);
177 call_usermodehelper(trigger, trigger_argv, NULL, -1); 177 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
178 } 178 }
179} 179}
180 180
diff --git a/drivers/Makefile b/drivers/Makefile
index 503d82569449..6d9d7fab77f5 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -15,6 +15,8 @@ obj-$(CONFIG_ACPI) += acpi/
15obj-$(CONFIG_PNP) += pnp/ 15obj-$(CONFIG_PNP) += pnp/
16obj-$(CONFIG_ARM_AMBA) += amba/ 16obj-$(CONFIG_ARM_AMBA) += amba/
17 17
18obj-$(CONFIG_XEN) += xen/
19
18# char/ comes before serial/ etc so that the VT console is the boot-time 20# char/ comes before serial/ etc so that the VT console is the boot-time
19# default. 21# default.
20obj-y += char/ 22obj-y += char/
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 88a6fc7fd271..58f1338981bc 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -40,6 +40,7 @@
40#include <linux/jiffies.h> 40#include <linux/jiffies.h>
41#include <linux/kmod.h> 41#include <linux/kmod.h>
42#include <linux/seq_file.h> 42#include <linux/seq_file.h>
43#include <linux/reboot.h>
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44 45
45#include <acpi/acpi_bus.h> 46#include <acpi/acpi_bus.h>
@@ -59,7 +60,6 @@
59#define ACPI_THERMAL_NOTIFY_CRITICAL 0xF0 60#define ACPI_THERMAL_NOTIFY_CRITICAL 0xF0
60#define ACPI_THERMAL_NOTIFY_HOT 0xF1 61#define ACPI_THERMAL_NOTIFY_HOT 0xF1
61#define ACPI_THERMAL_MODE_ACTIVE 0x00 62#define ACPI_THERMAL_MODE_ACTIVE 0x00
62#define ACPI_THERMAL_PATH_POWEROFF "/sbin/poweroff"
63 63
64#define ACPI_THERMAL_MAX_ACTIVE 10 64#define ACPI_THERMAL_MAX_ACTIVE 10
65#define ACPI_THERMAL_MAX_LIMIT_STR_LEN 65 65#define ACPI_THERMAL_MAX_LIMIT_STR_LEN 65
@@ -419,26 +419,6 @@ static int acpi_thermal_get_devices(struct acpi_thermal *tz)
419 return 0; 419 return 0;
420} 420}
421 421
422static int acpi_thermal_call_usermode(char *path)
423{
424 char *argv[2] = { NULL, NULL };
425 char *envp[3] = { NULL, NULL, NULL };
426
427
428 if (!path)
429 return -EINVAL;
430
431 argv[0] = path;
432
433 /* minimal command environment */
434 envp[0] = "HOME=/";
435 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
436
437 call_usermodehelper(argv[0], argv, envp, 0);
438
439 return 0;
440}
441
442static int acpi_thermal_critical(struct acpi_thermal *tz) 422static int acpi_thermal_critical(struct acpi_thermal *tz)
443{ 423{
444 if (!tz || !tz->trips.critical.flags.valid) 424 if (!tz || !tz->trips.critical.flags.valid)
@@ -456,7 +436,7 @@ static int acpi_thermal_critical(struct acpi_thermal *tz)
456 acpi_bus_generate_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL, 436 acpi_bus_generate_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL,
457 tz->trips.critical.flags.enabled); 437 tz->trips.critical.flags.enabled);
458 438
459 acpi_thermal_call_usermode(ACPI_THERMAL_PATH_POWEROFF); 439 orderly_poweroff(true);
460 440
461 return 0; 441 return 0;
462} 442}
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 8f65b88cf711..a4a311992408 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -427,4 +427,13 @@ config XILINX_SYSACE
427 help 427 help
428 Include support for the Xilinx SystemACE CompactFlash interface 428 Include support for the Xilinx SystemACE CompactFlash interface
429 429
430config XEN_BLKDEV_FRONTEND
431 tristate "Xen virtual block device support"
432 depends on XEN
433 default y
434 help
435 This driver implements the front-end of the Xen virtual
436 block device driver. It communicates with a back-end driver
437 in another domain which drives the actual block device.
438
430endif # BLK_DEV 439endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 9ee08ab4ffa8..3e31532df0ed 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_VIODASD) += viodasd.o
29obj-$(CONFIG_BLK_DEV_SX8) += sx8.o 29obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
30obj-$(CONFIG_BLK_DEV_UB) += ub.o 30obj-$(CONFIG_BLK_DEV_UB) += ub.o
31 31
32obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
new file mode 100644
index 000000000000..6746c29181f8
--- /dev/null
+++ b/drivers/block/xen-blkfront.c
@@ -0,0 +1,988 @@
1/*
2 * blkfront.c
3 *
4 * XenLinux virtual block device driver.
5 *
6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8 * Copyright (c) 2004, Christian Limpach
9 * Copyright (c) 2004, Andrew Warfield
10 * Copyright (c) 2005, Christopher Clark
11 * Copyright (c) 2005, XenSource Ltd
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License version 2
15 * as published by the Free Software Foundation; or, when distributed
16 * separately from the Linux kernel or incorporated into other
17 * software packages, subject to the following license:
18 *
19 * Permission is hereby granted, free of charge, to any person obtaining a copy
20 * of this source file (the "Software"), to deal in the Software without
21 * restriction, including without limitation the rights to use, copy, modify,
22 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23 * and to permit persons to whom the Software is furnished to do so, subject to
24 * the following conditions:
25 *
26 * The above copyright notice and this permission notice shall be included in
27 * all copies or substantial portions of the Software.
28 *
29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35 * IN THE SOFTWARE.
36 */
37
38#include <linux/interrupt.h>
39#include <linux/blkdev.h>
40#include <linux/module.h>
41
42#include <xen/xenbus.h>
43#include <xen/grant_table.h>
44#include <xen/events.h>
45#include <xen/page.h>
46
47#include <xen/interface/grant_table.h>
48#include <xen/interface/io/blkif.h>
49
50#include <asm/xen/hypervisor.h>
51
52enum blkif_state {
53 BLKIF_STATE_DISCONNECTED,
54 BLKIF_STATE_CONNECTED,
55 BLKIF_STATE_SUSPENDED,
56};
57
58struct blk_shadow {
59 struct blkif_request req;
60 unsigned long request;
61 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
62};
63
64static struct block_device_operations xlvbd_block_fops;
65
66#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
67
68/*
69 * We have one of these per vbd, whether ide, scsi or 'other'. They
70 * hang in private_data off the gendisk structure. We may end up
71 * putting all kinds of interesting stuff here :-)
72 */
73struct blkfront_info
74{
75 struct xenbus_device *xbdev;
76 dev_t dev;
77 struct gendisk *gd;
78 int vdevice;
79 blkif_vdev_t handle;
80 enum blkif_state connected;
81 int ring_ref;
82 struct blkif_front_ring ring;
83 unsigned int evtchn, irq;
84 struct request_queue *rq;
85 struct work_struct work;
86 struct gnttab_free_callback callback;
87 struct blk_shadow shadow[BLK_RING_SIZE];
88 unsigned long shadow_free;
89 int feature_barrier;
90
91 /**
92 * The number of people holding this device open. We won't allow a
93 * hot-unplug unless this is 0.
94 */
95 int users;
96};
97
98static DEFINE_SPINLOCK(blkif_io_lock);
99
100#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
101 (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
102#define GRANT_INVALID_REF 0
103
104#define PARTS_PER_DISK 16
105
106#define BLKIF_MAJOR(dev) ((dev)>>8)
107#define BLKIF_MINOR(dev) ((dev) & 0xff)
108
109#define DEV_NAME "xvd" /* name in /dev */
110
111/* Information about our VBDs. */
112#define MAX_VBDS 64
113static LIST_HEAD(vbds_list);
114
115static int get_id_from_freelist(struct blkfront_info *info)
116{
117 unsigned long free = info->shadow_free;
118 BUG_ON(free > BLK_RING_SIZE);
119 info->shadow_free = info->shadow[free].req.id;
120 info->shadow[free].req.id = 0x0fffffee; /* debug */
121 return free;
122}
123
124static void add_id_to_freelist(struct blkfront_info *info,
125 unsigned long id)
126{
127 info->shadow[id].req.id = info->shadow_free;
128 info->shadow[id].request = 0;
129 info->shadow_free = id;
130}
131
132static void blkif_restart_queue_callback(void *arg)
133{
134 struct blkfront_info *info = (struct blkfront_info *)arg;
135 schedule_work(&info->work);
136}
137
138/*
139 * blkif_queue_request
140 *
141 * request block io
142 *
143 * id: for guest use only.
144 * operation: BLKIF_OP_{READ,WRITE,PROBE}
145 * buffer: buffer to read/write into. this should be a
146 * virtual address in the guest os.
147 */
148static int blkif_queue_request(struct request *req)
149{
150 struct blkfront_info *info = req->rq_disk->private_data;
151 unsigned long buffer_mfn;
152 struct blkif_request *ring_req;
153 struct bio *bio;
154 struct bio_vec *bvec;
155 int idx;
156 unsigned long id;
157 unsigned int fsect, lsect;
158 int ref;
159 grant_ref_t gref_head;
160
161 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
162 return 1;
163
164 if (gnttab_alloc_grant_references(
165 BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
166 gnttab_request_free_callback(
167 &info->callback,
168 blkif_restart_queue_callback,
169 info,
170 BLKIF_MAX_SEGMENTS_PER_REQUEST);
171 return 1;
172 }
173
174 /* Fill out a communications ring structure. */
175 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
176 id = get_id_from_freelist(info);
177 info->shadow[id].request = (unsigned long)req;
178
179 ring_req->id = id;
180 ring_req->sector_number = (blkif_sector_t)req->sector;
181 ring_req->handle = info->handle;
182
183 ring_req->operation = rq_data_dir(req) ?
184 BLKIF_OP_WRITE : BLKIF_OP_READ;
185 if (blk_barrier_rq(req))
186 ring_req->operation = BLKIF_OP_WRITE_BARRIER;
187
188 ring_req->nr_segments = 0;
189 rq_for_each_bio (bio, req) {
190 bio_for_each_segment (bvec, bio, idx) {
191 BUG_ON(ring_req->nr_segments
192 == BLKIF_MAX_SEGMENTS_PER_REQUEST);
193 buffer_mfn = pfn_to_mfn(page_to_pfn(bvec->bv_page));
194 fsect = bvec->bv_offset >> 9;
195 lsect = fsect + (bvec->bv_len >> 9) - 1;
196 /* install a grant reference. */
197 ref = gnttab_claim_grant_reference(&gref_head);
198 BUG_ON(ref == -ENOSPC);
199
200 gnttab_grant_foreign_access_ref(
201 ref,
202 info->xbdev->otherend_id,
203 buffer_mfn,
204 rq_data_dir(req) );
205
206 info->shadow[id].frame[ring_req->nr_segments] =
207 mfn_to_pfn(buffer_mfn);
208
209 ring_req->seg[ring_req->nr_segments] =
210 (struct blkif_request_segment) {
211 .gref = ref,
212 .first_sect = fsect,
213 .last_sect = lsect };
214
215 ring_req->nr_segments++;
216 }
217 }
218
219 info->ring.req_prod_pvt++;
220
221 /* Keep a private copy so we can reissue requests when recovering. */
222 info->shadow[id].req = *ring_req;
223
224 gnttab_free_grant_references(gref_head);
225
226 return 0;
227}
228
229
230static inline void flush_requests(struct blkfront_info *info)
231{
232 int notify;
233
234 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
235
236 if (notify)
237 notify_remote_via_irq(info->irq);
238}
239
240/*
241 * do_blkif_request
242 * read a block; request is in a request queue
243 */
244static void do_blkif_request(request_queue_t *rq)
245{
246 struct blkfront_info *info = NULL;
247 struct request *req;
248 int queued;
249
250 pr_debug("Entered do_blkif_request\n");
251
252 queued = 0;
253
254 while ((req = elv_next_request(rq)) != NULL) {
255 info = req->rq_disk->private_data;
256 if (!blk_fs_request(req)) {
257 end_request(req, 0);
258 continue;
259 }
260
261 if (RING_FULL(&info->ring))
262 goto wait;
263
264 pr_debug("do_blk_req %p: cmd %p, sec %lx, "
265 "(%u/%li) buffer:%p [%s]\n",
266 req, req->cmd, (unsigned long)req->sector,
267 req->current_nr_sectors,
268 req->nr_sectors, req->buffer,
269 rq_data_dir(req) ? "write" : "read");
270
271
272 blkdev_dequeue_request(req);
273 if (blkif_queue_request(req)) {
274 blk_requeue_request(rq, req);
275wait:
276 /* Avoid pointless unplugs. */
277 blk_stop_queue(rq);
278 break;
279 }
280
281 queued++;
282 }
283
284 if (queued != 0)
285 flush_requests(info);
286}
287
288static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
289{
290 request_queue_t *rq;
291
292 rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
293 if (rq == NULL)
294 return -1;
295
296 elevator_init(rq, "noop");
297
298 /* Hard sector size and max sectors impersonate the equiv. hardware. */
299 blk_queue_hardsect_size(rq, sector_size);
300 blk_queue_max_sectors(rq, 512);
301
302 /* Each segment in a request is up to an aligned page in size. */
303 blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
304 blk_queue_max_segment_size(rq, PAGE_SIZE);
305
306 /* Ensure a merged request will fit in a single I/O ring slot. */
307 blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
308 blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
309
310 /* Make sure buffer addresses are sector-aligned. */
311 blk_queue_dma_alignment(rq, 511);
312
313 gd->queue = rq;
314
315 return 0;
316}
317
318
319static int xlvbd_barrier(struct blkfront_info *info)
320{
321 int err;
322
323 err = blk_queue_ordered(info->rq,
324 info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
325 NULL);
326
327 if (err)
328 return err;
329
330 printk(KERN_INFO "blkfront: %s: barriers %s\n",
331 info->gd->disk_name,
332 info->feature_barrier ? "enabled" : "disabled");
333 return 0;
334}
335
336
337static int xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity,
338 int vdevice, u16 vdisk_info, u16 sector_size,
339 struct blkfront_info *info)
340{
341 struct gendisk *gd;
342 int nr_minors = 1;
343 int err = -ENODEV;
344
345 BUG_ON(info->gd != NULL);
346 BUG_ON(info->rq != NULL);
347
348 if ((minor % PARTS_PER_DISK) == 0)
349 nr_minors = PARTS_PER_DISK;
350
351 gd = alloc_disk(nr_minors);
352 if (gd == NULL)
353 goto out;
354
355 if (nr_minors > 1)
356 sprintf(gd->disk_name, "%s%c", DEV_NAME,
357 'a' + minor / PARTS_PER_DISK);
358 else
359 sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
360 'a' + minor / PARTS_PER_DISK,
361 minor % PARTS_PER_DISK);
362
363 gd->major = XENVBD_MAJOR;
364 gd->first_minor = minor;
365 gd->fops = &xlvbd_block_fops;
366 gd->private_data = info;
367 gd->driverfs_dev = &(info->xbdev->dev);
368 set_capacity(gd, capacity);
369
370 if (xlvbd_init_blk_queue(gd, sector_size)) {
371 del_gendisk(gd);
372 goto out;
373 }
374
375 info->rq = gd->queue;
376 info->gd = gd;
377
378 if (info->feature_barrier)
379 xlvbd_barrier(info);
380
381 if (vdisk_info & VDISK_READONLY)
382 set_disk_ro(gd, 1);
383
384 if (vdisk_info & VDISK_REMOVABLE)
385 gd->flags |= GENHD_FL_REMOVABLE;
386
387 if (vdisk_info & VDISK_CDROM)
388 gd->flags |= GENHD_FL_CD;
389
390 return 0;
391
392 out:
393 return err;
394}
395
396static void kick_pending_request_queues(struct blkfront_info *info)
397{
398 if (!RING_FULL(&info->ring)) {
399 /* Re-enable calldowns. */
400 blk_start_queue(info->rq);
401 /* Kick things off immediately. */
402 do_blkif_request(info->rq);
403 }
404}
405
406static void blkif_restart_queue(struct work_struct *work)
407{
408 struct blkfront_info *info = container_of(work, struct blkfront_info, work);
409
410 spin_lock_irq(&blkif_io_lock);
411 if (info->connected == BLKIF_STATE_CONNECTED)
412 kick_pending_request_queues(info);
413 spin_unlock_irq(&blkif_io_lock);
414}
415
416static void blkif_free(struct blkfront_info *info, int suspend)
417{
418 /* Prevent new requests being issued until we fix things up. */
419 spin_lock_irq(&blkif_io_lock);
420 info->connected = suspend ?
421 BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
422 /* No more blkif_request(). */
423 if (info->rq)
424 blk_stop_queue(info->rq);
425 /* No more gnttab callback work. */
426 gnttab_cancel_free_callback(&info->callback);
427 spin_unlock_irq(&blkif_io_lock);
428
429 /* Flush gnttab callback work. Must be done with no locks held. */
430 flush_scheduled_work();
431
432 /* Free resources associated with old device channel. */
433 if (info->ring_ref != GRANT_INVALID_REF) {
434 gnttab_end_foreign_access(info->ring_ref, 0,
435 (unsigned long)info->ring.sring);
436 info->ring_ref = GRANT_INVALID_REF;
437 info->ring.sring = NULL;
438 }
439 if (info->irq)
440 unbind_from_irqhandler(info->irq, info);
441 info->evtchn = info->irq = 0;
442
443}
444
445static void blkif_completion(struct blk_shadow *s)
446{
447 int i;
448 for (i = 0; i < s->req.nr_segments; i++)
449 gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
450}
451
452static irqreturn_t blkif_interrupt(int irq, void *dev_id)
453{
454 struct request *req;
455 struct blkif_response *bret;
456 RING_IDX i, rp;
457 unsigned long flags;
458 struct blkfront_info *info = (struct blkfront_info *)dev_id;
459 int uptodate;
460
461 spin_lock_irqsave(&blkif_io_lock, flags);
462
463 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
464 spin_unlock_irqrestore(&blkif_io_lock, flags);
465 return IRQ_HANDLED;
466 }
467
468 again:
469 rp = info->ring.sring->rsp_prod;
470 rmb(); /* Ensure we see queued responses up to 'rp'. */
471
472 for (i = info->ring.rsp_cons; i != rp; i++) {
473 unsigned long id;
474 int ret;
475
476 bret = RING_GET_RESPONSE(&info->ring, i);
477 id = bret->id;
478 req = (struct request *)info->shadow[id].request;
479
480 blkif_completion(&info->shadow[id]);
481
482 add_id_to_freelist(info, id);
483
484 uptodate = (bret->status == BLKIF_RSP_OKAY);
485 switch (bret->operation) {
486 case BLKIF_OP_WRITE_BARRIER:
487 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
488 printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
489 info->gd->disk_name);
490 uptodate = -EOPNOTSUPP;
491 info->feature_barrier = 0;
492 xlvbd_barrier(info);
493 }
494 /* fall through */
495 case BLKIF_OP_READ:
496 case BLKIF_OP_WRITE:
497 if (unlikely(bret->status != BLKIF_RSP_OKAY))
498 dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
499 "request: %x\n", bret->status);
500
501 ret = end_that_request_first(req, uptodate,
502 req->hard_nr_sectors);
503 BUG_ON(ret);
504 end_that_request_last(req, uptodate);
505 break;
506 default:
507 BUG();
508 }
509 }
510
511 info->ring.rsp_cons = i;
512
513 if (i != info->ring.req_prod_pvt) {
514 int more_to_do;
515 RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
516 if (more_to_do)
517 goto again;
518 } else
519 info->ring.sring->rsp_event = i + 1;
520
521 kick_pending_request_queues(info);
522
523 spin_unlock_irqrestore(&blkif_io_lock, flags);
524
525 return IRQ_HANDLED;
526}
527
528
529static int setup_blkring(struct xenbus_device *dev,
530 struct blkfront_info *info)
531{
532 struct blkif_sring *sring;
533 int err;
534
535 info->ring_ref = GRANT_INVALID_REF;
536
537 sring = (struct blkif_sring *)__get_free_page(GFP_KERNEL);
538 if (!sring) {
539 xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
540 return -ENOMEM;
541 }
542 SHARED_RING_INIT(sring);
543 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
544
545 err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
546 if (err < 0) {
547 free_page((unsigned long)sring);
548 info->ring.sring = NULL;
549 goto fail;
550 }
551 info->ring_ref = err;
552
553 err = xenbus_alloc_evtchn(dev, &info->evtchn);
554 if (err)
555 goto fail;
556
557 err = bind_evtchn_to_irqhandler(info->evtchn,
558 blkif_interrupt,
559 IRQF_SAMPLE_RANDOM, "blkif", info);
560 if (err <= 0) {
561 xenbus_dev_fatal(dev, err,
562 "bind_evtchn_to_irqhandler failed");
563 goto fail;
564 }
565 info->irq = err;
566
567 return 0;
568fail:
569 blkif_free(info, 0);
570 return err;
571}
572
573
574/* Common code used when first setting up, and when resuming. */
575static int talk_to_backend(struct xenbus_device *dev,
576 struct blkfront_info *info)
577{
578 const char *message = NULL;
579 struct xenbus_transaction xbt;
580 int err;
581
582 /* Create shared ring, alloc event channel. */
583 err = setup_blkring(dev, info);
584 if (err)
585 goto out;
586
587again:
588 err = xenbus_transaction_start(&xbt);
589 if (err) {
590 xenbus_dev_fatal(dev, err, "starting transaction");
591 goto destroy_blkring;
592 }
593
594 err = xenbus_printf(xbt, dev->nodename,
595 "ring-ref", "%u", info->ring_ref);
596 if (err) {
597 message = "writing ring-ref";
598 goto abort_transaction;
599 }
600 err = xenbus_printf(xbt, dev->nodename,
601 "event-channel", "%u", info->evtchn);
602 if (err) {
603 message = "writing event-channel";
604 goto abort_transaction;
605 }
606
607 err = xenbus_transaction_end(xbt, 0);
608 if (err) {
609 if (err == -EAGAIN)
610 goto again;
611 xenbus_dev_fatal(dev, err, "completing transaction");
612 goto destroy_blkring;
613 }
614
615 xenbus_switch_state(dev, XenbusStateInitialised);
616
617 return 0;
618
619 abort_transaction:
620 xenbus_transaction_end(xbt, 1);
621 if (message)
622 xenbus_dev_fatal(dev, err, "%s", message);
623 destroy_blkring:
624 blkif_free(info, 0);
625 out:
626 return err;
627}
628
629
630/**
631 * Entry point to this code when a new device is created. Allocate the basic
632 * structures and the ring buffer for communication with the backend, and
633 * inform the backend of the appropriate details for those. Switch to
634 * Initialised state.
635 */
636static int blkfront_probe(struct xenbus_device *dev,
637 const struct xenbus_device_id *id)
638{
639 int err, vdevice, i;
640 struct blkfront_info *info;
641
642 /* FIXME: Use dynamic device id if this is not set. */
643 err = xenbus_scanf(XBT_NIL, dev->nodename,
644 "virtual-device", "%i", &vdevice);
645 if (err != 1) {
646 xenbus_dev_fatal(dev, err, "reading virtual-device");
647 return err;
648 }
649
650 info = kzalloc(sizeof(*info), GFP_KERNEL);
651 if (!info) {
652 xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
653 return -ENOMEM;
654 }
655
656 info->xbdev = dev;
657 info->vdevice = vdevice;
658 info->connected = BLKIF_STATE_DISCONNECTED;
659 INIT_WORK(&info->work, blkif_restart_queue);
660
661 for (i = 0; i < BLK_RING_SIZE; i++)
662 info->shadow[i].req.id = i+1;
663 info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
664
665 /* Front end dir is a number, which is used as the id. */
666 info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
667 dev->dev.driver_data = info;
668
669 err = talk_to_backend(dev, info);
670 if (err) {
671 kfree(info);
672 dev->dev.driver_data = NULL;
673 return err;
674 }
675
676 return 0;
677}
678
679
680static int blkif_recover(struct blkfront_info *info)
681{
682 int i;
683 struct blkif_request *req;
684 struct blk_shadow *copy;
685 int j;
686
687 /* Stage 1: Make a safe copy of the shadow state. */
688 copy = kmalloc(sizeof(info->shadow), GFP_KERNEL);
689 if (!copy)
690 return -ENOMEM;
691 memcpy(copy, info->shadow, sizeof(info->shadow));
692
693 /* Stage 2: Set up free list. */
694 memset(&info->shadow, 0, sizeof(info->shadow));
695 for (i = 0; i < BLK_RING_SIZE; i++)
696 info->shadow[i].req.id = i+1;
697 info->shadow_free = info->ring.req_prod_pvt;
698 info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
699
700 /* Stage 3: Find pending requests and requeue them. */
701 for (i = 0; i < BLK_RING_SIZE; i++) {
702 /* Not in use? */
703 if (copy[i].request == 0)
704 continue;
705
706 /* Grab a request slot and copy shadow state into it. */
707 req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
708 *req = copy[i].req;
709
710 /* We get a new request id, and must reset the shadow state. */
711 req->id = get_id_from_freelist(info);
712 memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
713
714 /* Rewrite any grant references invalidated by susp/resume. */
715 for (j = 0; j < req->nr_segments; j++)
716 gnttab_grant_foreign_access_ref(
717 req->seg[j].gref,
718 info->xbdev->otherend_id,
719 pfn_to_mfn(info->shadow[req->id].frame[j]),
720 rq_data_dir(
721 (struct request *)
722 info->shadow[req->id].request));
723 info->shadow[req->id].req = *req;
724
725 info->ring.req_prod_pvt++;
726 }
727
728 kfree(copy);
729
730 xenbus_switch_state(info->xbdev, XenbusStateConnected);
731
732 spin_lock_irq(&blkif_io_lock);
733
734 /* Now safe for us to use the shared ring */
735 info->connected = BLKIF_STATE_CONNECTED;
736
737 /* Send off requeued requests */
738 flush_requests(info);
739
740 /* Kick any other new requests queued since we resumed */
741 kick_pending_request_queues(info);
742
743 spin_unlock_irq(&blkif_io_lock);
744
745 return 0;
746}
747
748/**
749 * We are reconnecting to the backend, due to a suspend/resume, or a backend
750 * driver restart. We tear down our blkif structure and recreate it, but
751 * leave the device-layer structures intact so that this is transparent to the
752 * rest of the kernel.
753 */
754static int blkfront_resume(struct xenbus_device *dev)
755{
756 struct blkfront_info *info = dev->dev.driver_data;
757 int err;
758
759 dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
760
761 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
762
763 err = talk_to_backend(dev, info);
764 if (info->connected == BLKIF_STATE_SUSPENDED && !err)
765 err = blkif_recover(info);
766
767 return err;
768}
769
770
771/*
772 * Invoked when the backend is finally 'ready' (and has told produced
773 * the details about the physical device - #sectors, size, etc).
774 */
775static void blkfront_connect(struct blkfront_info *info)
776{
777 unsigned long long sectors;
778 unsigned long sector_size;
779 unsigned int binfo;
780 int err;
781
782 if ((info->connected == BLKIF_STATE_CONNECTED) ||
783 (info->connected == BLKIF_STATE_SUSPENDED) )
784 return;
785
786 dev_dbg(&info->xbdev->dev, "%s:%s.\n",
787 __func__, info->xbdev->otherend);
788
789 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
790 "sectors", "%llu", &sectors,
791 "info", "%u", &binfo,
792 "sector-size", "%lu", &sector_size,
793 NULL);
794 if (err) {
795 xenbus_dev_fatal(info->xbdev, err,
796 "reading backend fields at %s",
797 info->xbdev->otherend);
798 return;
799 }
800
801 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
802 "feature-barrier", "%lu", &info->feature_barrier,
803 NULL);
804 if (err)
805 info->feature_barrier = 0;
806
807 err = xlvbd_alloc_gendisk(BLKIF_MINOR(info->vdevice),
808 sectors, info->vdevice,
809 binfo, sector_size, info);
810 if (err) {
811 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
812 info->xbdev->otherend);
813 return;
814 }
815
816 xenbus_switch_state(info->xbdev, XenbusStateConnected);
817
818 /* Kick pending requests. */
819 spin_lock_irq(&blkif_io_lock);
820 info->connected = BLKIF_STATE_CONNECTED;
821 kick_pending_request_queues(info);
822 spin_unlock_irq(&blkif_io_lock);
823
824 add_disk(info->gd);
825}
826
827/**
828 * Handle the change of state of the backend to Closing. We must delete our
829 * device-layer structures now, to ensure that writes are flushed through to
830 * the backend. Once is this done, we can switch to Closed in
831 * acknowledgement.
832 */
833static void blkfront_closing(struct xenbus_device *dev)
834{
835 struct blkfront_info *info = dev->dev.driver_data;
836 unsigned long flags;
837
838 dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
839
840 if (info->rq == NULL)
841 goto out;
842
843 spin_lock_irqsave(&blkif_io_lock, flags);
844
845 del_gendisk(info->gd);
846
847 /* No more blkif_request(). */
848 blk_stop_queue(info->rq);
849
850 /* No more gnttab callback work. */
851 gnttab_cancel_free_callback(&info->callback);
852 spin_unlock_irqrestore(&blkif_io_lock, flags);
853
854 /* Flush gnttab callback work. Must be done with no locks held. */
855 flush_scheduled_work();
856
857 blk_cleanup_queue(info->rq);
858 info->rq = NULL;
859
860 out:
861 xenbus_frontend_closed(dev);
862}
863
864/**
865 * Callback received when the backend's state changes.
866 */
867static void backend_changed(struct xenbus_device *dev,
868 enum xenbus_state backend_state)
869{
870 struct blkfront_info *info = dev->dev.driver_data;
871 struct block_device *bd;
872
873 dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
874
875 switch (backend_state) {
876 case XenbusStateInitialising:
877 case XenbusStateInitWait:
878 case XenbusStateInitialised:
879 case XenbusStateUnknown:
880 case XenbusStateClosed:
881 break;
882
883 case XenbusStateConnected:
884 blkfront_connect(info);
885 break;
886
887 case XenbusStateClosing:
888 bd = bdget(info->dev);
889 if (bd == NULL)
890 xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
891
892 mutex_lock(&bd->bd_mutex);
893 if (info->users > 0)
894 xenbus_dev_error(dev, -EBUSY,
895 "Device in use; refusing to close");
896 else
897 blkfront_closing(dev);
898 mutex_unlock(&bd->bd_mutex);
899 bdput(bd);
900 break;
901 }
902}
903
904static int blkfront_remove(struct xenbus_device *dev)
905{
906 struct blkfront_info *info = dev->dev.driver_data;
907
908 dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename);
909
910 blkif_free(info, 0);
911
912 kfree(info);
913
914 return 0;
915}
916
917static int blkif_open(struct inode *inode, struct file *filep)
918{
919 struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
920 info->users++;
921 return 0;
922}
923
924static int blkif_release(struct inode *inode, struct file *filep)
925{
926 struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
927 info->users--;
928 if (info->users == 0) {
929 /* Check whether we have been instructed to close. We will
930 have ignored this request initially, as the device was
931 still mounted. */
932 struct xenbus_device *dev = info->xbdev;
933 enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
934
935 if (state == XenbusStateClosing)
936 blkfront_closing(dev);
937 }
938 return 0;
939}
940
941static struct block_device_operations xlvbd_block_fops =
942{
943 .owner = THIS_MODULE,
944 .open = blkif_open,
945 .release = blkif_release,
946};
947
948
949static struct xenbus_device_id blkfront_ids[] = {
950 { "vbd" },
951 { "" }
952};
953
954static struct xenbus_driver blkfront = {
955 .name = "vbd",
956 .owner = THIS_MODULE,
957 .ids = blkfront_ids,
958 .probe = blkfront_probe,
959 .remove = blkfront_remove,
960 .resume = blkfront_resume,
961 .otherend_changed = backend_changed,
962};
963
964static int __init xlblk_init(void)
965{
966 if (!is_running_on_xen())
967 return -ENODEV;
968
969 if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
970 printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
971 XENVBD_MAJOR, DEV_NAME);
972 return -ENODEV;
973 }
974
975 return xenbus_register_frontend(&blkfront);
976}
977module_init(xlblk_init);
978
979
980static void xlblk_exit(void)
981{
982 return xenbus_unregister_driver(&blkfront);
983}
984module_exit(xlblk_exit);
985
986MODULE_DESCRIPTION("Xen virtual block device frontend");
987MODULE_LICENSE("GPL");
988MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 97bd71bc3aea..9e8f21410d2d 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -604,6 +604,14 @@ config HVC_BEAT
604 help 604 help
605 Toshiba's Cell Reference Set Beat Console device driver 605 Toshiba's Cell Reference Set Beat Console device driver
606 606
607config HVC_XEN
608 bool "Xen Hypervisor Console support"
609 depends on XEN
610 select HVC_DRIVER
611 default y
612 help
613 Xen virtual console device driver
614
607config HVCS 615config HVCS
608 tristate "IBM Hypervisor Virtual Console Server support" 616 tristate "IBM Hypervisor Virtual Console Server support"
609 depends on PPC_PSERIES 617 depends on PPC_PSERIES
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index f2996a95eb07..8852b8d643cf 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o
48obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o 48obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o
49obj-$(CONFIG_HVC_BEAT) += hvc_beat.o 49obj-$(CONFIG_HVC_BEAT) += hvc_beat.o
50obj-$(CONFIG_HVC_DRIVER) += hvc_console.o 50obj-$(CONFIG_HVC_DRIVER) += hvc_console.o
51obj-$(CONFIG_HVC_XEN) += hvc_xen.o
51obj-$(CONFIG_RAW_DRIVER) += raw.o 52obj-$(CONFIG_RAW_DRIVER) += raw.o
52obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o 53obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o
53obj-$(CONFIG_MSPEC) += mspec.o 54obj-$(CONFIG_MSPEC) += mspec.o
diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c
new file mode 100644
index 000000000000..dd68f8541c2d
--- /dev/null
+++ b/drivers/char/hvc_xen.c
@@ -0,0 +1,159 @@
1/*
2 * xen console driver interface to hvc_console.c
3 *
4 * (c) 2007 Gerd Hoffmann <kraxel@suse.de>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#include <linux/console.h>
22#include <linux/delay.h>
23#include <linux/err.h>
24#include <linux/init.h>
25#include <linux/types.h>
26
27#include <asm/xen/hypervisor.h>
28#include <xen/page.h>
29#include <xen/events.h>
30#include <xen/interface/io/console.h>
31#include <xen/hvc-console.h>
32
33#include "hvc_console.h"
34
35#define HVC_COOKIE 0x58656e /* "Xen" in hex */
36
37static struct hvc_struct *hvc;
38static int xencons_irq;
39
40/* ------------------------------------------------------------------ */
41
42static inline struct xencons_interface *xencons_interface(void)
43{
44 return mfn_to_virt(xen_start_info->console.domU.mfn);
45}
46
47static inline void notify_daemon(void)
48{
49 /* Use evtchn: this is called early, before irq is set up. */
50 notify_remote_via_evtchn(xen_start_info->console.domU.evtchn);
51}
52
53static int write_console(uint32_t vtermno, const char *data, int len)
54{
55 struct xencons_interface *intf = xencons_interface();
56 XENCONS_RING_IDX cons, prod;
57 int sent = 0;
58
59 cons = intf->out_cons;
60 prod = intf->out_prod;
61 mb(); /* update queue values before going on */
62 BUG_ON((prod - cons) > sizeof(intf->out));
63
64 while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
65 intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
66
67 wmb(); /* write ring before updating pointer */
68 intf->out_prod = prod;
69
70 notify_daemon();
71 return sent;
72}
73
74static int read_console(uint32_t vtermno, char *buf, int len)
75{
76 struct xencons_interface *intf = xencons_interface();
77 XENCONS_RING_IDX cons, prod;
78 int recv = 0;
79
80 cons = intf->in_cons;
81 prod = intf->in_prod;
82 mb(); /* get pointers before reading ring */
83 BUG_ON((prod - cons) > sizeof(intf->in));
84
85 while (cons != prod && recv < len)
86 buf[recv++] = intf->in[MASK_XENCONS_IDX(cons++, intf->in)];
87
88 mb(); /* read ring before consuming */
89 intf->in_cons = cons;
90
91 notify_daemon();
92 return recv;
93}
94
95static struct hv_ops hvc_ops = {
96 .get_chars = read_console,
97 .put_chars = write_console,
98};
99
100static int __init xen_init(void)
101{
102 struct hvc_struct *hp;
103
104 if (!is_running_on_xen())
105 return 0;
106
107 xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
108 if (xencons_irq < 0)
109 xencons_irq = 0 /* NO_IRQ */;
110 hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256);
111 if (IS_ERR(hp))
112 return PTR_ERR(hp);
113
114 hvc = hp;
115 return 0;
116}
117
118static void __exit xen_fini(void)
119{
120 if (hvc)
121 hvc_remove(hvc);
122}
123
124static int xen_cons_init(void)
125{
126 if (!is_running_on_xen())
127 return 0;
128
129 hvc_instantiate(HVC_COOKIE, 0, &hvc_ops);
130 return 0;
131}
132
133module_init(xen_init);
134module_exit(xen_fini);
135console_initcall(xen_cons_init);
136
137static void xenboot_write_console(struct console *console, const char *string,
138 unsigned len)
139{
140 unsigned int linelen, off = 0;
141 const char *pos;
142
143 while (off < len && NULL != (pos = strchr(string+off, '\n'))) {
144 linelen = pos-string+off;
145 if (off + linelen > len)
146 break;
147 write_console(0, string+off, linelen);
148 write_console(0, "\r\n", 2);
149 off += linelen + 1;
150 }
151 if (off < len)
152 write_console(0, string+off, len-off);
153}
154
155struct console xenboot_console = {
156 .name = "xenboot",
157 .write = xenboot_write_console,
158 .flags = CON_PRINTBUFFER | CON_BOOT,
159};
diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c
index dbb22403979f..3d90fc002097 100644
--- a/drivers/macintosh/therm_pm72.c
+++ b/drivers/macintosh/therm_pm72.c
@@ -1770,7 +1770,8 @@ static int call_critical_overtemp(void)
1770 "PATH=/sbin:/usr/sbin:/bin:/usr/bin", 1770 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
1771 NULL }; 1771 NULL };
1772 1772
1773 return call_usermodehelper(critical_overtemp_path, argv, envp, 0); 1773 return call_usermodehelper(critical_overtemp_path,
1774 argv, envp, UMH_WAIT_EXEC);
1774} 1775}
1775 1776
1776 1777
diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c
index e18d265d5d33..516d943227e2 100644
--- a/drivers/macintosh/windfarm_core.c
+++ b/drivers/macintosh/windfarm_core.c
@@ -80,7 +80,8 @@ int wf_critical_overtemp(void)
80 "PATH=/sbin:/usr/sbin:/bin:/usr/bin", 80 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
81 NULL }; 81 NULL };
82 82
83 return call_usermodehelper(critical_overtemp_path, argv, envp, 0); 83 return call_usermodehelper(critical_overtemp_path,
84 argv, envp, UMH_WAIT_EXEC);
84} 85}
85EXPORT_SYMBOL_GPL(wf_critical_overtemp); 86EXPORT_SYMBOL_GPL(wf_critical_overtemp);
86 87
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 43d03178064d..5fb659f8b20e 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2486,6 +2486,18 @@ source "drivers/atm/Kconfig"
2486 2486
2487source "drivers/s390/net/Kconfig" 2487source "drivers/s390/net/Kconfig"
2488 2488
2489config XEN_NETDEV_FRONTEND
2490 tristate "Xen network device frontend driver"
2491 depends on XEN
2492 default y
2493 help
2494 The network device frontend driver allows the kernel to
2495 access network devices exported exported by a virtual
2496 machine containing a physical network device driver. The
2497 frontend driver is intended for unprivileged guest domains;
2498 if you are compiling a kernel for a Xen guest, you almost
2499 certainly want to enable this.
2500
2489config ISERIES_VETH 2501config ISERIES_VETH
2490 tristate "iSeries Virtual Ethernet driver support" 2502 tristate "iSeries Virtual Ethernet driver support"
2491 depends on PPC_ISERIES 2503 depends on PPC_ISERIES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index eb4167622a6a..0e286ab8855a 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -127,6 +127,8 @@ obj-$(CONFIG_PPPOL2TP) += pppox.o pppol2tp.o
127obj-$(CONFIG_SLIP) += slip.o 127obj-$(CONFIG_SLIP) += slip.o
128obj-$(CONFIG_SLHC) += slhc.o 128obj-$(CONFIG_SLHC) += slhc.o
129 129
130obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
131
130obj-$(CONFIG_DUMMY) += dummy.o 132obj-$(CONFIG_DUMMY) += dummy.o
131obj-$(CONFIG_IFB) += ifb.o 133obj-$(CONFIG_IFB) += ifb.o
132obj-$(CONFIG_MACVLAN) += macvlan.o 134obj-$(CONFIG_MACVLAN) += macvlan.o
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 84aa2117c0ee..355c6cf3d112 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -320,7 +320,7 @@ static int eppconfig(struct baycom_state *bc)
320 sprintf(portarg, "%ld", bc->pdev->port->base); 320 sprintf(portarg, "%ld", bc->pdev->port->base);
321 printk(KERN_DEBUG "%s: %s -s -p %s -m %s\n", bc_drvname, eppconfig_path, portarg, modearg); 321 printk(KERN_DEBUG "%s: %s -s -p %s -m %s\n", bc_drvname, eppconfig_path, portarg, modearg);
322 322
323 return call_usermodehelper(eppconfig_path, argv, envp, 1); 323 return call_usermodehelper(eppconfig_path, argv, envp, UMH_WAIT_PROC);
324} 324}
325 325
326/* ---------------------------------------------------------------------- */ 326/* ---------------------------------------------------------------------- */
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
new file mode 100644
index 000000000000..489f69c5d6ca
--- /dev/null
+++ b/drivers/net/xen-netfront.c
@@ -0,0 +1,1863 @@
1/*
2 * Virtual network driver for conversing with remote driver backends.
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 * Copyright (c) 2005, XenSource Ltd
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License version 2
9 * as published by the Free Software Foundation; or, when distributed
10 * separately from the Linux kernel or incorporated into other
11 * software packages, subject to the following license:
12 *
13 * Permission is hereby granted, free of charge, to any person obtaining a copy
14 * of this source file (the "Software"), to deal in the Software without
15 * restriction, including without limitation the rights to use, copy, modify,
16 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
17 * and to permit persons to whom the Software is furnished to do so, subject to
18 * the following conditions:
19 *
20 * The above copyright notice and this permission notice shall be included in
21 * all copies or substantial portions of the Software.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
29 * IN THE SOFTWARE.
30 */
31
32#include <linux/module.h>
33#include <linux/kernel.h>
34#include <linux/netdevice.h>
35#include <linux/etherdevice.h>
36#include <linux/skbuff.h>
37#include <linux/ethtool.h>
38#include <linux/if_ether.h>
39#include <linux/tcp.h>
40#include <linux/udp.h>
41#include <linux/moduleparam.h>
42#include <linux/mm.h>
43#include <net/ip.h>
44
45#include <xen/xenbus.h>
46#include <xen/events.h>
47#include <xen/page.h>
48#include <xen/grant_table.h>
49
50#include <xen/interface/io/netif.h>
51#include <xen/interface/memory.h>
52#include <xen/interface/grant_table.h>
53
54static struct ethtool_ops xennet_ethtool_ops;
55
56struct netfront_cb {
57 struct page *page;
58 unsigned offset;
59};
60
61#define NETFRONT_SKB_CB(skb) ((struct netfront_cb *)((skb)->cb))
62
63#define RX_COPY_THRESHOLD 256
64
65#define GRANT_INVALID_REF 0
66
67#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
68#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
69#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
70
71struct netfront_info {
72 struct list_head list;
73 struct net_device *netdev;
74
75 struct net_device_stats stats;
76
77 struct xen_netif_tx_front_ring tx;
78 struct xen_netif_rx_front_ring rx;
79
80 spinlock_t tx_lock;
81 spinlock_t rx_lock;
82
83 unsigned int evtchn;
84
85 /* Receive-ring batched refills. */
86#define RX_MIN_TARGET 8
87#define RX_DFL_MIN_TARGET 64
88#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
89 unsigned rx_min_target, rx_max_target, rx_target;
90 struct sk_buff_head rx_batch;
91
92 struct timer_list rx_refill_timer;
93
94 /*
95 * {tx,rx}_skbs store outstanding skbuffs. Free tx_skb entries
96 * are linked from tx_skb_freelist through skb_entry.link.
97 *
98 * NB. Freelist index entries are always going to be less than
99 * PAGE_OFFSET, whereas pointers to skbs will always be equal or
100 * greater than PAGE_OFFSET: we use this property to distinguish
101 * them.
102 */
103 union skb_entry {
104 struct sk_buff *skb;
105 unsigned link;
106 } tx_skbs[NET_TX_RING_SIZE];
107 grant_ref_t gref_tx_head;
108 grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
109 unsigned tx_skb_freelist;
110
111 struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
112 grant_ref_t gref_rx_head;
113 grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
114
115 struct xenbus_device *xbdev;
116 int tx_ring_ref;
117 int rx_ring_ref;
118
119 unsigned long rx_pfn_array[NET_RX_RING_SIZE];
120 struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
121 struct mmu_update rx_mmu[NET_RX_RING_SIZE];
122};
123
124struct netfront_rx_info {
125 struct xen_netif_rx_response rx;
126 struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
127};
128
129/*
130 * Access macros for acquiring freeing slots in tx_skbs[].
131 */
132
133static void add_id_to_freelist(unsigned *head, union skb_entry *list,
134 unsigned short id)
135{
136 list[id].link = *head;
137 *head = id;
138}
139
140static unsigned short get_id_from_freelist(unsigned *head,
141 union skb_entry *list)
142{
143 unsigned int id = *head;
144 *head = list[id].link;
145 return id;
146}
147
148static int xennet_rxidx(RING_IDX idx)
149{
150 return idx & (NET_RX_RING_SIZE - 1);
151}
152
153static struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
154 RING_IDX ri)
155{
156 int i = xennet_rxidx(ri);
157 struct sk_buff *skb = np->rx_skbs[i];
158 np->rx_skbs[i] = NULL;
159 return skb;
160}
161
162static grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
163 RING_IDX ri)
164{
165 int i = xennet_rxidx(ri);
166 grant_ref_t ref = np->grant_rx_ref[i];
167 np->grant_rx_ref[i] = GRANT_INVALID_REF;
168 return ref;
169}
170
171#ifdef CONFIG_SYSFS
172static int xennet_sysfs_addif(struct net_device *netdev);
173static void xennet_sysfs_delif(struct net_device *netdev);
174#else /* !CONFIG_SYSFS */
175#define xennet_sysfs_addif(dev) (0)
176#define xennet_sysfs_delif(dev) do { } while (0)
177#endif
178
179static int xennet_can_sg(struct net_device *dev)
180{
181 return dev->features & NETIF_F_SG;
182}
183
184
185static void rx_refill_timeout(unsigned long data)
186{
187 struct net_device *dev = (struct net_device *)data;
188 netif_rx_schedule(dev);
189}
190
191static int netfront_tx_slot_available(struct netfront_info *np)
192{
193 return ((np->tx.req_prod_pvt - np->tx.rsp_cons) <
194 (TX_MAX_TARGET - MAX_SKB_FRAGS - 2));
195}
196
197static void xennet_maybe_wake_tx(struct net_device *dev)
198{
199 struct netfront_info *np = netdev_priv(dev);
200
201 if (unlikely(netif_queue_stopped(dev)) &&
202 netfront_tx_slot_available(np) &&
203 likely(netif_running(dev)))
204 netif_wake_queue(dev);
205}
206
207static void xennet_alloc_rx_buffers(struct net_device *dev)
208{
209 unsigned short id;
210 struct netfront_info *np = netdev_priv(dev);
211 struct sk_buff *skb;
212 struct page *page;
213 int i, batch_target, notify;
214 RING_IDX req_prod = np->rx.req_prod_pvt;
215 struct xen_memory_reservation reservation;
216 grant_ref_t ref;
217 unsigned long pfn;
218 void *vaddr;
219 int nr_flips;
220 struct xen_netif_rx_request *req;
221
222 if (unlikely(!netif_carrier_ok(dev)))
223 return;
224
225 /*
226 * Allocate skbuffs greedily, even though we batch updates to the
227 * receive ring. This creates a less bursty demand on the memory
228 * allocator, so should reduce the chance of failed allocation requests
229 * both for ourself and for other kernel subsystems.
230 */
231 batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
232 for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
233 skb = __netdev_alloc_skb(dev, RX_COPY_THRESHOLD,
234 GFP_ATOMIC | __GFP_NOWARN);
235 if (unlikely(!skb))
236 goto no_skb;
237
238 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
239 if (!page) {
240 kfree_skb(skb);
241no_skb:
242 /* Any skbuffs queued for refill? Force them out. */
243 if (i != 0)
244 goto refill;
245 /* Could not allocate any skbuffs. Try again later. */
246 mod_timer(&np->rx_refill_timer,
247 jiffies + (HZ/10));
248 break;
249 }
250
251 skb_shinfo(skb)->frags[0].page = page;
252 skb_shinfo(skb)->nr_frags = 1;
253 __skb_queue_tail(&np->rx_batch, skb);
254 }
255
256 /* Is the batch large enough to be worthwhile? */
257 if (i < (np->rx_target/2)) {
258 if (req_prod > np->rx.sring->req_prod)
259 goto push;
260 return;
261 }
262
263 /* Adjust our fill target if we risked running out of buffers. */
264 if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
265 ((np->rx_target *= 2) > np->rx_max_target))
266 np->rx_target = np->rx_max_target;
267
268 refill:
269 for (nr_flips = i = 0; ; i++) {
270 skb = __skb_dequeue(&np->rx_batch);
271 if (skb == NULL)
272 break;
273
274 skb->dev = dev;
275
276 id = xennet_rxidx(req_prod + i);
277
278 BUG_ON(np->rx_skbs[id]);
279 np->rx_skbs[id] = skb;
280
281 ref = gnttab_claim_grant_reference(&np->gref_rx_head);
282 BUG_ON((signed short)ref < 0);
283 np->grant_rx_ref[id] = ref;
284
285 pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
286 vaddr = page_address(skb_shinfo(skb)->frags[0].page);
287
288 req = RING_GET_REQUEST(&np->rx, req_prod + i);
289 gnttab_grant_foreign_access_ref(ref,
290 np->xbdev->otherend_id,
291 pfn_to_mfn(pfn),
292 0);
293
294 req->id = id;
295 req->gref = ref;
296 }
297
298 if (nr_flips != 0) {
299 reservation.extent_start = np->rx_pfn_array;
300 reservation.nr_extents = nr_flips;
301 reservation.extent_order = 0;
302 reservation.address_bits = 0;
303 reservation.domid = DOMID_SELF;
304
305 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
306 /* After all PTEs have been zapped, flush the TLB. */
307 np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
308 UVMF_TLB_FLUSH|UVMF_ALL;
309
310 /* Give away a batch of pages. */
311 np->rx_mcl[i].op = __HYPERVISOR_memory_op;
312 np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
313 np->rx_mcl[i].args[1] = (unsigned long)&reservation;
314
315 /* Zap PTEs and give away pages in one big
316 * multicall. */
317 (void)HYPERVISOR_multicall(np->rx_mcl, i+1);
318
319 /* Check return status of HYPERVISOR_memory_op(). */
320 if (unlikely(np->rx_mcl[i].result != i))
321 panic("Unable to reduce memory reservation\n");
322 } else {
323 if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
324 &reservation) != i)
325 panic("Unable to reduce memory reservation\n");
326 }
327 } else {
328 wmb(); /* barrier so backend seens requests */
329 }
330
331 /* Above is a suitable barrier to ensure backend will see requests. */
332 np->rx.req_prod_pvt = req_prod + i;
333 push:
334 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
335 if (notify)
336 notify_remote_via_irq(np->netdev->irq);
337}
338
339static int xennet_open(struct net_device *dev)
340{
341 struct netfront_info *np = netdev_priv(dev);
342
343 memset(&np->stats, 0, sizeof(np->stats));
344
345 spin_lock_bh(&np->rx_lock);
346 if (netif_carrier_ok(dev)) {
347 xennet_alloc_rx_buffers(dev);
348 np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
349 if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
350 netif_rx_schedule(dev);
351 }
352 spin_unlock_bh(&np->rx_lock);
353
354 xennet_maybe_wake_tx(dev);
355
356 return 0;
357}
358
359static void xennet_tx_buf_gc(struct net_device *dev)
360{
361 RING_IDX cons, prod;
362 unsigned short id;
363 struct netfront_info *np = netdev_priv(dev);
364 struct sk_buff *skb;
365
366 BUG_ON(!netif_carrier_ok(dev));
367
368 do {
369 prod = np->tx.sring->rsp_prod;
370 rmb(); /* Ensure we see responses up to 'rp'. */
371
372 for (cons = np->tx.rsp_cons; cons != prod; cons++) {
373 struct xen_netif_tx_response *txrsp;
374
375 txrsp = RING_GET_RESPONSE(&np->tx, cons);
376 if (txrsp->status == NETIF_RSP_NULL)
377 continue;
378
379 id = txrsp->id;
380 skb = np->tx_skbs[id].skb;
381 if (unlikely(gnttab_query_foreign_access(
382 np->grant_tx_ref[id]) != 0)) {
383 printk(KERN_ALERT "xennet_tx_buf_gc: warning "
384 "-- grant still in use by backend "
385 "domain.\n");
386 BUG();
387 }
388 gnttab_end_foreign_access_ref(
389 np->grant_tx_ref[id], GNTMAP_readonly);
390 gnttab_release_grant_reference(
391 &np->gref_tx_head, np->grant_tx_ref[id]);
392 np->grant_tx_ref[id] = GRANT_INVALID_REF;
393 add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, id);
394 dev_kfree_skb_irq(skb);
395 }
396
397 np->tx.rsp_cons = prod;
398
399 /*
400 * Set a new event, then check for race with update of tx_cons.
401 * Note that it is essential to schedule a callback, no matter
402 * how few buffers are pending. Even if there is space in the
403 * transmit ring, higher layers may be blocked because too much
404 * data is outstanding: in such cases notification from Xen is
405 * likely to be the only kick that we'll get.
406 */
407 np->tx.sring->rsp_event =
408 prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
409 mb(); /* update shared area */
410 } while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
411
412 xennet_maybe_wake_tx(dev);
413}
414
415static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
416 struct xen_netif_tx_request *tx)
417{
418 struct netfront_info *np = netdev_priv(dev);
419 char *data = skb->data;
420 unsigned long mfn;
421 RING_IDX prod = np->tx.req_prod_pvt;
422 int frags = skb_shinfo(skb)->nr_frags;
423 unsigned int offset = offset_in_page(data);
424 unsigned int len = skb_headlen(skb);
425 unsigned int id;
426 grant_ref_t ref;
427 int i;
428
429 /* While the header overlaps a page boundary (including being
430 larger than a page), split it it into page-sized chunks. */
431 while (len > PAGE_SIZE - offset) {
432 tx->size = PAGE_SIZE - offset;
433 tx->flags |= NETTXF_more_data;
434 len -= tx->size;
435 data += tx->size;
436 offset = 0;
437
438 id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
439 np->tx_skbs[id].skb = skb_get(skb);
440 tx = RING_GET_REQUEST(&np->tx, prod++);
441 tx->id = id;
442 ref = gnttab_claim_grant_reference(&np->gref_tx_head);
443 BUG_ON((signed short)ref < 0);
444
445 mfn = virt_to_mfn(data);
446 gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
447 mfn, GNTMAP_readonly);
448
449 tx->gref = np->grant_tx_ref[id] = ref;
450 tx->offset = offset;
451 tx->size = len;
452 tx->flags = 0;
453 }
454
455 /* Grant backend access to each skb fragment page. */
456 for (i = 0; i < frags; i++) {
457 skb_frag_t *frag = skb_shinfo(skb)->frags + i;
458
459 tx->flags |= NETTXF_more_data;
460
461 id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
462 np->tx_skbs[id].skb = skb_get(skb);
463 tx = RING_GET_REQUEST(&np->tx, prod++);
464 tx->id = id;
465 ref = gnttab_claim_grant_reference(&np->gref_tx_head);
466 BUG_ON((signed short)ref < 0);
467
468 mfn = pfn_to_mfn(page_to_pfn(frag->page));
469 gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
470 mfn, GNTMAP_readonly);
471
472 tx->gref = np->grant_tx_ref[id] = ref;
473 tx->offset = frag->page_offset;
474 tx->size = frag->size;
475 tx->flags = 0;
476 }
477
478 np->tx.req_prod_pvt = prod;
479}
480
481static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
482{
483 unsigned short id;
484 struct netfront_info *np = netdev_priv(dev);
485 struct xen_netif_tx_request *tx;
486 struct xen_netif_extra_info *extra;
487 char *data = skb->data;
488 RING_IDX i;
489 grant_ref_t ref;
490 unsigned long mfn;
491 int notify;
492 int frags = skb_shinfo(skb)->nr_frags;
493 unsigned int offset = offset_in_page(data);
494 unsigned int len = skb_headlen(skb);
495
496 frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE;
497 if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
498 printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
499 frags);
500 dump_stack();
501 goto drop;
502 }
503
504 spin_lock_irq(&np->tx_lock);
505
506 if (unlikely(!netif_carrier_ok(dev) ||
507 (frags > 1 && !xennet_can_sg(dev)) ||
508 netif_needs_gso(dev, skb))) {
509 spin_unlock_irq(&np->tx_lock);
510 goto drop;
511 }
512
513 i = np->tx.req_prod_pvt;
514
515 id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
516 np->tx_skbs[id].skb = skb;
517
518 tx = RING_GET_REQUEST(&np->tx, i);
519
520 tx->id = id;
521 ref = gnttab_claim_grant_reference(&np->gref_tx_head);
522 BUG_ON((signed short)ref < 0);
523 mfn = virt_to_mfn(data);
524 gnttab_grant_foreign_access_ref(
525 ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
526 tx->gref = np->grant_tx_ref[id] = ref;
527 tx->offset = offset;
528 tx->size = len;
529 extra = NULL;
530
531 tx->flags = 0;
532 if (skb->ip_summed == CHECKSUM_PARTIAL)
533 /* local packet? */
534 tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
535 else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
536 /* remote but checksummed. */
537 tx->flags |= NETTXF_data_validated;
538
539 if (skb_shinfo(skb)->gso_size) {
540 struct xen_netif_extra_info *gso;
541
542 gso = (struct xen_netif_extra_info *)
543 RING_GET_REQUEST(&np->tx, ++i);
544
545 if (extra)
546 extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
547 else
548 tx->flags |= NETTXF_extra_info;
549
550 gso->u.gso.size = skb_shinfo(skb)->gso_size;
551 gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
552 gso->u.gso.pad = 0;
553 gso->u.gso.features = 0;
554
555 gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
556 gso->flags = 0;
557 extra = gso;
558 }
559
560 np->tx.req_prod_pvt = i + 1;
561
562 xennet_make_frags(skb, dev, tx);
563 tx->size = skb->len;
564
565 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
566 if (notify)
567 notify_remote_via_irq(np->netdev->irq);
568
569 xennet_tx_buf_gc(dev);
570
571 if (!netfront_tx_slot_available(np))
572 netif_stop_queue(dev);
573
574 spin_unlock_irq(&np->tx_lock);
575
576 np->stats.tx_bytes += skb->len;
577 np->stats.tx_packets++;
578
579 return 0;
580
581 drop:
582 np->stats.tx_dropped++;
583 dev_kfree_skb(skb);
584 return 0;
585}
586
587static int xennet_close(struct net_device *dev)
588{
589 struct netfront_info *np = netdev_priv(dev);
590 netif_stop_queue(np->netdev);
591 return 0;
592}
593
594static struct net_device_stats *xennet_get_stats(struct net_device *dev)
595{
596 struct netfront_info *np = netdev_priv(dev);
597 return &np->stats;
598}
599
600static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
601 grant_ref_t ref)
602{
603 int new = xennet_rxidx(np->rx.req_prod_pvt);
604
605 BUG_ON(np->rx_skbs[new]);
606 np->rx_skbs[new] = skb;
607 np->grant_rx_ref[new] = ref;
608 RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
609 RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
610 np->rx.req_prod_pvt++;
611}
612
613static int xennet_get_extras(struct netfront_info *np,
614 struct xen_netif_extra_info *extras,
615 RING_IDX rp)
616
617{
618 struct xen_netif_extra_info *extra;
619 struct device *dev = &np->netdev->dev;
620 RING_IDX cons = np->rx.rsp_cons;
621 int err = 0;
622
623 do {
624 struct sk_buff *skb;
625 grant_ref_t ref;
626
627 if (unlikely(cons + 1 == rp)) {
628 if (net_ratelimit())
629 dev_warn(dev, "Missing extra info\n");
630 err = -EBADR;
631 break;
632 }
633
634 extra = (struct xen_netif_extra_info *)
635 RING_GET_RESPONSE(&np->rx, ++cons);
636
637 if (unlikely(!extra->type ||
638 extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
639 if (net_ratelimit())
640 dev_warn(dev, "Invalid extra type: %d\n",
641 extra->type);
642 err = -EINVAL;
643 } else {
644 memcpy(&extras[extra->type - 1], extra,
645 sizeof(*extra));
646 }
647
648 skb = xennet_get_rx_skb(np, cons);
649 ref = xennet_get_rx_ref(np, cons);
650 xennet_move_rx_slot(np, skb, ref);
651 } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
652
653 np->rx.rsp_cons = cons;
654 return err;
655}
656
657static int xennet_get_responses(struct netfront_info *np,
658 struct netfront_rx_info *rinfo, RING_IDX rp,
659 struct sk_buff_head *list)
660{
661 struct xen_netif_rx_response *rx = &rinfo->rx;
662 struct xen_netif_extra_info *extras = rinfo->extras;
663 struct device *dev = &np->netdev->dev;
664 RING_IDX cons = np->rx.rsp_cons;
665 struct sk_buff *skb = xennet_get_rx_skb(np, cons);
666 grant_ref_t ref = xennet_get_rx_ref(np, cons);
667 int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
668 int frags = 1;
669 int err = 0;
670 unsigned long ret;
671
672 if (rx->flags & NETRXF_extra_info) {
673 err = xennet_get_extras(np, extras, rp);
674 cons = np->rx.rsp_cons;
675 }
676
677 for (;;) {
678 if (unlikely(rx->status < 0 ||
679 rx->offset + rx->status > PAGE_SIZE)) {
680 if (net_ratelimit())
681 dev_warn(dev, "rx->offset: %x, size: %u\n",
682 rx->offset, rx->status);
683 xennet_move_rx_slot(np, skb, ref);
684 err = -EINVAL;
685 goto next;
686 }
687
688 /*
689 * This definitely indicates a bug, either in this driver or in
690 * the backend driver. In future this should flag the bad
691 * situation to the system controller to reboot the backed.
692 */
693 if (ref == GRANT_INVALID_REF) {
694 if (net_ratelimit())
695 dev_warn(dev, "Bad rx response id %d.\n",
696 rx->id);
697 err = -EINVAL;
698 goto next;
699 }
700
701 ret = gnttab_end_foreign_access_ref(ref, 0);
702 BUG_ON(!ret);
703
704 gnttab_release_grant_reference(&np->gref_rx_head, ref);
705
706 __skb_queue_tail(list, skb);
707
708next:
709 if (!(rx->flags & NETRXF_more_data))
710 break;
711
712 if (cons + frags == rp) {
713 if (net_ratelimit())
714 dev_warn(dev, "Need more frags\n");
715 err = -ENOENT;
716 break;
717 }
718
719 rx = RING_GET_RESPONSE(&np->rx, cons + frags);
720 skb = xennet_get_rx_skb(np, cons + frags);
721 ref = xennet_get_rx_ref(np, cons + frags);
722 frags++;
723 }
724
725 if (unlikely(frags > max)) {
726 if (net_ratelimit())
727 dev_warn(dev, "Too many frags\n");
728 err = -E2BIG;
729 }
730
731 if (unlikely(err))
732 np->rx.rsp_cons = cons + frags;
733
734 return err;
735}
736
737static int xennet_set_skb_gso(struct sk_buff *skb,
738 struct xen_netif_extra_info *gso)
739{
740 if (!gso->u.gso.size) {
741 if (net_ratelimit())
742 printk(KERN_WARNING "GSO size must not be zero.\n");
743 return -EINVAL;
744 }
745
746 /* Currently only TCPv4 S.O. is supported. */
747 if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
748 if (net_ratelimit())
749 printk(KERN_WARNING "Bad GSO type %d.\n", gso->u.gso.type);
750 return -EINVAL;
751 }
752
753 skb_shinfo(skb)->gso_size = gso->u.gso.size;
754 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
755
756 /* Header must be checked, and gso_segs computed. */
757 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
758 skb_shinfo(skb)->gso_segs = 0;
759
760 return 0;
761}
762
763static RING_IDX xennet_fill_frags(struct netfront_info *np,
764 struct sk_buff *skb,
765 struct sk_buff_head *list)
766{
767 struct skb_shared_info *shinfo = skb_shinfo(skb);
768 int nr_frags = shinfo->nr_frags;
769 RING_IDX cons = np->rx.rsp_cons;
770 skb_frag_t *frag = shinfo->frags + nr_frags;
771 struct sk_buff *nskb;
772
773 while ((nskb = __skb_dequeue(list))) {
774 struct xen_netif_rx_response *rx =
775 RING_GET_RESPONSE(&np->rx, ++cons);
776
777 frag->page = skb_shinfo(nskb)->frags[0].page;
778 frag->page_offset = rx->offset;
779 frag->size = rx->status;
780
781 skb->data_len += rx->status;
782
783 skb_shinfo(nskb)->nr_frags = 0;
784 kfree_skb(nskb);
785
786 frag++;
787 nr_frags++;
788 }
789
790 shinfo->nr_frags = nr_frags;
791 return cons;
792}
793
794static int skb_checksum_setup(struct sk_buff *skb)
795{
796 struct iphdr *iph;
797 unsigned char *th;
798 int err = -EPROTO;
799
800 if (skb->protocol != htons(ETH_P_IP))
801 goto out;
802
803 iph = (void *)skb->data;
804 th = skb->data + 4 * iph->ihl;
805 if (th >= skb_tail_pointer(skb))
806 goto out;
807
808 skb->csum_start = th - skb->head;
809 switch (iph->protocol) {
810 case IPPROTO_TCP:
811 skb->csum_offset = offsetof(struct tcphdr, check);
812 break;
813 case IPPROTO_UDP:
814 skb->csum_offset = offsetof(struct udphdr, check);
815 break;
816 default:
817 if (net_ratelimit())
818 printk(KERN_ERR "Attempting to checksum a non-"
819 "TCP/UDP packet, dropping a protocol"
820 " %d packet", iph->protocol);
821 goto out;
822 }
823
824 if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
825 goto out;
826
827 err = 0;
828
829out:
830 return err;
831}
832
833static int handle_incoming_queue(struct net_device *dev,
834 struct sk_buff_head *rxq)
835{
836 struct netfront_info *np = netdev_priv(dev);
837 int packets_dropped = 0;
838 struct sk_buff *skb;
839
840 while ((skb = __skb_dequeue(rxq)) != NULL) {
841 struct page *page = NETFRONT_SKB_CB(skb)->page;
842 void *vaddr = page_address(page);
843 unsigned offset = NETFRONT_SKB_CB(skb)->offset;
844
845 memcpy(skb->data, vaddr + offset,
846 skb_headlen(skb));
847
848 if (page != skb_shinfo(skb)->frags[0].page)
849 __free_page(page);
850
851 /* Ethernet work: Delayed to here as it peeks the header. */
852 skb->protocol = eth_type_trans(skb, dev);
853
854 if (skb->ip_summed == CHECKSUM_PARTIAL) {
855 if (skb_checksum_setup(skb)) {
856 kfree_skb(skb);
857 packets_dropped++;
858 np->stats.rx_errors++;
859 continue;
860 }
861 }
862
863 np->stats.rx_packets++;
864 np->stats.rx_bytes += skb->len;
865
866 /* Pass it up. */
867 netif_receive_skb(skb);
868 dev->last_rx = jiffies;
869 }
870
871 return packets_dropped;
872}
873
874static int xennet_poll(struct net_device *dev, int *pbudget)
875{
876 struct netfront_info *np = netdev_priv(dev);
877 struct sk_buff *skb;
878 struct netfront_rx_info rinfo;
879 struct xen_netif_rx_response *rx = &rinfo.rx;
880 struct xen_netif_extra_info *extras = rinfo.extras;
881 RING_IDX i, rp;
882 int work_done, budget, more_to_do = 1;
883 struct sk_buff_head rxq;
884 struct sk_buff_head errq;
885 struct sk_buff_head tmpq;
886 unsigned long flags;
887 unsigned int len;
888 int err;
889
890 spin_lock(&np->rx_lock);
891
892 if (unlikely(!netif_carrier_ok(dev))) {
893 spin_unlock(&np->rx_lock);
894 return 0;
895 }
896
897 skb_queue_head_init(&rxq);
898 skb_queue_head_init(&errq);
899 skb_queue_head_init(&tmpq);
900
901 budget = *pbudget;
902 if (budget > dev->quota)
903 budget = dev->quota;
904 rp = np->rx.sring->rsp_prod;
905 rmb(); /* Ensure we see queued responses up to 'rp'. */
906
907 i = np->rx.rsp_cons;
908 work_done = 0;
909 while ((i != rp) && (work_done < budget)) {
910 memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
911 memset(extras, 0, sizeof(rinfo.extras));
912
913 err = xennet_get_responses(np, &rinfo, rp, &tmpq);
914
915 if (unlikely(err)) {
916err:
917 while ((skb = __skb_dequeue(&tmpq)))
918 __skb_queue_tail(&errq, skb);
919 np->stats.rx_errors++;
920 i = np->rx.rsp_cons;
921 continue;
922 }
923
924 skb = __skb_dequeue(&tmpq);
925
926 if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
927 struct xen_netif_extra_info *gso;
928 gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
929
930 if (unlikely(xennet_set_skb_gso(skb, gso))) {
931 __skb_queue_head(&tmpq, skb);
932 np->rx.rsp_cons += skb_queue_len(&tmpq);
933 goto err;
934 }
935 }
936
937 NETFRONT_SKB_CB(skb)->page = skb_shinfo(skb)->frags[0].page;
938 NETFRONT_SKB_CB(skb)->offset = rx->offset;
939
940 len = rx->status;
941 if (len > RX_COPY_THRESHOLD)
942 len = RX_COPY_THRESHOLD;
943 skb_put(skb, len);
944
945 if (rx->status > len) {
946 skb_shinfo(skb)->frags[0].page_offset =
947 rx->offset + len;
948 skb_shinfo(skb)->frags[0].size = rx->status - len;
949 skb->data_len = rx->status - len;
950 } else {
951 skb_shinfo(skb)->frags[0].page = NULL;
952 skb_shinfo(skb)->nr_frags = 0;
953 }
954
955 i = xennet_fill_frags(np, skb, &tmpq);
956
957 /*
958 * Truesize approximates the size of true data plus
959 * any supervisor overheads. Adding hypervisor
960 * overheads has been shown to significantly reduce
961 * achievable bandwidth with the default receive
962 * buffer size. It is therefore not wise to account
963 * for it here.
964 *
965 * After alloc_skb(RX_COPY_THRESHOLD), truesize is set
966 * to RX_COPY_THRESHOLD + the supervisor
967 * overheads. Here, we add the size of the data pulled
968 * in xennet_fill_frags().
969 *
970 * We also adjust for any unused space in the main
971 * data area by subtracting (RX_COPY_THRESHOLD -
972 * len). This is especially important with drivers
973 * which split incoming packets into header and data,
974 * using only 66 bytes of the main data area (see the
975 * e1000 driver for example.) On such systems,
976 * without this last adjustement, our achievable
977 * receive throughout using the standard receive
978 * buffer size was cut by 25%(!!!).
979 */
980 skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
981 skb->len += skb->data_len;
982
983 if (rx->flags & NETRXF_csum_blank)
984 skb->ip_summed = CHECKSUM_PARTIAL;
985 else if (rx->flags & NETRXF_data_validated)
986 skb->ip_summed = CHECKSUM_UNNECESSARY;
987
988 __skb_queue_tail(&rxq, skb);
989
990 np->rx.rsp_cons = ++i;
991 work_done++;
992 }
993
994 while ((skb = __skb_dequeue(&errq)))
995 kfree_skb(skb);
996
997 work_done -= handle_incoming_queue(dev, &rxq);
998
999 /* If we get a callback with very few responses, reduce fill target. */
1000 /* NB. Note exponential increase, linear decrease. */
1001 if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
1002 ((3*np->rx_target) / 4)) &&
1003 (--np->rx_target < np->rx_min_target))
1004 np->rx_target = np->rx_min_target;
1005
1006 xennet_alloc_rx_buffers(dev);
1007
1008 *pbudget -= work_done;
1009 dev->quota -= work_done;
1010
1011 if (work_done < budget) {
1012 local_irq_save(flags);
1013
1014 RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
1015 if (!more_to_do)
1016 __netif_rx_complete(dev);
1017
1018 local_irq_restore(flags);
1019 }
1020
1021 spin_unlock(&np->rx_lock);
1022
1023 return more_to_do;
1024}
1025
1026static int xennet_change_mtu(struct net_device *dev, int mtu)
1027{
1028 int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
1029
1030 if (mtu > max)
1031 return -EINVAL;
1032 dev->mtu = mtu;
1033 return 0;
1034}
1035
1036static void xennet_release_tx_bufs(struct netfront_info *np)
1037{
1038 struct sk_buff *skb;
1039 int i;
1040
1041 for (i = 0; i < NET_TX_RING_SIZE; i++) {
1042 /* Skip over entries which are actually freelist references */
1043 if ((unsigned long)np->tx_skbs[i].skb < PAGE_OFFSET)
1044 continue;
1045
1046 skb = np->tx_skbs[i].skb;
1047 gnttab_end_foreign_access_ref(np->grant_tx_ref[i],
1048 GNTMAP_readonly);
1049 gnttab_release_grant_reference(&np->gref_tx_head,
1050 np->grant_tx_ref[i]);
1051 np->grant_tx_ref[i] = GRANT_INVALID_REF;
1052 add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, i);
1053 dev_kfree_skb_irq(skb);
1054 }
1055}
1056
1057static void xennet_release_rx_bufs(struct netfront_info *np)
1058{
1059 struct mmu_update *mmu = np->rx_mmu;
1060 struct multicall_entry *mcl = np->rx_mcl;
1061 struct sk_buff_head free_list;
1062 struct sk_buff *skb;
1063 unsigned long mfn;
1064 int xfer = 0, noxfer = 0, unused = 0;
1065 int id, ref;
1066
1067 dev_warn(&np->netdev->dev, "%s: fix me for copying receiver.\n",
1068 __func__);
1069 return;
1070
1071 skb_queue_head_init(&free_list);
1072
1073 spin_lock_bh(&np->rx_lock);
1074
1075 for (id = 0; id < NET_RX_RING_SIZE; id++) {
1076 ref = np->grant_rx_ref[id];
1077 if (ref == GRANT_INVALID_REF) {
1078 unused++;
1079 continue;
1080 }
1081
1082 skb = np->rx_skbs[id];
1083 mfn = gnttab_end_foreign_transfer_ref(ref);
1084 gnttab_release_grant_reference(&np->gref_rx_head, ref);
1085 np->grant_rx_ref[id] = GRANT_INVALID_REF;
1086
1087 if (0 == mfn) {
1088 skb_shinfo(skb)->nr_frags = 0;
1089 dev_kfree_skb(skb);
1090 noxfer++;
1091 continue;
1092 }
1093
1094 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1095 /* Remap the page. */
1096 struct page *page = skb_shinfo(skb)->frags[0].page;
1097 unsigned long pfn = page_to_pfn(page);
1098 void *vaddr = page_address(page);
1099
1100 MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
1101 mfn_pte(mfn, PAGE_KERNEL),
1102 0);
1103 mcl++;
1104 mmu->ptr = ((u64)mfn << PAGE_SHIFT)
1105 | MMU_MACHPHYS_UPDATE;
1106 mmu->val = pfn;
1107 mmu++;
1108
1109 set_phys_to_machine(pfn, mfn);
1110 }
1111 __skb_queue_tail(&free_list, skb);
1112 xfer++;
1113 }
1114
1115 dev_info(&np->netdev->dev, "%s: %d xfer, %d noxfer, %d unused\n",
1116 __func__, xfer, noxfer, unused);
1117
1118 if (xfer) {
1119 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1120 /* Do all the remapping work and M2P updates. */
1121 MULTI_mmu_update(mcl, np->rx_mmu, mmu - np->rx_mmu,
1122 0, DOMID_SELF);
1123 mcl++;
1124 HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
1125 }
1126 }
1127
1128 while ((skb = __skb_dequeue(&free_list)) != NULL)
1129 dev_kfree_skb(skb);
1130
1131 spin_unlock_bh(&np->rx_lock);
1132}
1133
1134static void xennet_uninit(struct net_device *dev)
1135{
1136 struct netfront_info *np = netdev_priv(dev);
1137 xennet_release_tx_bufs(np);
1138 xennet_release_rx_bufs(np);
1139 gnttab_free_grant_references(np->gref_tx_head);
1140 gnttab_free_grant_references(np->gref_rx_head);
1141}
1142
1143static struct net_device * __devinit xennet_create_dev(struct xenbus_device *dev)
1144{
1145 int i, err;
1146 struct net_device *netdev;
1147 struct netfront_info *np;
1148
1149 netdev = alloc_etherdev(sizeof(struct netfront_info));
1150 if (!netdev) {
1151 printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
1152 __func__);
1153 return ERR_PTR(-ENOMEM);
1154 }
1155
1156 np = netdev_priv(netdev);
1157 np->xbdev = dev;
1158
1159 spin_lock_init(&np->tx_lock);
1160 spin_lock_init(&np->rx_lock);
1161
1162 skb_queue_head_init(&np->rx_batch);
1163 np->rx_target = RX_DFL_MIN_TARGET;
1164 np->rx_min_target = RX_DFL_MIN_TARGET;
1165 np->rx_max_target = RX_MAX_TARGET;
1166
1167 init_timer(&np->rx_refill_timer);
1168 np->rx_refill_timer.data = (unsigned long)netdev;
1169 np->rx_refill_timer.function = rx_refill_timeout;
1170
1171 /* Initialise tx_skbs as a free chain containing every entry. */
1172 np->tx_skb_freelist = 0;
1173 for (i = 0; i < NET_TX_RING_SIZE; i++) {
1174 np->tx_skbs[i].link = i+1;
1175 np->grant_tx_ref[i] = GRANT_INVALID_REF;
1176 }
1177
1178 /* Clear out rx_skbs */
1179 for (i = 0; i < NET_RX_RING_SIZE; i++) {
1180 np->rx_skbs[i] = NULL;
1181 np->grant_rx_ref[i] = GRANT_INVALID_REF;
1182 }
1183
1184 /* A grant for every tx ring slot */
1185 if (gnttab_alloc_grant_references(TX_MAX_TARGET,
1186 &np->gref_tx_head) < 0) {
1187 printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
1188 err = -ENOMEM;
1189 goto exit;
1190 }
1191 /* A grant for every rx ring slot */
1192 if (gnttab_alloc_grant_references(RX_MAX_TARGET,
1193 &np->gref_rx_head) < 0) {
1194 printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
1195 err = -ENOMEM;
1196 goto exit_free_tx;
1197 }
1198
1199 netdev->open = xennet_open;
1200 netdev->hard_start_xmit = xennet_start_xmit;
1201 netdev->stop = xennet_close;
1202 netdev->get_stats = xennet_get_stats;
1203 netdev->poll = xennet_poll;
1204 netdev->uninit = xennet_uninit;
1205 netdev->change_mtu = xennet_change_mtu;
1206 netdev->weight = 64;
1207 netdev->features = NETIF_F_IP_CSUM;
1208
1209 SET_ETHTOOL_OPS(netdev, &xennet_ethtool_ops);
1210 SET_MODULE_OWNER(netdev);
1211 SET_NETDEV_DEV(netdev, &dev->dev);
1212
1213 np->netdev = netdev;
1214
1215 netif_carrier_off(netdev);
1216
1217 return netdev;
1218
1219 exit_free_tx:
1220 gnttab_free_grant_references(np->gref_tx_head);
1221 exit:
1222 free_netdev(netdev);
1223 return ERR_PTR(err);
1224}
1225
1226/**
1227 * Entry point to this code when a new device is created. Allocate the basic
1228 * structures and the ring buffers for communication with the backend, and
1229 * inform the backend of the appropriate details for those.
1230 */
1231static int __devinit netfront_probe(struct xenbus_device *dev,
1232 const struct xenbus_device_id *id)
1233{
1234 int err;
1235 struct net_device *netdev;
1236 struct netfront_info *info;
1237
1238 netdev = xennet_create_dev(dev);
1239 if (IS_ERR(netdev)) {
1240 err = PTR_ERR(netdev);
1241 xenbus_dev_fatal(dev, err, "creating netdev");
1242 return err;
1243 }
1244
1245 info = netdev_priv(netdev);
1246 dev->dev.driver_data = info;
1247
1248 err = register_netdev(info->netdev);
1249 if (err) {
1250 printk(KERN_WARNING "%s: register_netdev err=%d\n",
1251 __func__, err);
1252 goto fail;
1253 }
1254
1255 err = xennet_sysfs_addif(info->netdev);
1256 if (err) {
1257 unregister_netdev(info->netdev);
1258 printk(KERN_WARNING "%s: add sysfs failed err=%d\n",
1259 __func__, err);
1260 goto fail;
1261 }
1262
1263 return 0;
1264
1265 fail:
1266 free_netdev(netdev);
1267 dev->dev.driver_data = NULL;
1268 return err;
1269}
1270
1271static void xennet_end_access(int ref, void *page)
1272{
1273 /* This frees the page as a side-effect */
1274 if (ref != GRANT_INVALID_REF)
1275 gnttab_end_foreign_access(ref, 0, (unsigned long)page);
1276}
1277
1278static void xennet_disconnect_backend(struct netfront_info *info)
1279{
1280 /* Stop old i/f to prevent errors whilst we rebuild the state. */
1281 spin_lock_bh(&info->rx_lock);
1282 spin_lock_irq(&info->tx_lock);
1283 netif_carrier_off(info->netdev);
1284 spin_unlock_irq(&info->tx_lock);
1285 spin_unlock_bh(&info->rx_lock);
1286
1287 if (info->netdev->irq)
1288 unbind_from_irqhandler(info->netdev->irq, info->netdev);
1289 info->evtchn = info->netdev->irq = 0;
1290
1291 /* End access and free the pages */
1292 xennet_end_access(info->tx_ring_ref, info->tx.sring);
1293 xennet_end_access(info->rx_ring_ref, info->rx.sring);
1294
1295 info->tx_ring_ref = GRANT_INVALID_REF;
1296 info->rx_ring_ref = GRANT_INVALID_REF;
1297 info->tx.sring = NULL;
1298 info->rx.sring = NULL;
1299}
1300
1301/**
1302 * We are reconnecting to the backend, due to a suspend/resume, or a backend
1303 * driver restart. We tear down our netif structure and recreate it, but
1304 * leave the device-layer structures intact so that this is transparent to the
1305 * rest of the kernel.
1306 */
1307static int netfront_resume(struct xenbus_device *dev)
1308{
1309 struct netfront_info *info = dev->dev.driver_data;
1310
1311 dev_dbg(&dev->dev, "%s\n", dev->nodename);
1312
1313 xennet_disconnect_backend(info);
1314 return 0;
1315}
1316
1317static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
1318{
1319 char *s, *e, *macstr;
1320 int i;
1321
1322 macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
1323 if (IS_ERR(macstr))
1324 return PTR_ERR(macstr);
1325
1326 for (i = 0; i < ETH_ALEN; i++) {
1327 mac[i] = simple_strtoul(s, &e, 16);
1328 if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
1329 kfree(macstr);
1330 return -ENOENT;
1331 }
1332 s = e+1;
1333 }
1334
1335 kfree(macstr);
1336 return 0;
1337}
1338
1339static irqreturn_t xennet_interrupt(int irq, void *dev_id)
1340{
1341 struct net_device *dev = dev_id;
1342 struct netfront_info *np = netdev_priv(dev);
1343 unsigned long flags;
1344
1345 spin_lock_irqsave(&np->tx_lock, flags);
1346
1347 if (likely(netif_carrier_ok(dev))) {
1348 xennet_tx_buf_gc(dev);
1349 /* Under tx_lock: protects access to rx shared-ring indexes. */
1350 if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
1351 netif_rx_schedule(dev);
1352 }
1353
1354 spin_unlock_irqrestore(&np->tx_lock, flags);
1355
1356 return IRQ_HANDLED;
1357}
1358
1359static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
1360{
1361 struct xen_netif_tx_sring *txs;
1362 struct xen_netif_rx_sring *rxs;
1363 int err;
1364 struct net_device *netdev = info->netdev;
1365
1366 info->tx_ring_ref = GRANT_INVALID_REF;
1367 info->rx_ring_ref = GRANT_INVALID_REF;
1368 info->rx.sring = NULL;
1369 info->tx.sring = NULL;
1370 netdev->irq = 0;
1371
1372 err = xen_net_read_mac(dev, netdev->dev_addr);
1373 if (err) {
1374 xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
1375 goto fail;
1376 }
1377
1378 txs = (struct xen_netif_tx_sring *)get_zeroed_page(GFP_KERNEL);
1379 if (!txs) {
1380 err = -ENOMEM;
1381 xenbus_dev_fatal(dev, err, "allocating tx ring page");
1382 goto fail;
1383 }
1384 SHARED_RING_INIT(txs);
1385 FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
1386
1387 err = xenbus_grant_ring(dev, virt_to_mfn(txs));
1388 if (err < 0) {
1389 free_page((unsigned long)txs);
1390 goto fail;
1391 }
1392
1393 info->tx_ring_ref = err;
1394 rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_KERNEL);
1395 if (!rxs) {
1396 err = -ENOMEM;
1397 xenbus_dev_fatal(dev, err, "allocating rx ring page");
1398 goto fail;
1399 }
1400 SHARED_RING_INIT(rxs);
1401 FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
1402
1403 err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
1404 if (err < 0) {
1405 free_page((unsigned long)rxs);
1406 goto fail;
1407 }
1408 info->rx_ring_ref = err;
1409
1410 err = xenbus_alloc_evtchn(dev, &info->evtchn);
1411 if (err)
1412 goto fail;
1413
1414 err = bind_evtchn_to_irqhandler(info->evtchn, xennet_interrupt,
1415 IRQF_SAMPLE_RANDOM, netdev->name,
1416 netdev);
1417 if (err < 0)
1418 goto fail;
1419 netdev->irq = err;
1420 return 0;
1421
1422 fail:
1423 return err;
1424}
1425
1426/* Common code used when first setting up, and when resuming. */
1427static int talk_to_backend(struct xenbus_device *dev,
1428 struct netfront_info *info)
1429{
1430 const char *message;
1431 struct xenbus_transaction xbt;
1432 int err;
1433
1434 /* Create shared ring, alloc event channel. */
1435 err = setup_netfront(dev, info);
1436 if (err)
1437 goto out;
1438
1439again:
1440 err = xenbus_transaction_start(&xbt);
1441 if (err) {
1442 xenbus_dev_fatal(dev, err, "starting transaction");
1443 goto destroy_ring;
1444 }
1445
1446 err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref", "%u",
1447 info->tx_ring_ref);
1448 if (err) {
1449 message = "writing tx ring-ref";
1450 goto abort_transaction;
1451 }
1452 err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref", "%u",
1453 info->rx_ring_ref);
1454 if (err) {
1455 message = "writing rx ring-ref";
1456 goto abort_transaction;
1457 }
1458 err = xenbus_printf(xbt, dev->nodename,
1459 "event-channel", "%u", info->evtchn);
1460 if (err) {
1461 message = "writing event-channel";
1462 goto abort_transaction;
1463 }
1464
1465 err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
1466 1);
1467 if (err) {
1468 message = "writing request-rx-copy";
1469 goto abort_transaction;
1470 }
1471
1472 err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
1473 if (err) {
1474 message = "writing feature-rx-notify";
1475 goto abort_transaction;
1476 }
1477
1478 err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
1479 if (err) {
1480 message = "writing feature-sg";
1481 goto abort_transaction;
1482 }
1483
1484 err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1);
1485 if (err) {
1486 message = "writing feature-gso-tcpv4";
1487 goto abort_transaction;
1488 }
1489
1490 err = xenbus_transaction_end(xbt, 0);
1491 if (err) {
1492 if (err == -EAGAIN)
1493 goto again;
1494 xenbus_dev_fatal(dev, err, "completing transaction");
1495 goto destroy_ring;
1496 }
1497
1498 return 0;
1499
1500 abort_transaction:
1501 xenbus_transaction_end(xbt, 1);
1502 xenbus_dev_fatal(dev, err, "%s", message);
1503 destroy_ring:
1504 xennet_disconnect_backend(info);
1505 out:
1506 return err;
1507}
1508
1509static int xennet_set_sg(struct net_device *dev, u32 data)
1510{
1511 if (data) {
1512 struct netfront_info *np = netdev_priv(dev);
1513 int val;
1514
1515 if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
1516 "%d", &val) < 0)
1517 val = 0;
1518 if (!val)
1519 return -ENOSYS;
1520 } else if (dev->mtu > ETH_DATA_LEN)
1521 dev->mtu = ETH_DATA_LEN;
1522
1523 return ethtool_op_set_sg(dev, data);
1524}
1525
1526static int xennet_set_tso(struct net_device *dev, u32 data)
1527{
1528 if (data) {
1529 struct netfront_info *np = netdev_priv(dev);
1530 int val;
1531
1532 if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
1533 "feature-gso-tcpv4", "%d", &val) < 0)
1534 val = 0;
1535 if (!val)
1536 return -ENOSYS;
1537 }
1538
1539 return ethtool_op_set_tso(dev, data);
1540}
1541
1542static void xennet_set_features(struct net_device *dev)
1543{
1544 /* Turn off all GSO bits except ROBUST. */
1545 dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1;
1546 dev->features |= NETIF_F_GSO_ROBUST;
1547 xennet_set_sg(dev, 0);
1548
1549 /* We need checksum offload to enable scatter/gather and TSO. */
1550 if (!(dev->features & NETIF_F_IP_CSUM))
1551 return;
1552
1553 if (!xennet_set_sg(dev, 1))
1554 xennet_set_tso(dev, 1);
1555}
1556
1557static int xennet_connect(struct net_device *dev)
1558{
1559 struct netfront_info *np = netdev_priv(dev);
1560 int i, requeue_idx, err;
1561 struct sk_buff *skb;
1562 grant_ref_t ref;
1563 struct xen_netif_rx_request *req;
1564 unsigned int feature_rx_copy;
1565
1566 err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
1567 "feature-rx-copy", "%u", &feature_rx_copy);
1568 if (err != 1)
1569 feature_rx_copy = 0;
1570
1571 if (!feature_rx_copy) {
1572 dev_info(&dev->dev,
1573 "backend does not support copying recieve path");
1574 return -ENODEV;
1575 }
1576
1577 err = talk_to_backend(np->xbdev, np);
1578 if (err)
1579 return err;
1580
1581 xennet_set_features(dev);
1582
1583 spin_lock_bh(&np->rx_lock);
1584 spin_lock_irq(&np->tx_lock);
1585
1586 /* Step 1: Discard all pending TX packet fragments. */
1587 xennet_release_tx_bufs(np);
1588
1589 /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
1590 for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
1591 if (!np->rx_skbs[i])
1592 continue;
1593
1594 skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
1595 ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
1596 req = RING_GET_REQUEST(&np->rx, requeue_idx);
1597
1598 gnttab_grant_foreign_access_ref(
1599 ref, np->xbdev->otherend_id,
1600 pfn_to_mfn(page_to_pfn(skb_shinfo(skb)->
1601 frags->page)),
1602 0);
1603 req->gref = ref;
1604 req->id = requeue_idx;
1605
1606 requeue_idx++;
1607 }
1608
1609 np->rx.req_prod_pvt = requeue_idx;
1610
1611 /*
1612 * Step 3: All public and private state should now be sane. Get
1613 * ready to start sending and receiving packets and give the driver
1614 * domain a kick because we've probably just requeued some
1615 * packets.
1616 */
1617 netif_carrier_on(np->netdev);
1618 notify_remote_via_irq(np->netdev->irq);
1619 xennet_tx_buf_gc(dev);
1620 xennet_alloc_rx_buffers(dev);
1621
1622 spin_unlock_irq(&np->tx_lock);
1623 spin_unlock_bh(&np->rx_lock);
1624
1625 return 0;
1626}
1627
1628/**
1629 * Callback received when the backend's state changes.
1630 */
1631static void backend_changed(struct xenbus_device *dev,
1632 enum xenbus_state backend_state)
1633{
1634 struct netfront_info *np = dev->dev.driver_data;
1635 struct net_device *netdev = np->netdev;
1636
1637 dev_dbg(&dev->dev, "%s\n", xenbus_strstate(backend_state));
1638
1639 switch (backend_state) {
1640 case XenbusStateInitialising:
1641 case XenbusStateInitialised:
1642 case XenbusStateConnected:
1643 case XenbusStateUnknown:
1644 case XenbusStateClosed:
1645 break;
1646
1647 case XenbusStateInitWait:
1648 if (dev->state != XenbusStateInitialising)
1649 break;
1650 if (xennet_connect(netdev) != 0)
1651 break;
1652 xenbus_switch_state(dev, XenbusStateConnected);
1653 break;
1654
1655 case XenbusStateClosing:
1656 xenbus_frontend_closed(dev);
1657 break;
1658 }
1659}
1660
1661static struct ethtool_ops xennet_ethtool_ops =
1662{
1663 .get_tx_csum = ethtool_op_get_tx_csum,
1664 .set_tx_csum = ethtool_op_set_tx_csum,
1665 .get_sg = ethtool_op_get_sg,
1666 .set_sg = xennet_set_sg,
1667 .get_tso = ethtool_op_get_tso,
1668 .set_tso = xennet_set_tso,
1669 .get_link = ethtool_op_get_link,
1670};
1671
1672#ifdef CONFIG_SYSFS
1673static ssize_t show_rxbuf_min(struct device *dev,
1674 struct device_attribute *attr, char *buf)
1675{
1676 struct net_device *netdev = to_net_dev(dev);
1677 struct netfront_info *info = netdev_priv(netdev);
1678
1679 return sprintf(buf, "%u\n", info->rx_min_target);
1680}
1681
1682static ssize_t store_rxbuf_min(struct device *dev,
1683 struct device_attribute *attr,
1684 const char *buf, size_t len)
1685{
1686 struct net_device *netdev = to_net_dev(dev);
1687 struct netfront_info *np = netdev_priv(netdev);
1688 char *endp;
1689 unsigned long target;
1690
1691 if (!capable(CAP_NET_ADMIN))
1692 return -EPERM;
1693
1694 target = simple_strtoul(buf, &endp, 0);
1695 if (endp == buf)
1696 return -EBADMSG;
1697
1698 if (target < RX_MIN_TARGET)
1699 target = RX_MIN_TARGET;
1700 if (target > RX_MAX_TARGET)
1701 target = RX_MAX_TARGET;
1702
1703 spin_lock_bh(&np->rx_lock);
1704 if (target > np->rx_max_target)
1705 np->rx_max_target = target;
1706 np->rx_min_target = target;
1707 if (target > np->rx_target)
1708 np->rx_target = target;
1709
1710 xennet_alloc_rx_buffers(netdev);
1711
1712 spin_unlock_bh(&np->rx_lock);
1713 return len;
1714}
1715
1716static ssize_t show_rxbuf_max(struct device *dev,
1717 struct device_attribute *attr, char *buf)
1718{
1719 struct net_device *netdev = to_net_dev(dev);
1720 struct netfront_info *info = netdev_priv(netdev);
1721
1722 return sprintf(buf, "%u\n", info->rx_max_target);
1723}
1724
1725static ssize_t store_rxbuf_max(struct device *dev,
1726 struct device_attribute *attr,
1727 const char *buf, size_t len)
1728{
1729 struct net_device *netdev = to_net_dev(dev);
1730 struct netfront_info *np = netdev_priv(netdev);
1731 char *endp;
1732 unsigned long target;
1733
1734 if (!capable(CAP_NET_ADMIN))
1735 return -EPERM;
1736
1737 target = simple_strtoul(buf, &endp, 0);
1738 if (endp == buf)
1739 return -EBADMSG;
1740
1741 if (target < RX_MIN_TARGET)
1742 target = RX_MIN_TARGET;
1743 if (target > RX_MAX_TARGET)
1744 target = RX_MAX_TARGET;
1745
1746 spin_lock_bh(&np->rx_lock);
1747 if (target < np->rx_min_target)
1748 np->rx_min_target = target;
1749 np->rx_max_target = target;
1750 if (target < np->rx_target)
1751 np->rx_target = target;
1752
1753 xennet_alloc_rx_buffers(netdev);
1754
1755 spin_unlock_bh(&np->rx_lock);
1756 return len;
1757}
1758
1759static ssize_t show_rxbuf_cur(struct device *dev,
1760 struct device_attribute *attr, char *buf)
1761{
1762 struct net_device *netdev = to_net_dev(dev);
1763 struct netfront_info *info = netdev_priv(netdev);
1764
1765 return sprintf(buf, "%u\n", info->rx_target);
1766}
1767
1768static struct device_attribute xennet_attrs[] = {
1769 __ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
1770 __ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
1771 __ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
1772};
1773
1774static int xennet_sysfs_addif(struct net_device *netdev)
1775{
1776 int i;
1777 int err;
1778
1779 for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
1780 err = device_create_file(&netdev->dev,
1781 &xennet_attrs[i]);
1782 if (err)
1783 goto fail;
1784 }
1785 return 0;
1786
1787 fail:
1788 while (--i >= 0)
1789 device_remove_file(&netdev->dev, &xennet_attrs[i]);
1790 return err;
1791}
1792
1793static void xennet_sysfs_delif(struct net_device *netdev)
1794{
1795 int i;
1796
1797 for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++)
1798 device_remove_file(&netdev->dev, &xennet_attrs[i]);
1799}
1800
1801#endif /* CONFIG_SYSFS */
1802
1803static struct xenbus_device_id netfront_ids[] = {
1804 { "vif" },
1805 { "" }
1806};
1807
1808
1809static int __devexit xennet_remove(struct xenbus_device *dev)
1810{
1811 struct netfront_info *info = dev->dev.driver_data;
1812
1813 dev_dbg(&dev->dev, "%s\n", dev->nodename);
1814
1815 unregister_netdev(info->netdev);
1816
1817 xennet_disconnect_backend(info);
1818
1819 del_timer_sync(&info->rx_refill_timer);
1820
1821 xennet_sysfs_delif(info->netdev);
1822
1823 free_netdev(info->netdev);
1824
1825 return 0;
1826}
1827
1828static struct xenbus_driver netfront = {
1829 .name = "vif",
1830 .owner = THIS_MODULE,
1831 .ids = netfront_ids,
1832 .probe = netfront_probe,
1833 .remove = __devexit_p(xennet_remove),
1834 .resume = netfront_resume,
1835 .otherend_changed = backend_changed,
1836};
1837
1838static int __init netif_init(void)
1839{
1840 if (!is_running_on_xen())
1841 return -ENODEV;
1842
1843 if (is_initial_xendomain())
1844 return 0;
1845
1846 printk(KERN_INFO "Initialising Xen virtual ethernet driver.\n");
1847
1848 return xenbus_register_frontend(&netfront);
1849}
1850module_init(netif_init);
1851
1852
1853static void __exit netif_exit(void)
1854{
1855 if (is_initial_xendomain())
1856 return;
1857
1858 return xenbus_unregister_driver(&netfront);
1859}
1860module_exit(netif_exit);
1861
1862MODULE_DESCRIPTION("Xen virtual network device frontend");
1863MODULE_LICENSE("GPL");
diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c
index 03baf1c64a2e..ed112ee16012 100644
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -147,7 +147,7 @@ static int pnp_dock_event(int dock, struct pnp_docking_station_info *info)
147 info->location_id, info->serial, info->capabilities); 147 info->location_id, info->serial, info->capabilities);
148 envp[i] = NULL; 148 envp[i] = NULL;
149 149
150 value = call_usermodehelper (argv [0], argv, envp, 0); 150 value = call_usermodehelper (argv [0], argv, envp, UMH_WAIT_EXEC);
151 kfree (buf); 151 kfree (buf);
152 kfree (envp); 152 kfree (envp);
153 return 0; 153 return 0;
diff --git a/drivers/sbus/char/bbc_envctrl.c b/drivers/sbus/char/bbc_envctrl.c
index a54e4140683a..e821a155b658 100644
--- a/drivers/sbus/char/bbc_envctrl.c
+++ b/drivers/sbus/char/bbc_envctrl.c
@@ -7,6 +7,7 @@
7#include <linux/kthread.h> 7#include <linux/kthread.h>
8#include <linux/delay.h> 8#include <linux/delay.h>
9#include <linux/kmod.h> 9#include <linux/kmod.h>
10#include <linux/reboot.h>
10#include <asm/oplib.h> 11#include <asm/oplib.h>
11#include <asm/ebus.h> 12#include <asm/ebus.h>
12 13
@@ -170,8 +171,6 @@ static void get_current_temps(struct bbc_cpu_temperature *tp)
170static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp) 171static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
171{ 172{
172 static int shutting_down = 0; 173 static int shutting_down = 0;
173 static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
174 char *argv[] = { "/sbin/shutdown", "-h", "now", NULL };
175 char *type = "???"; 174 char *type = "???";
176 s8 val = -1; 175 s8 val = -1;
177 176
@@ -195,7 +194,7 @@ static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
195 printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n"); 194 printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n");
196 195
197 shutting_down = 1; 196 shutting_down = 1;
198 if (call_usermodehelper("/sbin/shutdown", argv, envp, 0) < 0) 197 if (orderly_poweroff(true) < 0)
199 printk(KERN_CRIT "envctrl: shutdown execution failed\n"); 198 printk(KERN_CRIT "envctrl: shutdown execution failed\n");
200} 199}
201 200
diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c
index 8328acab47fd..dadabef116b6 100644
--- a/drivers/sbus/char/envctrl.c
+++ b/drivers/sbus/char/envctrl.c
@@ -26,6 +26,7 @@
26#include <linux/ioport.h> 26#include <linux/ioport.h>
27#include <linux/miscdevice.h> 27#include <linux/miscdevice.h>
28#include <linux/kmod.h> 28#include <linux/kmod.h>
29#include <linux/reboot.h>
29 30
30#include <asm/ebus.h> 31#include <asm/ebus.h>
31#include <asm/uaccess.h> 32#include <asm/uaccess.h>
@@ -966,10 +967,6 @@ static struct i2c_child_t *envctrl_get_i2c_child(unsigned char mon_type)
966static void envctrl_do_shutdown(void) 967static void envctrl_do_shutdown(void)
967{ 968{
968 static int inprog = 0; 969 static int inprog = 0;
969 static char *envp[] = {
970 "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
971 char *argv[] = {
972 "/sbin/shutdown", "-h", "now", NULL };
973 int ret; 970 int ret;
974 971
975 if (inprog != 0) 972 if (inprog != 0)
@@ -977,7 +974,7 @@ static void envctrl_do_shutdown(void)
977 974
978 inprog = 1; 975 inprog = 1;
979 printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n"); 976 printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n");
980 ret = call_usermodehelper("/sbin/shutdown", argv, envp, 0); 977 ret = orderly_poweroff(true);
981 if (ret < 0) { 978 if (ret < 0) {
982 printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n"); 979 printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n");
983 inprog = 0; /* unlikely to succeed, but we could try again */ 980 inprog = 0; /* unlikely to succeed, but we could try again */
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
new file mode 100644
index 000000000000..56592f0d6cef
--- /dev/null
+++ b/drivers/xen/Makefile
@@ -0,0 +1,2 @@
1obj-y += grant-table.o
2obj-y += xenbus/
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
new file mode 100644
index 000000000000..ea94dbabf9a9
--- /dev/null
+++ b/drivers/xen/grant-table.c
@@ -0,0 +1,582 @@
1/******************************************************************************
2 * grant_table.c
3 *
4 * Granting foreign access to our memory reservation.
5 *
6 * Copyright (c) 2005-2006, Christopher Clark
7 * Copyright (c) 2004-2005, K A Fraser
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License version 2
11 * as published by the Free Software Foundation; or, when distributed
12 * separately from the Linux kernel or incorporated into other
13 * software packages, subject to the following license:
14 *
15 * Permission is hereby granted, free of charge, to any person obtaining a copy
16 * of this source file (the "Software"), to deal in the Software without
17 * restriction, including without limitation the rights to use, copy, modify,
18 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
19 * and to permit persons to whom the Software is furnished to do so, subject to
20 * the following conditions:
21 *
22 * The above copyright notice and this permission notice shall be included in
23 * all copies or substantial portions of the Software.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
30 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
31 * IN THE SOFTWARE.
32 */
33
34#include <linux/module.h>
35#include <linux/sched.h>
36#include <linux/mm.h>
37#include <linux/vmalloc.h>
38#include <linux/uaccess.h>
39
40#include <xen/interface/xen.h>
41#include <xen/page.h>
42#include <xen/grant_table.h>
43
44#include <asm/pgtable.h>
45#include <asm/sync_bitops.h>
46
47
48/* External tools reserve first few grant table entries. */
49#define NR_RESERVED_ENTRIES 8
50#define GNTTAB_LIST_END 0xffffffff
51#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry))
52
53static grant_ref_t **gnttab_list;
54static unsigned int nr_grant_frames;
55static unsigned int boot_max_nr_grant_frames;
56static int gnttab_free_count;
57static grant_ref_t gnttab_free_head;
58static DEFINE_SPINLOCK(gnttab_list_lock);
59
60static struct grant_entry *shared;
61
62static struct gnttab_free_callback *gnttab_free_callback_list;
63
64static int gnttab_expand(unsigned int req_entries);
65
66#define RPP (PAGE_SIZE / sizeof(grant_ref_t))
67
68static inline grant_ref_t *__gnttab_entry(grant_ref_t entry)
69{
70 return &gnttab_list[(entry) / RPP][(entry) % RPP];
71}
72/* This can be used as an l-value */
73#define gnttab_entry(entry) (*__gnttab_entry(entry))
74
75static int get_free_entries(unsigned count)
76{
77 unsigned long flags;
78 int ref, rc;
79 grant_ref_t head;
80
81 spin_lock_irqsave(&gnttab_list_lock, flags);
82
83 if ((gnttab_free_count < count) &&
84 ((rc = gnttab_expand(count - gnttab_free_count)) < 0)) {
85 spin_unlock_irqrestore(&gnttab_list_lock, flags);
86 return rc;
87 }
88
89 ref = head = gnttab_free_head;
90 gnttab_free_count -= count;
91 while (count-- > 1)
92 head = gnttab_entry(head);
93 gnttab_free_head = gnttab_entry(head);
94 gnttab_entry(head) = GNTTAB_LIST_END;
95
96 spin_unlock_irqrestore(&gnttab_list_lock, flags);
97
98 return ref;
99}
100
101static void do_free_callbacks(void)
102{
103 struct gnttab_free_callback *callback, *next;
104
105 callback = gnttab_free_callback_list;
106 gnttab_free_callback_list = NULL;
107
108 while (callback != NULL) {
109 next = callback->next;
110 if (gnttab_free_count >= callback->count) {
111 callback->next = NULL;
112 callback->fn(callback->arg);
113 } else {
114 callback->next = gnttab_free_callback_list;
115 gnttab_free_callback_list = callback;
116 }
117 callback = next;
118 }
119}
120
121static inline void check_free_callbacks(void)
122{
123 if (unlikely(gnttab_free_callback_list))
124 do_free_callbacks();
125}
126
127static void put_free_entry(grant_ref_t ref)
128{
129 unsigned long flags;
130 spin_lock_irqsave(&gnttab_list_lock, flags);
131 gnttab_entry(ref) = gnttab_free_head;
132 gnttab_free_head = ref;
133 gnttab_free_count++;
134 check_free_callbacks();
135 spin_unlock_irqrestore(&gnttab_list_lock, flags);
136}
137
138static void update_grant_entry(grant_ref_t ref, domid_t domid,
139 unsigned long frame, unsigned flags)
140{
141 /*
142 * Introducing a valid entry into the grant table:
143 * 1. Write ent->domid.
144 * 2. Write ent->frame:
145 * GTF_permit_access: Frame to which access is permitted.
146 * GTF_accept_transfer: Pseudo-phys frame slot being filled by new
147 * frame, or zero if none.
148 * 3. Write memory barrier (WMB).
149 * 4. Write ent->flags, inc. valid type.
150 */
151 shared[ref].frame = frame;
152 shared[ref].domid = domid;
153 wmb();
154 shared[ref].flags = flags;
155}
156
157/*
158 * Public grant-issuing interface functions
159 */
160void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
161 unsigned long frame, int readonly)
162{
163 update_grant_entry(ref, domid, frame,
164 GTF_permit_access | (readonly ? GTF_readonly : 0));
165}
166EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
167
168int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
169 int readonly)
170{
171 int ref;
172
173 ref = get_free_entries(1);
174 if (unlikely(ref < 0))
175 return -ENOSPC;
176
177 gnttab_grant_foreign_access_ref(ref, domid, frame, readonly);
178
179 return ref;
180}
181EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
182
183int gnttab_query_foreign_access(grant_ref_t ref)
184{
185 u16 nflags;
186
187 nflags = shared[ref].flags;
188
189 return (nflags & (GTF_reading|GTF_writing));
190}
191EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
192
193int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
194{
195 u16 flags, nflags;
196
197 nflags = shared[ref].flags;
198 do {
199 flags = nflags;
200 if (flags & (GTF_reading|GTF_writing)) {
201 printk(KERN_ALERT "WARNING: g.e. still in use!\n");
202 return 0;
203 }
204 } while ((nflags = sync_cmpxchg(&shared[ref].flags, flags, 0)) != flags);
205
206 return 1;
207}
208EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
209
210void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
211 unsigned long page)
212{
213 if (gnttab_end_foreign_access_ref(ref, readonly)) {
214 put_free_entry(ref);
215 if (page != 0)
216 free_page(page);
217 } else {
218 /* XXX This needs to be fixed so that the ref and page are
219 placed on a list to be freed up later. */
220 printk(KERN_WARNING
221 "WARNING: leaking g.e. and page still in use!\n");
222 }
223}
224EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
225
226int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
227{
228 int ref;
229
230 ref = get_free_entries(1);
231 if (unlikely(ref < 0))
232 return -ENOSPC;
233 gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
234
235 return ref;
236}
237EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
238
239void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
240 unsigned long pfn)
241{
242 update_grant_entry(ref, domid, pfn, GTF_accept_transfer);
243}
244EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
245
246unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
247{
248 unsigned long frame;
249 u16 flags;
250
251 /*
252 * If a transfer is not even yet started, try to reclaim the grant
253 * reference and return failure (== 0).
254 */
255 while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
256 if (sync_cmpxchg(&shared[ref].flags, flags, 0) == flags)
257 return 0;
258 cpu_relax();
259 }
260
261 /* If a transfer is in progress then wait until it is completed. */
262 while (!(flags & GTF_transfer_completed)) {
263 flags = shared[ref].flags;
264 cpu_relax();
265 }
266
267 rmb(); /* Read the frame number /after/ reading completion status. */
268 frame = shared[ref].frame;
269 BUG_ON(frame == 0);
270
271 return frame;
272}
273EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
274
275unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
276{
277 unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
278 put_free_entry(ref);
279 return frame;
280}
281EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
282
283void gnttab_free_grant_reference(grant_ref_t ref)
284{
285 put_free_entry(ref);
286}
287EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
288
289void gnttab_free_grant_references(grant_ref_t head)
290{
291 grant_ref_t ref;
292 unsigned long flags;
293 int count = 1;
294 if (head == GNTTAB_LIST_END)
295 return;
296 spin_lock_irqsave(&gnttab_list_lock, flags);
297 ref = head;
298 while (gnttab_entry(ref) != GNTTAB_LIST_END) {
299 ref = gnttab_entry(ref);
300 count++;
301 }
302 gnttab_entry(ref) = gnttab_free_head;
303 gnttab_free_head = head;
304 gnttab_free_count += count;
305 check_free_callbacks();
306 spin_unlock_irqrestore(&gnttab_list_lock, flags);
307}
308EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
309
310int gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
311{
312 int h = get_free_entries(count);
313
314 if (h < 0)
315 return -ENOSPC;
316
317 *head = h;
318
319 return 0;
320}
321EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
322
323int gnttab_empty_grant_references(const grant_ref_t *private_head)
324{
325 return (*private_head == GNTTAB_LIST_END);
326}
327EXPORT_SYMBOL_GPL(gnttab_empty_grant_references);
328
329int gnttab_claim_grant_reference(grant_ref_t *private_head)
330{
331 grant_ref_t g = *private_head;
332 if (unlikely(g == GNTTAB_LIST_END))
333 return -ENOSPC;
334 *private_head = gnttab_entry(g);
335 return g;
336}
337EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
338
339void gnttab_release_grant_reference(grant_ref_t *private_head,
340 grant_ref_t release)
341{
342 gnttab_entry(release) = *private_head;
343 *private_head = release;
344}
345EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
346
347void gnttab_request_free_callback(struct gnttab_free_callback *callback,
348 void (*fn)(void *), void *arg, u16 count)
349{
350 unsigned long flags;
351 spin_lock_irqsave(&gnttab_list_lock, flags);
352 if (callback->next)
353 goto out;
354 callback->fn = fn;
355 callback->arg = arg;
356 callback->count = count;
357 callback->next = gnttab_free_callback_list;
358 gnttab_free_callback_list = callback;
359 check_free_callbacks();
360out:
361 spin_unlock_irqrestore(&gnttab_list_lock, flags);
362}
363EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
364
365void gnttab_cancel_free_callback(struct gnttab_free_callback *callback)
366{
367 struct gnttab_free_callback **pcb;
368 unsigned long flags;
369
370 spin_lock_irqsave(&gnttab_list_lock, flags);
371 for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) {
372 if (*pcb == callback) {
373 *pcb = callback->next;
374 break;
375 }
376 }
377 spin_unlock_irqrestore(&gnttab_list_lock, flags);
378}
379EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback);
380
381static int grow_gnttab_list(unsigned int more_frames)
382{
383 unsigned int new_nr_grant_frames, extra_entries, i;
384
385 new_nr_grant_frames = nr_grant_frames + more_frames;
386 extra_entries = more_frames * GREFS_PER_GRANT_FRAME;
387
388 for (i = nr_grant_frames; i < new_nr_grant_frames; i++) {
389 gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC);
390 if (!gnttab_list[i])
391 goto grow_nomem;
392 }
393
394
395 for (i = GREFS_PER_GRANT_FRAME * nr_grant_frames;
396 i < GREFS_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++)
397 gnttab_entry(i) = i + 1;
398
399 gnttab_entry(i) = gnttab_free_head;
400 gnttab_free_head = GREFS_PER_GRANT_FRAME * nr_grant_frames;
401 gnttab_free_count += extra_entries;
402
403 nr_grant_frames = new_nr_grant_frames;
404
405 check_free_callbacks();
406
407 return 0;
408
409grow_nomem:
410 for ( ; i >= nr_grant_frames; i--)
411 free_page((unsigned long) gnttab_list[i]);
412 return -ENOMEM;
413}
414
415static unsigned int __max_nr_grant_frames(void)
416{
417 struct gnttab_query_size query;
418 int rc;
419
420 query.dom = DOMID_SELF;
421
422 rc = HYPERVISOR_grant_table_op(GNTTABOP_query_size, &query, 1);
423 if ((rc < 0) || (query.status != GNTST_okay))
424 return 4; /* Legacy max supported number of frames */
425
426 return query.max_nr_frames;
427}
428
429static inline unsigned int max_nr_grant_frames(void)
430{
431 unsigned int xen_max = __max_nr_grant_frames();
432
433 if (xen_max > boot_max_nr_grant_frames)
434 return boot_max_nr_grant_frames;
435 return xen_max;
436}
437
438static int map_pte_fn(pte_t *pte, struct page *pmd_page,
439 unsigned long addr, void *data)
440{
441 unsigned long **frames = (unsigned long **)data;
442
443 set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
444 (*frames)++;
445 return 0;
446}
447
448static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
449 unsigned long addr, void *data)
450{
451
452 set_pte_at(&init_mm, addr, pte, __pte(0));
453 return 0;
454}
455
456static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
457{
458 struct gnttab_setup_table setup;
459 unsigned long *frames;
460 unsigned int nr_gframes = end_idx + 1;
461 int rc;
462
463 frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
464 if (!frames)
465 return -ENOMEM;
466
467 setup.dom = DOMID_SELF;
468 setup.nr_frames = nr_gframes;
469 setup.frame_list = frames;
470
471 rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
472 if (rc == -ENOSYS) {
473 kfree(frames);
474 return -ENOSYS;
475 }
476
477 BUG_ON(rc || setup.status);
478
479 if (shared == NULL) {
480 struct vm_struct *area;
481 area = alloc_vm_area(PAGE_SIZE * max_nr_grant_frames());
482 BUG_ON(area == NULL);
483 shared = area->addr;
484 }
485 rc = apply_to_page_range(&init_mm, (unsigned long)shared,
486 PAGE_SIZE * nr_gframes,
487 map_pte_fn, &frames);
488 BUG_ON(rc);
489 frames -= nr_gframes; /* adjust after map_pte_fn() */
490
491 kfree(frames);
492
493 return 0;
494}
495
496static int gnttab_resume(void)
497{
498 if (max_nr_grant_frames() < nr_grant_frames)
499 return -ENOSYS;
500 return gnttab_map(0, nr_grant_frames - 1);
501}
502
503static int gnttab_suspend(void)
504{
505 apply_to_page_range(&init_mm, (unsigned long)shared,
506 PAGE_SIZE * nr_grant_frames,
507 unmap_pte_fn, NULL);
508
509 return 0;
510}
511
512static int gnttab_expand(unsigned int req_entries)
513{
514 int rc;
515 unsigned int cur, extra;
516
517 cur = nr_grant_frames;
518 extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) /
519 GREFS_PER_GRANT_FRAME);
520 if (cur + extra > max_nr_grant_frames())
521 return -ENOSPC;
522
523 rc = gnttab_map(cur, cur + extra - 1);
524 if (rc == 0)
525 rc = grow_gnttab_list(extra);
526
527 return rc;
528}
529
530static int __devinit gnttab_init(void)
531{
532 int i;
533 unsigned int max_nr_glist_frames;
534 unsigned int nr_init_grefs;
535
536 if (!is_running_on_xen())
537 return -ENODEV;
538
539 nr_grant_frames = 1;
540 boot_max_nr_grant_frames = __max_nr_grant_frames();
541
542 /* Determine the maximum number of frames required for the
543 * grant reference free list on the current hypervisor.
544 */
545 max_nr_glist_frames = (boot_max_nr_grant_frames *
546 GREFS_PER_GRANT_FRAME /
547 (PAGE_SIZE / sizeof(grant_ref_t)));
548
549 gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *),
550 GFP_KERNEL);
551 if (gnttab_list == NULL)
552 return -ENOMEM;
553
554 for (i = 0; i < nr_grant_frames; i++) {
555 gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL);
556 if (gnttab_list[i] == NULL)
557 goto ini_nomem;
558 }
559
560 if (gnttab_resume() < 0)
561 return -ENODEV;
562
563 nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME;
564
565 for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++)
566 gnttab_entry(i) = i + 1;
567
568 gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END;
569 gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
570 gnttab_free_head = NR_RESERVED_ENTRIES;
571
572 printk("Grant table initialized\n");
573 return 0;
574
575 ini_nomem:
576 for (i--; i >= 0; i--)
577 free_page((unsigned long)gnttab_list[i]);
578 kfree(gnttab_list);
579 return -ENOMEM;
580}
581
582core_initcall(gnttab_init);
diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
new file mode 100644
index 000000000000..5571f5b84223
--- /dev/null
+++ b/drivers/xen/xenbus/Makefile
@@ -0,0 +1,7 @@
1obj-y += xenbus.o
2
3xenbus-objs =
4xenbus-objs += xenbus_client.o
5xenbus-objs += xenbus_comms.o
6xenbus-objs += xenbus_xs.o
7xenbus-objs += xenbus_probe.o
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
new file mode 100644
index 000000000000..9fd2f70ab46d
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -0,0 +1,569 @@
1/******************************************************************************
2 * Client-facing interface for the Xenbus driver. In other words, the
3 * interface between the Xenbus and the device-specific code, be it the
4 * frontend or the backend of that driver.
5 *
6 * Copyright (C) 2005 XenSource Ltd
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version 2
10 * as published by the Free Software Foundation; or, when distributed
11 * separately from the Linux kernel or incorporated into other
12 * software packages, subject to the following license:
13 *
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 * IN THE SOFTWARE.
31 */
32
33#include <linux/types.h>
34#include <linux/vmalloc.h>
35#include <asm/xen/hypervisor.h>
36#include <xen/interface/xen.h>
37#include <xen/interface/event_channel.h>
38#include <xen/events.h>
39#include <xen/grant_table.h>
40#include <xen/xenbus.h>
41
42const char *xenbus_strstate(enum xenbus_state state)
43{
44 static const char *const name[] = {
45 [ XenbusStateUnknown ] = "Unknown",
46 [ XenbusStateInitialising ] = "Initialising",
47 [ XenbusStateInitWait ] = "InitWait",
48 [ XenbusStateInitialised ] = "Initialised",
49 [ XenbusStateConnected ] = "Connected",
50 [ XenbusStateClosing ] = "Closing",
51 [ XenbusStateClosed ] = "Closed",
52 };
53 return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
54}
55EXPORT_SYMBOL_GPL(xenbus_strstate);
56
57/**
58 * xenbus_watch_path - register a watch
59 * @dev: xenbus device
60 * @path: path to watch
61 * @watch: watch to register
62 * @callback: callback to register
63 *
64 * Register a @watch on the given path, using the given xenbus_watch structure
65 * for storage, and the given @callback function as the callback. Return 0 on
66 * success, or -errno on error. On success, the given @path will be saved as
67 * @watch->node, and remains the caller's to free. On error, @watch->node will
68 * be NULL, the device will switch to %XenbusStateClosing, and the error will
69 * be saved in the store.
70 */
71int xenbus_watch_path(struct xenbus_device *dev, const char *path,
72 struct xenbus_watch *watch,
73 void (*callback)(struct xenbus_watch *,
74 const char **, unsigned int))
75{
76 int err;
77
78 watch->node = path;
79 watch->callback = callback;
80
81 err = register_xenbus_watch(watch);
82
83 if (err) {
84 watch->node = NULL;
85 watch->callback = NULL;
86 xenbus_dev_fatal(dev, err, "adding watch on %s", path);
87 }
88
89 return err;
90}
91EXPORT_SYMBOL_GPL(xenbus_watch_path);
92
93
94/**
95 * xenbus_watch_pathfmt - register a watch on a sprintf-formatted path
96 * @dev: xenbus device
97 * @watch: watch to register
98 * @callback: callback to register
99 * @pathfmt: format of path to watch
100 *
101 * Register a watch on the given @path, using the given xenbus_watch
102 * structure for storage, and the given @callback function as the callback.
103 * Return 0 on success, or -errno on error. On success, the watched path
104 * (@path/@path2) will be saved as @watch->node, and becomes the caller's to
105 * kfree(). On error, watch->node will be NULL, so the caller has nothing to
106 * free, the device will switch to %XenbusStateClosing, and the error will be
107 * saved in the store.
108 */
109int xenbus_watch_pathfmt(struct xenbus_device *dev,
110 struct xenbus_watch *watch,
111 void (*callback)(struct xenbus_watch *,
112 const char **, unsigned int),
113 const char *pathfmt, ...)
114{
115 int err;
116 va_list ap;
117 char *path;
118
119 va_start(ap, pathfmt);
120 path = kvasprintf(GFP_KERNEL, pathfmt, ap);
121 va_end(ap);
122
123 if (!path) {
124 xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
125 return -ENOMEM;
126 }
127 err = xenbus_watch_path(dev, path, watch, callback);
128
129 if (err)
130 kfree(path);
131 return err;
132}
133EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
134
135
136/**
137 * xenbus_switch_state
138 * @dev: xenbus device
139 * @xbt: transaction handle
140 * @state: new state
141 *
142 * Advertise in the store a change of the given driver to the given new_state.
143 * Return 0 on success, or -errno on error. On error, the device will switch
144 * to XenbusStateClosing, and the error will be saved in the store.
145 */
146int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
147{
148 /* We check whether the state is currently set to the given value, and
149 if not, then the state is set. We don't want to unconditionally
150 write the given state, because we don't want to fire watches
151 unnecessarily. Furthermore, if the node has gone, we don't write
152 to it, as the device will be tearing down, and we don't want to
153 resurrect that directory.
154
155 Note that, because of this cached value of our state, this function
156 will not work inside a Xenstore transaction (something it was
157 trying to in the past) because dev->state would not get reset if
158 the transaction was aborted.
159
160 */
161
162 int current_state;
163 int err;
164
165 if (state == dev->state)
166 return 0;
167
168 err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d",
169 &current_state);
170 if (err != 1)
171 return 0;
172
173 err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state);
174 if (err) {
175 if (state != XenbusStateClosing) /* Avoid looping */
176 xenbus_dev_fatal(dev, err, "writing new state");
177 return err;
178 }
179
180 dev->state = state;
181
182 return 0;
183}
184EXPORT_SYMBOL_GPL(xenbus_switch_state);
185
186int xenbus_frontend_closed(struct xenbus_device *dev)
187{
188 xenbus_switch_state(dev, XenbusStateClosed);
189 complete(&dev->down);
190 return 0;
191}
192EXPORT_SYMBOL_GPL(xenbus_frontend_closed);
193
194/**
195 * Return the path to the error node for the given device, or NULL on failure.
196 * If the value returned is non-NULL, then it is the caller's to kfree.
197 */
198static char *error_path(struct xenbus_device *dev)
199{
200 return kasprintf(GFP_KERNEL, "error/%s", dev->nodename);
201}
202
203
204static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
205 const char *fmt, va_list ap)
206{
207 int ret;
208 unsigned int len;
209 char *printf_buffer = NULL;
210 char *path_buffer = NULL;
211
212#define PRINTF_BUFFER_SIZE 4096
213 printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
214 if (printf_buffer == NULL)
215 goto fail;
216
217 len = sprintf(printf_buffer, "%i ", -err);
218 ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
219
220 BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
221
222 dev_err(&dev->dev, "%s\n", printf_buffer);
223
224 path_buffer = error_path(dev);
225
226 if (path_buffer == NULL) {
227 dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
228 dev->nodename, printf_buffer);
229 goto fail;
230 }
231
232 if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
233 dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
234 dev->nodename, printf_buffer);
235 goto fail;
236 }
237
238fail:
239 kfree(printf_buffer);
240 kfree(path_buffer);
241}
242
243
244/**
245 * xenbus_dev_error
246 * @dev: xenbus device
247 * @err: error to report
248 * @fmt: error message format
249 *
250 * Report the given negative errno into the store, along with the given
251 * formatted message.
252 */
253void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...)
254{
255 va_list ap;
256
257 va_start(ap, fmt);
258 xenbus_va_dev_error(dev, err, fmt, ap);
259 va_end(ap);
260}
261EXPORT_SYMBOL_GPL(xenbus_dev_error);
262
263/**
264 * xenbus_dev_fatal
265 * @dev: xenbus device
266 * @err: error to report
267 * @fmt: error message format
268 *
269 * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
270 * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
271 * closedown of this driver and its peer.
272 */
273
274void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
275{
276 va_list ap;
277
278 va_start(ap, fmt);
279 xenbus_va_dev_error(dev, err, fmt, ap);
280 va_end(ap);
281
282 xenbus_switch_state(dev, XenbusStateClosing);
283}
284EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
285
286/**
287 * xenbus_grant_ring
288 * @dev: xenbus device
289 * @ring_mfn: mfn of ring to grant
290
291 * Grant access to the given @ring_mfn to the peer of the given device. Return
292 * 0 on success, or -errno on error. On error, the device will switch to
293 * XenbusStateClosing, and the error will be saved in the store.
294 */
295int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
296{
297 int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0);
298 if (err < 0)
299 xenbus_dev_fatal(dev, err, "granting access to ring page");
300 return err;
301}
302EXPORT_SYMBOL_GPL(xenbus_grant_ring);
303
304
305/**
306 * Allocate an event channel for the given xenbus_device, assigning the newly
307 * created local port to *port. Return 0 on success, or -errno on error. On
308 * error, the device will switch to XenbusStateClosing, and the error will be
309 * saved in the store.
310 */
311int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
312{
313 struct evtchn_alloc_unbound alloc_unbound;
314 int err;
315
316 alloc_unbound.dom = DOMID_SELF;
317 alloc_unbound.remote_dom = dev->otherend_id;
318
319 err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
320 &alloc_unbound);
321 if (err)
322 xenbus_dev_fatal(dev, err, "allocating event channel");
323 else
324 *port = alloc_unbound.port;
325
326 return err;
327}
328EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
329
330
331/**
332 * Bind to an existing interdomain event channel in another domain. Returns 0
333 * on success and stores the local port in *port. On error, returns -errno,
334 * switches the device to XenbusStateClosing, and saves the error in XenStore.
335 */
336int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
337{
338 struct evtchn_bind_interdomain bind_interdomain;
339 int err;
340
341 bind_interdomain.remote_dom = dev->otherend_id;
342 bind_interdomain.remote_port = remote_port;
343
344 err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
345 &bind_interdomain);
346 if (err)
347 xenbus_dev_fatal(dev, err,
348 "binding to event channel %d from domain %d",
349 remote_port, dev->otherend_id);
350 else
351 *port = bind_interdomain.local_port;
352
353 return err;
354}
355EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
356
357
358/**
359 * Free an existing event channel. Returns 0 on success or -errno on error.
360 */
361int xenbus_free_evtchn(struct xenbus_device *dev, int port)
362{
363 struct evtchn_close close;
364 int err;
365
366 close.port = port;
367
368 err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
369 if (err)
370 xenbus_dev_error(dev, err, "freeing event channel %d", port);
371
372 return err;
373}
374EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
375
376
377/**
378 * xenbus_map_ring_valloc
379 * @dev: xenbus device
380 * @gnt_ref: grant reference
381 * @vaddr: pointer to address to be filled out by mapping
382 *
383 * Based on Rusty Russell's skeleton driver's map_page.
384 * Map a page of memory into this domain from another domain's grant table.
385 * xenbus_map_ring_valloc allocates a page of virtual address space, maps the
386 * page to that address, and sets *vaddr to that address.
387 * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
388 * or -ENOMEM on error. If an error is returned, device will switch to
389 * XenbusStateClosing and the error message will be saved in XenStore.
390 */
391int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
392{
393 struct gnttab_map_grant_ref op = {
394 .flags = GNTMAP_host_map,
395 .ref = gnt_ref,
396 .dom = dev->otherend_id,
397 };
398 struct vm_struct *area;
399
400 *vaddr = NULL;
401
402 area = alloc_vm_area(PAGE_SIZE);
403 if (!area)
404 return -ENOMEM;
405
406 op.host_addr = (unsigned long)area->addr;
407
408 if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
409 BUG();
410
411 if (op.status != GNTST_okay) {
412 free_vm_area(area);
413 xenbus_dev_fatal(dev, op.status,
414 "mapping in shared page %d from domain %d",
415 gnt_ref, dev->otherend_id);
416 return op.status;
417 }
418
419 /* Stuff the handle in an unused field */
420 area->phys_addr = (unsigned long)op.handle;
421
422 *vaddr = area->addr;
423 return 0;
424}
425EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
426
427
428/**
429 * xenbus_map_ring
430 * @dev: xenbus device
431 * @gnt_ref: grant reference
432 * @handle: pointer to grant handle to be filled
433 * @vaddr: address to be mapped to
434 *
435 * Map a page of memory into this domain from another domain's grant table.
436 * xenbus_map_ring does not allocate the virtual address space (you must do
437 * this yourself!). It only maps in the page to the specified address.
438 * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
439 * or -ENOMEM on error. If an error is returned, device will switch to
440 * XenbusStateClosing and the error message will be saved in XenStore.
441 */
442int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
443 grant_handle_t *handle, void *vaddr)
444{
445 struct gnttab_map_grant_ref op = {
446 .host_addr = (unsigned long)vaddr,
447 .flags = GNTMAP_host_map,
448 .ref = gnt_ref,
449 .dom = dev->otherend_id,
450 };
451
452 if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
453 BUG();
454
455 if (op.status != GNTST_okay) {
456 xenbus_dev_fatal(dev, op.status,
457 "mapping in shared page %d from domain %d",
458 gnt_ref, dev->otherend_id);
459 } else
460 *handle = op.handle;
461
462 return op.status;
463}
464EXPORT_SYMBOL_GPL(xenbus_map_ring);
465
466
467/**
468 * xenbus_unmap_ring_vfree
469 * @dev: xenbus device
470 * @vaddr: addr to unmap
471 *
472 * Based on Rusty Russell's skeleton driver's unmap_page.
473 * Unmap a page of memory in this domain that was imported from another domain.
474 * Use xenbus_unmap_ring_vfree if you mapped in your memory with
475 * xenbus_map_ring_valloc (it will free the virtual address space).
476 * Returns 0 on success and returns GNTST_* on error
477 * (see xen/include/interface/grant_table.h).
478 */
479int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
480{
481 struct vm_struct *area;
482 struct gnttab_unmap_grant_ref op = {
483 .host_addr = (unsigned long)vaddr,
484 };
485
486 /* It'd be nice if linux/vmalloc.h provided a find_vm_area(void *addr)
487 * method so that we don't have to muck with vmalloc internals here.
488 * We could force the user to hang on to their struct vm_struct from
489 * xenbus_map_ring_valloc, but these 6 lines considerably simplify
490 * this API.
491 */
492 read_lock(&vmlist_lock);
493 for (area = vmlist; area != NULL; area = area->next) {
494 if (area->addr == vaddr)
495 break;
496 }
497 read_unlock(&vmlist_lock);
498
499 if (!area) {
500 xenbus_dev_error(dev, -ENOENT,
501 "can't find mapped virtual address %p", vaddr);
502 return GNTST_bad_virt_addr;
503 }
504
505 op.handle = (grant_handle_t)area->phys_addr;
506
507 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
508 BUG();
509
510 if (op.status == GNTST_okay)
511 free_vm_area(area);
512 else
513 xenbus_dev_error(dev, op.status,
514 "unmapping page at handle %d error %d",
515 (int16_t)area->phys_addr, op.status);
516
517 return op.status;
518}
519EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
520
521
522/**
523 * xenbus_unmap_ring
524 * @dev: xenbus device
525 * @handle: grant handle
526 * @vaddr: addr to unmap
527 *
528 * Unmap a page of memory in this domain that was imported from another domain.
529 * Returns 0 on success and returns GNTST_* on error
530 * (see xen/include/interface/grant_table.h).
531 */
532int xenbus_unmap_ring(struct xenbus_device *dev,
533 grant_handle_t handle, void *vaddr)
534{
535 struct gnttab_unmap_grant_ref op = {
536 .host_addr = (unsigned long)vaddr,
537 .handle = handle,
538 };
539
540 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
541 BUG();
542
543 if (op.status != GNTST_okay)
544 xenbus_dev_error(dev, op.status,
545 "unmapping page at handle %d error %d",
546 handle, op.status);
547
548 return op.status;
549}
550EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
551
552
553/**
554 * xenbus_read_driver_state
555 * @path: path for driver
556 *
557 * Return the state of the driver rooted at the given store path, or
558 * XenbusStateUnknown if no state can be read.
559 */
560enum xenbus_state xenbus_read_driver_state(const char *path)
561{
562 enum xenbus_state result;
563 int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
564 if (err)
565 result = XenbusStateUnknown;
566
567 return result;
568}
569EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c
new file mode 100644
index 000000000000..6efbe3f29ca5
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_comms.c
@@ -0,0 +1,233 @@
1/******************************************************************************
2 * xenbus_comms.c
3 *
4 * Low level code to talks to Xen Store: ringbuffer and event channel.
5 *
6 * Copyright (C) 2005 Rusty Russell, IBM Corporation
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version 2
10 * as published by the Free Software Foundation; or, when distributed
11 * separately from the Linux kernel or incorporated into other
12 * software packages, subject to the following license:
13 *
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 * IN THE SOFTWARE.
31 */
32
33#include <linux/wait.h>
34#include <linux/interrupt.h>
35#include <linux/sched.h>
36#include <linux/err.h>
37#include <xen/xenbus.h>
38#include <asm/xen/hypervisor.h>
39#include <xen/events.h>
40#include <xen/page.h>
41#include "xenbus_comms.h"
42
43static int xenbus_irq;
44
45static DECLARE_WORK(probe_work, xenbus_probe);
46
47static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
48
49static irqreturn_t wake_waiting(int irq, void *unused)
50{
51 if (unlikely(xenstored_ready == 0)) {
52 xenstored_ready = 1;
53 schedule_work(&probe_work);
54 }
55
56 wake_up(&xb_waitq);
57 return IRQ_HANDLED;
58}
59
60static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
61{
62 return ((prod - cons) <= XENSTORE_RING_SIZE);
63}
64
65static void *get_output_chunk(XENSTORE_RING_IDX cons,
66 XENSTORE_RING_IDX prod,
67 char *buf, uint32_t *len)
68{
69 *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
70 if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
71 *len = XENSTORE_RING_SIZE - (prod - cons);
72 return buf + MASK_XENSTORE_IDX(prod);
73}
74
75static const void *get_input_chunk(XENSTORE_RING_IDX cons,
76 XENSTORE_RING_IDX prod,
77 const char *buf, uint32_t *len)
78{
79 *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
80 if ((prod - cons) < *len)
81 *len = prod - cons;
82 return buf + MASK_XENSTORE_IDX(cons);
83}
84
85/**
86 * xb_write - low level write
87 * @data: buffer to send
88 * @len: length of buffer
89 *
90 * Returns 0 on success, error otherwise.
91 */
92int xb_write(const void *data, unsigned len)
93{
94 struct xenstore_domain_interface *intf = xen_store_interface;
95 XENSTORE_RING_IDX cons, prod;
96 int rc;
97
98 while (len != 0) {
99 void *dst;
100 unsigned int avail;
101
102 rc = wait_event_interruptible(
103 xb_waitq,
104 (intf->req_prod - intf->req_cons) !=
105 XENSTORE_RING_SIZE);
106 if (rc < 0)
107 return rc;
108
109 /* Read indexes, then verify. */
110 cons = intf->req_cons;
111 prod = intf->req_prod;
112 if (!check_indexes(cons, prod)) {
113 intf->req_cons = intf->req_prod = 0;
114 return -EIO;
115 }
116
117 dst = get_output_chunk(cons, prod, intf->req, &avail);
118 if (avail == 0)
119 continue;
120 if (avail > len)
121 avail = len;
122
123 /* Must write data /after/ reading the consumer index. */
124 mb();
125
126 memcpy(dst, data, avail);
127 data += avail;
128 len -= avail;
129
130 /* Other side must not see new producer until data is there. */
131 wmb();
132 intf->req_prod += avail;
133
134 /* Implies mb(): other side will see the updated producer. */
135 notify_remote_via_evtchn(xen_store_evtchn);
136 }
137
138 return 0;
139}
140
141int xb_data_to_read(void)
142{
143 struct xenstore_domain_interface *intf = xen_store_interface;
144 return (intf->rsp_cons != intf->rsp_prod);
145}
146
147int xb_wait_for_data_to_read(void)
148{
149 return wait_event_interruptible(xb_waitq, xb_data_to_read());
150}
151
152int xb_read(void *data, unsigned len)
153{
154 struct xenstore_domain_interface *intf = xen_store_interface;
155 XENSTORE_RING_IDX cons, prod;
156 int rc;
157
158 while (len != 0) {
159 unsigned int avail;
160 const char *src;
161
162 rc = xb_wait_for_data_to_read();
163 if (rc < 0)
164 return rc;
165
166 /* Read indexes, then verify. */
167 cons = intf->rsp_cons;
168 prod = intf->rsp_prod;
169 if (!check_indexes(cons, prod)) {
170 intf->rsp_cons = intf->rsp_prod = 0;
171 return -EIO;
172 }
173
174 src = get_input_chunk(cons, prod, intf->rsp, &avail);
175 if (avail == 0)
176 continue;
177 if (avail > len)
178 avail = len;
179
180 /* Must read data /after/ reading the producer index. */
181 rmb();
182
183 memcpy(data, src, avail);
184 data += avail;
185 len -= avail;
186
187 /* Other side must not see free space until we've copied out */
188 mb();
189 intf->rsp_cons += avail;
190
191 pr_debug("Finished read of %i bytes (%i to go)\n", avail, len);
192
193 /* Implies mb(): other side will see the updated consumer. */
194 notify_remote_via_evtchn(xen_store_evtchn);
195 }
196
197 return 0;
198}
199
200/**
201 * xb_init_comms - Set up interrupt handler off store event channel.
202 */
203int xb_init_comms(void)
204{
205 struct xenstore_domain_interface *intf = xen_store_interface;
206 int err;
207
208 if (intf->req_prod != intf->req_cons)
209 printk(KERN_ERR "XENBUS request ring is not quiescent "
210 "(%08x:%08x)!\n", intf->req_cons, intf->req_prod);
211
212 if (intf->rsp_prod != intf->rsp_cons) {
213 printk(KERN_WARNING "XENBUS response ring is not quiescent "
214 "(%08x:%08x): fixing up\n",
215 intf->rsp_cons, intf->rsp_prod);
216 intf->rsp_cons = intf->rsp_prod;
217 }
218
219 if (xenbus_irq)
220 unbind_from_irqhandler(xenbus_irq, &xb_waitq);
221
222 err = bind_evtchn_to_irqhandler(
223 xen_store_evtchn, wake_waiting,
224 0, "xenbus", &xb_waitq);
225 if (err <= 0) {
226 printk(KERN_ERR "XENBUS request irq failed %i\n", err);
227 return err;
228 }
229
230 xenbus_irq = err;
231
232 return 0;
233}
diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h
new file mode 100644
index 000000000000..c21db7513736
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_comms.h
@@ -0,0 +1,46 @@
1/*
2 * Private include for xenbus communications.
3 *
4 * Copyright (C) 2005 Rusty Russell, IBM Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License version 2
8 * as published by the Free Software Foundation; or, when distributed
9 * separately from the Linux kernel or incorporated into other
10 * software packages, subject to the following license:
11 *
12 * Permission is hereby granted, free of charge, to any person obtaining a copy
13 * of this source file (the "Software"), to deal in the Software without
14 * restriction, including without limitation the rights to use, copy, modify,
15 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
16 * and to permit persons to whom the Software is furnished to do so, subject to
17 * the following conditions:
18 *
19 * The above copyright notice and this permission notice shall be included in
20 * all copies or substantial portions of the Software.
21 *
22 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
27 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
28 * IN THE SOFTWARE.
29 */
30
31#ifndef _XENBUS_COMMS_H
32#define _XENBUS_COMMS_H
33
34int xs_init(void);
35int xb_init_comms(void);
36
37/* Low level routines. */
38int xb_write(const void *data, unsigned len);
39int xb_read(void *data, unsigned len);
40int xb_data_to_read(void);
41int xb_wait_for_data_to_read(void);
42int xs_input_avail(void);
43extern struct xenstore_domain_interface *xen_store_interface;
44extern int xen_store_evtchn;
45
46#endif /* _XENBUS_COMMS_H */
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
new file mode 100644
index 000000000000..0b769f7c4a48
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -0,0 +1,935 @@
1/******************************************************************************
2 * Talks to Xen Store to figure out what devices we have.
3 *
4 * Copyright (C) 2005 Rusty Russell, IBM Corporation
5 * Copyright (C) 2005 Mike Wray, Hewlett-Packard
6 * Copyright (C) 2005, 2006 XenSource Ltd
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version 2
10 * as published by the Free Software Foundation; or, when distributed
11 * separately from the Linux kernel or incorporated into other
12 * software packages, subject to the following license:
13 *
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 * IN THE SOFTWARE.
31 */
32
33#define DPRINTK(fmt, args...) \
34 pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
35 __func__, __LINE__, ##args)
36
37#include <linux/kernel.h>
38#include <linux/err.h>
39#include <linux/string.h>
40#include <linux/ctype.h>
41#include <linux/fcntl.h>
42#include <linux/mm.h>
43#include <linux/notifier.h>
44#include <linux/kthread.h>
45#include <linux/mutex.h>
46#include <linux/io.h>
47
48#include <asm/page.h>
49#include <asm/pgtable.h>
50#include <asm/xen/hypervisor.h>
51#include <xen/xenbus.h>
52#include <xen/events.h>
53#include <xen/page.h>
54
55#include "xenbus_comms.h"
56#include "xenbus_probe.h"
57
58int xen_store_evtchn;
59struct xenstore_domain_interface *xen_store_interface;
60static unsigned long xen_store_mfn;
61
62static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
63
64static void wait_for_devices(struct xenbus_driver *xendrv);
65
66static int xenbus_probe_frontend(const char *type, const char *name);
67
68static void xenbus_dev_shutdown(struct device *_dev);
69
70/* If something in array of ids matches this device, return it. */
71static const struct xenbus_device_id *
72match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
73{
74 for (; *arr->devicetype != '\0'; arr++) {
75 if (!strcmp(arr->devicetype, dev->devicetype))
76 return arr;
77 }
78 return NULL;
79}
80
81int xenbus_match(struct device *_dev, struct device_driver *_drv)
82{
83 struct xenbus_driver *drv = to_xenbus_driver(_drv);
84
85 if (!drv->ids)
86 return 0;
87
88 return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
89}
90
91/* device/<type>/<id> => <type>-<id> */
92static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
93{
94 nodename = strchr(nodename, '/');
95 if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) {
96 printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
97 return -EINVAL;
98 }
99
100 strlcpy(bus_id, nodename + 1, BUS_ID_SIZE);
101 if (!strchr(bus_id, '/')) {
102 printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
103 return -EINVAL;
104 }
105 *strchr(bus_id, '/') = '-';
106 return 0;
107}
108
109
110static void free_otherend_details(struct xenbus_device *dev)
111{
112 kfree(dev->otherend);
113 dev->otherend = NULL;
114}
115
116
117static void free_otherend_watch(struct xenbus_device *dev)
118{
119 if (dev->otherend_watch.node) {
120 unregister_xenbus_watch(&dev->otherend_watch);
121 kfree(dev->otherend_watch.node);
122 dev->otherend_watch.node = NULL;
123 }
124}
125
126
127int read_otherend_details(struct xenbus_device *xendev,
128 char *id_node, char *path_node)
129{
130 int err = xenbus_gather(XBT_NIL, xendev->nodename,
131 id_node, "%i", &xendev->otherend_id,
132 path_node, NULL, &xendev->otherend,
133 NULL);
134 if (err) {
135 xenbus_dev_fatal(xendev, err,
136 "reading other end details from %s",
137 xendev->nodename);
138 return err;
139 }
140 if (strlen(xendev->otherend) == 0 ||
141 !xenbus_exists(XBT_NIL, xendev->otherend, "")) {
142 xenbus_dev_fatal(xendev, -ENOENT,
143 "unable to read other end from %s. "
144 "missing or inaccessible.",
145 xendev->nodename);
146 free_otherend_details(xendev);
147 return -ENOENT;
148 }
149
150 return 0;
151}
152
153
154static int read_backend_details(struct xenbus_device *xendev)
155{
156 return read_otherend_details(xendev, "backend-id", "backend");
157}
158
159
160/* Bus type for frontend drivers. */
161static struct xen_bus_type xenbus_frontend = {
162 .root = "device",
163 .levels = 2, /* device/type/<id> */
164 .get_bus_id = frontend_bus_id,
165 .probe = xenbus_probe_frontend,
166 .bus = {
167 .name = "xen",
168 .match = xenbus_match,
169 .probe = xenbus_dev_probe,
170 .remove = xenbus_dev_remove,
171 .shutdown = xenbus_dev_shutdown,
172 },
173};
174
175static void otherend_changed(struct xenbus_watch *watch,
176 const char **vec, unsigned int len)
177{
178 struct xenbus_device *dev =
179 container_of(watch, struct xenbus_device, otherend_watch);
180 struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
181 enum xenbus_state state;
182
183 /* Protect us against watches firing on old details when the otherend
184 details change, say immediately after a resume. */
185 if (!dev->otherend ||
186 strncmp(dev->otherend, vec[XS_WATCH_PATH],
187 strlen(dev->otherend))) {
188 dev_dbg(&dev->dev, "Ignoring watch at %s", vec[XS_WATCH_PATH]);
189 return;
190 }
191
192 state = xenbus_read_driver_state(dev->otherend);
193
194 dev_dbg(&dev->dev, "state is %d, (%s), %s, %s",
195 state, xenbus_strstate(state), dev->otherend_watch.node,
196 vec[XS_WATCH_PATH]);
197
198 /*
199 * Ignore xenbus transitions during shutdown. This prevents us doing
200 * work that can fail e.g., when the rootfs is gone.
201 */
202 if (system_state > SYSTEM_RUNNING) {
203 struct xen_bus_type *bus = bus;
204 bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
205 /* If we're frontend, drive the state machine to Closed. */
206 /* This should cause the backend to release our resources. */
207 if ((bus == &xenbus_frontend) && (state == XenbusStateClosing))
208 xenbus_frontend_closed(dev);
209 return;
210 }
211
212 if (drv->otherend_changed)
213 drv->otherend_changed(dev, state);
214}
215
216
217static int talk_to_otherend(struct xenbus_device *dev)
218{
219 struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
220
221 free_otherend_watch(dev);
222 free_otherend_details(dev);
223
224 return drv->read_otherend_details(dev);
225}
226
227
228static int watch_otherend(struct xenbus_device *dev)
229{
230 return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed,
231 "%s/%s", dev->otherend, "state");
232}
233
234
235int xenbus_dev_probe(struct device *_dev)
236{
237 struct xenbus_device *dev = to_xenbus_device(_dev);
238 struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
239 const struct xenbus_device_id *id;
240 int err;
241
242 DPRINTK("%s", dev->nodename);
243
244 if (!drv->probe) {
245 err = -ENODEV;
246 goto fail;
247 }
248
249 id = match_device(drv->ids, dev);
250 if (!id) {
251 err = -ENODEV;
252 goto fail;
253 }
254
255 err = talk_to_otherend(dev);
256 if (err) {
257 dev_warn(&dev->dev, "talk_to_otherend on %s failed.\n",
258 dev->nodename);
259 return err;
260 }
261
262 err = drv->probe(dev, id);
263 if (err)
264 goto fail;
265
266 err = watch_otherend(dev);
267 if (err) {
268 dev_warn(&dev->dev, "watch_otherend on %s failed.\n",
269 dev->nodename);
270 return err;
271 }
272
273 return 0;
274fail:
275 xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
276 xenbus_switch_state(dev, XenbusStateClosed);
277 return -ENODEV;
278}
279
280int xenbus_dev_remove(struct device *_dev)
281{
282 struct xenbus_device *dev = to_xenbus_device(_dev);
283 struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
284
285 DPRINTK("%s", dev->nodename);
286
287 free_otherend_watch(dev);
288 free_otherend_details(dev);
289
290 if (drv->remove)
291 drv->remove(dev);
292
293 xenbus_switch_state(dev, XenbusStateClosed);
294 return 0;
295}
296
297static void xenbus_dev_shutdown(struct device *_dev)
298{
299 struct xenbus_device *dev = to_xenbus_device(_dev);
300 unsigned long timeout = 5*HZ;
301
302 DPRINTK("%s", dev->nodename);
303
304 get_device(&dev->dev);
305 if (dev->state != XenbusStateConnected) {
306 printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__,
307 dev->nodename, xenbus_strstate(dev->state));
308 goto out;
309 }
310 xenbus_switch_state(dev, XenbusStateClosing);
311 timeout = wait_for_completion_timeout(&dev->down, timeout);
312 if (!timeout)
313 printk(KERN_INFO "%s: %s timeout closing device\n",
314 __func__, dev->nodename);
315 out:
316 put_device(&dev->dev);
317}
318
319int xenbus_register_driver_common(struct xenbus_driver *drv,
320 struct xen_bus_type *bus,
321 struct module *owner,
322 const char *mod_name)
323{
324 drv->driver.name = drv->name;
325 drv->driver.bus = &bus->bus;
326 drv->driver.owner = owner;
327 drv->driver.mod_name = mod_name;
328
329 return driver_register(&drv->driver);
330}
331
332int __xenbus_register_frontend(struct xenbus_driver *drv,
333 struct module *owner, const char *mod_name)
334{
335 int ret;
336
337 drv->read_otherend_details = read_backend_details;
338
339 ret = xenbus_register_driver_common(drv, &xenbus_frontend,
340 owner, mod_name);
341 if (ret)
342 return ret;
343
344 /* If this driver is loaded as a module wait for devices to attach. */
345 wait_for_devices(drv);
346
347 return 0;
348}
349EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
350
351void xenbus_unregister_driver(struct xenbus_driver *drv)
352{
353 driver_unregister(&drv->driver);
354}
355EXPORT_SYMBOL_GPL(xenbus_unregister_driver);
356
357struct xb_find_info
358{
359 struct xenbus_device *dev;
360 const char *nodename;
361};
362
363static int cmp_dev(struct device *dev, void *data)
364{
365 struct xenbus_device *xendev = to_xenbus_device(dev);
366 struct xb_find_info *info = data;
367
368 if (!strcmp(xendev->nodename, info->nodename)) {
369 info->dev = xendev;
370 get_device(dev);
371 return 1;
372 }
373 return 0;
374}
375
376struct xenbus_device *xenbus_device_find(const char *nodename,
377 struct bus_type *bus)
378{
379 struct xb_find_info info = { .dev = NULL, .nodename = nodename };
380
381 bus_for_each_dev(bus, NULL, &info, cmp_dev);
382 return info.dev;
383}
384
385static int cleanup_dev(struct device *dev, void *data)
386{
387 struct xenbus_device *xendev = to_xenbus_device(dev);
388 struct xb_find_info *info = data;
389 int len = strlen(info->nodename);
390
391 DPRINTK("%s", info->nodename);
392
393 /* Match the info->nodename path, or any subdirectory of that path. */
394 if (strncmp(xendev->nodename, info->nodename, len))
395 return 0;
396
397 /* If the node name is longer, ensure it really is a subdirectory. */
398 if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/'))
399 return 0;
400
401 info->dev = xendev;
402 get_device(dev);
403 return 1;
404}
405
406static void xenbus_cleanup_devices(const char *path, struct bus_type *bus)
407{
408 struct xb_find_info info = { .nodename = path };
409
410 do {
411 info.dev = NULL;
412 bus_for_each_dev(bus, NULL, &info, cleanup_dev);
413 if (info.dev) {
414 device_unregister(&info.dev->dev);
415 put_device(&info.dev->dev);
416 }
417 } while (info.dev);
418}
419
420static void xenbus_dev_release(struct device *dev)
421{
422 if (dev)
423 kfree(to_xenbus_device(dev));
424}
425
426static ssize_t xendev_show_nodename(struct device *dev,
427 struct device_attribute *attr, char *buf)
428{
429 return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
430}
431DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
432
433static ssize_t xendev_show_devtype(struct device *dev,
434 struct device_attribute *attr, char *buf)
435{
436 return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
437}
438DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
439
440
441int xenbus_probe_node(struct xen_bus_type *bus,
442 const char *type,
443 const char *nodename)
444{
445 int err;
446 struct xenbus_device *xendev;
447 size_t stringlen;
448 char *tmpstring;
449
450 enum xenbus_state state = xenbus_read_driver_state(nodename);
451
452 if (state != XenbusStateInitialising) {
453 /* Device is not new, so ignore it. This can happen if a
454 device is going away after switching to Closed. */
455 return 0;
456 }
457
458 stringlen = strlen(nodename) + 1 + strlen(type) + 1;
459 xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL);
460 if (!xendev)
461 return -ENOMEM;
462
463 xendev->state = XenbusStateInitialising;
464
465 /* Copy the strings into the extra space. */
466
467 tmpstring = (char *)(xendev + 1);
468 strcpy(tmpstring, nodename);
469 xendev->nodename = tmpstring;
470
471 tmpstring += strlen(tmpstring) + 1;
472 strcpy(tmpstring, type);
473 xendev->devicetype = tmpstring;
474 init_completion(&xendev->down);
475
476 xendev->dev.bus = &bus->bus;
477 xendev->dev.release = xenbus_dev_release;
478
479 err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
480 if (err)
481 goto fail;
482
483 /* Register with generic device framework. */
484 err = device_register(&xendev->dev);
485 if (err)
486 goto fail;
487
488 err = device_create_file(&xendev->dev, &dev_attr_nodename);
489 if (err)
490 goto fail_unregister;
491
492 err = device_create_file(&xendev->dev, &dev_attr_devtype);
493 if (err)
494 goto fail_remove_file;
495
496 return 0;
497fail_remove_file:
498 device_remove_file(&xendev->dev, &dev_attr_nodename);
499fail_unregister:
500 device_unregister(&xendev->dev);
501fail:
502 kfree(xendev);
503 return err;
504}
505
506/* device/<typename>/<name> */
507static int xenbus_probe_frontend(const char *type, const char *name)
508{
509 char *nodename;
510 int err;
511
512 nodename = kasprintf(GFP_KERNEL, "%s/%s/%s",
513 xenbus_frontend.root, type, name);
514 if (!nodename)
515 return -ENOMEM;
516
517 DPRINTK("%s", nodename);
518
519 err = xenbus_probe_node(&xenbus_frontend, type, nodename);
520 kfree(nodename);
521 return err;
522}
523
524static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
525{
526 int err = 0;
527 char **dir;
528 unsigned int dir_n = 0;
529 int i;
530
531 dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n);
532 if (IS_ERR(dir))
533 return PTR_ERR(dir);
534
535 for (i = 0; i < dir_n; i++) {
536 err = bus->probe(type, dir[i]);
537 if (err)
538 break;
539 }
540 kfree(dir);
541 return err;
542}
543
544int xenbus_probe_devices(struct xen_bus_type *bus)
545{
546 int err = 0;
547 char **dir;
548 unsigned int i, dir_n;
549
550 dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
551 if (IS_ERR(dir))
552 return PTR_ERR(dir);
553
554 for (i = 0; i < dir_n; i++) {
555 err = xenbus_probe_device_type(bus, dir[i]);
556 if (err)
557 break;
558 }
559 kfree(dir);
560 return err;
561}
562
563static unsigned int char_count(const char *str, char c)
564{
565 unsigned int i, ret = 0;
566
567 for (i = 0; str[i]; i++)
568 if (str[i] == c)
569 ret++;
570 return ret;
571}
572
573static int strsep_len(const char *str, char c, unsigned int len)
574{
575 unsigned int i;
576
577 for (i = 0; str[i]; i++)
578 if (str[i] == c) {
579 if (len == 0)
580 return i;
581 len--;
582 }
583 return (len == 0) ? i : -ERANGE;
584}
585
586void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
587{
588 int exists, rootlen;
589 struct xenbus_device *dev;
590 char type[BUS_ID_SIZE];
591 const char *p, *root;
592
593 if (char_count(node, '/') < 2)
594 return;
595
596 exists = xenbus_exists(XBT_NIL, node, "");
597 if (!exists) {
598 xenbus_cleanup_devices(node, &bus->bus);
599 return;
600 }
601
602 /* backend/<type>/... or device/<type>/... */
603 p = strchr(node, '/') + 1;
604 snprintf(type, BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p);
605 type[BUS_ID_SIZE-1] = '\0';
606
607 rootlen = strsep_len(node, '/', bus->levels);
608 if (rootlen < 0)
609 return;
610 root = kasprintf(GFP_KERNEL, "%.*s", rootlen, node);
611 if (!root)
612 return;
613
614 dev = xenbus_device_find(root, &bus->bus);
615 if (!dev)
616 xenbus_probe_node(bus, type, root);
617 else
618 put_device(&dev->dev);
619
620 kfree(root);
621}
622
623static void frontend_changed(struct xenbus_watch *watch,
624 const char **vec, unsigned int len)
625{
626 DPRINTK("");
627
628 xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
629}
630
631/* We watch for devices appearing and vanishing. */
632static struct xenbus_watch fe_watch = {
633 .node = "device",
634 .callback = frontend_changed,
635};
636
637static int suspend_dev(struct device *dev, void *data)
638{
639 int err = 0;
640 struct xenbus_driver *drv;
641 struct xenbus_device *xdev;
642
643 DPRINTK("");
644
645 if (dev->driver == NULL)
646 return 0;
647 drv = to_xenbus_driver(dev->driver);
648 xdev = container_of(dev, struct xenbus_device, dev);
649 if (drv->suspend)
650 err = drv->suspend(xdev);
651 if (err)
652 printk(KERN_WARNING
653 "xenbus: suspend %s failed: %i\n", dev->bus_id, err);
654 return 0;
655}
656
657static int suspend_cancel_dev(struct device *dev, void *data)
658{
659 int err = 0;
660 struct xenbus_driver *drv;
661 struct xenbus_device *xdev;
662
663 DPRINTK("");
664
665 if (dev->driver == NULL)
666 return 0;
667 drv = to_xenbus_driver(dev->driver);
668 xdev = container_of(dev, struct xenbus_device, dev);
669 if (drv->suspend_cancel)
670 err = drv->suspend_cancel(xdev);
671 if (err)
672 printk(KERN_WARNING
673 "xenbus: suspend_cancel %s failed: %i\n",
674 dev->bus_id, err);
675 return 0;
676}
677
678static int resume_dev(struct device *dev, void *data)
679{
680 int err;
681 struct xenbus_driver *drv;
682 struct xenbus_device *xdev;
683
684 DPRINTK("");
685
686 if (dev->driver == NULL)
687 return 0;
688
689 drv = to_xenbus_driver(dev->driver);
690 xdev = container_of(dev, struct xenbus_device, dev);
691
692 err = talk_to_otherend(xdev);
693 if (err) {
694 printk(KERN_WARNING
695 "xenbus: resume (talk_to_otherend) %s failed: %i\n",
696 dev->bus_id, err);
697 return err;
698 }
699
700 xdev->state = XenbusStateInitialising;
701
702 if (drv->resume) {
703 err = drv->resume(xdev);
704 if (err) {
705 printk(KERN_WARNING
706 "xenbus: resume %s failed: %i\n",
707 dev->bus_id, err);
708 return err;
709 }
710 }
711
712 err = watch_otherend(xdev);
713 if (err) {
714 printk(KERN_WARNING
715 "xenbus_probe: resume (watch_otherend) %s failed: "
716 "%d.\n", dev->bus_id, err);
717 return err;
718 }
719
720 return 0;
721}
722
723void xenbus_suspend(void)
724{
725 DPRINTK("");
726
727 bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
728 xenbus_backend_suspend(suspend_dev);
729 xs_suspend();
730}
731EXPORT_SYMBOL_GPL(xenbus_suspend);
732
733void xenbus_resume(void)
734{
735 xb_init_comms();
736 xs_resume();
737 bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
738 xenbus_backend_resume(resume_dev);
739}
740EXPORT_SYMBOL_GPL(xenbus_resume);
741
742void xenbus_suspend_cancel(void)
743{
744 xs_suspend_cancel();
745 bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev);
746 xenbus_backend_resume(suspend_cancel_dev);
747}
748EXPORT_SYMBOL_GPL(xenbus_suspend_cancel);
749
750/* A flag to determine if xenstored is 'ready' (i.e. has started) */
751int xenstored_ready = 0;
752
753
754int register_xenstore_notifier(struct notifier_block *nb)
755{
756 int ret = 0;
757
758 if (xenstored_ready > 0)
759 ret = nb->notifier_call(nb, 0, NULL);
760 else
761 blocking_notifier_chain_register(&xenstore_chain, nb);
762
763 return ret;
764}
765EXPORT_SYMBOL_GPL(register_xenstore_notifier);
766
767void unregister_xenstore_notifier(struct notifier_block *nb)
768{
769 blocking_notifier_chain_unregister(&xenstore_chain, nb);
770}
771EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
772
773void xenbus_probe(struct work_struct *unused)
774{
775 BUG_ON((xenstored_ready <= 0));
776
777 /* Enumerate devices in xenstore and watch for changes. */
778 xenbus_probe_devices(&xenbus_frontend);
779 register_xenbus_watch(&fe_watch);
780 xenbus_backend_probe_and_watch();
781
782 /* Notify others that xenstore is up */
783 blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
784}
785
786static int __init xenbus_probe_init(void)
787{
788 int err = 0;
789
790 DPRINTK("");
791
792 err = -ENODEV;
793 if (!is_running_on_xen())
794 goto out_error;
795
796 /* Register ourselves with the kernel bus subsystem */
797 err = bus_register(&xenbus_frontend.bus);
798 if (err)
799 goto out_error;
800
801 err = xenbus_backend_bus_register();
802 if (err)
803 goto out_unreg_front;
804
805 /*
806 * Domain0 doesn't have a store_evtchn or store_mfn yet.
807 */
808 if (is_initial_xendomain()) {
809 /* dom0 not yet supported */
810 } else {
811 xenstored_ready = 1;
812 xen_store_evtchn = xen_start_info->store_evtchn;
813 xen_store_mfn = xen_start_info->store_mfn;
814 }
815 xen_store_interface = mfn_to_virt(xen_store_mfn);
816
817 /* Initialize the interface to xenstore. */
818 err = xs_init();
819 if (err) {
820 printk(KERN_WARNING
821 "XENBUS: Error initializing xenstore comms: %i\n", err);
822 goto out_unreg_back;
823 }
824
825 if (!is_initial_xendomain())
826 xenbus_probe(NULL);
827
828 return 0;
829
830 out_unreg_back:
831 xenbus_backend_bus_unregister();
832
833 out_unreg_front:
834 bus_unregister(&xenbus_frontend.bus);
835
836 out_error:
837 return err;
838}
839
840postcore_initcall(xenbus_probe_init);
841
842MODULE_LICENSE("GPL");
843
844static int is_disconnected_device(struct device *dev, void *data)
845{
846 struct xenbus_device *xendev = to_xenbus_device(dev);
847 struct device_driver *drv = data;
848
849 /*
850 * A device with no driver will never connect. We care only about
851 * devices which should currently be in the process of connecting.
852 */
853 if (!dev->driver)
854 return 0;
855
856 /* Is this search limited to a particular driver? */
857 if (drv && (dev->driver != drv))
858 return 0;
859
860 return (xendev->state != XenbusStateConnected);
861}
862
863static int exists_disconnected_device(struct device_driver *drv)
864{
865 return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
866 is_disconnected_device);
867}
868
869static int print_device_status(struct device *dev, void *data)
870{
871 struct xenbus_device *xendev = to_xenbus_device(dev);
872 struct device_driver *drv = data;
873
874 /* Is this operation limited to a particular driver? */
875 if (drv && (dev->driver != drv))
876 return 0;
877
878 if (!dev->driver) {
879 /* Information only: is this too noisy? */
880 printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
881 xendev->nodename);
882 } else if (xendev->state != XenbusStateConnected) {
883 printk(KERN_WARNING "XENBUS: Timeout connecting "
884 "to device: %s (state %d)\n",
885 xendev->nodename, xendev->state);
886 }
887
888 return 0;
889}
890
891/* We only wait for device setup after most initcalls have run. */
892static int ready_to_wait_for_devices;
893
894/*
895 * On a 10 second timeout, wait for all devices currently configured. We need
896 * to do this to guarantee that the filesystems and / or network devices
897 * needed for boot are available, before we can allow the boot to proceed.
898 *
899 * This needs to be on a late_initcall, to happen after the frontend device
900 * drivers have been initialised, but before the root fs is mounted.
901 *
902 * A possible improvement here would be to have the tools add a per-device
903 * flag to the store entry, indicating whether it is needed at boot time.
904 * This would allow people who knew what they were doing to accelerate their
905 * boot slightly, but of course needs tools or manual intervention to set up
906 * those flags correctly.
907 */
908static void wait_for_devices(struct xenbus_driver *xendrv)
909{
910 unsigned long timeout = jiffies + 10*HZ;
911 struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
912
913 if (!ready_to_wait_for_devices || !is_running_on_xen())
914 return;
915
916 while (exists_disconnected_device(drv)) {
917 if (time_after(jiffies, timeout))
918 break;
919 schedule_timeout_interruptible(HZ/10);
920 }
921
922 bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
923 print_device_status);
924}
925
926#ifndef MODULE
927static int __init boot_wait_for_devices(void)
928{
929 ready_to_wait_for_devices = 1;
930 wait_for_devices(NULL);
931 return 0;
932}
933
934late_initcall(boot_wait_for_devices);
935#endif
diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h
new file mode 100644
index 000000000000..e09b19415a40
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe.h
@@ -0,0 +1,74 @@
1/******************************************************************************
2 * xenbus_probe.h
3 *
4 * Talks to Xen Store to figure out what devices we have.
5 *
6 * Copyright (C) 2005 Rusty Russell, IBM Corporation
7 * Copyright (C) 2005 XenSource Ltd.
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License version 2
11 * as published by the Free Software Foundation; or, when distributed
12 * separately from the Linux kernel or incorporated into other
13 * software packages, subject to the following license:
14 *
15 * Permission is hereby granted, free of charge, to any person obtaining a copy
16 * of this source file (the "Software"), to deal in the Software without
17 * restriction, including without limitation the rights to use, copy, modify,
18 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
19 * and to permit persons to whom the Software is furnished to do so, subject to
20 * the following conditions:
21 *
22 * The above copyright notice and this permission notice shall be included in
23 * all copies or substantial portions of the Software.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
30 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
31 * IN THE SOFTWARE.
32 */
33
34#ifndef _XENBUS_PROBE_H
35#define _XENBUS_PROBE_H
36
37#ifdef CONFIG_XEN_BACKEND
38extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
39extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
40extern void xenbus_backend_probe_and_watch(void);
41extern int xenbus_backend_bus_register(void);
42extern void xenbus_backend_bus_unregister(void);
43#else
44static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
45static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
46static inline void xenbus_backend_probe_and_watch(void) {}
47static inline int xenbus_backend_bus_register(void) { return 0; }
48static inline void xenbus_backend_bus_unregister(void) {}
49#endif
50
51struct xen_bus_type
52{
53 char *root;
54 unsigned int levels;
55 int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename);
56 int (*probe)(const char *type, const char *dir);
57 struct bus_type bus;
58};
59
60extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
61extern int xenbus_dev_probe(struct device *_dev);
62extern int xenbus_dev_remove(struct device *_dev);
63extern int xenbus_register_driver_common(struct xenbus_driver *drv,
64 struct xen_bus_type *bus,
65 struct module *owner,
66 const char *mod_name);
67extern int xenbus_probe_node(struct xen_bus_type *bus,
68 const char *type,
69 const char *nodename);
70extern int xenbus_probe_devices(struct xen_bus_type *bus);
71
72extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus);
73
74#endif
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
new file mode 100644
index 000000000000..9e943fbce81b
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -0,0 +1,861 @@
1/******************************************************************************
2 * xenbus_xs.c
3 *
4 * This is the kernel equivalent of the "xs" library. We don't need everything
5 * and we use xenbus_comms for communication.
6 *
7 * Copyright (C) 2005 Rusty Russell, IBM Corporation
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License version 2
11 * as published by the Free Software Foundation; or, when distributed
12 * separately from the Linux kernel or incorporated into other
13 * software packages, subject to the following license:
14 *
15 * Permission is hereby granted, free of charge, to any person obtaining a copy
16 * of this source file (the "Software"), to deal in the Software without
17 * restriction, including without limitation the rights to use, copy, modify,
18 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
19 * and to permit persons to whom the Software is furnished to do so, subject to
20 * the following conditions:
21 *
22 * The above copyright notice and this permission notice shall be included in
23 * all copies or substantial portions of the Software.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
30 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
31 * IN THE SOFTWARE.
32 */
33
34#include <linux/unistd.h>
35#include <linux/errno.h>
36#include <linux/types.h>
37#include <linux/uio.h>
38#include <linux/kernel.h>
39#include <linux/string.h>
40#include <linux/err.h>
41#include <linux/slab.h>
42#include <linux/fcntl.h>
43#include <linux/kthread.h>
44#include <linux/rwsem.h>
45#include <linux/module.h>
46#include <linux/mutex.h>
47#include <xen/xenbus.h>
48#include "xenbus_comms.h"
49
50struct xs_stored_msg {
51 struct list_head list;
52
53 struct xsd_sockmsg hdr;
54
55 union {
56 /* Queued replies. */
57 struct {
58 char *body;
59 } reply;
60
61 /* Queued watch events. */
62 struct {
63 struct xenbus_watch *handle;
64 char **vec;
65 unsigned int vec_size;
66 } watch;
67 } u;
68};
69
70struct xs_handle {
71 /* A list of replies. Currently only one will ever be outstanding. */
72 struct list_head reply_list;
73 spinlock_t reply_lock;
74 wait_queue_head_t reply_waitq;
75
76 /*
77 * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex.
78 * response_mutex is never taken simultaneously with the other three.
79 */
80
81 /* One request at a time. */
82 struct mutex request_mutex;
83
84 /* Protect xenbus reader thread against save/restore. */
85 struct mutex response_mutex;
86
87 /* Protect transactions against save/restore. */
88 struct rw_semaphore transaction_mutex;
89
90 /* Protect watch (de)register against save/restore. */
91 struct rw_semaphore watch_mutex;
92};
93
94static struct xs_handle xs_state;
95
96/* List of registered watches, and a lock to protect it. */
97static LIST_HEAD(watches);
98static DEFINE_SPINLOCK(watches_lock);
99
100/* List of pending watch callback events, and a lock to protect it. */
101static LIST_HEAD(watch_events);
102static DEFINE_SPINLOCK(watch_events_lock);
103
104/*
105 * Details of the xenwatch callback kernel thread. The thread waits on the
106 * watch_events_waitq for work to do (queued on watch_events list). When it
107 * wakes up it acquires the xenwatch_mutex before reading the list and
108 * carrying out work.
109 */
110static pid_t xenwatch_pid;
111static DEFINE_MUTEX(xenwatch_mutex);
112static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
113
114static int get_error(const char *errorstring)
115{
116 unsigned int i;
117
118 for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
119 if (i == ARRAY_SIZE(xsd_errors) - 1) {
120 printk(KERN_WARNING
121 "XENBUS xen store gave: unknown error %s",
122 errorstring);
123 return EINVAL;
124 }
125 }
126 return xsd_errors[i].errnum;
127}
128
129static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
130{
131 struct xs_stored_msg *msg;
132 char *body;
133
134 spin_lock(&xs_state.reply_lock);
135
136 while (list_empty(&xs_state.reply_list)) {
137 spin_unlock(&xs_state.reply_lock);
138 /* XXX FIXME: Avoid synchronous wait for response here. */
139 wait_event(xs_state.reply_waitq,
140 !list_empty(&xs_state.reply_list));
141 spin_lock(&xs_state.reply_lock);
142 }
143
144 msg = list_entry(xs_state.reply_list.next,
145 struct xs_stored_msg, list);
146 list_del(&msg->list);
147
148 spin_unlock(&xs_state.reply_lock);
149
150 *type = msg->hdr.type;
151 if (len)
152 *len = msg->hdr.len;
153 body = msg->u.reply.body;
154
155 kfree(msg);
156
157 return body;
158}
159
160void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
161{
162 void *ret;
163 struct xsd_sockmsg req_msg = *msg;
164 int err;
165
166 if (req_msg.type == XS_TRANSACTION_START)
167 down_read(&xs_state.transaction_mutex);
168
169 mutex_lock(&xs_state.request_mutex);
170
171 err = xb_write(msg, sizeof(*msg) + msg->len);
172 if (err) {
173 msg->type = XS_ERROR;
174 ret = ERR_PTR(err);
175 } else
176 ret = read_reply(&msg->type, &msg->len);
177
178 mutex_unlock(&xs_state.request_mutex);
179
180 if ((msg->type == XS_TRANSACTION_END) ||
181 ((req_msg.type == XS_TRANSACTION_START) &&
182 (msg->type == XS_ERROR)))
183 up_read(&xs_state.transaction_mutex);
184
185 return ret;
186}
187
188/* Send message to xs, get kmalloc'ed reply. ERR_PTR() on error. */
189static void *xs_talkv(struct xenbus_transaction t,
190 enum xsd_sockmsg_type type,
191 const struct kvec *iovec,
192 unsigned int num_vecs,
193 unsigned int *len)
194{
195 struct xsd_sockmsg msg;
196 void *ret = NULL;
197 unsigned int i;
198 int err;
199
200 msg.tx_id = t.id;
201 msg.req_id = 0;
202 msg.type = type;
203 msg.len = 0;
204 for (i = 0; i < num_vecs; i++)
205 msg.len += iovec[i].iov_len;
206
207 mutex_lock(&xs_state.request_mutex);
208
209 err = xb_write(&msg, sizeof(msg));
210 if (err) {
211 mutex_unlock(&xs_state.request_mutex);
212 return ERR_PTR(err);
213 }
214
215 for (i = 0; i < num_vecs; i++) {
216 err = xb_write(iovec[i].iov_base, iovec[i].iov_len);
217 if (err) {
218 mutex_unlock(&xs_state.request_mutex);
219 return ERR_PTR(err);
220 }
221 }
222
223 ret = read_reply(&msg.type, len);
224
225 mutex_unlock(&xs_state.request_mutex);
226
227 if (IS_ERR(ret))
228 return ret;
229
230 if (msg.type == XS_ERROR) {
231 err = get_error(ret);
232 kfree(ret);
233 return ERR_PTR(-err);
234 }
235
236 if (msg.type != type) {
237 if (printk_ratelimit())
238 printk(KERN_WARNING
239 "XENBUS unexpected type [%d], expected [%d]\n",
240 msg.type, type);
241 kfree(ret);
242 return ERR_PTR(-EINVAL);
243 }
244 return ret;
245}
246
247/* Simplified version of xs_talkv: single message. */
248static void *xs_single(struct xenbus_transaction t,
249 enum xsd_sockmsg_type type,
250 const char *string,
251 unsigned int *len)
252{
253 struct kvec iovec;
254
255 iovec.iov_base = (void *)string;
256 iovec.iov_len = strlen(string) + 1;
257 return xs_talkv(t, type, &iovec, 1, len);
258}
259
260/* Many commands only need an ack, don't care what it says. */
261static int xs_error(char *reply)
262{
263 if (IS_ERR(reply))
264 return PTR_ERR(reply);
265 kfree(reply);
266 return 0;
267}
268
269static unsigned int count_strings(const char *strings, unsigned int len)
270{
271 unsigned int num;
272 const char *p;
273
274 for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
275 num++;
276
277 return num;
278}
279
280/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */
281static char *join(const char *dir, const char *name)
282{
283 char *buffer;
284
285 if (strlen(name) == 0)
286 buffer = kasprintf(GFP_KERNEL, "%s", dir);
287 else
288 buffer = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
289 return (!buffer) ? ERR_PTR(-ENOMEM) : buffer;
290}
291
292static char **split(char *strings, unsigned int len, unsigned int *num)
293{
294 char *p, **ret;
295
296 /* Count the strings. */
297 *num = count_strings(strings, len);
298
299 /* Transfer to one big alloc for easy freeing. */
300 ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL);
301 if (!ret) {
302 kfree(strings);
303 return ERR_PTR(-ENOMEM);
304 }
305 memcpy(&ret[*num], strings, len);
306 kfree(strings);
307
308 strings = (char *)&ret[*num];
309 for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
310 ret[(*num)++] = p;
311
312 return ret;
313}
314
315char **xenbus_directory(struct xenbus_transaction t,
316 const char *dir, const char *node, unsigned int *num)
317{
318 char *strings, *path;
319 unsigned int len;
320
321 path = join(dir, node);
322 if (IS_ERR(path))
323 return (char **)path;
324
325 strings = xs_single(t, XS_DIRECTORY, path, &len);
326 kfree(path);
327 if (IS_ERR(strings))
328 return (char **)strings;
329
330 return split(strings, len, num);
331}
332EXPORT_SYMBOL_GPL(xenbus_directory);
333
334/* Check if a path exists. Return 1 if it does. */
335int xenbus_exists(struct xenbus_transaction t,
336 const char *dir, const char *node)
337{
338 char **d;
339 int dir_n;
340
341 d = xenbus_directory(t, dir, node, &dir_n);
342 if (IS_ERR(d))
343 return 0;
344 kfree(d);
345 return 1;
346}
347EXPORT_SYMBOL_GPL(xenbus_exists);
348
349/* Get the value of a single file.
350 * Returns a kmalloced value: call free() on it after use.
351 * len indicates length in bytes.
352 */
353void *xenbus_read(struct xenbus_transaction t,
354 const char *dir, const char *node, unsigned int *len)
355{
356 char *path;
357 void *ret;
358
359 path = join(dir, node);
360 if (IS_ERR(path))
361 return (void *)path;
362
363 ret = xs_single(t, XS_READ, path, len);
364 kfree(path);
365 return ret;
366}
367EXPORT_SYMBOL_GPL(xenbus_read);
368
369/* Write the value of a single file.
370 * Returns -err on failure.
371 */
372int xenbus_write(struct xenbus_transaction t,
373 const char *dir, const char *node, const char *string)
374{
375 const char *path;
376 struct kvec iovec[2];
377 int ret;
378
379 path = join(dir, node);
380 if (IS_ERR(path))
381 return PTR_ERR(path);
382
383 iovec[0].iov_base = (void *)path;
384 iovec[0].iov_len = strlen(path) + 1;
385 iovec[1].iov_base = (void *)string;
386 iovec[1].iov_len = strlen(string);
387
388 ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL));
389 kfree(path);
390 return ret;
391}
392EXPORT_SYMBOL_GPL(xenbus_write);
393
394/* Create a new directory. */
395int xenbus_mkdir(struct xenbus_transaction t,
396 const char *dir, const char *node)
397{
398 char *path;
399 int ret;
400
401 path = join(dir, node);
402 if (IS_ERR(path))
403 return PTR_ERR(path);
404
405 ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
406 kfree(path);
407 return ret;
408}
409EXPORT_SYMBOL_GPL(xenbus_mkdir);
410
411/* Destroy a file or directory (directories must be empty). */
412int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node)
413{
414 char *path;
415 int ret;
416
417 path = join(dir, node);
418 if (IS_ERR(path))
419 return PTR_ERR(path);
420
421 ret = xs_error(xs_single(t, XS_RM, path, NULL));
422 kfree(path);
423 return ret;
424}
425EXPORT_SYMBOL_GPL(xenbus_rm);
426
427/* Start a transaction: changes by others will not be seen during this
428 * transaction, and changes will not be visible to others until end.
429 */
430int xenbus_transaction_start(struct xenbus_transaction *t)
431{
432 char *id_str;
433
434 down_read(&xs_state.transaction_mutex);
435
436 id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL);
437 if (IS_ERR(id_str)) {
438 up_read(&xs_state.transaction_mutex);
439 return PTR_ERR(id_str);
440 }
441
442 t->id = simple_strtoul(id_str, NULL, 0);
443 kfree(id_str);
444 return 0;
445}
446EXPORT_SYMBOL_GPL(xenbus_transaction_start);
447
448/* End a transaction.
449 * If abandon is true, transaction is discarded instead of committed.
450 */
451int xenbus_transaction_end(struct xenbus_transaction t, int abort)
452{
453 char abortstr[2];
454 int err;
455
456 if (abort)
457 strcpy(abortstr, "F");
458 else
459 strcpy(abortstr, "T");
460
461 err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
462
463 up_read(&xs_state.transaction_mutex);
464
465 return err;
466}
467EXPORT_SYMBOL_GPL(xenbus_transaction_end);
468
469/* Single read and scanf: returns -errno or num scanned. */
470int xenbus_scanf(struct xenbus_transaction t,
471 const char *dir, const char *node, const char *fmt, ...)
472{
473 va_list ap;
474 int ret;
475 char *val;
476
477 val = xenbus_read(t, dir, node, NULL);
478 if (IS_ERR(val))
479 return PTR_ERR(val);
480
481 va_start(ap, fmt);
482 ret = vsscanf(val, fmt, ap);
483 va_end(ap);
484 kfree(val);
485 /* Distinctive errno. */
486 if (ret == 0)
487 return -ERANGE;
488 return ret;
489}
490EXPORT_SYMBOL_GPL(xenbus_scanf);
491
492/* Single printf and write: returns -errno or 0. */
493int xenbus_printf(struct xenbus_transaction t,
494 const char *dir, const char *node, const char *fmt, ...)
495{
496 va_list ap;
497 int ret;
498#define PRINTF_BUFFER_SIZE 4096
499 char *printf_buffer;
500
501 printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
502 if (printf_buffer == NULL)
503 return -ENOMEM;
504
505 va_start(ap, fmt);
506 ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
507 va_end(ap);
508
509 BUG_ON(ret > PRINTF_BUFFER_SIZE-1);
510 ret = xenbus_write(t, dir, node, printf_buffer);
511
512 kfree(printf_buffer);
513
514 return ret;
515}
516EXPORT_SYMBOL_GPL(xenbus_printf);
517
518/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
519int xenbus_gather(struct xenbus_transaction t, const char *dir, ...)
520{
521 va_list ap;
522 const char *name;
523 int ret = 0;
524
525 va_start(ap, dir);
526 while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
527 const char *fmt = va_arg(ap, char *);
528 void *result = va_arg(ap, void *);
529 char *p;
530
531 p = xenbus_read(t, dir, name, NULL);
532 if (IS_ERR(p)) {
533 ret = PTR_ERR(p);
534 break;
535 }
536 if (fmt) {
537 if (sscanf(p, fmt, result) == 0)
538 ret = -EINVAL;
539 kfree(p);
540 } else
541 *(char **)result = p;
542 }
543 va_end(ap);
544 return ret;
545}
546EXPORT_SYMBOL_GPL(xenbus_gather);
547
548static int xs_watch(const char *path, const char *token)
549{
550 struct kvec iov[2];
551
552 iov[0].iov_base = (void *)path;
553 iov[0].iov_len = strlen(path) + 1;
554 iov[1].iov_base = (void *)token;
555 iov[1].iov_len = strlen(token) + 1;
556
557 return xs_error(xs_talkv(XBT_NIL, XS_WATCH, iov,
558 ARRAY_SIZE(iov), NULL));
559}
560
561static int xs_unwatch(const char *path, const char *token)
562{
563 struct kvec iov[2];
564
565 iov[0].iov_base = (char *)path;
566 iov[0].iov_len = strlen(path) + 1;
567 iov[1].iov_base = (char *)token;
568 iov[1].iov_len = strlen(token) + 1;
569
570 return xs_error(xs_talkv(XBT_NIL, XS_UNWATCH, iov,
571 ARRAY_SIZE(iov), NULL));
572}
573
574static struct xenbus_watch *find_watch(const char *token)
575{
576 struct xenbus_watch *i, *cmp;
577
578 cmp = (void *)simple_strtoul(token, NULL, 16);
579
580 list_for_each_entry(i, &watches, list)
581 if (i == cmp)
582 return i;
583
584 return NULL;
585}
586
587/* Register callback to watch this node. */
588int register_xenbus_watch(struct xenbus_watch *watch)
589{
590 /* Pointer in ascii is the token. */
591 char token[sizeof(watch) * 2 + 1];
592 int err;
593
594 sprintf(token, "%lX", (long)watch);
595
596 down_read(&xs_state.watch_mutex);
597
598 spin_lock(&watches_lock);
599 BUG_ON(find_watch(token));
600 list_add(&watch->list, &watches);
601 spin_unlock(&watches_lock);
602
603 err = xs_watch(watch->node, token);
604
605 /* Ignore errors due to multiple registration. */
606 if ((err != 0) && (err != -EEXIST)) {
607 spin_lock(&watches_lock);
608 list_del(&watch->list);
609 spin_unlock(&watches_lock);
610 }
611
612 up_read(&xs_state.watch_mutex);
613
614 return err;
615}
616EXPORT_SYMBOL_GPL(register_xenbus_watch);
617
618void unregister_xenbus_watch(struct xenbus_watch *watch)
619{
620 struct xs_stored_msg *msg, *tmp;
621 char token[sizeof(watch) * 2 + 1];
622 int err;
623
624 sprintf(token, "%lX", (long)watch);
625
626 down_read(&xs_state.watch_mutex);
627
628 spin_lock(&watches_lock);
629 BUG_ON(!find_watch(token));
630 list_del(&watch->list);
631 spin_unlock(&watches_lock);
632
633 err = xs_unwatch(watch->node, token);
634 if (err)
635 printk(KERN_WARNING
636 "XENBUS Failed to release watch %s: %i\n",
637 watch->node, err);
638
639 up_read(&xs_state.watch_mutex);
640
641 /* Make sure there are no callbacks running currently (unless
642 its us) */
643 if (current->pid != xenwatch_pid)
644 mutex_lock(&xenwatch_mutex);
645
646 /* Cancel pending watch events. */
647 spin_lock(&watch_events_lock);
648 list_for_each_entry_safe(msg, tmp, &watch_events, list) {
649 if (msg->u.watch.handle != watch)
650 continue;
651 list_del(&msg->list);
652 kfree(msg->u.watch.vec);
653 kfree(msg);
654 }
655 spin_unlock(&watch_events_lock);
656
657 if (current->pid != xenwatch_pid)
658 mutex_unlock(&xenwatch_mutex);
659}
660EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
661
662void xs_suspend(void)
663{
664 down_write(&xs_state.transaction_mutex);
665 down_write(&xs_state.watch_mutex);
666 mutex_lock(&xs_state.request_mutex);
667 mutex_lock(&xs_state.response_mutex);
668}
669
670void xs_resume(void)
671{
672 struct xenbus_watch *watch;
673 char token[sizeof(watch) * 2 + 1];
674
675 mutex_unlock(&xs_state.response_mutex);
676 mutex_unlock(&xs_state.request_mutex);
677 up_write(&xs_state.transaction_mutex);
678
679 /* No need for watches_lock: the watch_mutex is sufficient. */
680 list_for_each_entry(watch, &watches, list) {
681 sprintf(token, "%lX", (long)watch);
682 xs_watch(watch->node, token);
683 }
684
685 up_write(&xs_state.watch_mutex);
686}
687
688void xs_suspend_cancel(void)
689{
690 mutex_unlock(&xs_state.response_mutex);
691 mutex_unlock(&xs_state.request_mutex);
692 up_write(&xs_state.watch_mutex);
693 up_write(&xs_state.transaction_mutex);
694}
695
696static int xenwatch_thread(void *unused)
697{
698 struct list_head *ent;
699 struct xs_stored_msg *msg;
700
701 for (;;) {
702 wait_event_interruptible(watch_events_waitq,
703 !list_empty(&watch_events));
704
705 if (kthread_should_stop())
706 break;
707
708 mutex_lock(&xenwatch_mutex);
709
710 spin_lock(&watch_events_lock);
711 ent = watch_events.next;
712 if (ent != &watch_events)
713 list_del(ent);
714 spin_unlock(&watch_events_lock);
715
716 if (ent != &watch_events) {
717 msg = list_entry(ent, struct xs_stored_msg, list);
718 msg->u.watch.handle->callback(
719 msg->u.watch.handle,
720 (const char **)msg->u.watch.vec,
721 msg->u.watch.vec_size);
722 kfree(msg->u.watch.vec);
723 kfree(msg);
724 }
725
726 mutex_unlock(&xenwatch_mutex);
727 }
728
729 return 0;
730}
731
732static int process_msg(void)
733{
734 struct xs_stored_msg *msg;
735 char *body;
736 int err;
737
738 /*
739 * We must disallow save/restore while reading a xenstore message.
740 * A partial read across s/r leaves us out of sync with xenstored.
741 */
742 for (;;) {
743 err = xb_wait_for_data_to_read();
744 if (err)
745 return err;
746 mutex_lock(&xs_state.response_mutex);
747 if (xb_data_to_read())
748 break;
749 /* We raced with save/restore: pending data 'disappeared'. */
750 mutex_unlock(&xs_state.response_mutex);
751 }
752
753
754 msg = kmalloc(sizeof(*msg), GFP_KERNEL);
755 if (msg == NULL) {
756 err = -ENOMEM;
757 goto out;
758 }
759
760 err = xb_read(&msg->hdr, sizeof(msg->hdr));
761 if (err) {
762 kfree(msg);
763 goto out;
764 }
765
766 body = kmalloc(msg->hdr.len + 1, GFP_KERNEL);
767 if (body == NULL) {
768 kfree(msg);
769 err = -ENOMEM;
770 goto out;
771 }
772
773 err = xb_read(body, msg->hdr.len);
774 if (err) {
775 kfree(body);
776 kfree(msg);
777 goto out;
778 }
779 body[msg->hdr.len] = '\0';
780
781 if (msg->hdr.type == XS_WATCH_EVENT) {
782 msg->u.watch.vec = split(body, msg->hdr.len,
783 &msg->u.watch.vec_size);
784 if (IS_ERR(msg->u.watch.vec)) {
785 kfree(msg);
786 err = PTR_ERR(msg->u.watch.vec);
787 goto out;
788 }
789
790 spin_lock(&watches_lock);
791 msg->u.watch.handle = find_watch(
792 msg->u.watch.vec[XS_WATCH_TOKEN]);
793 if (msg->u.watch.handle != NULL) {
794 spin_lock(&watch_events_lock);
795 list_add_tail(&msg->list, &watch_events);
796 wake_up(&watch_events_waitq);
797 spin_unlock(&watch_events_lock);
798 } else {
799 kfree(msg->u.watch.vec);
800 kfree(msg);
801 }
802 spin_unlock(&watches_lock);
803 } else {
804 msg->u.reply.body = body;
805 spin_lock(&xs_state.reply_lock);
806 list_add_tail(&msg->list, &xs_state.reply_list);
807 spin_unlock(&xs_state.reply_lock);
808 wake_up(&xs_state.reply_waitq);
809 }
810
811 out:
812 mutex_unlock(&xs_state.response_mutex);
813 return err;
814}
815
816static int xenbus_thread(void *unused)
817{
818 int err;
819
820 for (;;) {
821 err = process_msg();
822 if (err)
823 printk(KERN_WARNING "XENBUS error %d while reading "
824 "message\n", err);
825 if (kthread_should_stop())
826 break;
827 }
828
829 return 0;
830}
831
832int xs_init(void)
833{
834 int err;
835 struct task_struct *task;
836
837 INIT_LIST_HEAD(&xs_state.reply_list);
838 spin_lock_init(&xs_state.reply_lock);
839 init_waitqueue_head(&xs_state.reply_waitq);
840
841 mutex_init(&xs_state.request_mutex);
842 mutex_init(&xs_state.response_mutex);
843 init_rwsem(&xs_state.transaction_mutex);
844 init_rwsem(&xs_state.watch_mutex);
845
846 /* Initialize the shared memory rings to talk to xenstored */
847 err = xb_init_comms();
848 if (err)
849 return err;
850
851 task = kthread_run(xenwatch_thread, NULL, "xenwatch");
852 if (IS_ERR(task))
853 return PTR_ERR(task);
854 xenwatch_pid = task->pid;
855
856 task = kthread_run(xenbus_thread, NULL, "xenbus");
857 if (IS_ERR(task))
858 return PTR_ERR(task);
859
860 return 0;
861}
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 352eb4a13f98..c4c36171240d 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -209,7 +209,7 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
209 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; 209 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
210 envp[2] = NULL; 210 envp[2] = NULL;
211 211
212 ret = call_usermodehelper(argv[0], argv, envp, 1); 212 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
213 if (ret < 0) 213 if (ret < 0)
214 mlog_errno(ret); 214 mlog_errno(ret);
215} 215}
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index 9e15ce0006eb..36f310632c49 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -41,6 +41,7 @@ extern int irqbalance_disable(char *str);
41extern void fixup_irqs(cpumask_t map); 41extern void fixup_irqs(cpumask_t map);
42#endif 42#endif
43 43
44unsigned int do_IRQ(struct pt_regs *regs);
44void init_IRQ(void); 45void init_IRQ(void);
45void __init native_init_IRQ(void); 46void __init native_init_IRQ(void);
46 47
diff --git a/include/asm-i386/mach-default/irq_vectors_limits.h b/include/asm-i386/mach-default/irq_vectors_limits.h
index 7f161e760be6..a90c7a60109f 100644
--- a/include/asm-i386/mach-default/irq_vectors_limits.h
+++ b/include/asm-i386/mach-default/irq_vectors_limits.h
@@ -1,7 +1,7 @@
1#ifndef _ASM_IRQ_VECTORS_LIMITS_H 1#ifndef _ASM_IRQ_VECTORS_LIMITS_H
2#define _ASM_IRQ_VECTORS_LIMITS_H 2#define _ASM_IRQ_VECTORS_LIMITS_H
3 3
4#ifdef CONFIG_X86_IO_APIC 4#if defined(CONFIG_X86_IO_APIC) || defined(CONFIG_PARAVIRT)
5#define NR_IRQS 224 5#define NR_IRQS 224
6# if (224 >= 32 * NR_CPUS) 6# if (224 >= 32 * NR_CPUS)
7# define NR_IRQ_VECTORS NR_IRQS 7# define NR_IRQ_VECTORS NR_IRQS
diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h
index 8198d1cca1f3..7eb0b0b1fb3c 100644
--- a/include/asm-i386/mmu_context.h
+++ b/include/asm-i386/mmu_context.h
@@ -32,6 +32,8 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
32#endif 32#endif
33} 33}
34 34
35void leave_mm(unsigned long cpu);
36
35static inline void switch_mm(struct mm_struct *prev, 37static inline void switch_mm(struct mm_struct *prev,
36 struct mm_struct *next, 38 struct mm_struct *next,
37 struct task_struct *tsk) 39 struct task_struct *tsk)
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index 7f846a7d6bcc..7df88be2dd9e 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -52,6 +52,8 @@ struct paravirt_ops
52 /* Basic arch-specific setup */ 52 /* Basic arch-specific setup */
53 void (*arch_setup)(void); 53 void (*arch_setup)(void);
54 char *(*memory_setup)(void); 54 char *(*memory_setup)(void);
55 void (*post_allocator_init)(void);
56
55 void (*init_IRQ)(void); 57 void (*init_IRQ)(void);
56 void (*time_init)(void); 58 void (*time_init)(void);
57 59
@@ -116,7 +118,7 @@ struct paravirt_ops
116 118
117 u64 (*read_tsc)(void); 119 u64 (*read_tsc)(void);
118 u64 (*read_pmc)(void); 120 u64 (*read_pmc)(void);
119 u64 (*get_scheduled_cycles)(void); 121 unsigned long long (*sched_clock)(void);
120 unsigned long (*get_cpu_khz)(void); 122 unsigned long (*get_cpu_khz)(void);
121 123
122 /* Segment descriptor handling */ 124 /* Segment descriptor handling */
@@ -173,7 +175,7 @@ struct paravirt_ops
173 unsigned long va); 175 unsigned long va);
174 176
175 /* Hooks for allocating/releasing pagetable pages */ 177 /* Hooks for allocating/releasing pagetable pages */
176 void (*alloc_pt)(u32 pfn); 178 void (*alloc_pt)(struct mm_struct *mm, u32 pfn);
177 void (*alloc_pd)(u32 pfn); 179 void (*alloc_pd)(u32 pfn);
178 void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count); 180 void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
179 void (*release_pt)(u32 pfn); 181 void (*release_pt)(u32 pfn);
@@ -260,6 +262,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *site, unsigned len)
260unsigned paravirt_patch_insns(void *site, unsigned len, 262unsigned paravirt_patch_insns(void *site, unsigned len,
261 const char *start, const char *end); 263 const char *start, const char *end);
262 264
265int paravirt_disable_iospace(void);
263 266
264/* 267/*
265 * This generates an indirect call based on the operation type number. 268 * This generates an indirect call based on the operation type number.
@@ -563,7 +566,10 @@ static inline u64 paravirt_read_tsc(void)
563 566
564#define rdtscll(val) (val = paravirt_read_tsc()) 567#define rdtscll(val) (val = paravirt_read_tsc())
565 568
566#define get_scheduled_cycles(val) (val = paravirt_ops.get_scheduled_cycles()) 569static inline unsigned long long paravirt_sched_clock(void)
570{
571 return PVOP_CALL0(unsigned long long, sched_clock);
572}
567#define calculate_cpu_khz() (paravirt_ops.get_cpu_khz()) 573#define calculate_cpu_khz() (paravirt_ops.get_cpu_khz())
568 574
569#define write_tsc(val1,val2) wrmsr(0x10, val1, val2) 575#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
@@ -669,6 +675,12 @@ static inline void setup_secondary_clock(void)
669} 675}
670#endif 676#endif
671 677
678static inline void paravirt_post_allocator_init(void)
679{
680 if (paravirt_ops.post_allocator_init)
681 (*paravirt_ops.post_allocator_init)();
682}
683
672static inline void paravirt_pagetable_setup_start(pgd_t *base) 684static inline void paravirt_pagetable_setup_start(pgd_t *base)
673{ 685{
674 if (paravirt_ops.pagetable_setup_start) 686 if (paravirt_ops.pagetable_setup_start)
@@ -725,9 +737,9 @@ static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
725 PVOP_VCALL3(flush_tlb_others, &cpumask, mm, va); 737 PVOP_VCALL3(flush_tlb_others, &cpumask, mm, va);
726} 738}
727 739
728static inline void paravirt_alloc_pt(unsigned pfn) 740static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn)
729{ 741{
730 PVOP_VCALL1(alloc_pt, pfn); 742 PVOP_VCALL2(alloc_pt, mm, pfn);
731} 743}
732static inline void paravirt_release_pt(unsigned pfn) 744static inline void paravirt_release_pt(unsigned pfn)
733{ 745{
diff --git a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h
index d07b7afc2692..f2fc33ceb9f2 100644
--- a/include/asm-i386/pgalloc.h
+++ b/include/asm-i386/pgalloc.h
@@ -7,7 +7,7 @@
7#ifdef CONFIG_PARAVIRT 7#ifdef CONFIG_PARAVIRT
8#include <asm/paravirt.h> 8#include <asm/paravirt.h>
9#else 9#else
10#define paravirt_alloc_pt(pfn) do { } while (0) 10#define paravirt_alloc_pt(mm, pfn) do { } while (0)
11#define paravirt_alloc_pd(pfn) do { } while (0) 11#define paravirt_alloc_pd(pfn) do { } while (0)
12#define paravirt_alloc_pd(pfn) do { } while (0) 12#define paravirt_alloc_pd(pfn) do { } while (0)
13#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0) 13#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
@@ -17,13 +17,13 @@
17 17
18#define pmd_populate_kernel(mm, pmd, pte) \ 18#define pmd_populate_kernel(mm, pmd, pte) \
19do { \ 19do { \
20 paravirt_alloc_pt(__pa(pte) >> PAGE_SHIFT); \ 20 paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT); \
21 set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \ 21 set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \
22} while (0) 22} while (0)
23 23
24#define pmd_populate(mm, pmd, pte) \ 24#define pmd_populate(mm, pmd, pte) \
25do { \ 25do { \
26 paravirt_alloc_pt(page_to_pfn(pte)); \ 26 paravirt_alloc_pt(mm, page_to_pfn(pte)); \
27 set_pmd(pmd, __pmd(_PAGE_TABLE + \ 27 set_pmd(pmd, __pmd(_PAGE_TABLE + \
28 ((unsigned long long)page_to_pfn(pte) << \ 28 ((unsigned long long)page_to_pfn(pte) << \
29 (unsigned long long) PAGE_SHIFT))); \ 29 (unsigned long long) PAGE_SHIFT))); \
diff --git a/include/asm-i386/setup.h b/include/asm-i386/setup.h
index 0d5bff9dc4a5..7862fe858a9e 100644
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -81,6 +81,10 @@ void __init add_memory_region(unsigned long long start,
81 81
82extern unsigned long init_pg_tables_end; 82extern unsigned long init_pg_tables_end;
83 83
84#ifndef CONFIG_PARAVIRT
85#define paravirt_post_allocator_init() do {} while (0)
86#endif
87
84#endif /* __ASSEMBLY__ */ 88#endif /* __ASSEMBLY__ */
85 89
86#endif /* __KERNEL__ */ 90#endif /* __KERNEL__ */
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index 0c7132787062..1f73bde165b1 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -43,9 +43,12 @@ extern u8 x86_cpu_to_apicid[];
43 43
44#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] 44#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
45 45
46extern void set_cpu_sibling_map(int cpu);
47
46#ifdef CONFIG_HOTPLUG_CPU 48#ifdef CONFIG_HOTPLUG_CPU
47extern void cpu_exit_clear(void); 49extern void cpu_exit_clear(void);
48extern void cpu_uninit(void); 50extern void cpu_uninit(void);
51extern void remove_siblinginfo(int cpu);
49#endif 52#endif
50 53
51struct smp_ops 54struct smp_ops
@@ -129,6 +132,8 @@ extern int __cpu_disable(void);
129extern void __cpu_die(unsigned int cpu); 132extern void __cpu_die(unsigned int cpu);
130extern unsigned int num_processors; 133extern unsigned int num_processors;
131 134
135void __cpuinit smp_store_cpu_info(int id);
136
132#endif /* !__ASSEMBLY__ */ 137#endif /* !__ASSEMBLY__ */
133 138
134#else /* CONFIG_SMP */ 139#else /* CONFIG_SMP */
diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h
index 153770e25faa..51a713e33a9e 100644
--- a/include/asm-i386/timer.h
+++ b/include/asm-i386/timer.h
@@ -15,8 +15,38 @@ extern int no_sync_cmos_clock;
15extern int recalibrate_cpu_khz(void); 15extern int recalibrate_cpu_khz(void);
16 16
17#ifndef CONFIG_PARAVIRT 17#ifndef CONFIG_PARAVIRT
18#define get_scheduled_cycles(val) rdtscll(val)
19#define calculate_cpu_khz() native_calculate_cpu_khz() 18#define calculate_cpu_khz() native_calculate_cpu_khz()
20#endif 19#endif
21 20
21/* Accellerators for sched_clock()
22 * convert from cycles(64bits) => nanoseconds (64bits)
23 * basic equation:
24 * ns = cycles / (freq / ns_per_sec)
25 * ns = cycles * (ns_per_sec / freq)
26 * ns = cycles * (10^9 / (cpu_khz * 10^3))
27 * ns = cycles * (10^6 / cpu_khz)
28 *
29 * Then we use scaling math (suggested by george@mvista.com) to get:
30 * ns = cycles * (10^6 * SC / cpu_khz) / SC
31 * ns = cycles * cyc2ns_scale / SC
32 *
33 * And since SC is a constant power of two, we can convert the div
34 * into a shift.
35 *
36 * We can use khz divisor instead of mhz to keep a better percision, since
37 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
38 * (mathieu.desnoyers@polymtl.ca)
39 *
40 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
41 */
42extern unsigned long cyc2ns_scale __read_mostly;
43
44#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
45
46static inline unsigned long long cycles_2_ns(unsigned long long cyc)
47{
48 return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
49}
50
51
22#endif 52#endif
diff --git a/include/asm-i386/vmi_time.h b/include/asm-i386/vmi_time.h
index 213930b995cb..478188130328 100644
--- a/include/asm-i386/vmi_time.h
+++ b/include/asm-i386/vmi_time.h
@@ -49,7 +49,7 @@ extern struct vmi_timer_ops {
49extern void __init vmi_time_init(void); 49extern void __init vmi_time_init(void);
50extern unsigned long vmi_get_wallclock(void); 50extern unsigned long vmi_get_wallclock(void);
51extern int vmi_set_wallclock(unsigned long now); 51extern int vmi_set_wallclock(unsigned long now);
52extern unsigned long long vmi_get_sched_cycles(void); 52extern unsigned long long vmi_sched_clock(void);
53extern unsigned long vmi_cpu_khz(void); 53extern unsigned long vmi_cpu_khz(void);
54 54
55#ifdef CONFIG_X86_LOCAL_APIC 55#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/include/asm-i386/xen/hypercall.h b/include/asm-i386/xen/hypercall.h
new file mode 100644
index 000000000000..bc0ee7d961ca
--- /dev/null
+++ b/include/asm-i386/xen/hypercall.h
@@ -0,0 +1,413 @@
1/******************************************************************************
2 * hypercall.h
3 *
4 * Linux-specific hypervisor handling.
5 *
6 * Copyright (c) 2002-2004, K A Fraser
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version 2
10 * as published by the Free Software Foundation; or, when distributed
11 * separately from the Linux kernel or incorporated into other
12 * software packages, subject to the following license:
13 *
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 * IN THE SOFTWARE.
31 */
32
33#ifndef __HYPERCALL_H__
34#define __HYPERCALL_H__
35
36#include <linux/errno.h>
37#include <linux/string.h>
38
39#include <xen/interface/xen.h>
40#include <xen/interface/sched.h>
41#include <xen/interface/physdev.h>
42
43extern struct { char _entry[32]; } hypercall_page[];
44
45#define _hypercall0(type, name) \
46({ \
47 long __res; \
48 asm volatile ( \
49 "call %[call]" \
50 : "=a" (__res) \
51 : [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
52 : "memory" ); \
53 (type)__res; \
54})
55
56#define _hypercall1(type, name, a1) \
57({ \
58 long __res, __ign1; \
59 asm volatile ( \
60 "call %[call]" \
61 : "=a" (__res), "=b" (__ign1) \
62 : "1" ((long)(a1)), \
63 [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
64 : "memory" ); \
65 (type)__res; \
66})
67
68#define _hypercall2(type, name, a1, a2) \
69({ \
70 long __res, __ign1, __ign2; \
71 asm volatile ( \
72 "call %[call]" \
73 : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \
74 : "1" ((long)(a1)), "2" ((long)(a2)), \
75 [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
76 : "memory" ); \
77 (type)__res; \
78})
79
80#define _hypercall3(type, name, a1, a2, a3) \
81({ \
82 long __res, __ign1, __ign2, __ign3; \
83 asm volatile ( \
84 "call %[call]" \
85 : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
86 "=d" (__ign3) \
87 : "1" ((long)(a1)), "2" ((long)(a2)), \
88 "3" ((long)(a3)), \
89 [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
90 : "memory" ); \
91 (type)__res; \
92})
93
94#define _hypercall4(type, name, a1, a2, a3, a4) \
95({ \
96 long __res, __ign1, __ign2, __ign3, __ign4; \
97 asm volatile ( \
98 "call %[call]" \
99 : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
100 "=d" (__ign3), "=S" (__ign4) \
101 : "1" ((long)(a1)), "2" ((long)(a2)), \
102 "3" ((long)(a3)), "4" ((long)(a4)), \
103 [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
104 : "memory" ); \
105 (type)__res; \
106})
107
108#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
109({ \
110 long __res, __ign1, __ign2, __ign3, __ign4, __ign5; \
111 asm volatile ( \
112 "call %[call]" \
113 : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
114 "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \
115 : "1" ((long)(a1)), "2" ((long)(a2)), \
116 "3" ((long)(a3)), "4" ((long)(a4)), \
117 "5" ((long)(a5)), \
118 [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
119 : "memory" ); \
120 (type)__res; \
121})
122
123static inline int
124HYPERVISOR_set_trap_table(struct trap_info *table)
125{
126 return _hypercall1(int, set_trap_table, table);
127}
128
129static inline int
130HYPERVISOR_mmu_update(struct mmu_update *req, int count,
131 int *success_count, domid_t domid)
132{
133 return _hypercall4(int, mmu_update, req, count, success_count, domid);
134}
135
136static inline int
137HYPERVISOR_mmuext_op(struct mmuext_op *op, int count,
138 int *success_count, domid_t domid)
139{
140 return _hypercall4(int, mmuext_op, op, count, success_count, domid);
141}
142
143static inline int
144HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
145{
146 return _hypercall2(int, set_gdt, frame_list, entries);
147}
148
149static inline int
150HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
151{
152 return _hypercall2(int, stack_switch, ss, esp);
153}
154
155static inline int
156HYPERVISOR_set_callbacks(unsigned long event_selector,
157 unsigned long event_address,
158 unsigned long failsafe_selector,
159 unsigned long failsafe_address)
160{
161 return _hypercall4(int, set_callbacks,
162 event_selector, event_address,
163 failsafe_selector, failsafe_address);
164}
165
166static inline int
167HYPERVISOR_fpu_taskswitch(int set)
168{
169 return _hypercall1(int, fpu_taskswitch, set);
170}
171
172static inline int
173HYPERVISOR_sched_op(int cmd, unsigned long arg)
174{
175 return _hypercall2(int, sched_op, cmd, arg);
176}
177
178static inline long
179HYPERVISOR_set_timer_op(u64 timeout)
180{
181 unsigned long timeout_hi = (unsigned long)(timeout>>32);
182 unsigned long timeout_lo = (unsigned long)timeout;
183 return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
184}
185
186static inline int
187HYPERVISOR_set_debugreg(int reg, unsigned long value)
188{
189 return _hypercall2(int, set_debugreg, reg, value);
190}
191
192static inline unsigned long
193HYPERVISOR_get_debugreg(int reg)
194{
195 return _hypercall1(unsigned long, get_debugreg, reg);
196}
197
198static inline int
199HYPERVISOR_update_descriptor(u64 ma, u64 desc)
200{
201 return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
202}
203
204static inline int
205HYPERVISOR_memory_op(unsigned int cmd, void *arg)
206{
207 return _hypercall2(int, memory_op, cmd, arg);
208}
209
210static inline int
211HYPERVISOR_multicall(void *call_list, int nr_calls)
212{
213 return _hypercall2(int, multicall, call_list, nr_calls);
214}
215
216static inline int
217HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
218 unsigned long flags)
219{
220 unsigned long pte_hi = 0;
221#ifdef CONFIG_X86_PAE
222 pte_hi = new_val.pte_high;
223#endif
224 return _hypercall4(int, update_va_mapping, va,
225 new_val.pte_low, pte_hi, flags);
226}
227
228static inline int
229HYPERVISOR_event_channel_op(int cmd, void *arg)
230{
231 int rc = _hypercall2(int, event_channel_op, cmd, arg);
232 if (unlikely(rc == -ENOSYS)) {
233 struct evtchn_op op;
234 op.cmd = cmd;
235 memcpy(&op.u, arg, sizeof(op.u));
236 rc = _hypercall1(int, event_channel_op_compat, &op);
237 memcpy(arg, &op.u, sizeof(op.u));
238 }
239 return rc;
240}
241
242static inline int
243HYPERVISOR_xen_version(int cmd, void *arg)
244{
245 return _hypercall2(int, xen_version, cmd, arg);
246}
247
248static inline int
249HYPERVISOR_console_io(int cmd, int count, char *str)
250{
251 return _hypercall3(int, console_io, cmd, count, str);
252}
253
254static inline int
255HYPERVISOR_physdev_op(int cmd, void *arg)
256{
257 int rc = _hypercall2(int, physdev_op, cmd, arg);
258 if (unlikely(rc == -ENOSYS)) {
259 struct physdev_op op;
260 op.cmd = cmd;
261 memcpy(&op.u, arg, sizeof(op.u));
262 rc = _hypercall1(int, physdev_op_compat, &op);
263 memcpy(arg, &op.u, sizeof(op.u));
264 }
265 return rc;
266}
267
268static inline int
269HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
270{
271 return _hypercall3(int, grant_table_op, cmd, uop, count);
272}
273
274static inline int
275HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val,
276 unsigned long flags, domid_t domid)
277{
278 unsigned long pte_hi = 0;
279#ifdef CONFIG_X86_PAE
280 pte_hi = new_val.pte_high;
281#endif
282 return _hypercall5(int, update_va_mapping_otherdomain, va,
283 new_val.pte_low, pte_hi, flags, domid);
284}
285
286static inline int
287HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type)
288{
289 return _hypercall2(int, vm_assist, cmd, type);
290}
291
292static inline int
293HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
294{
295 return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
296}
297
298static inline int
299HYPERVISOR_suspend(unsigned long srec)
300{
301 return _hypercall3(int, sched_op, SCHEDOP_shutdown,
302 SHUTDOWN_suspend, srec);
303}
304
305static inline int
306HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
307{
308 return _hypercall2(int, nmi_op, op, arg);
309}
310
311static inline void
312MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
313 pte_t new_val, unsigned long flags)
314{
315 mcl->op = __HYPERVISOR_update_va_mapping;
316 mcl->args[0] = va;
317#ifdef CONFIG_X86_PAE
318 mcl->args[1] = new_val.pte_low;
319 mcl->args[2] = new_val.pte_high;
320#else
321 mcl->args[1] = new_val.pte_low;
322 mcl->args[2] = 0;
323#endif
324 mcl->args[3] = flags;
325}
326
327static inline void
328MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd,
329 void *uop, unsigned int count)
330{
331 mcl->op = __HYPERVISOR_grant_table_op;
332 mcl->args[0] = cmd;
333 mcl->args[1] = (unsigned long)uop;
334 mcl->args[2] = count;
335}
336
337static inline void
338MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long va,
339 pte_t new_val, unsigned long flags,
340 domid_t domid)
341{
342 mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
343 mcl->args[0] = va;
344#ifdef CONFIG_X86_PAE
345 mcl->args[1] = new_val.pte_low;
346 mcl->args[2] = new_val.pte_high;
347#else
348 mcl->args[1] = new_val.pte_low;
349 mcl->args[2] = 0;
350#endif
351 mcl->args[3] = flags;
352 mcl->args[4] = domid;
353}
354
355static inline void
356MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
357 struct desc_struct desc)
358{
359 mcl->op = __HYPERVISOR_update_descriptor;
360 mcl->args[0] = maddr;
361 mcl->args[1] = maddr >> 32;
362 mcl->args[2] = desc.a;
363 mcl->args[3] = desc.b;
364}
365
366static inline void
367MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg)
368{
369 mcl->op = __HYPERVISOR_memory_op;
370 mcl->args[0] = cmd;
371 mcl->args[1] = (unsigned long)arg;
372}
373
374static inline void
375MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
376 int count, int *success_count, domid_t domid)
377{
378 mcl->op = __HYPERVISOR_mmu_update;
379 mcl->args[0] = (unsigned long)req;
380 mcl->args[1] = count;
381 mcl->args[2] = (unsigned long)success_count;
382 mcl->args[3] = domid;
383}
384
385static inline void
386MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
387 int *success_count, domid_t domid)
388{
389 mcl->op = __HYPERVISOR_mmuext_op;
390 mcl->args[0] = (unsigned long)op;
391 mcl->args[1] = count;
392 mcl->args[2] = (unsigned long)success_count;
393 mcl->args[3] = domid;
394}
395
396static inline void
397MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
398{
399 mcl->op = __HYPERVISOR_set_gdt;
400 mcl->args[0] = (unsigned long)frames;
401 mcl->args[1] = entries;
402}
403
404static inline void
405MULTI_stack_switch(struct multicall_entry *mcl,
406 unsigned long ss, unsigned long esp)
407{
408 mcl->op = __HYPERVISOR_stack_switch;
409 mcl->args[0] = ss;
410 mcl->args[1] = esp;
411}
412
413#endif /* __HYPERCALL_H__ */
diff --git a/include/asm-i386/xen/hypervisor.h b/include/asm-i386/xen/hypervisor.h
new file mode 100644
index 000000000000..8e15dd28c91f
--- /dev/null
+++ b/include/asm-i386/xen/hypervisor.h
@@ -0,0 +1,73 @@
1/******************************************************************************
2 * hypervisor.h
3 *
4 * Linux-specific hypervisor handling.
5 *
6 * Copyright (c) 2002-2004, K A Fraser
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version 2
10 * as published by the Free Software Foundation; or, when distributed
11 * separately from the Linux kernel or incorporated into other
12 * software packages, subject to the following license:
13 *
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 * IN THE SOFTWARE.
31 */
32
33#ifndef __HYPERVISOR_H__
34#define __HYPERVISOR_H__
35
36#include <linux/types.h>
37#include <linux/kernel.h>
38#include <linux/version.h>
39
40#include <xen/interface/xen.h>
41#include <xen/interface/version.h>
42
43#include <asm/ptrace.h>
44#include <asm/page.h>
45#include <asm/desc.h>
46#if defined(__i386__)
47# ifdef CONFIG_X86_PAE
48# include <asm-generic/pgtable-nopud.h>
49# else
50# include <asm-generic/pgtable-nopmd.h>
51# endif
52#endif
53#include <asm/xen/hypercall.h>
54
55/* arch/i386/kernel/setup.c */
56extern struct shared_info *HYPERVISOR_shared_info;
57extern struct start_info *xen_start_info;
58#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
59
60/* arch/i386/mach-xen/evtchn.c */
61/* Force a proper event-channel callback from Xen. */
62extern void force_evtchn_callback(void);
63
64/* Turn jiffies into Xen system time. */
65u64 jiffies_to_st(unsigned long jiffies);
66
67
68#define MULTI_UVMFLAGS_INDEX 3
69#define MULTI_UVMDOMID_INDEX 4
70
71#define is_running_on_xen() (xen_start_info ? 1 : 0)
72
73#endif /* __HYPERVISOR_H__ */
diff --git a/include/asm-i386/xen/interface.h b/include/asm-i386/xen/interface.h
new file mode 100644
index 000000000000..165c3968e138
--- /dev/null
+++ b/include/asm-i386/xen/interface.h
@@ -0,0 +1,188 @@
1/******************************************************************************
2 * arch-x86_32.h
3 *
4 * Guest OS interface to x86 32-bit Xen.
5 *
6 * Copyright (c) 2004, K A Fraser
7 */
8
9#ifndef __XEN_PUBLIC_ARCH_X86_32_H__
10#define __XEN_PUBLIC_ARCH_X86_32_H__
11
12#ifdef __XEN__
13#define __DEFINE_GUEST_HANDLE(name, type) \
14 typedef struct { type *p; } __guest_handle_ ## name
15#else
16#define __DEFINE_GUEST_HANDLE(name, type) \
17 typedef type * __guest_handle_ ## name
18#endif
19
20#define DEFINE_GUEST_HANDLE_STRUCT(name) \
21 __DEFINE_GUEST_HANDLE(name, struct name)
22#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
23#define GUEST_HANDLE(name) __guest_handle_ ## name
24
25#ifndef __ASSEMBLY__
26/* Guest handles for primitive C types. */
27__DEFINE_GUEST_HANDLE(uchar, unsigned char);
28__DEFINE_GUEST_HANDLE(uint, unsigned int);
29__DEFINE_GUEST_HANDLE(ulong, unsigned long);
30DEFINE_GUEST_HANDLE(char);
31DEFINE_GUEST_HANDLE(int);
32DEFINE_GUEST_HANDLE(long);
33DEFINE_GUEST_HANDLE(void);
34#endif
35
36/*
37 * SEGMENT DESCRIPTOR TABLES
38 */
39/*
40 * A number of GDT entries are reserved by Xen. These are not situated at the
41 * start of the GDT because some stupid OSes export hard-coded selector values
42 * in their ABI. These hard-coded values are always near the start of the GDT,
43 * so Xen places itself out of the way, at the far end of the GDT.
44 */
45#define FIRST_RESERVED_GDT_PAGE 14
46#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
47#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
48
49/*
50 * These flat segments are in the Xen-private section of every GDT. Since these
51 * are also present in the initial GDT, many OSes will be able to avoid
52 * installing their own GDT.
53 */
54#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
55#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
56#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
57#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
58#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
59#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
60
61#define FLAT_KERNEL_CS FLAT_RING1_CS
62#define FLAT_KERNEL_DS FLAT_RING1_DS
63#define FLAT_KERNEL_SS FLAT_RING1_SS
64#define FLAT_USER_CS FLAT_RING3_CS
65#define FLAT_USER_DS FLAT_RING3_DS
66#define FLAT_USER_SS FLAT_RING3_SS
67
68/* And the trap vector is... */
69#define TRAP_INSTR "int $0x82"
70
71/*
72 * Virtual addresses beyond this are not modifiable by guest OSes. The
73 * machine->physical mapping table starts at this address, read-only.
74 */
75#ifdef CONFIG_X86_PAE
76#define __HYPERVISOR_VIRT_START 0xF5800000
77#else
78#define __HYPERVISOR_VIRT_START 0xFC000000
79#endif
80
81#ifndef HYPERVISOR_VIRT_START
82#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
83#endif
84
85#ifndef machine_to_phys_mapping
86#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
87#endif
88
89/* Maximum number of virtual CPUs in multi-processor guests. */
90#define MAX_VIRT_CPUS 32
91
92#ifndef __ASSEMBLY__
93
94/*
95 * Send an array of these to HYPERVISOR_set_trap_table()
96 */
97#define TI_GET_DPL(_ti) ((_ti)->flags & 3)
98#define TI_GET_IF(_ti) ((_ti)->flags & 4)
99#define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl))
100#define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2))
101
102struct trap_info {
103 uint8_t vector; /* exception vector */
104 uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
105 uint16_t cs; /* code selector */
106 unsigned long address; /* code offset */
107};
108DEFINE_GUEST_HANDLE_STRUCT(trap_info);
109
110struct cpu_user_regs {
111 uint32_t ebx;
112 uint32_t ecx;
113 uint32_t edx;
114 uint32_t esi;
115 uint32_t edi;
116 uint32_t ebp;
117 uint32_t eax;
118 uint16_t error_code; /* private */
119 uint16_t entry_vector; /* private */
120 uint32_t eip;
121 uint16_t cs;
122 uint8_t saved_upcall_mask;
123 uint8_t _pad0;
124 uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
125 uint32_t esp;
126 uint16_t ss, _pad1;
127 uint16_t es, _pad2;
128 uint16_t ds, _pad3;
129 uint16_t fs, _pad4;
130 uint16_t gs, _pad5;
131};
132DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
133
134typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
135
136/*
137 * The following is all CPU context. Note that the fpu_ctxt block is filled
138 * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
139 */
140struct vcpu_guest_context {
141 /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
142 struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
143#define VGCF_I387_VALID (1<<0)
144#define VGCF_HVM_GUEST (1<<1)
145#define VGCF_IN_KERNEL (1<<2)
146 unsigned long flags; /* VGCF_* flags */
147 struct cpu_user_regs user_regs; /* User-level CPU registers */
148 struct trap_info trap_ctxt[256]; /* Virtual IDT */
149 unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
150 unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
151 unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
152 unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
153 unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
154 unsigned long event_callback_cs; /* CS:EIP of event callback */
155 unsigned long event_callback_eip;
156 unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
157 unsigned long failsafe_callback_eip;
158 unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
159};
160DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
161
162struct arch_shared_info {
163 unsigned long max_pfn; /* max pfn that appears in table */
164 /* Frame containing list of mfns containing list of mfns containing p2m. */
165 unsigned long pfn_to_mfn_frame_list_list;
166 unsigned long nmi_reason;
167};
168
169struct arch_vcpu_info {
170 unsigned long cr2;
171 unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
172};
173
174#endif /* !__ASSEMBLY__ */
175
176/*
177 * Prefix forces emulation of some non-trapping instructions.
178 * Currently only CPUID.
179 */
180#ifdef __ASSEMBLY__
181#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
182#define XEN_CPUID XEN_EMULATE_PREFIX cpuid
183#else
184#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
185#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid"
186#endif
187
188#endif
diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h
index 9a1e0674e56c..e831759b2fb5 100644
--- a/include/linux/elfnote.h
+++ b/include/linux/elfnote.h
@@ -38,17 +38,25 @@
38 * e.g. ELFNOTE(XYZCo, 42, .asciz, "forty-two") 38 * e.g. ELFNOTE(XYZCo, 42, .asciz, "forty-two")
39 * ELFNOTE(XYZCo, 12, .long, 0xdeadbeef) 39 * ELFNOTE(XYZCo, 12, .long, 0xdeadbeef)
40 */ 40 */
41#define ELFNOTE(name, type, desctype, descdata) \ 41#define ELFNOTE_START(name, type, flags) \
42.pushsection .note.name, "",@note ; \ 42.pushsection .note.name, flags,@note ; \
43 .align 4 ; \ 43 .balign 4 ; \
44 .long 2f - 1f /* namesz */ ; \ 44 .long 2f - 1f /* namesz */ ; \
45 .long 4f - 3f /* descsz */ ; \ 45 .long 4484f - 3f /* descsz */ ; \
46 .long type ; \ 46 .long type ; \
471:.asciz #name ; \ 471:.asciz #name ; \
482:.align 4 ; \ 482:.balign 4 ; \
493:desctype descdata ; \ 493:
504:.align 4 ; \ 50
51#define ELFNOTE_END \
524484:.balign 4 ; \
51.popsection ; 53.popsection ;
54
55#define ELFNOTE(name, type, desc) \
56 ELFNOTE_START(name, type, "") \
57 desc ; \
58 ELFNOTE_END
59
52#else /* !__ASSEMBLER__ */ 60#else /* !__ASSEMBLER__ */
53#include <linux/elf.h> 61#include <linux/elf.h>
54/* 62/*
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 10f505c8431d..5dc13848891b 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -36,13 +36,57 @@ static inline int request_module(const char * name, ...) { return -ENOSYS; }
36#define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x))) 36#define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x)))
37 37
38struct key; 38struct key;
39extern int call_usermodehelper_keys(char *path, char *argv[], char *envp[], 39struct file;
40 struct key *session_keyring, int wait); 40struct subprocess_info;
41
42/* Allocate a subprocess_info structure */
43struct subprocess_info *call_usermodehelper_setup(char *path,
44 char **argv, char **envp);
45
46/* Set various pieces of state into the subprocess_info structure */
47void call_usermodehelper_setkeys(struct subprocess_info *info,
48 struct key *session_keyring);
49int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
50 struct file **filp);
51void call_usermodehelper_setcleanup(struct subprocess_info *info,
52 void (*cleanup)(char **argv, char **envp));
53
54enum umh_wait {
55 UMH_NO_WAIT = -1, /* don't wait at all */
56 UMH_WAIT_EXEC = 0, /* wait for the exec, but not the process */
57 UMH_WAIT_PROC = 1, /* wait for the process to complete */
58};
59
60/* Actually execute the sub-process */
61int call_usermodehelper_exec(struct subprocess_info *info, enum umh_wait wait);
62
63/* Free the subprocess_info. This is only needed if you're not going
64 to call call_usermodehelper_exec */
65void call_usermodehelper_freeinfo(struct subprocess_info *info);
41 66
42static inline int 67static inline int
43call_usermodehelper(char *path, char **argv, char **envp, int wait) 68call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait)
44{ 69{
45 return call_usermodehelper_keys(path, argv, envp, NULL, wait); 70 struct subprocess_info *info;
71
72 info = call_usermodehelper_setup(path, argv, envp);
73 if (info == NULL)
74 return -ENOMEM;
75 return call_usermodehelper_exec(info, wait);
76}
77
78static inline int
79call_usermodehelper_keys(char *path, char **argv, char **envp,
80 struct key *session_keyring, enum umh_wait wait)
81{
82 struct subprocess_info *info;
83
84 info = call_usermodehelper_setup(path, argv, envp);
85 if (info == NULL)
86 return -ENOMEM;
87
88 call_usermodehelper_setkeys(info, session_keyring);
89 return call_usermodehelper_exec(info, wait);
46} 90}
47 91
48extern void usermodehelper_init(void); 92extern void usermodehelper_init(void);
diff --git a/include/linux/major.h b/include/linux/major.h
index 7e7c9093919a..0cb98053537a 100644
--- a/include/linux/major.h
+++ b/include/linux/major.h
@@ -158,6 +158,8 @@
158#define VXSPEC_MAJOR 200 /* VERITAS volume config driver */ 158#define VXSPEC_MAJOR 200 /* VERITAS volume config driver */
159#define VXDMP_MAJOR 201 /* VERITAS volume multipath driver */ 159#define VXDMP_MAJOR 201 /* VERITAS volume multipath driver */
160 160
161#define XENVBD_MAJOR 202 /* Xen virtual block device */
162
161#define MSR_MAJOR 202 163#define MSR_MAJOR 202
162#define CPUID_MAJOR 203 164#define CPUID_MAJOR 203
163 165
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ae2d79f2107e..731cd2ac3227 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -92,6 +92,7 @@
92 92
93/* PG_owner_priv_1 users should have descriptive aliases */ 93/* PG_owner_priv_1 users should have descriptive aliases */
94#define PG_checked PG_owner_priv_1 /* Used by some filesystems */ 94#define PG_checked PG_owner_priv_1 /* Used by some filesystems */
95#define PG_pinned PG_owner_priv_1 /* Xen pinned pagetable */
95 96
96#if (BITS_PER_LONG > 32) 97#if (BITS_PER_LONG > 32)
97/* 98/*
@@ -170,6 +171,10 @@ static inline void SetPageUptodate(struct page *page)
170#define SetPageChecked(page) set_bit(PG_checked, &(page)->flags) 171#define SetPageChecked(page) set_bit(PG_checked, &(page)->flags)
171#define ClearPageChecked(page) clear_bit(PG_checked, &(page)->flags) 172#define ClearPageChecked(page) clear_bit(PG_checked, &(page)->flags)
172 173
174#define PagePinned(page) test_bit(PG_pinned, &(page)->flags)
175#define SetPagePinned(page) set_bit(PG_pinned, &(page)->flags)
176#define ClearPagePinned(page) clear_bit(PG_pinned, &(page)->flags)
177
173#define PageReserved(page) test_bit(PG_reserved, &(page)->flags) 178#define PageReserved(page) test_bit(PG_reserved, &(page)->flags)
174#define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags) 179#define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags)
175#define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags) 180#define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags)
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 1dd1c707311f..85ea63f462af 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -67,6 +67,11 @@ extern void kernel_power_off(void);
67 67
68void ctrl_alt_del(void); 68void ctrl_alt_del(void);
69 69
70#define POWEROFF_CMD_PATH_LEN 256
71extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN];
72
73extern int orderly_poweroff(bool force);
74
70/* 75/*
71 * Emergency restart, callable from an interrupt handler. 76 * Emergency restart, callable from an interrupt handler.
72 */ 77 */
diff --git a/include/linux/string.h b/include/linux/string.h
index 7f2eb6a477f9..836062b7582a 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -105,8 +105,12 @@ extern void * memchr(const void *,int,__kernel_size_t);
105#endif 105#endif
106 106
107extern char *kstrdup(const char *s, gfp_t gfp); 107extern char *kstrdup(const char *s, gfp_t gfp);
108extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
108extern void *kmemdup(const void *src, size_t len, gfp_t gfp); 109extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
109 110
111extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
112extern void argv_free(char **argv);
113
110#ifdef __cplusplus 114#ifdef __cplusplus
111} 115}
112#endif 116#endif
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 132b260aef1e..c2b10cae5da5 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -70,6 +70,10 @@ extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
70 struct page ***pages); 70 struct page ***pages);
71extern void unmap_kernel_range(unsigned long addr, unsigned long size); 71extern void unmap_kernel_range(unsigned long addr, unsigned long size);
72 72
73/* Allocate/destroy a 'vmalloc' VM area. */
74extern struct vm_struct *alloc_vm_area(size_t size);
75extern void free_vm_area(struct vm_struct *area);
76
73/* 77/*
74 * Internals. Dont't use.. 78 * Internals. Dont't use..
75 */ 79 */
diff --git a/include/xen/events.h b/include/xen/events.h
new file mode 100644
index 000000000000..2bde54d29be5
--- /dev/null
+++ b/include/xen/events.h
@@ -0,0 +1,48 @@
1#ifndef _XEN_EVENTS_H
2#define _XEN_EVENTS_H
3
4#include <linux/interrupt.h>
5
6#include <xen/interface/event_channel.h>
7#include <asm/xen/hypercall.h>
8
9enum ipi_vector {
10 XEN_RESCHEDULE_VECTOR,
11 XEN_CALL_FUNCTION_VECTOR,
12
13 XEN_NR_IPIS,
14};
15
16int bind_evtchn_to_irq(unsigned int evtchn);
17int bind_evtchn_to_irqhandler(unsigned int evtchn,
18 irq_handler_t handler,
19 unsigned long irqflags, const char *devname,
20 void *dev_id);
21int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
22 irq_handler_t handler,
23 unsigned long irqflags, const char *devname,
24 void *dev_id);
25int bind_ipi_to_irqhandler(enum ipi_vector ipi,
26 unsigned int cpu,
27 irq_handler_t handler,
28 unsigned long irqflags,
29 const char *devname,
30 void *dev_id);
31
32/*
33 * Common unbind function for all event sources. Takes IRQ to unbind from.
34 * Automatically closes the underlying event channel (even for bindings
35 * made with bind_evtchn_to_irqhandler()).
36 */
37void unbind_from_irqhandler(unsigned int irq, void *dev_id);
38
39void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector);
40
41static inline void notify_remote_via_evtchn(int port)
42{
43 struct evtchn_send send = { .port = port };
44 (void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
45}
46
47extern void notify_remote_via_irq(int irq);
48#endif /* _XEN_EVENTS_H */
diff --git a/include/xen/features.h b/include/xen/features.h
new file mode 100644
index 000000000000..27292d4d2a6a
--- /dev/null
+++ b/include/xen/features.h
@@ -0,0 +1,23 @@
1/******************************************************************************
2 * features.h
3 *
4 * Query the features reported by Xen.
5 *
6 * Copyright (c) 2006, Ian Campbell
7 */
8
9#ifndef __XEN_FEATURES_H__
10#define __XEN_FEATURES_H__
11
12#include <xen/interface/features.h>
13
14void xen_setup_features(void);
15
16extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32];
17
18static inline int xen_feature(int flag)
19{
20 return xen_features[flag];
21}
22
23#endif /* __ASM_XEN_FEATURES_H__ */
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
new file mode 100644
index 000000000000..761c83498e03
--- /dev/null
+++ b/include/xen/grant_table.h
@@ -0,0 +1,107 @@
1/******************************************************************************
2 * grant_table.h
3 *
4 * Two sets of functionality:
5 * 1. Granting foreign access to our memory reservation.
6 * 2. Accessing others' memory reservations via grant references.
7 * (i.e., mechanisms for both sender and recipient of grant references)
8 *
9 * Copyright (c) 2004-2005, K A Fraser
10 * Copyright (c) 2005, Christopher Clark
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License version 2
14 * as published by the Free Software Foundation; or, when distributed
15 * separately from the Linux kernel or incorporated into other
16 * software packages, subject to the following license:
17 *
18 * Permission is hereby granted, free of charge, to any person obtaining a copy
19 * of this source file (the "Software"), to deal in the Software without
20 * restriction, including without limitation the rights to use, copy, modify,
21 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
22 * and to permit persons to whom the Software is furnished to do so, subject to
23 * the following conditions:
24 *
25 * The above copyright notice and this permission notice shall be included in
26 * all copies or substantial portions of the Software.
27 *
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
33 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
34 * IN THE SOFTWARE.
35 */
36
37#ifndef __ASM_GNTTAB_H__
38#define __ASM_GNTTAB_H__
39
40#include <asm/xen/hypervisor.h>
41#include <xen/interface/grant_table.h>
42
43/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
44#define NR_GRANT_FRAMES 4
45
46struct gnttab_free_callback {
47 struct gnttab_free_callback *next;
48 void (*fn)(void *);
49 void *arg;
50 u16 count;
51};
52
53int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
54 int readonly);
55
56/*
57 * End access through the given grant reference, iff the grant entry is no
58 * longer in use. Return 1 if the grant entry was freed, 0 if it is still in
59 * use.
60 */
61int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly);
62
63/*
64 * Eventually end access through the given grant reference, and once that
65 * access has been ended, free the given page too. Access will be ended
66 * immediately iff the grant entry is not in use, otherwise it will happen
67 * some time later. page may be 0, in which case no freeing will occur.
68 */
69void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
70 unsigned long page);
71
72int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
73
74unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
75unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
76
77int gnttab_query_foreign_access(grant_ref_t ref);
78
79/*
80 * operations on reserved batches of grant references
81 */
82int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
83
84void gnttab_free_grant_reference(grant_ref_t ref);
85
86void gnttab_free_grant_references(grant_ref_t head);
87
88int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
89
90int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
91
92void gnttab_release_grant_reference(grant_ref_t *private_head,
93 grant_ref_t release);
94
95void gnttab_request_free_callback(struct gnttab_free_callback *callback,
96 void (*fn)(void *), void *arg, u16 count);
97void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
98
99void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
100 unsigned long frame, int readonly);
101
102void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
103 unsigned long pfn);
104
105#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
106
107#endif /* __ASM_GNTTAB_H__ */
diff --git a/include/xen/hvc-console.h b/include/xen/hvc-console.h
new file mode 100644
index 000000000000..21c0ecfd786d
--- /dev/null
+++ b/include/xen/hvc-console.h
@@ -0,0 +1,6 @@
1#ifndef XEN_HVC_CONSOLE_H
2#define XEN_HVC_CONSOLE_H
3
4extern struct console xenboot_console;
5
6#endif /* XEN_HVC_CONSOLE_H */
diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h
new file mode 100644
index 000000000000..a64d3df5bd95
--- /dev/null
+++ b/include/xen/interface/elfnote.h
@@ -0,0 +1,133 @@
1/******************************************************************************
2 * elfnote.h
3 *
4 * Definitions used for the Xen ELF notes.
5 *
6 * Copyright (c) 2006, Ian Campbell, XenSource Ltd.
7 */
8
9#ifndef __XEN_PUBLIC_ELFNOTE_H__
10#define __XEN_PUBLIC_ELFNOTE_H__
11
12/*
13 * The notes should live in a SHT_NOTE segment and have "Xen" in the
14 * name field.
15 *
16 * Numeric types are either 4 or 8 bytes depending on the content of
17 * the desc field.
18 *
19 * LEGACY indicated the fields in the legacy __xen_guest string which
20 * this a note type replaces.
21 */
22
23/*
24 * NAME=VALUE pair (string).
25 *
26 * LEGACY: FEATURES and PAE
27 */
28#define XEN_ELFNOTE_INFO 0
29
30/*
31 * The virtual address of the entry point (numeric).
32 *
33 * LEGACY: VIRT_ENTRY
34 */
35#define XEN_ELFNOTE_ENTRY 1
36
37/* The virtual address of the hypercall transfer page (numeric).
38 *
39 * LEGACY: HYPERCALL_PAGE. (n.b. legacy value is a physical page
40 * number not a virtual address)
41 */
42#define XEN_ELFNOTE_HYPERCALL_PAGE 2
43
44/* The virtual address where the kernel image should be mapped (numeric).
45 *
46 * Defaults to 0.
47 *
48 * LEGACY: VIRT_BASE
49 */
50#define XEN_ELFNOTE_VIRT_BASE 3
51
52/*
53 * The offset of the ELF paddr field from the acutal required
54 * psuedo-physical address (numeric).
55 *
56 * This is used to maintain backwards compatibility with older kernels
57 * which wrote __PAGE_OFFSET into that field. This field defaults to 0
58 * if not present.
59 *
60 * LEGACY: ELF_PADDR_OFFSET. (n.b. legacy default is VIRT_BASE)
61 */
62#define XEN_ELFNOTE_PADDR_OFFSET 4
63
64/*
65 * The version of Xen that we work with (string).
66 *
67 * LEGACY: XEN_VER
68 */
69#define XEN_ELFNOTE_XEN_VERSION 5
70
71/*
72 * The name of the guest operating system (string).
73 *
74 * LEGACY: GUEST_OS
75 */
76#define XEN_ELFNOTE_GUEST_OS 6
77
78/*
79 * The version of the guest operating system (string).
80 *
81 * LEGACY: GUEST_VER
82 */
83#define XEN_ELFNOTE_GUEST_VERSION 7
84
85/*
86 * The loader type (string).
87 *
88 * LEGACY: LOADER
89 */
90#define XEN_ELFNOTE_LOADER 8
91
92/*
93 * The kernel supports PAE (x86/32 only, string = "yes" or "no").
94 *
95 * LEGACY: PAE (n.b. The legacy interface included a provision to
96 * indicate 'extended-cr3' support allowing L3 page tables to be
97 * placed above 4G. It is assumed that any kernel new enough to use
98 * these ELF notes will include this and therefore "yes" here is
99 * equivalent to "yes[entended-cr3]" in the __xen_guest interface.
100 */
101#define XEN_ELFNOTE_PAE_MODE 9
102
103/*
104 * The features supported/required by this kernel (string).
105 *
106 * The string must consist of a list of feature names (as given in
107 * features.h, without the "XENFEAT_" prefix) separated by '|'
108 * characters. If a feature is required for the kernel to function
109 * then the feature name must be preceded by a '!' character.
110 *
111 * LEGACY: FEATURES
112 */
113#define XEN_ELFNOTE_FEATURES 10
114
115/*
116 * The kernel requires the symbol table to be loaded (string = "yes" or "no")
117 * LEGACY: BSD_SYMTAB (n.b. The legacy treated the presence or absence
118 * of this string as a boolean flag rather than requiring "yes" or
119 * "no".
120 */
121#define XEN_ELFNOTE_BSD_SYMTAB 11
122
123#endif /* __XEN_PUBLIC_ELFNOTE_H__ */
124
125/*
126 * Local variables:
127 * mode: C
128 * c-set-style: "BSD"
129 * c-basic-offset: 4
130 * tab-width: 4
131 * indent-tabs-mode: nil
132 * End:
133 */
diff --git a/include/xen/interface/event_channel.h b/include/xen/interface/event_channel.h
new file mode 100644
index 000000000000..919b5bdcb2bd
--- /dev/null
+++ b/include/xen/interface/event_channel.h
@@ -0,0 +1,195 @@
1/******************************************************************************
2 * event_channel.h
3 *
4 * Event channels between domains.
5 *
6 * Copyright (c) 2003-2004, K A Fraser.
7 */
8
9#ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__
10#define __XEN_PUBLIC_EVENT_CHANNEL_H__
11
12typedef uint32_t evtchn_port_t;
13DEFINE_GUEST_HANDLE(evtchn_port_t);
14
15/*
16 * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
17 * accepting interdomain bindings from domain <remote_dom>. A fresh port
18 * is allocated in <dom> and returned as <port>.
19 * NOTES:
20 * 1. If the caller is unprivileged then <dom> must be DOMID_SELF.
21 * 2. <rdom> may be DOMID_SELF, allowing loopback connections.
22 */
23#define EVTCHNOP_alloc_unbound 6
24struct evtchn_alloc_unbound {
25 /* IN parameters */
26 domid_t dom, remote_dom;
27 /* OUT parameters */
28 evtchn_port_t port;
29};
30
31/*
32 * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
33 * the calling domain and <remote_dom>. <remote_dom,remote_port> must identify
34 * a port that is unbound and marked as accepting bindings from the calling
35 * domain. A fresh port is allocated in the calling domain and returned as
36 * <local_port>.
37 * NOTES:
38 * 2. <remote_dom> may be DOMID_SELF, allowing loopback connections.
39 */
40#define EVTCHNOP_bind_interdomain 0
41struct evtchn_bind_interdomain {
42 /* IN parameters. */
43 domid_t remote_dom;
44 evtchn_port_t remote_port;
45 /* OUT parameters. */
46 evtchn_port_t local_port;
47};
48
49/*
50 * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified
51 * vcpu.
52 * NOTES:
53 * 1. A virtual IRQ may be bound to at most one event channel per vcpu.
54 * 2. The allocated event channel is bound to the specified vcpu. The binding
55 * may not be changed.
56 */
57#define EVTCHNOP_bind_virq 1
58struct evtchn_bind_virq {
59 /* IN parameters. */
60 uint32_t virq;
61 uint32_t vcpu;
62 /* OUT parameters. */
63 evtchn_port_t port;
64};
65
66/*
67 * EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.
68 * NOTES:
69 * 1. A physical IRQ may be bound to at most one event channel per domain.
70 * 2. Only a sufficiently-privileged domain may bind to a physical IRQ.
71 */
72#define EVTCHNOP_bind_pirq 2
73struct evtchn_bind_pirq {
74 /* IN parameters. */
75 uint32_t pirq;
76#define BIND_PIRQ__WILL_SHARE 1
77 uint32_t flags; /* BIND_PIRQ__* */
78 /* OUT parameters. */
79 evtchn_port_t port;
80};
81
82/*
83 * EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
84 * NOTES:
85 * 1. The allocated event channel is bound to the specified vcpu. The binding
86 * may not be changed.
87 */
88#define EVTCHNOP_bind_ipi 7
89struct evtchn_bind_ipi {
90 uint32_t vcpu;
91 /* OUT parameters. */
92 evtchn_port_t port;
93};
94
95/*
96 * EVTCHNOP_close: Close a local event channel <port>. If the channel is
97 * interdomain then the remote end is placed in the unbound state
98 * (EVTCHNSTAT_unbound), awaiting a new connection.
99 */
100#define EVTCHNOP_close 3
101struct evtchn_close {
102 /* IN parameters. */
103 evtchn_port_t port;
104};
105
106/*
107 * EVTCHNOP_send: Send an event to the remote end of the channel whose local
108 * endpoint is <port>.
109 */
110#define EVTCHNOP_send 4
111struct evtchn_send {
112 /* IN parameters. */
113 evtchn_port_t port;
114};
115
116/*
117 * EVTCHNOP_status: Get the current status of the communication channel which
118 * has an endpoint at <dom, port>.
119 * NOTES:
120 * 1. <dom> may be specified as DOMID_SELF.
121 * 2. Only a sufficiently-privileged domain may obtain the status of an event
122 * channel for which <dom> is not DOMID_SELF.
123 */
124#define EVTCHNOP_status 5
125struct evtchn_status {
126 /* IN parameters */
127 domid_t dom;
128 evtchn_port_t port;
129 /* OUT parameters */
130#define EVTCHNSTAT_closed 0 /* Channel is not in use. */
131#define EVTCHNSTAT_unbound 1 /* Channel is waiting interdom connection.*/
132#define EVTCHNSTAT_interdomain 2 /* Channel is connected to remote domain. */
133#define EVTCHNSTAT_pirq 3 /* Channel is bound to a phys IRQ line. */
134#define EVTCHNSTAT_virq 4 /* Channel is bound to a virtual IRQ line */
135#define EVTCHNSTAT_ipi 5 /* Channel is bound to a virtual IPI line */
136 uint32_t status;
137 uint32_t vcpu; /* VCPU to which this channel is bound. */
138 union {
139 struct {
140 domid_t dom;
141 } unbound; /* EVTCHNSTAT_unbound */
142 struct {
143 domid_t dom;
144 evtchn_port_t port;
145 } interdomain; /* EVTCHNSTAT_interdomain */
146 uint32_t pirq; /* EVTCHNSTAT_pirq */
147 uint32_t virq; /* EVTCHNSTAT_virq */
148 } u;
149};
150
151/*
152 * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
153 * event is pending.
154 * NOTES:
155 * 1. IPI- and VIRQ-bound channels always notify the vcpu that initialised
156 * the binding. This binding cannot be changed.
157 * 2. All other channels notify vcpu0 by default. This default is set when
158 * the channel is allocated (a port that is freed and subsequently reused
159 * has its binding reset to vcpu0).
160 */
161#define EVTCHNOP_bind_vcpu 8
162struct evtchn_bind_vcpu {
163 /* IN parameters. */
164 evtchn_port_t port;
165 uint32_t vcpu;
166};
167
168/*
169 * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver
170 * a notification to the appropriate VCPU if an event is pending.
171 */
172#define EVTCHNOP_unmask 9
173struct evtchn_unmask {
174 /* IN parameters. */
175 evtchn_port_t port;
176};
177
178struct evtchn_op {
179 uint32_t cmd; /* EVTCHNOP_* */
180 union {
181 struct evtchn_alloc_unbound alloc_unbound;
182 struct evtchn_bind_interdomain bind_interdomain;
183 struct evtchn_bind_virq bind_virq;
184 struct evtchn_bind_pirq bind_pirq;
185 struct evtchn_bind_ipi bind_ipi;
186 struct evtchn_close close;
187 struct evtchn_send send;
188 struct evtchn_status status;
189 struct evtchn_bind_vcpu bind_vcpu;
190 struct evtchn_unmask unmask;
191 } u;
192};
193DEFINE_GUEST_HANDLE_STRUCT(evtchn_op);
194
195#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
diff --git a/include/xen/interface/features.h b/include/xen/interface/features.h
new file mode 100644
index 000000000000..d73228d16488
--- /dev/null
+++ b/include/xen/interface/features.h
@@ -0,0 +1,43 @@
1/******************************************************************************
2 * features.h
3 *
4 * Feature flags, reported by XENVER_get_features.
5 *
6 * Copyright (c) 2006, Keir Fraser <keir@xensource.com>
7 */
8
9#ifndef __XEN_PUBLIC_FEATURES_H__
10#define __XEN_PUBLIC_FEATURES_H__
11
12/*
13 * If set, the guest does not need to write-protect its pagetables, and can
14 * update them via direct writes.
15 */
16#define XENFEAT_writable_page_tables 0
17
18/*
19 * If set, the guest does not need to write-protect its segment descriptor
20 * tables, and can update them via direct writes.
21 */
22#define XENFEAT_writable_descriptor_tables 1
23
24/*
25 * If set, translation between the guest's 'pseudo-physical' address space
26 * and the host's machine address space are handled by the hypervisor. In this
27 * mode the guest does not need to perform phys-to/from-machine translations
28 * when performing page table operations.
29 */
30#define XENFEAT_auto_translated_physmap 2
31
32/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
33#define XENFEAT_supervisor_mode_kernel 3
34
35/*
36 * If set, the guest does not need to allocate x86 PAE page directories
37 * below 4GB. This flag is usually implied by auto_translated_physmap.
38 */
39#define XENFEAT_pae_pgdir_above_4gb 4
40
41#define XENFEAT_NR_SUBMAPS 1
42
43#endif /* __XEN_PUBLIC_FEATURES_H__ */
diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h
new file mode 100644
index 000000000000..219049802cf2
--- /dev/null
+++ b/include/xen/interface/grant_table.h
@@ -0,0 +1,375 @@
1/******************************************************************************
2 * grant_table.h
3 *
4 * Interface for granting foreign access to page frames, and receiving
5 * page-ownership transfers.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to
9 * deal in the Software without restriction, including without limitation the
10 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
11 * sell copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
24 *
25 * Copyright (c) 2004, K A Fraser
26 */
27
28#ifndef __XEN_PUBLIC_GRANT_TABLE_H__
29#define __XEN_PUBLIC_GRANT_TABLE_H__
30
31
32/***********************************
33 * GRANT TABLE REPRESENTATION
34 */
35
36/* Some rough guidelines on accessing and updating grant-table entries
37 * in a concurrency-safe manner. For more information, Linux contains a
38 * reference implementation for guest OSes (arch/xen/kernel/grant_table.c).
39 *
40 * NB. WMB is a no-op on current-generation x86 processors. However, a
41 * compiler barrier will still be required.
42 *
43 * Introducing a valid entry into the grant table:
44 * 1. Write ent->domid.
45 * 2. Write ent->frame:
46 * GTF_permit_access: Frame to which access is permitted.
47 * GTF_accept_transfer: Pseudo-phys frame slot being filled by new
48 * frame, or zero if none.
49 * 3. Write memory barrier (WMB).
50 * 4. Write ent->flags, inc. valid type.
51 *
52 * Invalidating an unused GTF_permit_access entry:
53 * 1. flags = ent->flags.
54 * 2. Observe that !(flags & (GTF_reading|GTF_writing)).
55 * 3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
56 * NB. No need for WMB as reuse of entry is control-dependent on success of
57 * step 3, and all architectures guarantee ordering of ctrl-dep writes.
58 *
59 * Invalidating an in-use GTF_permit_access entry:
60 * This cannot be done directly. Request assistance from the domain controller
61 * which can set a timeout on the use of a grant entry and take necessary
62 * action. (NB. This is not yet implemented!).
63 *
64 * Invalidating an unused GTF_accept_transfer entry:
65 * 1. flags = ent->flags.
66 * 2. Observe that !(flags & GTF_transfer_committed). [*]
67 * 3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
68 * NB. No need for WMB as reuse of entry is control-dependent on success of
69 * step 3, and all architectures guarantee ordering of ctrl-dep writes.
70 * [*] If GTF_transfer_committed is set then the grant entry is 'committed'.
71 * The guest must /not/ modify the grant entry until the address of the
72 * transferred frame is written. It is safe for the guest to spin waiting
73 * for this to occur (detect by observing GTF_transfer_completed in
74 * ent->flags).
75 *
76 * Invalidating a committed GTF_accept_transfer entry:
77 * 1. Wait for (ent->flags & GTF_transfer_completed).
78 *
79 * Changing a GTF_permit_access from writable to read-only:
80 * Use SMP-safe CMPXCHG to set GTF_readonly, while checking !GTF_writing.
81 *
82 * Changing a GTF_permit_access from read-only to writable:
83 * Use SMP-safe bit-setting instruction.
84 */
85
86/*
87 * A grant table comprises a packed array of grant entries in one or more
88 * page frames shared between Xen and a guest.
89 * [XEN]: This field is written by Xen and read by the sharing guest.
90 * [GST]: This field is written by the guest and read by Xen.
91 */
92struct grant_entry {
93 /* GTF_xxx: various type and flag information. [XEN,GST] */
94 uint16_t flags;
95 /* The domain being granted foreign privileges. [GST] */
96 domid_t domid;
97 /*
98 * GTF_permit_access: Frame that @domid is allowed to map and access. [GST]
99 * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN]
100 */
101 uint32_t frame;
102};
103
104/*
105 * Type of grant entry.
106 * GTF_invalid: This grant entry grants no privileges.
107 * GTF_permit_access: Allow @domid to map/access @frame.
108 * GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
109 * to this guest. Xen writes the page number to @frame.
110 */
111#define GTF_invalid (0U<<0)
112#define GTF_permit_access (1U<<0)
113#define GTF_accept_transfer (2U<<0)
114#define GTF_type_mask (3U<<0)
115
116/*
117 * Subflags for GTF_permit_access.
118 * GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
119 * GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
120 * GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
121 */
122#define _GTF_readonly (2)
123#define GTF_readonly (1U<<_GTF_readonly)
124#define _GTF_reading (3)
125#define GTF_reading (1U<<_GTF_reading)
126#define _GTF_writing (4)
127#define GTF_writing (1U<<_GTF_writing)
128
129/*
130 * Subflags for GTF_accept_transfer:
131 * GTF_transfer_committed: Xen sets this flag to indicate that it is committed
132 * to transferring ownership of a page frame. When a guest sees this flag
133 * it must /not/ modify the grant entry until GTF_transfer_completed is
134 * set by Xen.
135 * GTF_transfer_completed: It is safe for the guest to spin-wait on this flag
136 * after reading GTF_transfer_committed. Xen will always write the frame
137 * address, followed by ORing this flag, in a timely manner.
138 */
139#define _GTF_transfer_committed (2)
140#define GTF_transfer_committed (1U<<_GTF_transfer_committed)
141#define _GTF_transfer_completed (3)
142#define GTF_transfer_completed (1U<<_GTF_transfer_completed)
143
144
145/***********************************
146 * GRANT TABLE QUERIES AND USES
147 */
148
149/*
150 * Reference to a grant entry in a specified domain's grant table.
151 */
152typedef uint32_t grant_ref_t;
153
154/*
155 * Handle to track a mapping created via a grant reference.
156 */
157typedef uint32_t grant_handle_t;
158
159/*
160 * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access
161 * by devices and/or host CPUs. If successful, <handle> is a tracking number
162 * that must be presented later to destroy the mapping(s). On error, <handle>
163 * is a negative status code.
164 * NOTES:
165 * 1. If GNTMAP_device_map is specified then <dev_bus_addr> is the address
166 * via which I/O devices may access the granted frame.
167 * 2. If GNTMAP_host_map is specified then a mapping will be added at
168 * either a host virtual address in the current address space, or at
169 * a PTE at the specified machine address. The type of mapping to
170 * perform is selected through the GNTMAP_contains_pte flag, and the
171 * address is specified in <host_addr>.
172 * 3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a
173 * host mapping is destroyed by other means then it is *NOT* guaranteed
174 * to be accounted to the correct grant reference!
175 */
176#define GNTTABOP_map_grant_ref 0
177struct gnttab_map_grant_ref {
178 /* IN parameters. */
179 uint64_t host_addr;
180 uint32_t flags; /* GNTMAP_* */
181 grant_ref_t ref;
182 domid_t dom;
183 /* OUT parameters. */
184 int16_t status; /* GNTST_* */
185 grant_handle_t handle;
186 uint64_t dev_bus_addr;
187};
188
189/*
190 * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
191 * tracked by <handle>. If <host_addr> or <dev_bus_addr> is zero, that
192 * field is ignored. If non-zero, they must refer to a device/host mapping
193 * that is tracked by <handle>
194 * NOTES:
195 * 1. The call may fail in an undefined manner if either mapping is not
196 * tracked by <handle>.
197 * 3. After executing a batch of unmaps, it is guaranteed that no stale
198 * mappings will remain in the device or host TLBs.
199 */
200#define GNTTABOP_unmap_grant_ref 1
201struct gnttab_unmap_grant_ref {
202 /* IN parameters. */
203 uint64_t host_addr;
204 uint64_t dev_bus_addr;
205 grant_handle_t handle;
206 /* OUT parameters. */
207 int16_t status; /* GNTST_* */
208};
209
210/*
211 * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
212 * <nr_frames> pages. The frame addresses are written to the <frame_list>.
213 * Only <nr_frames> addresses are written, even if the table is larger.
214 * NOTES:
215 * 1. <dom> may be specified as DOMID_SELF.
216 * 2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
217 * 3. Xen may not support more than a single grant-table page per domain.
218 */
219#define GNTTABOP_setup_table 2
220struct gnttab_setup_table {
221 /* IN parameters. */
222 domid_t dom;
223 uint32_t nr_frames;
224 /* OUT parameters. */
225 int16_t status; /* GNTST_* */
226 ulong *frame_list;
227};
228
229/*
230 * GNTTABOP_dump_table: Dump the contents of the grant table to the
231 * xen console. Debugging use only.
232 */
233#define GNTTABOP_dump_table 3
234struct gnttab_dump_table {
235 /* IN parameters. */
236 domid_t dom;
237 /* OUT parameters. */
238 int16_t status; /* GNTST_* */
239};
240
241/*
242 * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The
243 * foreign domain has previously registered its interest in the transfer via
244 * <domid, ref>.
245 *
246 * Note that, even if the transfer fails, the specified page no longer belongs
247 * to the calling domain *unless* the error is GNTST_bad_page.
248 */
249#define GNTTABOP_transfer 4
250struct gnttab_transfer {
251 /* IN parameters. */
252 unsigned long mfn;
253 domid_t domid;
254 grant_ref_t ref;
255 /* OUT parameters. */
256 int16_t status;
257};
258
259
260/*
261 * GNTTABOP_copy: Hypervisor based copy
262 * source and destinations can be eithers MFNs or, for foreign domains,
263 * grant references. the foreign domain has to grant read/write access
264 * in its grant table.
265 *
266 * The flags specify what type source and destinations are (either MFN
267 * or grant reference).
268 *
269 * Note that this can also be used to copy data between two domains
270 * via a third party if the source and destination domains had previously
271 * grant appropriate access to their pages to the third party.
272 *
273 * source_offset specifies an offset in the source frame, dest_offset
274 * the offset in the target frame and len specifies the number of
275 * bytes to be copied.
276 */
277
278#define _GNTCOPY_source_gref (0)
279#define GNTCOPY_source_gref (1<<_GNTCOPY_source_gref)
280#define _GNTCOPY_dest_gref (1)
281#define GNTCOPY_dest_gref (1<<_GNTCOPY_dest_gref)
282
283#define GNTTABOP_copy 5
284struct gnttab_copy {
285 /* IN parameters. */
286 struct {
287 union {
288 grant_ref_t ref;
289 unsigned long gmfn;
290 } u;
291 domid_t domid;
292 uint16_t offset;
293 } source, dest;
294 uint16_t len;
295 uint16_t flags; /* GNTCOPY_* */
296 /* OUT parameters. */
297 int16_t status;
298};
299
300/*
301 * GNTTABOP_query_size: Query the current and maximum sizes of the shared
302 * grant table.
303 * NOTES:
304 * 1. <dom> may be specified as DOMID_SELF.
305 * 2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
306 */
307#define GNTTABOP_query_size 6
308struct gnttab_query_size {
309 /* IN parameters. */
310 domid_t dom;
311 /* OUT parameters. */
312 uint32_t nr_frames;
313 uint32_t max_nr_frames;
314 int16_t status; /* GNTST_* */
315};
316
317
318/*
319 * Bitfield values for update_pin_status.flags.
320 */
321 /* Map the grant entry for access by I/O devices. */
322#define _GNTMAP_device_map (0)
323#define GNTMAP_device_map (1<<_GNTMAP_device_map)
324 /* Map the grant entry for access by host CPUs. */
325#define _GNTMAP_host_map (1)
326#define GNTMAP_host_map (1<<_GNTMAP_host_map)
327 /* Accesses to the granted frame will be restricted to read-only access. */
328#define _GNTMAP_readonly (2)
329#define GNTMAP_readonly (1<<_GNTMAP_readonly)
330 /*
331 * GNTMAP_host_map subflag:
332 * 0 => The host mapping is usable only by the guest OS.
333 * 1 => The host mapping is usable by guest OS + current application.
334 */
335#define _GNTMAP_application_map (3)
336#define GNTMAP_application_map (1<<_GNTMAP_application_map)
337
338 /*
339 * GNTMAP_contains_pte subflag:
340 * 0 => This map request contains a host virtual address.
341 * 1 => This map request contains the machine addess of the PTE to update.
342 */
343#define _GNTMAP_contains_pte (4)
344#define GNTMAP_contains_pte (1<<_GNTMAP_contains_pte)
345
346/*
347 * Values for error status returns. All errors are -ve.
348 */
349#define GNTST_okay (0) /* Normal return. */
350#define GNTST_general_error (-1) /* General undefined error. */
351#define GNTST_bad_domain (-2) /* Unrecognsed domain id. */
352#define GNTST_bad_gntref (-3) /* Unrecognised or inappropriate gntref. */
353#define GNTST_bad_handle (-4) /* Unrecognised or inappropriate handle. */
354#define GNTST_bad_virt_addr (-5) /* Inappropriate virtual address to map. */
355#define GNTST_bad_dev_addr (-6) /* Inappropriate device address to unmap.*/
356#define GNTST_no_device_space (-7) /* Out of space in I/O MMU. */
357#define GNTST_permission_denied (-8) /* Not enough privilege for operation. */
358#define GNTST_bad_page (-9) /* Specified page was invalid for op. */
359#define GNTST_bad_copy_arg (-10) /* copy arguments cross page boundary */
360
361#define GNTTABOP_error_msgs { \
362 "okay", \
363 "undefined error", \
364 "unrecognised domain id", \
365 "invalid grant reference", \
366 "invalid mapping handle", \
367 "invalid virtual address", \
368 "invalid device address", \
369 "no spare translation slot in the I/O MMU", \
370 "permission denied", \
371 "bad page", \
372 "copy arguments cross page boundary" \
373}
374
375#endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
new file mode 100644
index 000000000000..c2d1fa4dc1ee
--- /dev/null
+++ b/include/xen/interface/io/blkif.h
@@ -0,0 +1,94 @@
1/******************************************************************************
2 * blkif.h
3 *
4 * Unified block-device I/O interface for Xen guest OSes.
5 *
6 * Copyright (c) 2003-2004, Keir Fraser
7 */
8
9#ifndef __XEN_PUBLIC_IO_BLKIF_H__
10#define __XEN_PUBLIC_IO_BLKIF_H__
11
12#include "ring.h"
13#include "../grant_table.h"
14
15/*
16 * Front->back notifications: When enqueuing a new request, sending a
17 * notification can be made conditional on req_event (i.e., the generic
18 * hold-off mechanism provided by the ring macros). Backends must set
19 * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
20 *
21 * Back->front notifications: When enqueuing a new response, sending a
22 * notification can be made conditional on rsp_event (i.e., the generic
23 * hold-off mechanism provided by the ring macros). Frontends must set
24 * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
25 */
26
27typedef uint16_t blkif_vdev_t;
28typedef uint64_t blkif_sector_t;
29
30/*
31 * REQUEST CODES.
32 */
33#define BLKIF_OP_READ 0
34#define BLKIF_OP_WRITE 1
35/*
36 * Recognised only if "feature-barrier" is present in backend xenbus info.
37 * The "feature_barrier" node contains a boolean indicating whether barrier
38 * requests are likely to succeed or fail. Either way, a barrier request
39 * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
40 * the underlying block-device hardware. The boolean simply indicates whether
41 * or not it is worthwhile for the frontend to attempt barrier requests.
42 * If a backend does not recognise BLKIF_OP_WRITE_BARRIER, it should *not*
43 * create the "feature-barrier" node!
44 */
45#define BLKIF_OP_WRITE_BARRIER 2
46
47/*
48 * Maximum scatter/gather segments per request.
49 * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
50 * NB. This could be 12 if the ring indexes weren't stored in the same page.
51 */
52#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
53
54struct blkif_request {
55 uint8_t operation; /* BLKIF_OP_??? */
56 uint8_t nr_segments; /* number of segments */
57 blkif_vdev_t handle; /* only for read/write requests */
58 uint64_t id; /* private guest value, echoed in resp */
59 blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
60 struct blkif_request_segment {
61 grant_ref_t gref; /* reference to I/O buffer frame */
62 /* @first_sect: first sector in frame to transfer (inclusive). */
63 /* @last_sect: last sector in frame to transfer (inclusive). */
64 uint8_t first_sect, last_sect;
65 } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
66};
67
68struct blkif_response {
69 uint64_t id; /* copied from request */
70 uint8_t operation; /* copied from request */
71 int16_t status; /* BLKIF_RSP_??? */
72};
73
74/*
75 * STATUS RETURN CODES.
76 */
77 /* Operation not supported (only happens on barrier writes). */
78#define BLKIF_RSP_EOPNOTSUPP -2
79 /* Operation failed for some unspecified reason (-EIO). */
80#define BLKIF_RSP_ERROR -1
81 /* Operation completed successfully. */
82#define BLKIF_RSP_OKAY 0
83
84/*
85 * Generate blkif ring structures and types.
86 */
87
88DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
89
90#define VDISK_CDROM 0x1
91#define VDISK_REMOVABLE 0x2
92#define VDISK_READONLY 0x4
93
94#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
diff --git a/include/xen/interface/io/console.h b/include/xen/interface/io/console.h
new file mode 100644
index 000000000000..e563de70f784
--- /dev/null
+++ b/include/xen/interface/io/console.h
@@ -0,0 +1,23 @@
1/******************************************************************************
2 * console.h
3 *
4 * Console I/O interface for Xen guest OSes.
5 *
6 * Copyright (c) 2005, Keir Fraser
7 */
8
9#ifndef __XEN_PUBLIC_IO_CONSOLE_H__
10#define __XEN_PUBLIC_IO_CONSOLE_H__
11
12typedef uint32_t XENCONS_RING_IDX;
13
14#define MASK_XENCONS_IDX(idx, ring) ((idx) & (sizeof(ring)-1))
15
16struct xencons_interface {
17 char in[1024];
18 char out[2048];
19 XENCONS_RING_IDX in_cons, in_prod;
20 XENCONS_RING_IDX out_cons, out_prod;
21};
22
23#endif /* __XEN_PUBLIC_IO_CONSOLE_H__ */
diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h
new file mode 100644
index 000000000000..518481c95f18
--- /dev/null
+++ b/include/xen/interface/io/netif.h
@@ -0,0 +1,158 @@
1/******************************************************************************
2 * netif.h
3 *
4 * Unified network-device I/O interface for Xen guest OSes.
5 *
6 * Copyright (c) 2003-2004, Keir Fraser
7 */
8
9#ifndef __XEN_PUBLIC_IO_NETIF_H__
10#define __XEN_PUBLIC_IO_NETIF_H__
11
12#include "ring.h"
13#include "../grant_table.h"
14
15/*
16 * Notifications after enqueuing any type of message should be conditional on
17 * the appropriate req_event or rsp_event field in the shared ring.
18 * If the client sends notification for rx requests then it should specify
19 * feature 'feature-rx-notify' via xenbus. Otherwise the backend will assume
20 * that it cannot safely queue packets (as it may not be kicked to send them).
21 */
22
23/*
24 * This is the 'wire' format for packets:
25 * Request 1: netif_tx_request -- NETTXF_* (any flags)
26 * [Request 2: netif_tx_extra] (only if request 1 has NETTXF_extra_info)
27 * [Request 3: netif_tx_extra] (only if request 2 has XEN_NETIF_EXTRA_MORE)
28 * Request 4: netif_tx_request -- NETTXF_more_data
29 * Request 5: netif_tx_request -- NETTXF_more_data
30 * ...
31 * Request N: netif_tx_request -- 0
32 */
33
34/* Protocol checksum field is blank in the packet (hardware offload)? */
35#define _NETTXF_csum_blank (0)
36#define NETTXF_csum_blank (1U<<_NETTXF_csum_blank)
37
38/* Packet data has been validated against protocol checksum. */
39#define _NETTXF_data_validated (1)
40#define NETTXF_data_validated (1U<<_NETTXF_data_validated)
41
42/* Packet continues in the next request descriptor. */
43#define _NETTXF_more_data (2)
44#define NETTXF_more_data (1U<<_NETTXF_more_data)
45
46/* Packet to be followed by extra descriptor(s). */
47#define _NETTXF_extra_info (3)
48#define NETTXF_extra_info (1U<<_NETTXF_extra_info)
49
50struct xen_netif_tx_request {
51 grant_ref_t gref; /* Reference to buffer page */
52 uint16_t offset; /* Offset within buffer page */
53 uint16_t flags; /* NETTXF_* */
54 uint16_t id; /* Echoed in response message. */
55 uint16_t size; /* Packet size in bytes. */
56};
57
58/* Types of netif_extra_info descriptors. */
59#define XEN_NETIF_EXTRA_TYPE_NONE (0) /* Never used - invalid */
60#define XEN_NETIF_EXTRA_TYPE_GSO (1) /* u.gso */
61#define XEN_NETIF_EXTRA_TYPE_MAX (2)
62
63/* netif_extra_info flags. */
64#define _XEN_NETIF_EXTRA_FLAG_MORE (0)
65#define XEN_NETIF_EXTRA_FLAG_MORE (1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
66
67/* GSO types - only TCPv4 currently supported. */
68#define XEN_NETIF_GSO_TYPE_TCPV4 (1)
69
70/*
71 * This structure needs to fit within both netif_tx_request and
72 * netif_rx_response for compatibility.
73 */
74struct xen_netif_extra_info {
75 uint8_t type; /* XEN_NETIF_EXTRA_TYPE_* */
76 uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */
77
78 union {
79 struct {
80 /*
81 * Maximum payload size of each segment. For
82 * example, for TCP this is just the path MSS.
83 */
84 uint16_t size;
85
86 /*
87 * GSO type. This determines the protocol of
88 * the packet and any extra features required
89 * to segment the packet properly.
90 */
91 uint8_t type; /* XEN_NETIF_GSO_TYPE_* */
92
93 /* Future expansion. */
94 uint8_t pad;
95
96 /*
97 * GSO features. This specifies any extra GSO
98 * features required to process this packet,
99 * such as ECN support for TCPv4.
100 */
101 uint16_t features; /* XEN_NETIF_GSO_FEAT_* */
102 } gso;
103
104 uint16_t pad[3];
105 } u;
106};
107
108struct xen_netif_tx_response {
109 uint16_t id;
110 int16_t status; /* NETIF_RSP_* */
111};
112
113struct xen_netif_rx_request {
114 uint16_t id; /* Echoed in response message. */
115 grant_ref_t gref; /* Reference to incoming granted frame */
116};
117
118/* Packet data has been validated against protocol checksum. */
119#define _NETRXF_data_validated (0)
120#define NETRXF_data_validated (1U<<_NETRXF_data_validated)
121
122/* Protocol checksum field is blank in the packet (hardware offload)? */
123#define _NETRXF_csum_blank (1)
124#define NETRXF_csum_blank (1U<<_NETRXF_csum_blank)
125
126/* Packet continues in the next request descriptor. */
127#define _NETRXF_more_data (2)
128#define NETRXF_more_data (1U<<_NETRXF_more_data)
129
130/* Packet to be followed by extra descriptor(s). */
131#define _NETRXF_extra_info (3)
132#define NETRXF_extra_info (1U<<_NETRXF_extra_info)
133
134struct xen_netif_rx_response {
135 uint16_t id;
136 uint16_t offset; /* Offset in page of start of received packet */
137 uint16_t flags; /* NETRXF_* */
138 int16_t status; /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
139};
140
141/*
142 * Generate netif ring structures and types.
143 */
144
145DEFINE_RING_TYPES(xen_netif_tx,
146 struct xen_netif_tx_request,
147 struct xen_netif_tx_response);
148DEFINE_RING_TYPES(xen_netif_rx,
149 struct xen_netif_rx_request,
150 struct xen_netif_rx_response);
151
152#define NETIF_RSP_DROPPED -2
153#define NETIF_RSP_ERROR -1
154#define NETIF_RSP_OKAY 0
155/* No response: used for auxiliary requests (e.g., netif_tx_extra). */
156#define NETIF_RSP_NULL 1
157
158#endif
diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h
new file mode 100644
index 000000000000..e8cbf431c8cc
--- /dev/null
+++ b/include/xen/interface/io/ring.h
@@ -0,0 +1,260 @@
1/******************************************************************************
2 * ring.h
3 *
4 * Shared producer-consumer ring macros.
5 *
6 * Tim Deegan and Andrew Warfield November 2004.
7 */
8
9#ifndef __XEN_PUBLIC_IO_RING_H__
10#define __XEN_PUBLIC_IO_RING_H__
11
12typedef unsigned int RING_IDX;
13
14/* Round a 32-bit unsigned constant down to the nearest power of two. */
15#define __RD2(_x) (((_x) & 0x00000002) ? 0x2 : ((_x) & 0x1))
16#define __RD4(_x) (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2 : __RD2(_x))
17#define __RD8(_x) (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4 : __RD4(_x))
18#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8 : __RD8(_x))
19#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x))
20
21/*
22 * Calculate size of a shared ring, given the total available space for the
23 * ring and indexes (_sz), and the name tag of the request/response structure.
24 * A ring contains as many entries as will fit, rounded down to the nearest
25 * power of two (so we can mask with (size-1) to loop around).
26 */
27#define __RING_SIZE(_s, _sz) \
28 (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
29
30/*
31 * Macros to make the correct C datatypes for a new kind of ring.
32 *
33 * To make a new ring datatype, you need to have two message structures,
34 * let's say struct request, and struct response already defined.
35 *
36 * In a header where you want the ring datatype declared, you then do:
37 *
38 * DEFINE_RING_TYPES(mytag, struct request, struct response);
39 *
40 * These expand out to give you a set of types, as you can see below.
41 * The most important of these are:
42 *
43 * struct mytag_sring - The shared ring.
44 * struct mytag_front_ring - The 'front' half of the ring.
45 * struct mytag_back_ring - The 'back' half of the ring.
46 *
47 * To initialize a ring in your code you need to know the location and size
48 * of the shared memory area (PAGE_SIZE, for instance). To initialise
49 * the front half:
50 *
51 * struct mytag_front_ring front_ring;
52 * SHARED_RING_INIT((struct mytag_sring *)shared_page);
53 * FRONT_RING_INIT(&front_ring, (struct mytag_sring *)shared_page,
54 * PAGE_SIZE);
55 *
56 * Initializing the back follows similarly (note that only the front
57 * initializes the shared ring):
58 *
59 * struct mytag_back_ring back_ring;
60 * BACK_RING_INIT(&back_ring, (struct mytag_sring *)shared_page,
61 * PAGE_SIZE);
62 */
63
64#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) \
65 \
66/* Shared ring entry */ \
67union __name##_sring_entry { \
68 __req_t req; \
69 __rsp_t rsp; \
70}; \
71 \
72/* Shared ring page */ \
73struct __name##_sring { \
74 RING_IDX req_prod, req_event; \
75 RING_IDX rsp_prod, rsp_event; \
76 uint8_t pad[48]; \
77 union __name##_sring_entry ring[1]; /* variable-length */ \
78}; \
79 \
80/* "Front" end's private variables */ \
81struct __name##_front_ring { \
82 RING_IDX req_prod_pvt; \
83 RING_IDX rsp_cons; \
84 unsigned int nr_ents; \
85 struct __name##_sring *sring; \
86}; \
87 \
88/* "Back" end's private variables */ \
89struct __name##_back_ring { \
90 RING_IDX rsp_prod_pvt; \
91 RING_IDX req_cons; \
92 unsigned int nr_ents; \
93 struct __name##_sring *sring; \
94};
95
96/*
97 * Macros for manipulating rings.
98 *
99 * FRONT_RING_whatever works on the "front end" of a ring: here
100 * requests are pushed on to the ring and responses taken off it.
101 *
102 * BACK_RING_whatever works on the "back end" of a ring: here
103 * requests are taken off the ring and responses put on.
104 *
105 * N.B. these macros do NO INTERLOCKS OR FLOW CONTROL.
106 * This is OK in 1-for-1 request-response situations where the
107 * requestor (front end) never has more than RING_SIZE()-1
108 * outstanding requests.
109 */
110
111/* Initialising empty rings */
112#define SHARED_RING_INIT(_s) do { \
113 (_s)->req_prod = (_s)->rsp_prod = 0; \
114 (_s)->req_event = (_s)->rsp_event = 1; \
115 memset((_s)->pad, 0, sizeof((_s)->pad)); \
116} while(0)
117
118#define FRONT_RING_INIT(_r, _s, __size) do { \
119 (_r)->req_prod_pvt = 0; \
120 (_r)->rsp_cons = 0; \
121 (_r)->nr_ents = __RING_SIZE(_s, __size); \
122 (_r)->sring = (_s); \
123} while (0)
124
125#define BACK_RING_INIT(_r, _s, __size) do { \
126 (_r)->rsp_prod_pvt = 0; \
127 (_r)->req_cons = 0; \
128 (_r)->nr_ents = __RING_SIZE(_s, __size); \
129 (_r)->sring = (_s); \
130} while (0)
131
132/* Initialize to existing shared indexes -- for recovery */
133#define FRONT_RING_ATTACH(_r, _s, __size) do { \
134 (_r)->sring = (_s); \
135 (_r)->req_prod_pvt = (_s)->req_prod; \
136 (_r)->rsp_cons = (_s)->rsp_prod; \
137 (_r)->nr_ents = __RING_SIZE(_s, __size); \
138} while (0)
139
140#define BACK_RING_ATTACH(_r, _s, __size) do { \
141 (_r)->sring = (_s); \
142 (_r)->rsp_prod_pvt = (_s)->rsp_prod; \
143 (_r)->req_cons = (_s)->req_prod; \
144 (_r)->nr_ents = __RING_SIZE(_s, __size); \
145} while (0)
146
147/* How big is this ring? */
148#define RING_SIZE(_r) \
149 ((_r)->nr_ents)
150
151/* Number of free requests (for use on front side only). */
152#define RING_FREE_REQUESTS(_r) \
153 (RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons))
154
155/* Test if there is an empty slot available on the front ring.
156 * (This is only meaningful from the front. )
157 */
158#define RING_FULL(_r) \
159 (RING_FREE_REQUESTS(_r) == 0)
160
161/* Test if there are outstanding messages to be processed on a ring. */
162#define RING_HAS_UNCONSUMED_RESPONSES(_r) \
163 ((_r)->sring->rsp_prod - (_r)->rsp_cons)
164
165#define RING_HAS_UNCONSUMED_REQUESTS(_r) \
166 ({ \
167 unsigned int req = (_r)->sring->req_prod - (_r)->req_cons; \
168 unsigned int rsp = RING_SIZE(_r) - \
169 ((_r)->req_cons - (_r)->rsp_prod_pvt); \
170 req < rsp ? req : rsp; \
171 })
172
173/* Direct access to individual ring elements, by index. */
174#define RING_GET_REQUEST(_r, _idx) \
175 (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
176
177#define RING_GET_RESPONSE(_r, _idx) \
178 (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
179
180/* Loop termination condition: Would the specified index overflow the ring? */
181#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \
182 (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
183
184#define RING_PUSH_REQUESTS(_r) do { \
185 wmb(); /* back sees requests /before/ updated producer index */ \
186 (_r)->sring->req_prod = (_r)->req_prod_pvt; \
187} while (0)
188
189#define RING_PUSH_RESPONSES(_r) do { \
190 wmb(); /* front sees responses /before/ updated producer index */ \
191 (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt; \
192} while (0)
193
194/*
195 * Notification hold-off (req_event and rsp_event):
196 *
197 * When queueing requests or responses on a shared ring, it may not always be
198 * necessary to notify the remote end. For example, if requests are in flight
199 * in a backend, the front may be able to queue further requests without
200 * notifying the back (if the back checks for new requests when it queues
201 * responses).
202 *
203 * When enqueuing requests or responses:
204 *
205 * Use RING_PUSH_{REQUESTS,RESPONSES}_AND_CHECK_NOTIFY(). The second argument
206 * is a boolean return value. True indicates that the receiver requires an
207 * asynchronous notification.
208 *
209 * After dequeuing requests or responses (before sleeping the connection):
210 *
211 * Use RING_FINAL_CHECK_FOR_REQUESTS() or RING_FINAL_CHECK_FOR_RESPONSES().
212 * The second argument is a boolean return value. True indicates that there
213 * are pending messages on the ring (i.e., the connection should not be put
214 * to sleep).
215 *
216 * These macros will set the req_event/rsp_event field to trigger a
217 * notification on the very next message that is enqueued. If you want to
218 * create batches of work (i.e., only receive a notification after several
219 * messages have been enqueued) then you will need to create a customised
220 * version of the FINAL_CHECK macro in your own code, which sets the event
221 * field appropriately.
222 */
223
224#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do { \
225 RING_IDX __old = (_r)->sring->req_prod; \
226 RING_IDX __new = (_r)->req_prod_pvt; \
227 wmb(); /* back sees requests /before/ updated producer index */ \
228 (_r)->sring->req_prod = __new; \
229 mb(); /* back sees new requests /before/ we check req_event */ \
230 (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) < \
231 (RING_IDX)(__new - __old)); \
232} while (0)
233
234#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do { \
235 RING_IDX __old = (_r)->sring->rsp_prod; \
236 RING_IDX __new = (_r)->rsp_prod_pvt; \
237 wmb(); /* front sees responses /before/ updated producer index */ \
238 (_r)->sring->rsp_prod = __new; \
239 mb(); /* front sees new responses /before/ we check rsp_event */ \
240 (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) < \
241 (RING_IDX)(__new - __old)); \
242} while (0)
243
244#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do { \
245 (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \
246 if (_work_to_do) break; \
247 (_r)->sring->req_event = (_r)->req_cons + 1; \
248 mb(); \
249 (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \
250} while (0)
251
252#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do { \
253 (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \
254 if (_work_to_do) break; \
255 (_r)->sring->rsp_event = (_r)->rsp_cons + 1; \
256 mb(); \
257 (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \
258} while (0)
259
260#endif /* __XEN_PUBLIC_IO_RING_H__ */
diff --git a/include/xen/interface/io/xenbus.h b/include/xen/interface/io/xenbus.h
new file mode 100644
index 000000000000..46508c7fa399
--- /dev/null
+++ b/include/xen/interface/io/xenbus.h
@@ -0,0 +1,44 @@
1/*****************************************************************************
2 * xenbus.h
3 *
4 * Xenbus protocol details.
5 *
6 * Copyright (C) 2005 XenSource Ltd.
7 */
8
9#ifndef _XEN_PUBLIC_IO_XENBUS_H
10#define _XEN_PUBLIC_IO_XENBUS_H
11
12/* The state of either end of the Xenbus, i.e. the current communication
13 status of initialisation across the bus. States here imply nothing about
14 the state of the connection between the driver and the kernel's device
15 layers. */
16enum xenbus_state
17{
18 XenbusStateUnknown = 0,
19 XenbusStateInitialising = 1,
20 XenbusStateInitWait = 2, /* Finished early
21 initialisation, but waiting
22 for information from the peer
23 or hotplug scripts. */
24 XenbusStateInitialised = 3, /* Initialised and waiting for a
25 connection from the peer. */
26 XenbusStateConnected = 4,
27 XenbusStateClosing = 5, /* The device is being closed
28 due to an error or an unplug
29 event. */
30 XenbusStateClosed = 6
31
32};
33
34#endif /* _XEN_PUBLIC_IO_XENBUS_H */
35
36/*
37 * Local variables:
38 * c-file-style: "linux"
39 * indent-tabs-mode: t
40 * c-indent-level: 8
41 * c-basic-offset: 8
42 * tab-width: 8
43 * End:
44 */
diff --git a/include/xen/interface/io/xs_wire.h b/include/xen/interface/io/xs_wire.h
new file mode 100644
index 000000000000..99fcffb372d1
--- /dev/null
+++ b/include/xen/interface/io/xs_wire.h
@@ -0,0 +1,87 @@
1/*
2 * Details of the "wire" protocol between Xen Store Daemon and client
3 * library or guest kernel.
4 * Copyright (C) 2005 Rusty Russell IBM Corporation
5 */
6
7#ifndef _XS_WIRE_H
8#define _XS_WIRE_H
9
10enum xsd_sockmsg_type
11{
12 XS_DEBUG,
13 XS_DIRECTORY,
14 XS_READ,
15 XS_GET_PERMS,
16 XS_WATCH,
17 XS_UNWATCH,
18 XS_TRANSACTION_START,
19 XS_TRANSACTION_END,
20 XS_INTRODUCE,
21 XS_RELEASE,
22 XS_GET_DOMAIN_PATH,
23 XS_WRITE,
24 XS_MKDIR,
25 XS_RM,
26 XS_SET_PERMS,
27 XS_WATCH_EVENT,
28 XS_ERROR,
29 XS_IS_DOMAIN_INTRODUCED
30};
31
32#define XS_WRITE_NONE "NONE"
33#define XS_WRITE_CREATE "CREATE"
34#define XS_WRITE_CREATE_EXCL "CREATE|EXCL"
35
36/* We hand errors as strings, for portability. */
37struct xsd_errors
38{
39 int errnum;
40 const char *errstring;
41};
42#define XSD_ERROR(x) { x, #x }
43static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
44 XSD_ERROR(EINVAL),
45 XSD_ERROR(EACCES),
46 XSD_ERROR(EEXIST),
47 XSD_ERROR(EISDIR),
48 XSD_ERROR(ENOENT),
49 XSD_ERROR(ENOMEM),
50 XSD_ERROR(ENOSPC),
51 XSD_ERROR(EIO),
52 XSD_ERROR(ENOTEMPTY),
53 XSD_ERROR(ENOSYS),
54 XSD_ERROR(EROFS),
55 XSD_ERROR(EBUSY),
56 XSD_ERROR(EAGAIN),
57 XSD_ERROR(EISCONN)
58};
59
60struct xsd_sockmsg
61{
62 uint32_t type; /* XS_??? */
63 uint32_t req_id;/* Request identifier, echoed in daemon's response. */
64 uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */
65 uint32_t len; /* Length of data following this. */
66
67 /* Generally followed by nul-terminated string(s). */
68};
69
70enum xs_watch_type
71{
72 XS_WATCH_PATH = 0,
73 XS_WATCH_TOKEN
74};
75
76/* Inter-domain shared memory communications. */
77#define XENSTORE_RING_SIZE 1024
78typedef uint32_t XENSTORE_RING_IDX;
79#define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE-1))
80struct xenstore_domain_interface {
81 char req[XENSTORE_RING_SIZE]; /* Requests to xenstore daemon. */
82 char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch events. */
83 XENSTORE_RING_IDX req_cons, req_prod;
84 XENSTORE_RING_IDX rsp_cons, rsp_prod;
85};
86
87#endif /* _XS_WIRE_H */
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
new file mode 100644
index 000000000000..af36ead16817
--- /dev/null
+++ b/include/xen/interface/memory.h
@@ -0,0 +1,145 @@
1/******************************************************************************
2 * memory.h
3 *
4 * Memory reservation and information.
5 *
6 * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
7 */
8
9#ifndef __XEN_PUBLIC_MEMORY_H__
10#define __XEN_PUBLIC_MEMORY_H__
11
12/*
13 * Increase or decrease the specified domain's memory reservation. Returns a
14 * -ve errcode on failure, or the # extents successfully allocated or freed.
15 * arg == addr of struct xen_memory_reservation.
16 */
17#define XENMEM_increase_reservation 0
18#define XENMEM_decrease_reservation 1
19#define XENMEM_populate_physmap 6
20struct xen_memory_reservation {
21
22 /*
23 * XENMEM_increase_reservation:
24 * OUT: MFN (*not* GMFN) bases of extents that were allocated
25 * XENMEM_decrease_reservation:
26 * IN: GMFN bases of extents to free
27 * XENMEM_populate_physmap:
28 * IN: GPFN bases of extents to populate with memory
29 * OUT: GMFN bases of extents that were allocated
30 * (NB. This command also updates the mach_to_phys translation table)
31 */
32 GUEST_HANDLE(ulong) extent_start;
33
34 /* Number of extents, and size/alignment of each (2^extent_order pages). */
35 unsigned long nr_extents;
36 unsigned int extent_order;
37
38 /*
39 * Maximum # bits addressable by the user of the allocated region (e.g.,
40 * I/O devices often have a 32-bit limitation even in 64-bit systems). If
41 * zero then the user has no addressing restriction.
42 * This field is not used by XENMEM_decrease_reservation.
43 */
44 unsigned int address_bits;
45
46 /*
47 * Domain whose reservation is being changed.
48 * Unprivileged domains can specify only DOMID_SELF.
49 */
50 domid_t domid;
51
52};
53DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation);
54
55/*
56 * Returns the maximum machine frame number of mapped RAM in this system.
57 * This command always succeeds (it never returns an error code).
58 * arg == NULL.
59 */
60#define XENMEM_maximum_ram_page 2
61
62/*
63 * Returns the current or maximum memory reservation, in pages, of the
64 * specified domain (may be DOMID_SELF). Returns -ve errcode on failure.
65 * arg == addr of domid_t.
66 */
67#define XENMEM_current_reservation 3
68#define XENMEM_maximum_reservation 4
69
70/*
71 * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys
72 * mapping table. Architectures which do not have a m2p table do not implement
73 * this command.
74 * arg == addr of xen_machphys_mfn_list_t.
75 */
76#define XENMEM_machphys_mfn_list 5
77struct xen_machphys_mfn_list {
78 /*
79 * Size of the 'extent_start' array. Fewer entries will be filled if the
80 * machphys table is smaller than max_extents * 2MB.
81 */
82 unsigned int max_extents;
83
84 /*
85 * Pointer to buffer to fill with list of extent starts. If there are
86 * any large discontiguities in the machine address space, 2MB gaps in
87 * the machphys table will be represented by an MFN base of zero.
88 */
89 GUEST_HANDLE(ulong) extent_start;
90
91 /*
92 * Number of extents written to the above array. This will be smaller
93 * than 'max_extents' if the machphys table is smaller than max_e * 2MB.
94 */
95 unsigned int nr_extents;
96};
97DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
98
99/*
100 * Sets the GPFN at which a particular page appears in the specified guest's
101 * pseudophysical address space.
102 * arg == addr of xen_add_to_physmap_t.
103 */
104#define XENMEM_add_to_physmap 7
105struct xen_add_to_physmap {
106 /* Which domain to change the mapping for. */
107 domid_t domid;
108
109 /* Source mapping space. */
110#define XENMAPSPACE_shared_info 0 /* shared info page */
111#define XENMAPSPACE_grant_table 1 /* grant table page */
112 unsigned int space;
113
114 /* Index into source mapping space. */
115 unsigned long idx;
116
117 /* GPFN where the source mapping page should appear. */
118 unsigned long gpfn;
119};
120DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
121
122/*
123 * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
124 * code on failure. This call only works for auto-translated guests.
125 */
126#define XENMEM_translate_gpfn_list 8
127struct xen_translate_gpfn_list {
128 /* Which domain to translate for? */
129 domid_t domid;
130
131 /* Length of list. */
132 unsigned long nr_gpfns;
133
134 /* List of GPFNs to translate. */
135 GUEST_HANDLE(ulong) gpfn_list;
136
137 /*
138 * Output list to contain MFN translations. May be the same as the input
139 * list (in which case each input GPFN is overwritten with the output MFN).
140 */
141 GUEST_HANDLE(ulong) mfn_list;
142};
143DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
144
145#endif /* __XEN_PUBLIC_MEMORY_H__ */
diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h
new file mode 100644
index 000000000000..cd6939147cb6
--- /dev/null
+++ b/include/xen/interface/physdev.h
@@ -0,0 +1,145 @@
1/*
2 * Permission is hereby granted, free of charge, to any person obtaining a copy
3 * of this software and associated documentation files (the "Software"), to
4 * deal in the Software without restriction, including without limitation the
5 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
6 * sell copies of the Software, and to permit persons to whom the Software is
7 * furnished to do so, subject to the following conditions:
8 *
9 * The above copyright notice and this permission notice shall be included in
10 * all copies or substantial portions of the Software.
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
17 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
18 * DEALINGS IN THE SOFTWARE.
19 */
20
21#ifndef __XEN_PUBLIC_PHYSDEV_H__
22#define __XEN_PUBLIC_PHYSDEV_H__
23
24/*
25 * Prototype for this hypercall is:
26 * int physdev_op(int cmd, void *args)
27 * @cmd == PHYSDEVOP_??? (physdev operation).
28 * @args == Operation-specific extra arguments (NULL if none).
29 */
30
31/*
32 * Notify end-of-interrupt (EOI) for the specified IRQ.
33 * @arg == pointer to physdev_eoi structure.
34 */
35#define PHYSDEVOP_eoi 12
36struct physdev_eoi {
37 /* IN */
38 uint32_t irq;
39};
40
41/*
42 * Query the status of an IRQ line.
43 * @arg == pointer to physdev_irq_status_query structure.
44 */
45#define PHYSDEVOP_irq_status_query 5
46struct physdev_irq_status_query {
47 /* IN */
48 uint32_t irq;
49 /* OUT */
50 uint32_t flags; /* XENIRQSTAT_* */
51};
52
53/* Need to call PHYSDEVOP_eoi when the IRQ has been serviced? */
54#define _XENIRQSTAT_needs_eoi (0)
55#define XENIRQSTAT_needs_eoi (1U<<_XENIRQSTAT_needs_eoi)
56
57/* IRQ shared by multiple guests? */
58#define _XENIRQSTAT_shared (1)
59#define XENIRQSTAT_shared (1U<<_XENIRQSTAT_shared)
60
61/*
62 * Set the current VCPU's I/O privilege level.
63 * @arg == pointer to physdev_set_iopl structure.
64 */
65#define PHYSDEVOP_set_iopl 6
66struct physdev_set_iopl {
67 /* IN */
68 uint32_t iopl;
69};
70
71/*
72 * Set the current VCPU's I/O-port permissions bitmap.
73 * @arg == pointer to physdev_set_iobitmap structure.
74 */
75#define PHYSDEVOP_set_iobitmap 7
76struct physdev_set_iobitmap {
77 /* IN */
78 uint8_t * bitmap;
79 uint32_t nr_ports;
80};
81
82/*
83 * Read or write an IO-APIC register.
84 * @arg == pointer to physdev_apic structure.
85 */
86#define PHYSDEVOP_apic_read 8
87#define PHYSDEVOP_apic_write 9
88struct physdev_apic {
89 /* IN */
90 unsigned long apic_physbase;
91 uint32_t reg;
92 /* IN or OUT */
93 uint32_t value;
94};
95
96/*
97 * Allocate or free a physical upcall vector for the specified IRQ line.
98 * @arg == pointer to physdev_irq structure.
99 */
100#define PHYSDEVOP_alloc_irq_vector 10
101#define PHYSDEVOP_free_irq_vector 11
102struct physdev_irq {
103 /* IN */
104 uint32_t irq;
105 /* IN or OUT */
106 uint32_t vector;
107};
108
109/*
110 * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
111 * hypercall since 0x00030202.
112 */
113struct physdev_op {
114 uint32_t cmd;
115 union {
116 struct physdev_irq_status_query irq_status_query;
117 struct physdev_set_iopl set_iopl;
118 struct physdev_set_iobitmap set_iobitmap;
119 struct physdev_apic apic_op;
120 struct physdev_irq irq_op;
121 } u;
122};
123
124/*
125 * Notify that some PIRQ-bound event channels have been unmasked.
126 * ** This command is obsolete since interface version 0x00030202 and is **
127 * ** unsupported by newer versions of Xen. **
128 */
129#define PHYSDEVOP_IRQ_UNMASK_NOTIFY 4
130
131/*
132 * These all-capitals physdev operation names are superceded by the new names
133 * (defined above) since interface version 0x00030202.
134 */
135#define PHYSDEVOP_IRQ_STATUS_QUERY PHYSDEVOP_irq_status_query
136#define PHYSDEVOP_SET_IOPL PHYSDEVOP_set_iopl
137#define PHYSDEVOP_SET_IOBITMAP PHYSDEVOP_set_iobitmap
138#define PHYSDEVOP_APIC_READ PHYSDEVOP_apic_read
139#define PHYSDEVOP_APIC_WRITE PHYSDEVOP_apic_write
140#define PHYSDEVOP_ASSIGN_VECTOR PHYSDEVOP_alloc_irq_vector
141#define PHYSDEVOP_FREE_VECTOR PHYSDEVOP_free_irq_vector
142#define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi
143#define PHYSDEVOP_IRQ_SHARED XENIRQSTAT_shared
144
145#endif /* __XEN_PUBLIC_PHYSDEV_H__ */
diff --git a/include/xen/interface/sched.h b/include/xen/interface/sched.h
new file mode 100644
index 000000000000..5fec575a800a
--- /dev/null
+++ b/include/xen/interface/sched.h
@@ -0,0 +1,77 @@
1/******************************************************************************
2 * sched.h
3 *
4 * Scheduler state interactions
5 *
6 * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
7 */
8
9#ifndef __XEN_PUBLIC_SCHED_H__
10#define __XEN_PUBLIC_SCHED_H__
11
12#include "event_channel.h"
13
14/*
15 * The prototype for this hypercall is:
16 * long sched_op_new(int cmd, void *arg)
17 * @cmd == SCHEDOP_??? (scheduler operation).
18 * @arg == Operation-specific extra argument(s), as described below.
19 *
20 * **NOTE**:
21 * Versions of Xen prior to 3.0.2 provide only the following legacy version
22 * of this hypercall, supporting only the commands yield, block and shutdown:
23 * long sched_op(int cmd, unsigned long arg)
24 * @cmd == SCHEDOP_??? (scheduler operation).
25 * @arg == 0 (SCHEDOP_yield and SCHEDOP_block)
26 * == SHUTDOWN_* code (SCHEDOP_shutdown)
27 */
28
29/*
30 * Voluntarily yield the CPU.
31 * @arg == NULL.
32 */
33#define SCHEDOP_yield 0
34
35/*
36 * Block execution of this VCPU until an event is received for processing.
37 * If called with event upcalls masked, this operation will atomically
38 * reenable event delivery and check for pending events before blocking the
39 * VCPU. This avoids a "wakeup waiting" race.
40 * @arg == NULL.
41 */
42#define SCHEDOP_block 1
43
44/*
45 * Halt execution of this domain (all VCPUs) and notify the system controller.
46 * @arg == pointer to sched_shutdown structure.
47 */
48#define SCHEDOP_shutdown 2
49struct sched_shutdown {
50 unsigned int reason; /* SHUTDOWN_* */
51};
52DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
53
54/*
55 * Poll a set of event-channel ports. Return when one or more are pending. An
56 * optional timeout may be specified.
57 * @arg == pointer to sched_poll structure.
58 */
59#define SCHEDOP_poll 3
60struct sched_poll {
61 GUEST_HANDLE(evtchn_port_t) ports;
62 unsigned int nr_ports;
63 uint64_t timeout;
64};
65DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
66
67/*
68 * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
69 * software to determine the appropriate action. For the most part, Xen does
70 * not care about the shutdown code.
71 */
72#define SHUTDOWN_poweroff 0 /* Domain exited normally. Clean up and kill. */
73#define SHUTDOWN_reboot 1 /* Clean up, kill, and then restart. */
74#define SHUTDOWN_suspend 2 /* Clean up, save suspend info, kill. */
75#define SHUTDOWN_crash 3 /* Tell controller we've crashed. */
76
77#endif /* __XEN_PUBLIC_SCHED_H__ */
diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
new file mode 100644
index 000000000000..ff61ea365997
--- /dev/null
+++ b/include/xen/interface/vcpu.h
@@ -0,0 +1,167 @@
1/******************************************************************************
2 * vcpu.h
3 *
4 * VCPU initialisation, query, and hotplug.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 * DEALINGS IN THE SOFTWARE.
23 *
24 * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
25 */
26
27#ifndef __XEN_PUBLIC_VCPU_H__
28#define __XEN_PUBLIC_VCPU_H__
29
30/*
31 * Prototype for this hypercall is:
32 * int vcpu_op(int cmd, int vcpuid, void *extra_args)
33 * @cmd == VCPUOP_??? (VCPU operation).
34 * @vcpuid == VCPU to operate on.
35 * @extra_args == Operation-specific extra arguments (NULL if none).
36 */
37
38/*
39 * Initialise a VCPU. Each VCPU can be initialised only once. A
40 * newly-initialised VCPU will not run until it is brought up by VCPUOP_up.
41 *
42 * @extra_arg == pointer to vcpu_guest_context structure containing initial
43 * state for the VCPU.
44 */
45#define VCPUOP_initialise 0
46
47/*
48 * Bring up a VCPU. This makes the VCPU runnable. This operation will fail
49 * if the VCPU has not been initialised (VCPUOP_initialise).
50 */
51#define VCPUOP_up 1
52
53/*
54 * Bring down a VCPU (i.e., make it non-runnable).
55 * There are a few caveats that callers should observe:
56 * 1. This operation may return, and VCPU_is_up may return false, before the
57 * VCPU stops running (i.e., the command is asynchronous). It is a good
58 * idea to ensure that the VCPU has entered a non-critical loop before
59 * bringing it down. Alternatively, this operation is guaranteed
60 * synchronous if invoked by the VCPU itself.
61 * 2. After a VCPU is initialised, there is currently no way to drop all its
62 * references to domain memory. Even a VCPU that is down still holds
63 * memory references via its pagetable base pointer and GDT. It is good
64 * practise to move a VCPU onto an 'idle' or default page table, LDT and
65 * GDT before bringing it down.
66 */
67#define VCPUOP_down 2
68
69/* Returns 1 if the given VCPU is up. */
70#define VCPUOP_is_up 3
71
72/*
73 * Return information about the state and running time of a VCPU.
74 * @extra_arg == pointer to vcpu_runstate_info structure.
75 */
76#define VCPUOP_get_runstate_info 4
77struct vcpu_runstate_info {
78 /* VCPU's current state (RUNSTATE_*). */
79 int state;
80 /* When was current state entered (system time, ns)? */
81 uint64_t state_entry_time;
82 /*
83 * Time spent in each RUNSTATE_* (ns). The sum of these times is
84 * guaranteed not to drift from system time.
85 */
86 uint64_t time[4];
87};
88
89/* VCPU is currently running on a physical CPU. */
90#define RUNSTATE_running 0
91
92/* VCPU is runnable, but not currently scheduled on any physical CPU. */
93#define RUNSTATE_runnable 1
94
95/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
96#define RUNSTATE_blocked 2
97
98/*
99 * VCPU is not runnable, but it is not blocked.
100 * This is a 'catch all' state for things like hotplug and pauses by the
101 * system administrator (or for critical sections in the hypervisor).
102 * RUNSTATE_blocked dominates this state (it is the preferred state).
103 */
104#define RUNSTATE_offline 3
105
106/*
107 * Register a shared memory area from which the guest may obtain its own
108 * runstate information without needing to execute a hypercall.
109 * Notes:
110 * 1. The registered address may be virtual or physical, depending on the
111 * platform. The virtual address should be registered on x86 systems.
112 * 2. Only one shared area may be registered per VCPU. The shared area is
113 * updated by the hypervisor each time the VCPU is scheduled. Thus
114 * runstate.state will always be RUNSTATE_running and
115 * runstate.state_entry_time will indicate the system time at which the
116 * VCPU was last scheduled to run.
117 * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
118 */
119#define VCPUOP_register_runstate_memory_area 5
120struct vcpu_register_runstate_memory_area {
121 union {
122 struct vcpu_runstate_info *v;
123 uint64_t p;
124 } addr;
125};
126
127/*
128 * Set or stop a VCPU's periodic timer. Every VCPU has one periodic timer
129 * which can be set via these commands. Periods smaller than one millisecond
130 * may not be supported.
131 */
132#define VCPUOP_set_periodic_timer 6 /* arg == vcpu_set_periodic_timer_t */
133#define VCPUOP_stop_periodic_timer 7 /* arg == NULL */
134struct vcpu_set_periodic_timer {
135 uint64_t period_ns;
136};
137
138/*
139 * Set or stop a VCPU's single-shot timer. Every VCPU has one single-shot
140 * timer which can be set via these commands.
141 */
142#define VCPUOP_set_singleshot_timer 8 /* arg == vcpu_set_singleshot_timer_t */
143#define VCPUOP_stop_singleshot_timer 9 /* arg == NULL */
144struct vcpu_set_singleshot_timer {
145 uint64_t timeout_abs_ns;
146 uint32_t flags; /* VCPU_SSHOTTMR_??? */
147};
148
149/* Flags to VCPUOP_set_singleshot_timer. */
150 /* Require the timeout to be in the future (return -ETIME if it's passed). */
151#define _VCPU_SSHOTTMR_future (0)
152#define VCPU_SSHOTTMR_future (1U << _VCPU_SSHOTTMR_future)
153
154/*
155 * Register a memory location in the guest address space for the
156 * vcpu_info structure. This allows the guest to place the vcpu_info
157 * structure in a convenient place, such as in a per-cpu data area.
158 * The pointer need not be page aligned, but the structure must not
159 * cross a page boundary.
160 */
161#define VCPUOP_register_vcpu_info 10 /* arg == struct vcpu_info */
162struct vcpu_register_vcpu_info {
163 uint32_t mfn; /* mfn of page to place vcpu_info */
164 uint32_t offset; /* offset within page */
165};
166
167#endif /* __XEN_PUBLIC_VCPU_H__ */
diff --git a/include/xen/interface/version.h b/include/xen/interface/version.h
new file mode 100644
index 000000000000..453235e923f0
--- /dev/null
+++ b/include/xen/interface/version.h
@@ -0,0 +1,60 @@
1/******************************************************************************
2 * version.h
3 *
4 * Xen version, type, and compile information.
5 *
6 * Copyright (c) 2005, Nguyen Anh Quynh <aquynh@gmail.com>
7 * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
8 */
9
10#ifndef __XEN_PUBLIC_VERSION_H__
11#define __XEN_PUBLIC_VERSION_H__
12
13/* NB. All ops return zero on success, except XENVER_version. */
14
15/* arg == NULL; returns major:minor (16:16). */
16#define XENVER_version 0
17
18/* arg == xen_extraversion_t. */
19#define XENVER_extraversion 1
20struct xen_extraversion {
21 char extraversion[16];
22};
23#define XEN_EXTRAVERSION_LEN (sizeof(struct xen_extraversion))
24
25/* arg == xen_compile_info_t. */
26#define XENVER_compile_info 2
27struct xen_compile_info {
28 char compiler[64];
29 char compile_by[16];
30 char compile_domain[32];
31 char compile_date[32];
32};
33
34#define XENVER_capabilities 3
35struct xen_capabilities_info {
36 char info[1024];
37};
38#define XEN_CAPABILITIES_INFO_LEN (sizeof(struct xen_capabilities_info))
39
40#define XENVER_changeset 4
41struct xen_changeset_info {
42 char info[64];
43};
44#define XEN_CHANGESET_INFO_LEN (sizeof(struct xen_changeset_info))
45
46#define XENVER_platform_parameters 5
47struct xen_platform_parameters {
48 unsigned long virt_start;
49};
50
51#define XENVER_get_features 6
52struct xen_feature_info {
53 unsigned int submap_idx; /* IN: which 32-bit submap to return */
54 uint32_t submap; /* OUT: 32-bit submap */
55};
56
57/* Declares the features reported by XENVER_get_features. */
58#include "features.h"
59
60#endif /* __XEN_PUBLIC_VERSION_H__ */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
new file mode 100644
index 000000000000..518a5bf79ed3
--- /dev/null
+++ b/include/xen/interface/xen.h
@@ -0,0 +1,447 @@
1/******************************************************************************
2 * xen.h
3 *
4 * Guest OS interface to Xen.
5 *
6 * Copyright (c) 2004, K A Fraser
7 */
8
9#ifndef __XEN_PUBLIC_XEN_H__
10#define __XEN_PUBLIC_XEN_H__
11
12#include <asm/xen/interface.h>
13
14/*
15 * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
16 */
17
18/*
19 * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5.
20 * EAX = return value
21 * (argument registers may be clobbered on return)
22 * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6.
23 * RAX = return value
24 * (argument registers not clobbered on return; RCX, R11 are)
25 */
26#define __HYPERVISOR_set_trap_table 0
27#define __HYPERVISOR_mmu_update 1
28#define __HYPERVISOR_set_gdt 2
29#define __HYPERVISOR_stack_switch 3
30#define __HYPERVISOR_set_callbacks 4
31#define __HYPERVISOR_fpu_taskswitch 5
32#define __HYPERVISOR_sched_op 6
33#define __HYPERVISOR_dom0_op 7
34#define __HYPERVISOR_set_debugreg 8
35#define __HYPERVISOR_get_debugreg 9
36#define __HYPERVISOR_update_descriptor 10
37#define __HYPERVISOR_memory_op 12
38#define __HYPERVISOR_multicall 13
39#define __HYPERVISOR_update_va_mapping 14
40#define __HYPERVISOR_set_timer_op 15
41#define __HYPERVISOR_event_channel_op_compat 16
42#define __HYPERVISOR_xen_version 17
43#define __HYPERVISOR_console_io 18
44#define __HYPERVISOR_physdev_op_compat 19
45#define __HYPERVISOR_grant_table_op 20
46#define __HYPERVISOR_vm_assist 21
47#define __HYPERVISOR_update_va_mapping_otherdomain 22
48#define __HYPERVISOR_iret 23 /* x86 only */
49#define __HYPERVISOR_vcpu_op 24
50#define __HYPERVISOR_set_segment_base 25 /* x86/64 only */
51#define __HYPERVISOR_mmuext_op 26
52#define __HYPERVISOR_acm_op 27
53#define __HYPERVISOR_nmi_op 28
54#define __HYPERVISOR_sched_op_new 29
55#define __HYPERVISOR_callback_op 30
56#define __HYPERVISOR_xenoprof_op 31
57#define __HYPERVISOR_event_channel_op 32
58#define __HYPERVISOR_physdev_op 33
59#define __HYPERVISOR_hvm_op 34
60
61/*
62 * VIRTUAL INTERRUPTS
63 *
64 * Virtual interrupts that a guest OS may receive from Xen.
65 */
66#define VIRQ_TIMER 0 /* Timebase update, and/or requested timeout. */
67#define VIRQ_DEBUG 1 /* Request guest to dump debug info. */
68#define VIRQ_CONSOLE 2 /* (DOM0) Bytes received on emergency console. */
69#define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */
70#define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */
71#define NR_VIRQS 8
72
73/*
74 * MMU-UPDATE REQUESTS
75 *
76 * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
77 * A foreigndom (FD) can be specified (or DOMID_SELF for none).
78 * Where the FD has some effect, it is described below.
79 * ptr[1:0] specifies the appropriate MMU_* command.
80 *
81 * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
82 * Updates an entry in a page table. If updating an L1 table, and the new
83 * table entry is valid/present, the mapped frame must belong to the FD, if
84 * an FD has been specified. If attempting to map an I/O page then the
85 * caller assumes the privilege of the FD.
86 * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
87 * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
88 * ptr[:2] -- Machine address of the page-table entry to modify.
89 * val -- Value to write.
90 *
91 * ptr[1:0] == MMU_MACHPHYS_UPDATE:
92 * Updates an entry in the machine->pseudo-physical mapping table.
93 * ptr[:2] -- Machine address within the frame whose mapping to modify.
94 * The frame must belong to the FD, if one is specified.
95 * val -- Value to write into the mapping entry.
96 */
97#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */
98#define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */
99
100/*
101 * MMU EXTENDED OPERATIONS
102 *
103 * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
104 * A foreigndom (FD) can be specified (or DOMID_SELF for none).
105 * Where the FD has some effect, it is described below.
106 *
107 * cmd: MMUEXT_(UN)PIN_*_TABLE
108 * mfn: Machine frame number to be (un)pinned as a p.t. page.
109 * The frame must belong to the FD, if one is specified.
110 *
111 * cmd: MMUEXT_NEW_BASEPTR
112 * mfn: Machine frame number of new page-table base to install in MMU.
113 *
114 * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only]
115 * mfn: Machine frame number of new page-table base to install in MMU
116 * when in user space.
117 *
118 * cmd: MMUEXT_TLB_FLUSH_LOCAL
119 * No additional arguments. Flushes local TLB.
120 *
121 * cmd: MMUEXT_INVLPG_LOCAL
122 * linear_addr: Linear address to be flushed from the local TLB.
123 *
124 * cmd: MMUEXT_TLB_FLUSH_MULTI
125 * vcpumask: Pointer to bitmap of VCPUs to be flushed.
126 *
127 * cmd: MMUEXT_INVLPG_MULTI
128 * linear_addr: Linear address to be flushed.
129 * vcpumask: Pointer to bitmap of VCPUs to be flushed.
130 *
131 * cmd: MMUEXT_TLB_FLUSH_ALL
132 * No additional arguments. Flushes all VCPUs' TLBs.
133 *
134 * cmd: MMUEXT_INVLPG_ALL
135 * linear_addr: Linear address to be flushed from all VCPUs' TLBs.
136 *
137 * cmd: MMUEXT_FLUSH_CACHE
138 * No additional arguments. Writes back and flushes cache contents.
139 *
140 * cmd: MMUEXT_SET_LDT
141 * linear_addr: Linear address of LDT base (NB. must be page-aligned).
142 * nr_ents: Number of entries in LDT.
143 */
144#define MMUEXT_PIN_L1_TABLE 0
145#define MMUEXT_PIN_L2_TABLE 1
146#define MMUEXT_PIN_L3_TABLE 2
147#define MMUEXT_PIN_L4_TABLE 3
148#define MMUEXT_UNPIN_TABLE 4
149#define MMUEXT_NEW_BASEPTR 5
150#define MMUEXT_TLB_FLUSH_LOCAL 6
151#define MMUEXT_INVLPG_LOCAL 7
152#define MMUEXT_TLB_FLUSH_MULTI 8
153#define MMUEXT_INVLPG_MULTI 9
154#define MMUEXT_TLB_FLUSH_ALL 10
155#define MMUEXT_INVLPG_ALL 11
156#define MMUEXT_FLUSH_CACHE 12
157#define MMUEXT_SET_LDT 13
158#define MMUEXT_NEW_USER_BASEPTR 15
159
160#ifndef __ASSEMBLY__
161struct mmuext_op {
162 unsigned int cmd;
163 union {
164 /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
165 unsigned long mfn;
166 /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
167 unsigned long linear_addr;
168 } arg1;
169 union {
170 /* SET_LDT */
171 unsigned int nr_ents;
172 /* TLB_FLUSH_MULTI, INVLPG_MULTI */
173 void *vcpumask;
174 } arg2;
175};
176DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
177#endif
178
179/* These are passed as 'flags' to update_va_mapping. They can be ORed. */
180/* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap. */
181/* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer. */
182#define UVMF_NONE (0UL<<0) /* No flushing at all. */
183#define UVMF_TLB_FLUSH (1UL<<0) /* Flush entire TLB(s). */
184#define UVMF_INVLPG (2UL<<0) /* Flush only one entry. */
185#define UVMF_FLUSHTYPE_MASK (3UL<<0)
186#define UVMF_MULTI (0UL<<2) /* Flush subset of TLBs. */
187#define UVMF_LOCAL (0UL<<2) /* Flush local TLB. */
188#define UVMF_ALL (1UL<<2) /* Flush all TLBs. */
189
190/*
191 * Commands to HYPERVISOR_console_io().
192 */
193#define CONSOLEIO_write 0
194#define CONSOLEIO_read 1
195
196/*
197 * Commands to HYPERVISOR_vm_assist().
198 */
199#define VMASST_CMD_enable 0
200#define VMASST_CMD_disable 1
201#define VMASST_TYPE_4gb_segments 0
202#define VMASST_TYPE_4gb_segments_notify 1
203#define VMASST_TYPE_writable_pagetables 2
204#define VMASST_TYPE_pae_extended_cr3 3
205#define MAX_VMASST_TYPE 3
206
207#ifndef __ASSEMBLY__
208
209typedef uint16_t domid_t;
210
211/* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */
212#define DOMID_FIRST_RESERVED (0x7FF0U)
213
214/* DOMID_SELF is used in certain contexts to refer to oneself. */
215#define DOMID_SELF (0x7FF0U)
216
217/*
218 * DOMID_IO is used to restrict page-table updates to mapping I/O memory.
219 * Although no Foreign Domain need be specified to map I/O pages, DOMID_IO
220 * is useful to ensure that no mappings to the OS's own heap are accidentally
221 * installed. (e.g., in Linux this could cause havoc as reference counts
222 * aren't adjusted on the I/O-mapping code path).
223 * This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can
224 * be specified by any calling domain.
225 */
226#define DOMID_IO (0x7FF1U)
227
228/*
229 * DOMID_XEN is used to allow privileged domains to map restricted parts of
230 * Xen's heap space (e.g., the machine_to_phys table).
231 * This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if
232 * the caller is privileged.
233 */
234#define DOMID_XEN (0x7FF2U)
235
236/*
237 * Send an array of these to HYPERVISOR_mmu_update().
238 * NB. The fields are natural pointer/address size for this architecture.
239 */
240struct mmu_update {
241 uint64_t ptr; /* Machine address of PTE. */
242 uint64_t val; /* New contents of PTE. */
243};
244DEFINE_GUEST_HANDLE_STRUCT(mmu_update);
245
246/*
247 * Send an array of these to HYPERVISOR_multicall().
248 * NB. The fields are natural register size for this architecture.
249 */
250struct multicall_entry {
251 unsigned long op;
252 long result;
253 unsigned long args[6];
254};
255DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
256
257/*
258 * Event channel endpoints per domain:
259 * 1024 if a long is 32 bits; 4096 if a long is 64 bits.
260 */
261#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
262
263struct vcpu_time_info {
264 /*
265 * Updates to the following values are preceded and followed
266 * by an increment of 'version'. The guest can therefore
267 * detect updates by looking for changes to 'version'. If the
268 * least-significant bit of the version number is set then an
269 * update is in progress and the guest must wait to read a
270 * consistent set of values. The correct way to interact with
271 * the version number is similar to Linux's seqlock: see the
272 * implementations of read_seqbegin/read_seqretry.
273 */
274 uint32_t version;
275 uint32_t pad0;
276 uint64_t tsc_timestamp; /* TSC at last update of time vals. */
277 uint64_t system_time; /* Time, in nanosecs, since boot. */
278 /*
279 * Current system time:
280 * system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul
281 * CPU frequency (Hz):
282 * ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
283 */
284 uint32_t tsc_to_system_mul;
285 int8_t tsc_shift;
286 int8_t pad1[3];
287}; /* 32 bytes */
288
289struct vcpu_info {
290 /*
291 * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
292 * a pending notification for a particular VCPU. It is then cleared
293 * by the guest OS /before/ checking for pending work, thus avoiding
294 * a set-and-check race. Note that the mask is only accessed by Xen
295 * on the CPU that is currently hosting the VCPU. This means that the
296 * pending and mask flags can be updated by the guest without special
297 * synchronisation (i.e., no need for the x86 LOCK prefix).
298 * This may seem suboptimal because if the pending flag is set by
299 * a different CPU then an IPI may be scheduled even when the mask
300 * is set. However, note:
301 * 1. The task of 'interrupt holdoff' is covered by the per-event-
302 * channel mask bits. A 'noisy' event that is continually being
303 * triggered can be masked at source at this very precise
304 * granularity.
305 * 2. The main purpose of the per-VCPU mask is therefore to restrict
306 * reentrant execution: whether for concurrency control, or to
307 * prevent unbounded stack usage. Whatever the purpose, we expect
308 * that the mask will be asserted only for short periods at a time,
309 * and so the likelihood of a 'spurious' IPI is suitably small.
310 * The mask is read before making an event upcall to the guest: a
311 * non-zero mask therefore guarantees that the VCPU will not receive
312 * an upcall activation. The mask is cleared when the VCPU requests
313 * to block: this avoids wakeup-waiting races.
314 */
315 uint8_t evtchn_upcall_pending;
316 uint8_t evtchn_upcall_mask;
317 unsigned long evtchn_pending_sel;
318 struct arch_vcpu_info arch;
319 struct vcpu_time_info time;
320}; /* 64 bytes (x86) */
321
322/*
323 * Xen/kernel shared data -- pointer provided in start_info.
324 * NB. We expect that this struct is smaller than a page.
325 */
326struct shared_info {
327 struct vcpu_info vcpu_info[MAX_VIRT_CPUS];
328
329 /*
330 * A domain can create "event channels" on which it can send and receive
331 * asynchronous event notifications. There are three classes of event that
332 * are delivered by this mechanism:
333 * 1. Bi-directional inter- and intra-domain connections. Domains must
334 * arrange out-of-band to set up a connection (usually by allocating
335 * an unbound 'listener' port and avertising that via a storage service
336 * such as xenstore).
337 * 2. Physical interrupts. A domain with suitable hardware-access
338 * privileges can bind an event-channel port to a physical interrupt
339 * source.
340 * 3. Virtual interrupts ('events'). A domain can bind an event-channel
341 * port to a virtual interrupt source, such as the virtual-timer
342 * device or the emergency console.
343 *
344 * Event channels are addressed by a "port index". Each channel is
345 * associated with two bits of information:
346 * 1. PENDING -- notifies the domain that there is a pending notification
347 * to be processed. This bit is cleared by the guest.
348 * 2. MASK -- if this bit is clear then a 0->1 transition of PENDING
349 * will cause an asynchronous upcall to be scheduled. This bit is only
350 * updated by the guest. It is read-only within Xen. If a channel
351 * becomes pending while the channel is masked then the 'edge' is lost
352 * (i.e., when the channel is unmasked, the guest must manually handle
353 * pending notifications as no upcall will be scheduled by Xen).
354 *
355 * To expedite scanning of pending notifications, any 0->1 pending
356 * transition on an unmasked channel causes a corresponding bit in a
357 * per-vcpu selector word to be set. Each bit in the selector covers a
358 * 'C long' in the PENDING bitfield array.
359 */
360 unsigned long evtchn_pending[sizeof(unsigned long) * 8];
361 unsigned long evtchn_mask[sizeof(unsigned long) * 8];
362
363 /*
364 * Wallclock time: updated only by control software. Guests should base
365 * their gettimeofday() syscall on this wallclock-base value.
366 */
367 uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */
368 uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */
369 uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */
370
371 struct arch_shared_info arch;
372
373};
374
375/*
376 * Start-of-day memory layout for the initial domain (DOM0):
377 * 1. The domain is started within contiguous virtual-memory region.
378 * 2. The contiguous region begins and ends on an aligned 4MB boundary.
379 * 3. The region start corresponds to the load address of the OS image.
380 * If the load address is not 4MB aligned then the address is rounded down.
381 * 4. This the order of bootstrap elements in the initial virtual region:
382 * a. relocated kernel image
383 * b. initial ram disk [mod_start, mod_len]
384 * c. list of allocated page frames [mfn_list, nr_pages]
385 * d. start_info_t structure [register ESI (x86)]
386 * e. bootstrap page tables [pt_base, CR3 (x86)]
387 * f. bootstrap stack [register ESP (x86)]
388 * 5. Bootstrap elements are packed together, but each is 4kB-aligned.
389 * 6. The initial ram disk may be omitted.
390 * 7. The list of page frames forms a contiguous 'pseudo-physical' memory
391 * layout for the domain. In particular, the bootstrap virtual-memory
392 * region is a 1:1 mapping to the first section of the pseudo-physical map.
393 * 8. All bootstrap elements are mapped read-writable for the guest OS. The
394 * only exception is the bootstrap page table, which is mapped read-only.
395 * 9. There is guaranteed to be at least 512kB padding after the final
396 * bootstrap element. If necessary, the bootstrap virtual region is
397 * extended by an extra 4MB to ensure this.
398 */
399
400#define MAX_GUEST_CMDLINE 1024
401struct start_info {
402 /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */
403 char magic[32]; /* "xen-<version>-<platform>". */
404 unsigned long nr_pages; /* Total pages allocated to this domain. */
405 unsigned long shared_info; /* MACHINE address of shared info struct. */
406 uint32_t flags; /* SIF_xxx flags. */
407 unsigned long store_mfn; /* MACHINE page number of shared page. */
408 uint32_t store_evtchn; /* Event channel for store communication. */
409 union {
410 struct {
411 unsigned long mfn; /* MACHINE page number of console page. */
412 uint32_t evtchn; /* Event channel for console page. */
413 } domU;
414 struct {
415 uint32_t info_off; /* Offset of console_info struct. */
416 uint32_t info_size; /* Size of console_info struct from start.*/
417 } dom0;
418 } console;
419 /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME). */
420 unsigned long pt_base; /* VIRTUAL address of page directory. */
421 unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames. */
422 unsigned long mfn_list; /* VIRTUAL address of page-frame list. */
423 unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */
424 unsigned long mod_len; /* Size (bytes) of pre-loaded module. */
425 int8_t cmd_line[MAX_GUEST_CMDLINE];
426};
427
428/* These flags are passed in the 'flags' field of start_info_t. */
429#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */
430#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */
431
432typedef uint64_t cpumap_t;
433
434typedef uint8_t xen_domain_handle_t[16];
435
436/* Turn a plain number into a C unsigned long constant. */
437#define __mk_unsigned_long(x) x ## UL
438#define mk_unsigned_long(x) __mk_unsigned_long(x)
439
440#else /* __ASSEMBLY__ */
441
442/* In assembly code we cannot use C numeric constant suffixes. */
443#define mk_unsigned_long(x) x
444
445#endif /* !__ASSEMBLY__ */
446
447#endif /* __XEN_PUBLIC_XEN_H__ */
diff --git a/include/xen/page.h b/include/xen/page.h
new file mode 100644
index 000000000000..1df6c1930578
--- /dev/null
+++ b/include/xen/page.h
@@ -0,0 +1,179 @@
1#ifndef __XEN_PAGE_H
2#define __XEN_PAGE_H
3
4#include <linux/pfn.h>
5
6#include <asm/uaccess.h>
7
8#include <xen/features.h>
9
10#ifdef CONFIG_X86_PAE
11/* Xen machine address */
12typedef struct xmaddr {
13 unsigned long long maddr;
14} xmaddr_t;
15
16/* Xen pseudo-physical address */
17typedef struct xpaddr {
18 unsigned long long paddr;
19} xpaddr_t;
20#else
21/* Xen machine address */
22typedef struct xmaddr {
23 unsigned long maddr;
24} xmaddr_t;
25
26/* Xen pseudo-physical address */
27typedef struct xpaddr {
28 unsigned long paddr;
29} xpaddr_t;
30#endif
31
32#define XMADDR(x) ((xmaddr_t) { .maddr = (x) })
33#define XPADDR(x) ((xpaddr_t) { .paddr = (x) })
34
35/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
36#define INVALID_P2M_ENTRY (~0UL)
37#define FOREIGN_FRAME_BIT (1UL<<31)
38#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
39
40extern unsigned long *phys_to_machine_mapping;
41
42static inline unsigned long pfn_to_mfn(unsigned long pfn)
43{
44 if (xen_feature(XENFEAT_auto_translated_physmap))
45 return pfn;
46
47 return phys_to_machine_mapping[(unsigned int)(pfn)] &
48 ~FOREIGN_FRAME_BIT;
49}
50
51static inline int phys_to_machine_mapping_valid(unsigned long pfn)
52{
53 if (xen_feature(XENFEAT_auto_translated_physmap))
54 return 1;
55
56 return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
57}
58
59static inline unsigned long mfn_to_pfn(unsigned long mfn)
60{
61 unsigned long pfn;
62
63 if (xen_feature(XENFEAT_auto_translated_physmap))
64 return mfn;
65
66#if 0
67 if (unlikely((mfn >> machine_to_phys_order) != 0))
68 return max_mapnr;
69#endif
70
71 pfn = 0;
72 /*
73 * The array access can fail (e.g., device space beyond end of RAM).
74 * In such cases it doesn't matter what we return (we return garbage),
75 * but we must handle the fault without crashing!
76 */
77 __get_user(pfn, &machine_to_phys_mapping[mfn]);
78
79 return pfn;
80}
81
82static inline xmaddr_t phys_to_machine(xpaddr_t phys)
83{
84 unsigned offset = phys.paddr & ~PAGE_MASK;
85 return XMADDR(PFN_PHYS((u64)pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
86}
87
88static inline xpaddr_t machine_to_phys(xmaddr_t machine)
89{
90 unsigned offset = machine.maddr & ~PAGE_MASK;
91 return XPADDR(PFN_PHYS((u64)mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
92}
93
94/*
95 * We detect special mappings in one of two ways:
96 * 1. If the MFN is an I/O page then Xen will set the m2p entry
97 * to be outside our maximum possible pseudophys range.
98 * 2. If the MFN belongs to a different domain then we will certainly
99 * not have MFN in our p2m table. Conversely, if the page is ours,
100 * then we'll have p2m(m2p(MFN))==MFN.
101 * If we detect a special mapping then it doesn't have a 'struct page'.
102 * We force !pfn_valid() by returning an out-of-range pointer.
103 *
104 * NB. These checks require that, for any MFN that is not in our reservation,
105 * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
106 * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
107 * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
108 *
109 * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
110 * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
111 * require. In all the cases we care about, the FOREIGN_FRAME bit is
112 * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
113 */
114static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
115{
116 extern unsigned long max_mapnr;
117 unsigned long pfn = mfn_to_pfn(mfn);
118 if ((pfn < max_mapnr)
119 && !xen_feature(XENFEAT_auto_translated_physmap)
120 && (phys_to_machine_mapping[pfn] != mfn))
121 return max_mapnr; /* force !pfn_valid() */
122 return pfn;
123}
124
125static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
126{
127 if (xen_feature(XENFEAT_auto_translated_physmap)) {
128 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
129 return;
130 }
131 phys_to_machine_mapping[pfn] = mfn;
132}
133
134/* VIRT <-> MACHINE conversion */
135#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v))))
136#define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v))))
137#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
138
139#ifdef CONFIG_X86_PAE
140#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
141 (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT)))
142
143static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
144{
145 pte_t pte;
146
147 pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) |
148 (pgprot_val(pgprot) >> 32);
149 pte.pte_high &= (__supported_pte_mask >> 32);
150 pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot));
151 pte.pte_low &= __supported_pte_mask;
152
153 return pte;
154}
155
156static inline unsigned long long pte_val_ma(pte_t x)
157{
158 return ((unsigned long long)x.pte_high << 32) | x.pte_low;
159}
160#define pmd_val_ma(v) ((v).pmd)
161#define pud_val_ma(v) ((v).pgd.pgd)
162#define __pte_ma(x) ((pte_t) { .pte_low = (x), .pte_high = (x)>>32 } )
163#define __pmd_ma(x) ((pmd_t) { (x) } )
164#else /* !X86_PAE */
165#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
166#define mfn_pte(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
167#define pte_val_ma(x) ((x).pte_low)
168#define pmd_val_ma(v) ((v).pud.pgd.pgd)
169#define __pte_ma(x) ((pte_t) { (x) } )
170#endif /* CONFIG_X86_PAE */
171
172#define pgd_val_ma(x) ((x).pgd)
173
174
175xmaddr_t arbitrary_virt_to_machine(unsigned long address);
176void make_lowmem_page_readonly(void *vaddr);
177void make_lowmem_page_readwrite(void *vaddr);
178
179#endif /* __XEN_PAGE_H */
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
new file mode 100644
index 000000000000..6f7c290651ae
--- /dev/null
+++ b/include/xen/xenbus.h
@@ -0,0 +1,234 @@
1/******************************************************************************
2 * xenbus.h
3 *
4 * Talks to Xen Store to figure out what devices we have.
5 *
6 * Copyright (C) 2005 Rusty Russell, IBM Corporation
7 * Copyright (C) 2005 XenSource Ltd.
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License version 2
11 * as published by the Free Software Foundation; or, when distributed
12 * separately from the Linux kernel or incorporated into other
13 * software packages, subject to the following license:
14 *
15 * Permission is hereby granted, free of charge, to any person obtaining a copy
16 * of this source file (the "Software"), to deal in the Software without
17 * restriction, including without limitation the rights to use, copy, modify,
18 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
19 * and to permit persons to whom the Software is furnished to do so, subject to
20 * the following conditions:
21 *
22 * The above copyright notice and this permission notice shall be included in
23 * all copies or substantial portions of the Software.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
30 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
31 * IN THE SOFTWARE.
32 */
33
34#ifndef _XEN_XENBUS_H
35#define _XEN_XENBUS_H
36
37#include <linux/device.h>
38#include <linux/notifier.h>
39#include <linux/mutex.h>
40#include <linux/completion.h>
41#include <linux/init.h>
42#include <xen/interface/xen.h>
43#include <xen/interface/grant_table.h>
44#include <xen/interface/io/xenbus.h>
45#include <xen/interface/io/xs_wire.h>
46
47/* Register callback to watch this node. */
48struct xenbus_watch
49{
50 struct list_head list;
51
52 /* Path being watched. */
53 const char *node;
54
55 /* Callback (executed in a process context with no locks held). */
56 void (*callback)(struct xenbus_watch *,
57 const char **vec, unsigned int len);
58};
59
60
61/* A xenbus device. */
62struct xenbus_device {
63 const char *devicetype;
64 const char *nodename;
65 const char *otherend;
66 int otherend_id;
67 struct xenbus_watch otherend_watch;
68 struct device dev;
69 enum xenbus_state state;
70 struct completion down;
71};
72
73static inline struct xenbus_device *to_xenbus_device(struct device *dev)
74{
75 return container_of(dev, struct xenbus_device, dev);
76}
77
78struct xenbus_device_id
79{
80 /* .../device/<device_type>/<identifier> */
81 char devicetype[32]; /* General class of device. */
82};
83
84/* A xenbus driver. */
85struct xenbus_driver {
86 char *name;
87 struct module *owner;
88 const struct xenbus_device_id *ids;
89 int (*probe)(struct xenbus_device *dev,
90 const struct xenbus_device_id *id);
91 void (*otherend_changed)(struct xenbus_device *dev,
92 enum xenbus_state backend_state);
93 int (*remove)(struct xenbus_device *dev);
94 int (*suspend)(struct xenbus_device *dev);
95 int (*suspend_cancel)(struct xenbus_device *dev);
96 int (*resume)(struct xenbus_device *dev);
97 int (*uevent)(struct xenbus_device *, char **, int, char *, int);
98 struct device_driver driver;
99 int (*read_otherend_details)(struct xenbus_device *dev);
100};
101
102static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv)
103{
104 return container_of(drv, struct xenbus_driver, driver);
105}
106
107int __must_check __xenbus_register_frontend(struct xenbus_driver *drv,
108 struct module *owner,
109 const char *mod_name);
110
111static inline int __must_check
112xenbus_register_frontend(struct xenbus_driver *drv)
113{
114 WARN_ON(drv->owner != THIS_MODULE);
115 return __xenbus_register_frontend(drv, THIS_MODULE, KBUILD_MODNAME);
116}
117
118int __must_check __xenbus_register_backend(struct xenbus_driver *drv,
119 struct module *owner,
120 const char *mod_name);
121static inline int __must_check
122xenbus_register_backend(struct xenbus_driver *drv)
123{
124 WARN_ON(drv->owner != THIS_MODULE);
125 return __xenbus_register_backend(drv, THIS_MODULE, KBUILD_MODNAME);
126}
127
128void xenbus_unregister_driver(struct xenbus_driver *drv);
129
130struct xenbus_transaction
131{
132 u32 id;
133};
134
135/* Nil transaction ID. */
136#define XBT_NIL ((struct xenbus_transaction) { 0 })
137
138int __init xenbus_dev_init(void);
139
140char **xenbus_directory(struct xenbus_transaction t,
141 const char *dir, const char *node, unsigned int *num);
142void *xenbus_read(struct xenbus_transaction t,
143 const char *dir, const char *node, unsigned int *len);
144int xenbus_write(struct xenbus_transaction t,
145 const char *dir, const char *node, const char *string);
146int xenbus_mkdir(struct xenbus_transaction t,
147 const char *dir, const char *node);
148int xenbus_exists(struct xenbus_transaction t,
149 const char *dir, const char *node);
150int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node);
151int xenbus_transaction_start(struct xenbus_transaction *t);
152int xenbus_transaction_end(struct xenbus_transaction t, int abort);
153
154/* Single read and scanf: returns -errno or num scanned if > 0. */
155int xenbus_scanf(struct xenbus_transaction t,
156 const char *dir, const char *node, const char *fmt, ...)
157 __attribute__((format(scanf, 4, 5)));
158
159/* Single printf and write: returns -errno or 0. */
160int xenbus_printf(struct xenbus_transaction t,
161 const char *dir, const char *node, const char *fmt, ...)
162 __attribute__((format(printf, 4, 5)));
163
164/* Generic read function: NULL-terminated triples of name,
165 * sprintf-style type string, and pointer. Returns 0 or errno.*/
166int xenbus_gather(struct xenbus_transaction t, const char *dir, ...);
167
168/* notifer routines for when the xenstore comes up */
169extern int xenstored_ready;
170int register_xenstore_notifier(struct notifier_block *nb);
171void unregister_xenstore_notifier(struct notifier_block *nb);
172
173int register_xenbus_watch(struct xenbus_watch *watch);
174void unregister_xenbus_watch(struct xenbus_watch *watch);
175void xs_suspend(void);
176void xs_resume(void);
177void xs_suspend_cancel(void);
178
179/* Used by xenbus_dev to borrow kernel's store connection. */
180void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg);
181
182struct work_struct;
183
184/* Prepare for domain suspend: then resume or cancel the suspend. */
185void xenbus_suspend(void);
186void xenbus_resume(void);
187void xenbus_probe(struct work_struct *);
188void xenbus_suspend_cancel(void);
189
190#define XENBUS_IS_ERR_READ(str) ({ \
191 if (!IS_ERR(str) && strlen(str) == 0) { \
192 kfree(str); \
193 str = ERR_PTR(-ERANGE); \
194 } \
195 IS_ERR(str); \
196})
197
198#define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE)
199
200int xenbus_watch_path(struct xenbus_device *dev, const char *path,
201 struct xenbus_watch *watch,
202 void (*callback)(struct xenbus_watch *,
203 const char **, unsigned int));
204int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch,
205 void (*callback)(struct xenbus_watch *,
206 const char **, unsigned int),
207 const char *pathfmt, ...)
208 __attribute__ ((format (printf, 4, 5)));
209
210int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
211int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
212int xenbus_map_ring_valloc(struct xenbus_device *dev,
213 int gnt_ref, void **vaddr);
214int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
215 grant_handle_t *handle, void *vaddr);
216
217int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr);
218int xenbus_unmap_ring(struct xenbus_device *dev,
219 grant_handle_t handle, void *vaddr);
220
221int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
222int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);
223int xenbus_free_evtchn(struct xenbus_device *dev, int port);
224
225enum xenbus_state xenbus_read_driver_state(const char *path);
226
227void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...);
228void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...);
229
230const char *xenbus_strstate(enum xenbus_state state);
231int xenbus_dev_is_online(struct xenbus_device *dev);
232int xenbus_frontend_closed(struct xenbus_device *dev);
233
234#endif /* _XEN_XENBUS_H */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b4796d850140..57e6448b171e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -516,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf)
516 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; 516 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
517 envp[i] = NULL; 517 envp[i] = NULL;
518 518
519 call_usermodehelper(argv[0], argv, envp, 0); 519 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
520 kfree(pathbuf); 520 kfree(pathbuf);
521} 521}
522 522
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 4d32eb077179..78d365c524ed 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -119,9 +119,10 @@ struct subprocess_info {
119 char **argv; 119 char **argv;
120 char **envp; 120 char **envp;
121 struct key *ring; 121 struct key *ring;
122 int wait; 122 enum umh_wait wait;
123 int retval; 123 int retval;
124 struct file *stdin; 124 struct file *stdin;
125 void (*cleanup)(char **argv, char **envp);
125}; 126};
126 127
127/* 128/*
@@ -180,6 +181,14 @@ static int ____call_usermodehelper(void *data)
180 do_exit(0); 181 do_exit(0);
181} 182}
182 183
184void call_usermodehelper_freeinfo(struct subprocess_info *info)
185{
186 if (info->cleanup)
187 (*info->cleanup)(info->argv, info->envp);
188 kfree(info);
189}
190EXPORT_SYMBOL(call_usermodehelper_freeinfo);
191
183/* Keventd can't block, but this (a child) can. */ 192/* Keventd can't block, but this (a child) can. */
184static int wait_for_helper(void *data) 193static int wait_for_helper(void *data)
185{ 194{
@@ -216,8 +225,8 @@ static int wait_for_helper(void *data)
216 sub_info->retval = ret; 225 sub_info->retval = ret;
217 } 226 }
218 227
219 if (sub_info->wait < 0) 228 if (sub_info->wait == UMH_NO_WAIT)
220 kfree(sub_info); 229 call_usermodehelper_freeinfo(sub_info);
221 else 230 else
222 complete(sub_info->complete); 231 complete(sub_info->complete);
223 return 0; 232 return 0;
@@ -229,34 +238,122 @@ static void __call_usermodehelper(struct work_struct *work)
229 struct subprocess_info *sub_info = 238 struct subprocess_info *sub_info =
230 container_of(work, struct subprocess_info, work); 239 container_of(work, struct subprocess_info, work);
231 pid_t pid; 240 pid_t pid;
232 int wait = sub_info->wait; 241 enum umh_wait wait = sub_info->wait;
233 242
234 /* CLONE_VFORK: wait until the usermode helper has execve'd 243 /* CLONE_VFORK: wait until the usermode helper has execve'd
235 * successfully We need the data structures to stay around 244 * successfully We need the data structures to stay around
236 * until that is done. */ 245 * until that is done. */
237 if (wait) 246 if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT)
238 pid = kernel_thread(wait_for_helper, sub_info, 247 pid = kernel_thread(wait_for_helper, sub_info,
239 CLONE_FS | CLONE_FILES | SIGCHLD); 248 CLONE_FS | CLONE_FILES | SIGCHLD);
240 else 249 else
241 pid = kernel_thread(____call_usermodehelper, sub_info, 250 pid = kernel_thread(____call_usermodehelper, sub_info,
242 CLONE_VFORK | SIGCHLD); 251 CLONE_VFORK | SIGCHLD);
243 252
244 if (wait < 0) 253 switch (wait) {
245 return; 254 case UMH_NO_WAIT:
255 break;
246 256
247 if (pid < 0) { 257 case UMH_WAIT_PROC:
258 if (pid > 0)
259 break;
248 sub_info->retval = pid; 260 sub_info->retval = pid;
261 /* FALLTHROUGH */
262
263 case UMH_WAIT_EXEC:
249 complete(sub_info->complete); 264 complete(sub_info->complete);
250 } else if (!wait) 265 }
251 complete(sub_info->complete); 266}
267
268/**
269 * call_usermodehelper_setup - prepare to call a usermode helper
270 * @path - path to usermode executable
271 * @argv - arg vector for process
272 * @envp - environment for process
273 *
274 * Returns either NULL on allocation failure, or a subprocess_info
275 * structure. This should be passed to call_usermodehelper_exec to
276 * exec the process and free the structure.
277 */
278struct subprocess_info *call_usermodehelper_setup(char *path,
279 char **argv, char **envp)
280{
281 struct subprocess_info *sub_info;
282 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC);
283 if (!sub_info)
284 goto out;
285
286 INIT_WORK(&sub_info->work, __call_usermodehelper);
287 sub_info->path = path;
288 sub_info->argv = argv;
289 sub_info->envp = envp;
290
291 out:
292 return sub_info;
252} 293}
294EXPORT_SYMBOL(call_usermodehelper_setup);
253 295
254/** 296/**
255 * call_usermodehelper_keys - start a usermode application 297 * call_usermodehelper_setkeys - set the session keys for usermode helper
256 * @path: pathname for the application 298 * @info: a subprocess_info returned by call_usermodehelper_setup
257 * @argv: null-terminated argument list 299 * @session_keyring: the session keyring for the process
258 * @envp: null-terminated environment list 300 */
259 * @session_keyring: session keyring for process (NULL for an empty keyring) 301void call_usermodehelper_setkeys(struct subprocess_info *info,
302 struct key *session_keyring)
303{
304 info->ring = session_keyring;
305}
306EXPORT_SYMBOL(call_usermodehelper_setkeys);
307
308/**
309 * call_usermodehelper_setcleanup - set a cleanup function
310 * @info: a subprocess_info returned by call_usermodehelper_setup
311 * @cleanup: a cleanup function
312 *
313 * The cleanup function is just befor ethe subprocess_info is about to
314 * be freed. This can be used for freeing the argv and envp. The
315 * Function must be runnable in either a process context or the
316 * context in which call_usermodehelper_exec is called.
317 */
318void call_usermodehelper_setcleanup(struct subprocess_info *info,
319 void (*cleanup)(char **argv, char **envp))
320{
321 info->cleanup = cleanup;
322}
323EXPORT_SYMBOL(call_usermodehelper_setcleanup);
324
325/**
326 * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
327 * @sub_info: a subprocess_info returned by call_usermodehelper_setup
328 * @filp: set to the write-end of a pipe
329 *
330 * This constructs a pipe, and sets the read end to be the stdin of the
331 * subprocess, and returns the write-end in *@filp.
332 */
333int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
334 struct file **filp)
335{
336 struct file *f;
337
338 f = create_write_pipe();
339 if (IS_ERR(f))
340 return PTR_ERR(f);
341 *filp = f;
342
343 f = create_read_pipe(f);
344 if (IS_ERR(f)) {
345 free_write_pipe(*filp);
346 return PTR_ERR(f);
347 }
348 sub_info->stdin = f;
349
350 return 0;
351}
352EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
353
354/**
355 * call_usermodehelper_exec - start a usermode application
356 * @sub_info: information about the subprocessa
260 * @wait: wait for the application to finish and return status. 357 * @wait: wait for the application to finish and return status.
261 * when -1 don't wait at all, but you get no useful error back when 358 * when -1 don't wait at all, but you get no useful error back when
262 * the program couldn't be exec'ed. This makes it safe to call 359 * the program couldn't be exec'ed. This makes it safe to call
@@ -265,81 +362,68 @@ static void __call_usermodehelper(struct work_struct *work)
265 * Runs a user-space application. The application is started 362 * Runs a user-space application. The application is started
266 * asynchronously if wait is not set, and runs as a child of keventd. 363 * asynchronously if wait is not set, and runs as a child of keventd.
267 * (ie. it runs with full root capabilities). 364 * (ie. it runs with full root capabilities).
268 *
269 * Must be called from process context. Returns a negative error code
270 * if program was not execed successfully, or 0.
271 */ 365 */
272int call_usermodehelper_keys(char *path, char **argv, char **envp, 366int call_usermodehelper_exec(struct subprocess_info *sub_info,
273 struct key *session_keyring, int wait) 367 enum umh_wait wait)
274{ 368{
275 DECLARE_COMPLETION_ONSTACK(done); 369 DECLARE_COMPLETION_ONSTACK(done);
276 struct subprocess_info *sub_info;
277 int retval; 370 int retval;
278 371
279 if (!khelper_wq) 372 if (sub_info->path[0] == '\0') {
280 return -EBUSY; 373 retval = 0;
281 374 goto out;
282 if (path[0] == '\0') 375 }
283 return 0;
284 376
285 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); 377 if (!khelper_wq) {
286 if (!sub_info) 378 retval = -EBUSY;
287 return -ENOMEM; 379 goto out;
380 }
288 381
289 INIT_WORK(&sub_info->work, __call_usermodehelper);
290 sub_info->complete = &done; 382 sub_info->complete = &done;
291 sub_info->path = path;
292 sub_info->argv = argv;
293 sub_info->envp = envp;
294 sub_info->ring = session_keyring;
295 sub_info->wait = wait; 383 sub_info->wait = wait;
296 384
297 queue_work(khelper_wq, &sub_info->work); 385 queue_work(khelper_wq, &sub_info->work);
298 if (wait < 0) /* task has freed sub_info */ 386 if (wait == UMH_NO_WAIT) /* task has freed sub_info */
299 return 0; 387 return 0;
300 wait_for_completion(&done); 388 wait_for_completion(&done);
301 retval = sub_info->retval; 389 retval = sub_info->retval;
302 kfree(sub_info); 390
391 out:
392 call_usermodehelper_freeinfo(sub_info);
303 return retval; 393 return retval;
304} 394}
305EXPORT_SYMBOL(call_usermodehelper_keys); 395EXPORT_SYMBOL(call_usermodehelper_exec);
306 396
397/**
398 * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
399 * @path: path to usermode executable
400 * @argv: arg vector for process
401 * @envp: environment for process
402 * @filp: set to the write-end of a pipe
403 *
404 * This is a simple wrapper which executes a usermode-helper function
405 * with a pipe as stdin. It is implemented entirely in terms of
406 * lower-level call_usermodehelper_* functions.
407 */
307int call_usermodehelper_pipe(char *path, char **argv, char **envp, 408int call_usermodehelper_pipe(char *path, char **argv, char **envp,
308 struct file **filp) 409 struct file **filp)
309{ 410{
310 DECLARE_COMPLETION(done); 411 struct subprocess_info *sub_info;
311 struct subprocess_info sub_info = { 412 int ret;
312 .work = __WORK_INITIALIZER(sub_info.work,
313 __call_usermodehelper),
314 .complete = &done,
315 .path = path,
316 .argv = argv,
317 .envp = envp,
318 .retval = 0,
319 };
320 struct file *f;
321 413
322 if (!khelper_wq) 414 sub_info = call_usermodehelper_setup(path, argv, envp);
323 return -EBUSY; 415 if (sub_info == NULL)
416 return -ENOMEM;
324 417
325 if (path[0] == '\0') 418 ret = call_usermodehelper_stdinpipe(sub_info, filp);
326 return 0; 419 if (ret < 0)
420 goto out;
327 421
328 f = create_write_pipe(); 422 return call_usermodehelper_exec(sub_info, 1);
329 if (IS_ERR(f))
330 return PTR_ERR(f);
331 *filp = f;
332
333 f = create_read_pipe(f);
334 if (IS_ERR(f)) {
335 free_write_pipe(*filp);
336 return PTR_ERR(f);
337 }
338 sub_info.stdin = f;
339 423
340 queue_work(khelper_wq, &sub_info.work); 424 out:
341 wait_for_completion(&done); 425 call_usermodehelper_freeinfo(sub_info);
342 return sub_info.retval; 426 return ret;
343} 427}
344EXPORT_SYMBOL(call_usermodehelper_pipe); 428EXPORT_SYMBOL(call_usermodehelper_pipe);
345 429
diff --git a/kernel/sys.c b/kernel/sys.c
index 4d141ae3e802..18987c7f6add 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2286,3 +2286,61 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
2286 } 2286 }
2287 return err ? -EFAULT : 0; 2287 return err ? -EFAULT : 0;
2288} 2288}
2289
2290char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
2291
2292static void argv_cleanup(char **argv, char **envp)
2293{
2294 argv_free(argv);
2295}
2296
2297/**
2298 * orderly_poweroff - Trigger an orderly system poweroff
2299 * @force: force poweroff if command execution fails
2300 *
2301 * This may be called from any context to trigger a system shutdown.
2302 * If the orderly shutdown fails, it will force an immediate shutdown.
2303 */
2304int orderly_poweroff(bool force)
2305{
2306 int argc;
2307 char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
2308 static char *envp[] = {
2309 "HOME=/",
2310 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
2311 NULL
2312 };
2313 int ret = -ENOMEM;
2314 struct subprocess_info *info;
2315
2316 if (argv == NULL) {
2317 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2318 __func__, poweroff_cmd);
2319 goto out;
2320 }
2321
2322 info = call_usermodehelper_setup(argv[0], argv, envp);
2323 if (info == NULL) {
2324 argv_free(argv);
2325 goto out;
2326 }
2327
2328 call_usermodehelper_setcleanup(info, argv_cleanup);
2329
2330 ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
2331
2332 out:
2333 if (ret && force) {
2334 printk(KERN_WARNING "Failed to start orderly shutdown: "
2335 "forcing the issue\n");
2336
2337 /* I guess this should try to kick off some daemon to
2338 sync and poweroff asap. Or not even bother syncing
2339 if we're doing an emergency shutdown? */
2340 emergency_sync();
2341 kernel_power_off();
2342 }
2343
2344 return ret;
2345}
2346EXPORT_SYMBOL_GPL(orderly_poweroff);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7063ebc6db05..44a1d699aad7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -46,6 +46,7 @@
46#include <linux/syscalls.h> 46#include <linux/syscalls.h>
47#include <linux/nfs_fs.h> 47#include <linux/nfs_fs.h>
48#include <linux/acpi.h> 48#include <linux/acpi.h>
49#include <linux/reboot.h>
49 50
50#include <asm/uaccess.h> 51#include <asm/uaccess.h>
51#include <asm/processor.h> 52#include <asm/processor.h>
@@ -705,6 +706,15 @@ static ctl_table kern_table[] = {
705 .proc_handler = &proc_dointvec, 706 .proc_handler = &proc_dointvec,
706 }, 707 },
707#endif 708#endif
709 {
710 .ctl_name = CTL_UNNUMBERED,
711 .procname = "poweroff_cmd",
712 .data = &poweroff_cmd,
713 .maxlen = POWEROFF_CMD_PATH_LEN,
714 .mode = 0644,
715 .proc_handler = &proc_dostring,
716 .strategy = &sysctl_string,
717 },
708 718
709 { .ctl_name = 0 } 719 { .ctl_name = 0 }
710}; 720};
diff --git a/lib/Makefile b/lib/Makefile
index da68b2ca0606..614966387402 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -5,7 +5,7 @@
5lib-y := ctype.o string.o vsprintf.o cmdline.o \ 5lib-y := ctype.o string.o vsprintf.o cmdline.o \
6 rbtree.o radix-tree.o dump_stack.o \ 6 rbtree.o radix-tree.o dump_stack.o \
7 idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ 7 idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
8 sha1.o irq_regs.o reciprocal_div.o 8 sha1.o irq_regs.o reciprocal_div.o argv_split.o
9 9
10lib-$(CONFIG_MMU) += ioremap.o 10lib-$(CONFIG_MMU) += ioremap.o
11lib-$(CONFIG_SMP) += cpumask.o 11lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/argv_split.c b/lib/argv_split.c
new file mode 100644
index 000000000000..4096ed42f490
--- /dev/null
+++ b/lib/argv_split.c
@@ -0,0 +1,105 @@
1/*
2 * Helper function for splitting a string into an argv-like array.
3 */
4
5#include <linux/kernel.h>
6#include <linux/ctype.h>
7#include <linux/bug.h>
8
9static const char *skip_sep(const char *cp)
10{
11 while (*cp && isspace(*cp))
12 cp++;
13
14 return cp;
15}
16
17static const char *skip_arg(const char *cp)
18{
19 while (*cp && !isspace(*cp))
20 cp++;
21
22 return cp;
23}
24
25static int count_argc(const char *str)
26{
27 int count = 0;
28
29 while (*str) {
30 str = skip_sep(str);
31 if (*str) {
32 count++;
33 str = skip_arg(str);
34 }
35 }
36
37 return count;
38}
39
40/**
41 * argv_free - free an argv
42 * @argv - the argument vector to be freed
43 *
44 * Frees an argv and the strings it points to.
45 */
46void argv_free(char **argv)
47{
48 char **p;
49 for (p = argv; *p; p++)
50 kfree(*p);
51
52 kfree(argv);
53}
54EXPORT_SYMBOL(argv_free);
55
56/**
57 * argv_split - split a string at whitespace, returning an argv
58 * @gfp: the GFP mask used to allocate memory
59 * @str: the string to be split
60 * @argcp: returned argument count
61 *
62 * Returns an array of pointers to strings which are split out from
63 * @str. This is performed by strictly splitting on white-space; no
64 * quote processing is performed. Multiple whitespace characters are
65 * considered to be a single argument separator. The returned array
66 * is always NULL-terminated. Returns NULL on memory allocation
67 * failure.
68 */
69char **argv_split(gfp_t gfp, const char *str, int *argcp)
70{
71 int argc = count_argc(str);
72 char **argv = kzalloc(sizeof(*argv) * (argc+1), gfp);
73 char **argvp;
74
75 if (argv == NULL)
76 goto out;
77
78 *argcp = argc;
79 argvp = argv;
80
81 while (*str) {
82 str = skip_sep(str);
83
84 if (*str) {
85 const char *p = str;
86 char *t;
87
88 str = skip_arg(str);
89
90 t = kstrndup(p, str-p, gfp);
91 if (t == NULL)
92 goto fail;
93 *argvp++ = t;
94 }
95 }
96 *argvp = NULL;
97
98 out:
99 return argv;
100
101 fail:
102 argv_free(argv);
103 return NULL;
104}
105EXPORT_SYMBOL(argv_split);
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 12e311dc664c..bd5ecbbafab1 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -208,7 +208,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
208 argv [0] = uevent_helper; 208 argv [0] = uevent_helper;
209 argv [1] = (char *)subsystem; 209 argv [1] = (char *)subsystem;
210 argv [2] = NULL; 210 argv [2] = NULL;
211 call_usermodehelper (argv[0], argv, envp, 0); 211 call_usermodehelper (argv[0], argv, envp, UMH_WAIT_EXEC);
212 } 212 }
213 213
214exit: 214exit:
diff --git a/mm/util.c b/mm/util.c
index 78f3783bdcc8..bf340d806868 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -6,7 +6,6 @@
6 6
7/** 7/**
8 * kstrdup - allocate space for and copy an existing string 8 * kstrdup - allocate space for and copy an existing string
9 *
10 * @s: the string to duplicate 9 * @s: the string to duplicate
11 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 10 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
12 */ 11 */
@@ -27,6 +26,30 @@ char *kstrdup(const char *s, gfp_t gfp)
27EXPORT_SYMBOL(kstrdup); 26EXPORT_SYMBOL(kstrdup);
28 27
29/** 28/**
29 * kstrndup - allocate space for and copy an existing string
30 * @s: the string to duplicate
31 * @max: read at most @max chars from @s
32 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
33 */
34char *kstrndup(const char *s, size_t max, gfp_t gfp)
35{
36 size_t len;
37 char *buf;
38
39 if (!s)
40 return NULL;
41
42 len = strnlen(s, max);
43 buf = kmalloc_track_caller(len+1, gfp);
44 if (buf) {
45 memcpy(buf, s, len);
46 buf[len] = '\0';
47 }
48 return buf;
49}
50EXPORT_SYMBOL(kstrndup);
51
52/**
30 * kmemdup - duplicate region of memory 53 * kmemdup - duplicate region of memory
31 * 54 *
32 * @src: memory region to duplicate 55 * @src: memory region to duplicate
@@ -80,7 +103,6 @@ EXPORT_SYMBOL(krealloc);
80 103
81/* 104/*
82 * strndup_user - duplicate an existing string from user space 105 * strndup_user - duplicate an existing string from user space
83 *
84 * @s: The string to duplicate 106 * @s: The string to duplicate
85 * @n: Maximum number of bytes to copy, including the trailing NUL. 107 * @n: Maximum number of bytes to copy, including the trailing NUL.
86 */ 108 */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8e05a11155c9..3130c343088f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -767,3 +767,56 @@ EXPORT_SYMBOL(remap_vmalloc_range);
767void __attribute__((weak)) vmalloc_sync_all(void) 767void __attribute__((weak)) vmalloc_sync_all(void)
768{ 768{
769} 769}
770
771
772static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
773{
774 /* apply_to_page_range() does all the hard work. */
775 return 0;
776}
777
778/**
779 * alloc_vm_area - allocate a range of kernel address space
780 * @size: size of the area
781 * @returns: NULL on failure, vm_struct on success
782 *
783 * This function reserves a range of kernel address space, and
784 * allocates pagetables to map that range. No actual mappings
785 * are created. If the kernel address space is not shared
786 * between processes, it syncs the pagetable across all
787 * processes.
788 */
789struct vm_struct *alloc_vm_area(size_t size)
790{
791 struct vm_struct *area;
792
793 area = get_vm_area(size, VM_IOREMAP);
794 if (area == NULL)
795 return NULL;
796
797 /*
798 * This ensures that page tables are constructed for this region
799 * of kernel virtual address space and mapped into init_mm.
800 */
801 if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
802 area->size, f, NULL)) {
803 free_vm_area(area);
804 return NULL;
805 }
806
807 /* Make sure the pagetables are constructed in process kernel
808 mappings */
809 vmalloc_sync_all();
810
811 return area;
812}
813EXPORT_SYMBOL_GPL(alloc_vm_area);
814
815void free_vm_area(struct vm_struct *area)
816{
817 struct vm_struct *ret;
818 ret = remove_vm_area(area->addr);
819 BUG_ON(ret != area);
820 kfree(area);
821}
822EXPORT_SYMBOL_GPL(free_vm_area);
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index a786e7863200..1ea2f86f7683 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -125,7 +125,7 @@ static void br_stp_start(struct net_bridge *br)
125 char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL }; 125 char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL };
126 char *envp[] = { NULL }; 126 char *envp[] = { NULL };
127 127
128 r = call_usermodehelper(BR_STP_PROG, argv, envp, 1); 128 r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
129 if (r == 0) { 129 if (r == 0) {
130 br->stp_enabled = BR_USER_STP; 130 br->stp_enabled = BR_USER_STP;
131 printk(KERN_INFO "%s: userspace STP started\n", br->dev->name); 131 printk(KERN_INFO "%s: userspace STP started\n", br->dev->name);
diff --git a/net/irda/irias_object.c b/net/irda/irias_object.c
index 4adaae242b9e..cf302457097b 100644
--- a/net/irda/irias_object.c
+++ b/net/irda/irias_object.c
@@ -36,39 +36,6 @@ hashbin_t *irias_objects;
36 */ 36 */
37struct ias_value irias_missing = { IAS_MISSING, 0, 0, 0, {0}}; 37struct ias_value irias_missing = { IAS_MISSING, 0, 0, 0, {0}};
38 38
39/*
40 * Function strndup (str, max)
41 *
42 * My own kernel version of strndup!
43 *
44 * Faster, check boundary... Jean II
45 */
46static char *strndup(char *str, size_t max)
47{
48 char *new_str;
49 int len;
50
51 /* Check string */
52 if (str == NULL)
53 return NULL;
54 /* Check length, truncate */
55 len = strlen(str);
56 if(len > max)
57 len = max;
58
59 /* Allocate new string */
60 new_str = kmalloc(len + 1, GFP_ATOMIC);
61 if (new_str == NULL) {
62 IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__);
63 return NULL;
64 }
65
66 /* Copy and truncate */
67 memcpy(new_str, str, len);
68 new_str[len] = '\0';
69
70 return new_str;
71}
72 39
73/* 40/*
74 * Function ias_new_object (name, id) 41 * Function ias_new_object (name, id)
@@ -90,7 +57,7 @@ struct ias_object *irias_new_object( char *name, int id)
90 } 57 }
91 58
92 obj->magic = IAS_OBJECT_MAGIC; 59 obj->magic = IAS_OBJECT_MAGIC;
93 obj->name = strndup(name, IAS_MAX_CLASSNAME); 60 obj->name = kstrndup(name, IAS_MAX_CLASSNAME, GFP_ATOMIC);
94 if (!obj->name) { 61 if (!obj->name) {
95 IRDA_WARNING("%s(), Unable to allocate name!\n", 62 IRDA_WARNING("%s(), Unable to allocate name!\n",
96 __FUNCTION__); 63 __FUNCTION__);
@@ -360,7 +327,7 @@ void irias_add_integer_attrib(struct ias_object *obj, char *name, int value,
360 } 327 }
361 328
362 attrib->magic = IAS_ATTRIB_MAGIC; 329 attrib->magic = IAS_ATTRIB_MAGIC;
363 attrib->name = strndup(name, IAS_MAX_ATTRIBNAME); 330 attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC);
364 331
365 /* Insert value */ 332 /* Insert value */
366 attrib->value = irias_new_integer_value(value); 333 attrib->value = irias_new_integer_value(value);
@@ -404,7 +371,7 @@ void irias_add_octseq_attrib(struct ias_object *obj, char *name, __u8 *octets,
404 } 371 }
405 372
406 attrib->magic = IAS_ATTRIB_MAGIC; 373 attrib->magic = IAS_ATTRIB_MAGIC;
407 attrib->name = strndup(name, IAS_MAX_ATTRIBNAME); 374 attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC);
408 375
409 attrib->value = irias_new_octseq_value( octets, len); 376 attrib->value = irias_new_octseq_value( octets, len);
410 if (!attrib->name || !attrib->value) { 377 if (!attrib->name || !attrib->value) {
@@ -446,7 +413,7 @@ void irias_add_string_attrib(struct ias_object *obj, char *name, char *value,
446 } 413 }
447 414
448 attrib->magic = IAS_ATTRIB_MAGIC; 415 attrib->magic = IAS_ATTRIB_MAGIC;
449 attrib->name = strndup(name, IAS_MAX_ATTRIBNAME); 416 attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC);
450 417
451 attrib->value = irias_new_string_value(value); 418 attrib->value = irias_new_string_value(value);
452 if (!attrib->name || !attrib->value) { 419 if (!attrib->name || !attrib->value) {
@@ -506,7 +473,7 @@ struct ias_value *irias_new_string_value(char *string)
506 473
507 value->type = IAS_STRING; 474 value->type = IAS_STRING;
508 value->charset = CS_ASCII; 475 value->charset = CS_ASCII;
509 value->t.string = strndup(string, IAS_MAX_STRING); 476 value->t.string = kstrndup(string, IAS_MAX_STRING, GFP_ATOMIC);
510 if (!value->t.string) { 477 if (!value->t.string) {
511 IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__); 478 IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__);
512 kfree(value); 479 kfree(value);
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index f573ac189a0a..557500110a13 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -108,7 +108,8 @@ static int call_sbin_request_key(struct key *key,
108 argv[i] = NULL; 108 argv[i] = NULL;
109 109
110 /* do it */ 110 /* do it */
111 ret = call_usermodehelper_keys(argv[0], argv, envp, keyring, 1); 111 ret = call_usermodehelper_keys(argv[0], argv, envp, keyring,
112 UMH_WAIT_PROC);
112 113
113error_link: 114error_link:
114 key_put(keyring); 115 key_put(keyring);