diff options
116 files changed, 15031 insertions, 210 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index c7c9c2a15fab..7a11b905ef49 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig | |||
@@ -222,6 +222,8 @@ config PARAVIRT | |||
222 | However, when run without a hypervisor the kernel is | 222 | However, when run without a hypervisor the kernel is |
223 | theoretically slower. If in doubt, say N. | 223 | theoretically slower. If in doubt, say N. |
224 | 224 | ||
225 | source "arch/i386/xen/Kconfig" | ||
226 | |||
225 | config VMI | 227 | config VMI |
226 | bool "VMI Paravirt-ops support" | 228 | bool "VMI Paravirt-ops support" |
227 | depends on PARAVIRT | 229 | depends on PARAVIRT |
diff --git a/arch/i386/Makefile b/arch/i386/Makefile index 181cc29a7c4f..01f0ff0daaf4 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile | |||
@@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-i386/mach-es7000 | |||
93 | mcore-$(CONFIG_X86_ES7000) := mach-default | 93 | mcore-$(CONFIG_X86_ES7000) := mach-default |
94 | core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/ | 94 | core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/ |
95 | 95 | ||
96 | # Xen paravirtualization support | ||
97 | core-$(CONFIG_XEN) += arch/i386/xen/ | ||
98 | |||
96 | # default subarch .h files | 99 | # default subarch .h files |
97 | mflags-y += -Iinclude/asm-i386/mach-default | 100 | mflags-y += -Iinclude/asm-i386/mach-default |
98 | 101 | ||
diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c index ce4fda261aaf..b0e21c3cee5c 100644 --- a/arch/i386/boot/compressed/relocs.c +++ b/arch/i386/boot/compressed/relocs.c | |||
@@ -31,6 +31,8 @@ static const char* safe_abs_relocs[] = { | |||
31 | "__kernel_rt_sigreturn", | 31 | "__kernel_rt_sigreturn", |
32 | "__kernel_sigreturn", | 32 | "__kernel_sigreturn", |
33 | "SYSENTER_RETURN", | 33 | "SYSENTER_RETURN", |
34 | "xen_irq_disable_direct_reloc", | ||
35 | "xen_save_fl_direct_reloc", | ||
34 | }; | 36 | }; |
35 | 37 | ||
36 | static int is_safe_abs_reloc(const char* sym_name) | 38 | static int is_safe_abs_reloc(const char* sym_name) |
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 27a776c9044d..25f7eb513928 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c | |||
@@ -17,6 +17,8 @@ | |||
17 | #include <asm/thread_info.h> | 17 | #include <asm/thread_info.h> |
18 | #include <asm/elf.h> | 18 | #include <asm/elf.h> |
19 | 19 | ||
20 | #include <xen/interface/xen.h> | ||
21 | |||
20 | #define DEFINE(sym, val) \ | 22 | #define DEFINE(sym, val) \ |
21 | asm volatile("\n->" #sym " %0 " #val : : "i" (val)) | 23 | asm volatile("\n->" #sym " %0 " #val : : "i" (val)) |
22 | 24 | ||
@@ -59,6 +61,7 @@ void foo(void) | |||
59 | OFFSET(TI_addr_limit, thread_info, addr_limit); | 61 | OFFSET(TI_addr_limit, thread_info, addr_limit); |
60 | OFFSET(TI_restart_block, thread_info, restart_block); | 62 | OFFSET(TI_restart_block, thread_info, restart_block); |
61 | OFFSET(TI_sysenter_return, thread_info, sysenter_return); | 63 | OFFSET(TI_sysenter_return, thread_info, sysenter_return); |
64 | OFFSET(TI_cpu, thread_info, cpu); | ||
62 | BLANK(); | 65 | BLANK(); |
63 | 66 | ||
64 | OFFSET(GDS_size, Xgt_desc_struct, size); | 67 | OFFSET(GDS_size, Xgt_desc_struct, size); |
@@ -115,4 +118,10 @@ void foo(void) | |||
115 | OFFSET(PARAVIRT_iret, paravirt_ops, iret); | 118 | OFFSET(PARAVIRT_iret, paravirt_ops, iret); |
116 | OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); | 119 | OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); |
117 | #endif | 120 | #endif |
121 | |||
122 | #ifdef CONFIG_XEN | ||
123 | BLANK(); | ||
124 | OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); | ||
125 | OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); | ||
126 | #endif | ||
118 | } | 127 | } |
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 3c3c220488c9..32980b834935 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S | |||
@@ -1023,6 +1023,91 @@ ENTRY(kernel_thread_helper) | |||
1023 | CFI_ENDPROC | 1023 | CFI_ENDPROC |
1024 | ENDPROC(kernel_thread_helper) | 1024 | ENDPROC(kernel_thread_helper) |
1025 | 1025 | ||
1026 | #ifdef CONFIG_XEN | ||
1027 | ENTRY(xen_hypervisor_callback) | ||
1028 | CFI_STARTPROC | ||
1029 | pushl $0 | ||
1030 | CFI_ADJUST_CFA_OFFSET 4 | ||
1031 | SAVE_ALL | ||
1032 | TRACE_IRQS_OFF | ||
1033 | |||
1034 | /* Check to see if we got the event in the critical | ||
1035 | region in xen_iret_direct, after we've reenabled | ||
1036 | events and checked for pending events. This simulates | ||
1037 | iret instruction's behaviour where it delivers a | ||
1038 | pending interrupt when enabling interrupts. */ | ||
1039 | movl PT_EIP(%esp),%eax | ||
1040 | cmpl $xen_iret_start_crit,%eax | ||
1041 | jb 1f | ||
1042 | cmpl $xen_iret_end_crit,%eax | ||
1043 | jae 1f | ||
1044 | |||
1045 | call xen_iret_crit_fixup | ||
1046 | |||
1047 | 1: mov %esp, %eax | ||
1048 | call xen_evtchn_do_upcall | ||
1049 | jmp ret_from_intr | ||
1050 | CFI_ENDPROC | ||
1051 | ENDPROC(xen_hypervisor_callback) | ||
1052 | |||
1053 | # Hypervisor uses this for application faults while it executes. | ||
1054 | # We get here for two reasons: | ||
1055 | # 1. Fault while reloading DS, ES, FS or GS | ||
1056 | # 2. Fault while executing IRET | ||
1057 | # Category 1 we fix up by reattempting the load, and zeroing the segment | ||
1058 | # register if the load fails. | ||
1059 | # Category 2 we fix up by jumping to do_iret_error. We cannot use the | ||
1060 | # normal Linux return path in this case because if we use the IRET hypercall | ||
1061 | # to pop the stack frame we end up in an infinite loop of failsafe callbacks. | ||
1062 | # We distinguish between categories by maintaining a status value in EAX. | ||
1063 | ENTRY(xen_failsafe_callback) | ||
1064 | CFI_STARTPROC | ||
1065 | pushl %eax | ||
1066 | CFI_ADJUST_CFA_OFFSET 4 | ||
1067 | movl $1,%eax | ||
1068 | 1: mov 4(%esp),%ds | ||
1069 | 2: mov 8(%esp),%es | ||
1070 | 3: mov 12(%esp),%fs | ||
1071 | 4: mov 16(%esp),%gs | ||
1072 | testl %eax,%eax | ||
1073 | popl %eax | ||
1074 | CFI_ADJUST_CFA_OFFSET -4 | ||
1075 | lea 16(%esp),%esp | ||
1076 | CFI_ADJUST_CFA_OFFSET -16 | ||
1077 | jz 5f | ||
1078 | addl $16,%esp | ||
1079 | jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) | ||
1080 | 5: pushl $0 # EAX == 0 => Category 1 (Bad segment) | ||
1081 | CFI_ADJUST_CFA_OFFSET 4 | ||
1082 | SAVE_ALL | ||
1083 | jmp ret_from_exception | ||
1084 | CFI_ENDPROC | ||
1085 | |||
1086 | .section .fixup,"ax" | ||
1087 | 6: xorl %eax,%eax | ||
1088 | movl %eax,4(%esp) | ||
1089 | jmp 1b | ||
1090 | 7: xorl %eax,%eax | ||
1091 | movl %eax,8(%esp) | ||
1092 | jmp 2b | ||
1093 | 8: xorl %eax,%eax | ||
1094 | movl %eax,12(%esp) | ||
1095 | jmp 3b | ||
1096 | 9: xorl %eax,%eax | ||
1097 | movl %eax,16(%esp) | ||
1098 | jmp 4b | ||
1099 | .previous | ||
1100 | .section __ex_table,"a" | ||
1101 | .align 4 | ||
1102 | .long 1b,6b | ||
1103 | .long 2b,7b | ||
1104 | .long 3b,8b | ||
1105 | .long 4b,9b | ||
1106 | .previous | ||
1107 | ENDPROC(xen_failsafe_callback) | ||
1108 | |||
1109 | #endif /* CONFIG_XEN */ | ||
1110 | |||
1026 | .section .rodata,"a" | 1111 | .section .rodata,"a" |
1027 | #include "syscall_table.S" | 1112 | #include "syscall_table.S" |
1028 | 1113 | ||
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 82714668d43b..7c52b222207e 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S | |||
@@ -510,7 +510,8 @@ ENTRY(_stext) | |||
510 | /* | 510 | /* |
511 | * BSS section | 511 | * BSS section |
512 | */ | 512 | */ |
513 | .section ".bss.page_aligned","w" | 513 | .section ".bss.page_aligned","wa" |
514 | .align PAGE_SIZE_asm | ||
514 | ENTRY(swapper_pg_dir) | 515 | ENTRY(swapper_pg_dir) |
515 | .fill 1024,4,0 | 516 | .fill 1024,4,0 |
516 | ENTRY(swapper_pg_pmd) | 517 | ENTRY(swapper_pg_pmd) |
@@ -538,6 +539,8 @@ fault_msg: | |||
538 | .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n" | 539 | .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n" |
539 | .asciz "Stack: %p %p %p %p %p %p %p %p\n" | 540 | .asciz "Stack: %p %p %p %p %p %p %p %p\n" |
540 | 541 | ||
542 | #include "../xen/xen-head.S" | ||
543 | |||
541 | /* | 544 | /* |
542 | * The IDT and GDT 'descriptors' are a strange 48-bit object | 545 | * The IDT and GDT 'descriptors' are a strange 48-bit object |
543 | * only used by the lidt and lgdt instructions. They are not | 546 | * only used by the lidt and lgdt instructions. They are not |
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index faab09abca5e..53f07a8275e3 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c | |||
@@ -228,6 +228,41 @@ static int __init print_banner(void) | |||
228 | } | 228 | } |
229 | core_initcall(print_banner); | 229 | core_initcall(print_banner); |
230 | 230 | ||
231 | static struct resource reserve_ioports = { | ||
232 | .start = 0, | ||
233 | .end = IO_SPACE_LIMIT, | ||
234 | .name = "paravirt-ioport", | ||
235 | .flags = IORESOURCE_IO | IORESOURCE_BUSY, | ||
236 | }; | ||
237 | |||
238 | static struct resource reserve_iomem = { | ||
239 | .start = 0, | ||
240 | .end = -1, | ||
241 | .name = "paravirt-iomem", | ||
242 | .flags = IORESOURCE_MEM | IORESOURCE_BUSY, | ||
243 | }; | ||
244 | |||
245 | /* | ||
246 | * Reserve the whole legacy IO space to prevent any legacy drivers | ||
247 | * from wasting time probing for their hardware. This is a fairly | ||
248 | * brute-force approach to disabling all non-virtual drivers. | ||
249 | * | ||
250 | * Note that this must be called very early to have any effect. | ||
251 | */ | ||
252 | int paravirt_disable_iospace(void) | ||
253 | { | ||
254 | int ret; | ||
255 | |||
256 | ret = request_resource(&ioport_resource, &reserve_ioports); | ||
257 | if (ret == 0) { | ||
258 | ret = request_resource(&iomem_resource, &reserve_iomem); | ||
259 | if (ret) | ||
260 | release_resource(&reserve_ioports); | ||
261 | } | ||
262 | |||
263 | return ret; | ||
264 | } | ||
265 | |||
231 | struct paravirt_ops paravirt_ops = { | 266 | struct paravirt_ops paravirt_ops = { |
232 | .name = "bare hardware", | 267 | .name = "bare hardware", |
233 | .paravirt_enabled = 0, | 268 | .paravirt_enabled = 0, |
@@ -267,7 +302,7 @@ struct paravirt_ops paravirt_ops = { | |||
267 | .write_msr = native_write_msr_safe, | 302 | .write_msr = native_write_msr_safe, |
268 | .read_tsc = native_read_tsc, | 303 | .read_tsc = native_read_tsc, |
269 | .read_pmc = native_read_pmc, | 304 | .read_pmc = native_read_pmc, |
270 | .get_scheduled_cycles = native_read_tsc, | 305 | .sched_clock = native_sched_clock, |
271 | .get_cpu_khz = native_calculate_cpu_khz, | 306 | .get_cpu_khz = native_calculate_cpu_khz, |
272 | .load_tr_desc = native_load_tr_desc, | 307 | .load_tr_desc = native_load_tr_desc, |
273 | .set_ldt = native_set_ldt, | 308 | .set_ldt = native_set_ldt, |
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 2d61e65eeb50..74871d066c2b 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c | |||
@@ -601,6 +601,8 @@ void __init setup_arch(char **cmdline_p) | |||
601 | * NOTE: at this point the bootmem allocator is fully available. | 601 | * NOTE: at this point the bootmem allocator is fully available. |
602 | */ | 602 | */ |
603 | 603 | ||
604 | paravirt_post_allocator_init(); | ||
605 | |||
604 | dmi_scan_machine(); | 606 | dmi_scan_machine(); |
605 | 607 | ||
606 | #ifdef CONFIG_X86_GENERICARCH | 608 | #ifdef CONFIG_X86_GENERICARCH |
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 6299c080f6e2..2d35d8502029 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c | |||
@@ -22,6 +22,7 @@ | |||
22 | 22 | ||
23 | #include <asm/mtrr.h> | 23 | #include <asm/mtrr.h> |
24 | #include <asm/tlbflush.h> | 24 | #include <asm/tlbflush.h> |
25 | #include <asm/mmu_context.h> | ||
25 | #include <mach_apic.h> | 26 | #include <mach_apic.h> |
26 | 27 | ||
27 | /* | 28 | /* |
@@ -249,13 +250,13 @@ static unsigned long flush_va; | |||
249 | static DEFINE_SPINLOCK(tlbstate_lock); | 250 | static DEFINE_SPINLOCK(tlbstate_lock); |
250 | 251 | ||
251 | /* | 252 | /* |
252 | * We cannot call mmdrop() because we are in interrupt context, | 253 | * We cannot call mmdrop() because we are in interrupt context, |
253 | * instead update mm->cpu_vm_mask. | 254 | * instead update mm->cpu_vm_mask. |
254 | * | 255 | * |
255 | * We need to reload %cr3 since the page tables may be going | 256 | * We need to reload %cr3 since the page tables may be going |
256 | * away from under us.. | 257 | * away from under us.. |
257 | */ | 258 | */ |
258 | static inline void leave_mm (unsigned long cpu) | 259 | void leave_mm(unsigned long cpu) |
259 | { | 260 | { |
260 | if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) | 261 | if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) |
261 | BUG(); | 262 | BUG(); |
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 0b2954534b8e..5910d3fac561 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c | |||
@@ -148,7 +148,7 @@ void __init smp_alloc_memory(void) | |||
148 | * a given CPU | 148 | * a given CPU |
149 | */ | 149 | */ |
150 | 150 | ||
151 | static void __cpuinit smp_store_cpu_info(int id) | 151 | void __cpuinit smp_store_cpu_info(int id) |
152 | { | 152 | { |
153 | struct cpuinfo_x86 *c = cpu_data + id; | 153 | struct cpuinfo_x86 *c = cpu_data + id; |
154 | 154 | ||
@@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu) | |||
308 | /* representing cpus for which sibling maps can be computed */ | 308 | /* representing cpus for which sibling maps can be computed */ |
309 | static cpumask_t cpu_sibling_setup_map; | 309 | static cpumask_t cpu_sibling_setup_map; |
310 | 310 | ||
311 | static inline void | 311 | void set_cpu_sibling_map(int cpu) |
312 | set_cpu_sibling_map(int cpu) | ||
313 | { | 312 | { |
314 | int i; | 313 | int i; |
315 | struct cpuinfo_x86 *c = cpu_data; | 314 | struct cpuinfo_x86 *c = cpu_data; |
@@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void) | |||
1144 | } | 1143 | } |
1145 | 1144 | ||
1146 | #ifdef CONFIG_HOTPLUG_CPU | 1145 | #ifdef CONFIG_HOTPLUG_CPU |
1147 | static void | 1146 | void remove_siblinginfo(int cpu) |
1148 | remove_siblinginfo(int cpu) | ||
1149 | { | 1147 | { |
1150 | int sibling; | 1148 | int sibling; |
1151 | struct cpuinfo_x86 *c = cpu_data; | 1149 | struct cpuinfo_x86 *c = cpu_data; |
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index ea63a30ca3e8..252f9010f283 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c | |||
@@ -84,7 +84,7 @@ static inline int check_tsc_unstable(void) | |||
84 | * | 84 | * |
85 | * -johnstul@us.ibm.com "math is hard, lets go shopping!" | 85 | * -johnstul@us.ibm.com "math is hard, lets go shopping!" |
86 | */ | 86 | */ |
87 | static unsigned long cyc2ns_scale __read_mostly; | 87 | unsigned long cyc2ns_scale __read_mostly; |
88 | 88 | ||
89 | #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ | 89 | #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ |
90 | 90 | ||
@@ -93,15 +93,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz) | |||
93 | cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; | 93 | cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; |
94 | } | 94 | } |
95 | 95 | ||
96 | static inline unsigned long long cycles_2_ns(unsigned long long cyc) | ||
97 | { | ||
98 | return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; | ||
99 | } | ||
100 | |||
101 | /* | 96 | /* |
102 | * Scheduler clock - returns current time in nanosec units. | 97 | * Scheduler clock - returns current time in nanosec units. |
103 | */ | 98 | */ |
104 | unsigned long long sched_clock(void) | 99 | unsigned long long native_sched_clock(void) |
105 | { | 100 | { |
106 | unsigned long long this_offset; | 101 | unsigned long long this_offset; |
107 | 102 | ||
@@ -118,12 +113,24 @@ unsigned long long sched_clock(void) | |||
118 | return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); | 113 | return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); |
119 | 114 | ||
120 | /* read the Time Stamp Counter: */ | 115 | /* read the Time Stamp Counter: */ |
121 | get_scheduled_cycles(this_offset); | 116 | rdtscll(this_offset); |
122 | 117 | ||
123 | /* return the value in ns */ | 118 | /* return the value in ns */ |
124 | return cycles_2_ns(this_offset); | 119 | return cycles_2_ns(this_offset); |
125 | } | 120 | } |
126 | 121 | ||
122 | /* We need to define a real function for sched_clock, to override the | ||
123 | weak default version */ | ||
124 | #ifdef CONFIG_PARAVIRT | ||
125 | unsigned long long sched_clock(void) | ||
126 | { | ||
127 | return paravirt_sched_clock(); | ||
128 | } | ||
129 | #else | ||
130 | unsigned long long sched_clock(void) | ||
131 | __attribute__((alias("native_sched_clock"))); | ||
132 | #endif | ||
133 | |||
127 | unsigned long native_calculate_cpu_khz(void) | 134 | unsigned long native_calculate_cpu_khz(void) |
128 | { | 135 | { |
129 | unsigned long long start, end; | 136 | unsigned long long start, end; |
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index c12720d7cbc5..72042bb7ec94 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c | |||
@@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) | |||
362 | } | 362 | } |
363 | #endif | 363 | #endif |
364 | 364 | ||
365 | static void vmi_allocate_pt(u32 pfn) | 365 | static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn) |
366 | { | 366 | { |
367 | vmi_set_page_type(pfn, VMI_PAGE_L1); | 367 | vmi_set_page_type(pfn, VMI_PAGE_L1); |
368 | vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); | 368 | vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); |
@@ -891,7 +891,7 @@ static inline int __init activate_vmi(void) | |||
891 | paravirt_ops.setup_boot_clock = vmi_time_bsp_init; | 891 | paravirt_ops.setup_boot_clock = vmi_time_bsp_init; |
892 | paravirt_ops.setup_secondary_clock = vmi_time_ap_init; | 892 | paravirt_ops.setup_secondary_clock = vmi_time_ap_init; |
893 | #endif | 893 | #endif |
894 | paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles; | 894 | paravirt_ops.sched_clock = vmi_sched_clock; |
895 | paravirt_ops.get_cpu_khz = vmi_cpu_khz; | 895 | paravirt_ops.get_cpu_khz = vmi_cpu_khz; |
896 | 896 | ||
897 | /* We have true wallclock functions; disable CMOS clock sync */ | 897 | /* We have true wallclock functions; disable CMOS clock sync */ |
diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c index 26a37f8a8762..f9b845f4e692 100644 --- a/arch/i386/kernel/vmiclock.c +++ b/arch/i386/kernel/vmiclock.c | |||
@@ -64,10 +64,10 @@ int vmi_set_wallclock(unsigned long now) | |||
64 | return 0; | 64 | return 0; |
65 | } | 65 | } |
66 | 66 | ||
67 | /* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */ | 67 | /* paravirt_ops.sched_clock = vmi_sched_clock */ |
68 | unsigned long long vmi_get_sched_cycles(void) | 68 | unsigned long long vmi_sched_clock(void) |
69 | { | 69 | { |
70 | return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); | 70 | return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); |
71 | } | 71 | } |
72 | 72 | ||
73 | /* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ | 73 | /* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ |
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index aa87b06c7c82..00f1bc47d3a2 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S | |||
@@ -88,6 +88,7 @@ SECTIONS | |||
88 | 88 | ||
89 | . = ALIGN(4096); | 89 | . = ALIGN(4096); |
90 | .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { | 90 | .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { |
91 | *(.data.page_aligned) | ||
91 | *(.data.idt) | 92 | *(.data.idt) |
92 | } | 93 | } |
93 | 94 | ||
diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S index d4b5be4f3d5f..271f16a8ca01 100644 --- a/arch/i386/kernel/vsyscall-note.S +++ b/arch/i386/kernel/vsyscall-note.S | |||
@@ -3,23 +3,40 @@ | |||
3 | * Here we can supply some information useful to userland. | 3 | * Here we can supply some information useful to userland. |
4 | */ | 4 | */ |
5 | 5 | ||
6 | #include <linux/uts.h> | ||
7 | #include <linux/version.h> | 6 | #include <linux/version.h> |
7 | #include <linux/elfnote.h> | ||
8 | 8 | ||
9 | #define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type) \ | 9 | /* Ideally this would use UTS_NAME, but using a quoted string here |
10 | .section name, flags; \ | 10 | doesn't work. Remember to change this when changing the |
11 | .balign 4; \ | 11 | kernel's name. */ |
12 | .long 1f - 0f; /* name length */ \ | 12 | ELFNOTE_START(Linux, 0, "a") |
13 | .long 3f - 2f; /* data length */ \ | 13 | .long LINUX_VERSION_CODE |
14 | .long type; /* note type */ \ | 14 | ELFNOTE_END |
15 | 0: .asciz vendor; /* vendor name */ \ | ||
16 | 1: .balign 4; \ | ||
17 | 2: | ||
18 | 15 | ||
19 | #define ASM_ELF_NOTE_END \ | 16 | #ifdef CONFIG_XEN |
20 | 3: .balign 4; /* pad out section */ \ | ||
21 | .previous | ||
22 | 17 | ||
23 | ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0) | 18 | /* |
24 | .long LINUX_VERSION_CODE | 19 | * Add a special note telling glibc's dynamic linker a fake hardware |
25 | ASM_ELF_NOTE_END | 20 | * flavor that it will use to choose the search path for libraries in the |
21 | * same way it uses real hardware capabilities like "mmx". | ||
22 | * We supply "nosegneg" as the fake capability, to indicate that we | ||
23 | * do not like negative offsets in instructions using segment overrides, | ||
24 | * since we implement those inefficiently. This makes it possible to | ||
25 | * install libraries optimized to avoid those access patterns in someplace | ||
26 | * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file | ||
27 | * corresponding to the bits here is needed to make ldconfig work right. | ||
28 | * It should contain: | ||
29 | * hwcap 1 nosegneg | ||
30 | * to match the mapping of bit to name that we give here. | ||
31 | */ | ||
32 | |||
33 | /* Bit used for the pseudo-hwcap for non-negative segments. We use | ||
34 | bit 1 to avoid bugs in some versions of glibc when bit 0 is | ||
35 | used; the choice is otherwise arbitrary. */ | ||
36 | #define VDSO_NOTE_NONEGSEG_BIT 1 | ||
37 | |||
38 | ELFNOTE_START(GNU, 2, "a") | ||
39 | .long 1, 1<<VDSO_NOTE_NONEGSEG_BIT /* ncaps, mask */ | ||
40 | .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ | ||
41 | ELFNOTE_END | ||
42 | #endif | ||
diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c index b4b24e0e45e1..f9d595338159 100644 --- a/arch/i386/mach-voyager/voyager_thread.c +++ b/arch/i386/mach-voyager/voyager_thread.c | |||
@@ -52,7 +52,7 @@ execute(const char *string) | |||
52 | NULL, | 52 | NULL, |
53 | }; | 53 | }; |
54 | 54 | ||
55 | if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) { | 55 | if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) { |
56 | printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", | 56 | printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", |
57 | string, ret); | 57 | string, ret); |
58 | } | 58 | } |
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 7135946d3663..6a68b1ae061c 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c | |||
@@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) | |||
87 | if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { | 87 | if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { |
88 | pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); | 88 | pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); |
89 | 89 | ||
90 | paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); | 90 | paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); |
91 | set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); | 91 | set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); |
92 | BUG_ON(page_table != pte_offset_kernel(pmd, 0)); | 92 | BUG_ON(page_table != pte_offset_kernel(pmd, 0)); |
93 | } | 93 | } |
@@ -473,6 +473,7 @@ void zap_low_mappings (void) | |||
473 | 473 | ||
474 | static int disable_nx __initdata = 0; | 474 | static int disable_nx __initdata = 0; |
475 | u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; | 475 | u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; |
476 | EXPORT_SYMBOL_GPL(__supported_pte_mask); | ||
476 | 477 | ||
477 | /* | 478 | /* |
478 | * noexec = on|off | 479 | * noexec = on|off |
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index 2eb14a73be9c..37992ffb1633 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c | |||
@@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, | |||
60 | address = __pa(address); | 60 | address = __pa(address); |
61 | addr = address & LARGE_PAGE_MASK; | 61 | addr = address & LARGE_PAGE_MASK; |
62 | pbase = (pte_t *)page_address(base); | 62 | pbase = (pte_t *)page_address(base); |
63 | paravirt_alloc_pt(page_to_pfn(base)); | 63 | paravirt_alloc_pt(&init_mm, page_to_pfn(base)); |
64 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { | 64 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { |
65 | set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, | 65 | set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, |
66 | addr == address ? prot : ref_prot)); | 66 | addr == address ? prot : ref_prot)); |
diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig new file mode 100644 index 000000000000..9df99e1885a4 --- /dev/null +++ b/arch/i386/xen/Kconfig | |||
@@ -0,0 +1,11 @@ | |||
1 | # | ||
2 | # This Kconfig describes xen options | ||
3 | # | ||
4 | |||
5 | config XEN | ||
6 | bool "Enable support for Xen hypervisor" | ||
7 | depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES | ||
8 | help | ||
9 | This is the Linux Xen port. Enabling this will allow the | ||
10 | kernel to boot in a paravirtualized environment under the | ||
11 | Xen hypervisor. | ||
diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile new file mode 100644 index 000000000000..343df246bd3e --- /dev/null +++ b/arch/i386/xen/Makefile | |||
@@ -0,0 +1,4 @@ | |||
1 | obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ | ||
2 | events.o time.o manage.o xen-asm.o | ||
3 | |||
4 | obj-$(CONFIG_SMP) += smp.o | ||
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c new file mode 100644 index 000000000000..9a8c1181c001 --- /dev/null +++ b/arch/i386/xen/enlighten.c | |||
@@ -0,0 +1,1144 @@ | |||
1 | /* | ||
2 | * Core of Xen paravirt_ops implementation. | ||
3 | * | ||
4 | * This file contains the xen_paravirt_ops structure itself, and the | ||
5 | * implementations for: | ||
6 | * - privileged instructions | ||
7 | * - interrupt flags | ||
8 | * - segment operations | ||
9 | * - booting and setup | ||
10 | * | ||
11 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | ||
12 | */ | ||
13 | |||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/init.h> | ||
16 | #include <linux/smp.h> | ||
17 | #include <linux/preempt.h> | ||
18 | #include <linux/hardirq.h> | ||
19 | #include <linux/percpu.h> | ||
20 | #include <linux/delay.h> | ||
21 | #include <linux/start_kernel.h> | ||
22 | #include <linux/sched.h> | ||
23 | #include <linux/bootmem.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/mm.h> | ||
26 | #include <linux/page-flags.h> | ||
27 | #include <linux/highmem.h> | ||
28 | #include <linux/smp.h> | ||
29 | |||
30 | #include <xen/interface/xen.h> | ||
31 | #include <xen/interface/physdev.h> | ||
32 | #include <xen/interface/vcpu.h> | ||
33 | #include <xen/interface/sched.h> | ||
34 | #include <xen/features.h> | ||
35 | #include <xen/page.h> | ||
36 | |||
37 | #include <asm/paravirt.h> | ||
38 | #include <asm/page.h> | ||
39 | #include <asm/xen/hypercall.h> | ||
40 | #include <asm/xen/hypervisor.h> | ||
41 | #include <asm/fixmap.h> | ||
42 | #include <asm/processor.h> | ||
43 | #include <asm/setup.h> | ||
44 | #include <asm/desc.h> | ||
45 | #include <asm/pgtable.h> | ||
46 | #include <asm/tlbflush.h> | ||
47 | #include <asm/reboot.h> | ||
48 | |||
49 | #include "xen-ops.h" | ||
50 | #include "mmu.h" | ||
51 | #include "multicalls.h" | ||
52 | |||
53 | EXPORT_SYMBOL_GPL(hypercall_page); | ||
54 | |||
55 | DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); | ||
56 | |||
57 | DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); | ||
58 | DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); | ||
59 | DEFINE_PER_CPU(unsigned long, xen_cr3); | ||
60 | |||
61 | struct start_info *xen_start_info; | ||
62 | EXPORT_SYMBOL_GPL(xen_start_info); | ||
63 | |||
64 | static /* __initdata */ struct shared_info dummy_shared_info; | ||
65 | |||
66 | /* | ||
67 | * Point at some empty memory to start with. We map the real shared_info | ||
68 | * page as soon as fixmap is up and running. | ||
69 | */ | ||
70 | struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; | ||
71 | |||
72 | /* | ||
73 | * Flag to determine whether vcpu info placement is available on all | ||
74 | * VCPUs. We assume it is to start with, and then set it to zero on | ||
75 | * the first failure. This is because it can succeed on some VCPUs | ||
76 | * and not others, since it can involve hypervisor memory allocation, | ||
77 | * or because the guest failed to guarantee all the appropriate | ||
78 | * constraints on all VCPUs (ie buffer can't cross a page boundary). | ||
79 | * | ||
80 | * Note that any particular CPU may be using a placed vcpu structure, | ||
81 | * but we can only optimise if the all are. | ||
82 | * | ||
83 | * 0: not available, 1: available | ||
84 | */ | ||
85 | static int have_vcpu_info_placement = 1; | ||
86 | |||
87 | static void __init xen_vcpu_setup(int cpu) | ||
88 | { | ||
89 | struct vcpu_register_vcpu_info info; | ||
90 | int err; | ||
91 | struct vcpu_info *vcpup; | ||
92 | |||
93 | per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; | ||
94 | |||
95 | if (!have_vcpu_info_placement) | ||
96 | return; /* already tested, not available */ | ||
97 | |||
98 | vcpup = &per_cpu(xen_vcpu_info, cpu); | ||
99 | |||
100 | info.mfn = virt_to_mfn(vcpup); | ||
101 | info.offset = offset_in_page(vcpup); | ||
102 | |||
103 | printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", | ||
104 | cpu, vcpup, info.mfn, info.offset); | ||
105 | |||
106 | /* Check to see if the hypervisor will put the vcpu_info | ||
107 | structure where we want it, which allows direct access via | ||
108 | a percpu-variable. */ | ||
109 | err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); | ||
110 | |||
111 | if (err) { | ||
112 | printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); | ||
113 | have_vcpu_info_placement = 0; | ||
114 | } else { | ||
115 | /* This cpu is using the registered vcpu info, even if | ||
116 | later ones fail to. */ | ||
117 | per_cpu(xen_vcpu, cpu) = vcpup; | ||
118 | |||
119 | printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", | ||
120 | cpu, vcpup); | ||
121 | } | ||
122 | } | ||
123 | |||
124 | static void __init xen_banner(void) | ||
125 | { | ||
126 | printk(KERN_INFO "Booting paravirtualized kernel on %s\n", | ||
127 | paravirt_ops.name); | ||
128 | printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); | ||
129 | } | ||
130 | |||
131 | static void xen_cpuid(unsigned int *eax, unsigned int *ebx, | ||
132 | unsigned int *ecx, unsigned int *edx) | ||
133 | { | ||
134 | unsigned maskedx = ~0; | ||
135 | |||
136 | /* | ||
137 | * Mask out inconvenient features, to try and disable as many | ||
138 | * unsupported kernel subsystems as possible. | ||
139 | */ | ||
140 | if (*eax == 1) | ||
141 | maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ | ||
142 | (1 << X86_FEATURE_ACPI) | /* disable ACPI */ | ||
143 | (1 << X86_FEATURE_ACC)); /* thermal monitoring */ | ||
144 | |||
145 | asm(XEN_EMULATE_PREFIX "cpuid" | ||
146 | : "=a" (*eax), | ||
147 | "=b" (*ebx), | ||
148 | "=c" (*ecx), | ||
149 | "=d" (*edx) | ||
150 | : "0" (*eax), "2" (*ecx)); | ||
151 | *edx &= maskedx; | ||
152 | } | ||
153 | |||
154 | static void xen_set_debugreg(int reg, unsigned long val) | ||
155 | { | ||
156 | HYPERVISOR_set_debugreg(reg, val); | ||
157 | } | ||
158 | |||
159 | static unsigned long xen_get_debugreg(int reg) | ||
160 | { | ||
161 | return HYPERVISOR_get_debugreg(reg); | ||
162 | } | ||
163 | |||
164 | static unsigned long xen_save_fl(void) | ||
165 | { | ||
166 | struct vcpu_info *vcpu; | ||
167 | unsigned long flags; | ||
168 | |||
169 | vcpu = x86_read_percpu(xen_vcpu); | ||
170 | |||
171 | /* flag has opposite sense of mask */ | ||
172 | flags = !vcpu->evtchn_upcall_mask; | ||
173 | |||
174 | /* convert to IF type flag | ||
175 | -0 -> 0x00000000 | ||
176 | -1 -> 0xffffffff | ||
177 | */ | ||
178 | return (-flags) & X86_EFLAGS_IF; | ||
179 | } | ||
180 | |||
181 | static void xen_restore_fl(unsigned long flags) | ||
182 | { | ||
183 | struct vcpu_info *vcpu; | ||
184 | |||
185 | /* convert from IF type flag */ | ||
186 | flags = !(flags & X86_EFLAGS_IF); | ||
187 | |||
188 | /* There's a one instruction preempt window here. We need to | ||
189 | make sure we're don't switch CPUs between getting the vcpu | ||
190 | pointer and updating the mask. */ | ||
191 | preempt_disable(); | ||
192 | vcpu = x86_read_percpu(xen_vcpu); | ||
193 | vcpu->evtchn_upcall_mask = flags; | ||
194 | preempt_enable_no_resched(); | ||
195 | |||
196 | /* Doesn't matter if we get preempted here, because any | ||
197 | pending event will get dealt with anyway. */ | ||
198 | |||
199 | if (flags == 0) { | ||
200 | preempt_check_resched(); | ||
201 | barrier(); /* unmask then check (avoid races) */ | ||
202 | if (unlikely(vcpu->evtchn_upcall_pending)) | ||
203 | force_evtchn_callback(); | ||
204 | } | ||
205 | } | ||
206 | |||
207 | static void xen_irq_disable(void) | ||
208 | { | ||
209 | /* There's a one instruction preempt window here. We need to | ||
210 | make sure we're don't switch CPUs between getting the vcpu | ||
211 | pointer and updating the mask. */ | ||
212 | preempt_disable(); | ||
213 | x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; | ||
214 | preempt_enable_no_resched(); | ||
215 | } | ||
216 | |||
217 | static void xen_irq_enable(void) | ||
218 | { | ||
219 | struct vcpu_info *vcpu; | ||
220 | |||
221 | /* There's a one instruction preempt window here. We need to | ||
222 | make sure we're don't switch CPUs between getting the vcpu | ||
223 | pointer and updating the mask. */ | ||
224 | preempt_disable(); | ||
225 | vcpu = x86_read_percpu(xen_vcpu); | ||
226 | vcpu->evtchn_upcall_mask = 0; | ||
227 | preempt_enable_no_resched(); | ||
228 | |||
229 | /* Doesn't matter if we get preempted here, because any | ||
230 | pending event will get dealt with anyway. */ | ||
231 | |||
232 | barrier(); /* unmask then check (avoid races) */ | ||
233 | if (unlikely(vcpu->evtchn_upcall_pending)) | ||
234 | force_evtchn_callback(); | ||
235 | } | ||
236 | |||
237 | static void xen_safe_halt(void) | ||
238 | { | ||
239 | /* Blocking includes an implicit local_irq_enable(). */ | ||
240 | if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) | ||
241 | BUG(); | ||
242 | } | ||
243 | |||
244 | static void xen_halt(void) | ||
245 | { | ||
246 | if (irqs_disabled()) | ||
247 | HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); | ||
248 | else | ||
249 | xen_safe_halt(); | ||
250 | } | ||
251 | |||
252 | static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) | ||
253 | { | ||
254 | BUG_ON(preemptible()); | ||
255 | |||
256 | switch (mode) { | ||
257 | case PARAVIRT_LAZY_NONE: | ||
258 | BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE); | ||
259 | break; | ||
260 | |||
261 | case PARAVIRT_LAZY_MMU: | ||
262 | case PARAVIRT_LAZY_CPU: | ||
263 | BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE); | ||
264 | break; | ||
265 | |||
266 | case PARAVIRT_LAZY_FLUSH: | ||
267 | /* flush if necessary, but don't change state */ | ||
268 | if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE) | ||
269 | xen_mc_flush(); | ||
270 | return; | ||
271 | } | ||
272 | |||
273 | xen_mc_flush(); | ||
274 | x86_write_percpu(xen_lazy_mode, mode); | ||
275 | } | ||
276 | |||
277 | static unsigned long xen_store_tr(void) | ||
278 | { | ||
279 | return 0; | ||
280 | } | ||
281 | |||
282 | static void xen_set_ldt(const void *addr, unsigned entries) | ||
283 | { | ||
284 | unsigned long linear_addr = (unsigned long)addr; | ||
285 | struct mmuext_op *op; | ||
286 | struct multicall_space mcs = xen_mc_entry(sizeof(*op)); | ||
287 | |||
288 | op = mcs.args; | ||
289 | op->cmd = MMUEXT_SET_LDT; | ||
290 | if (linear_addr) { | ||
291 | /* ldt my be vmalloced, use arbitrary_virt_to_machine */ | ||
292 | xmaddr_t maddr; | ||
293 | maddr = arbitrary_virt_to_machine((unsigned long)addr); | ||
294 | linear_addr = (unsigned long)maddr.maddr; | ||
295 | } | ||
296 | op->arg1.linear_addr = linear_addr; | ||
297 | op->arg2.nr_ents = entries; | ||
298 | |||
299 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
300 | |||
301 | xen_mc_issue(PARAVIRT_LAZY_CPU); | ||
302 | } | ||
303 | |||
304 | static void xen_load_gdt(const struct Xgt_desc_struct *dtr) | ||
305 | { | ||
306 | unsigned long *frames; | ||
307 | unsigned long va = dtr->address; | ||
308 | unsigned int size = dtr->size + 1; | ||
309 | unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; | ||
310 | int f; | ||
311 | struct multicall_space mcs; | ||
312 | |||
313 | /* A GDT can be up to 64k in size, which corresponds to 8192 | ||
314 | 8-byte entries, or 16 4k pages.. */ | ||
315 | |||
316 | BUG_ON(size > 65536); | ||
317 | BUG_ON(va & ~PAGE_MASK); | ||
318 | |||
319 | mcs = xen_mc_entry(sizeof(*frames) * pages); | ||
320 | frames = mcs.args; | ||
321 | |||
322 | for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { | ||
323 | frames[f] = virt_to_mfn(va); | ||
324 | make_lowmem_page_readonly((void *)va); | ||
325 | } | ||
326 | |||
327 | MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); | ||
328 | |||
329 | xen_mc_issue(PARAVIRT_LAZY_CPU); | ||
330 | } | ||
331 | |||
332 | static void load_TLS_descriptor(struct thread_struct *t, | ||
333 | unsigned int cpu, unsigned int i) | ||
334 | { | ||
335 | struct desc_struct *gdt = get_cpu_gdt_table(cpu); | ||
336 | xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); | ||
337 | struct multicall_space mc = __xen_mc_entry(0); | ||
338 | |||
339 | MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); | ||
340 | } | ||
341 | |||
342 | static void xen_load_tls(struct thread_struct *t, unsigned int cpu) | ||
343 | { | ||
344 | xen_mc_batch(); | ||
345 | |||
346 | load_TLS_descriptor(t, cpu, 0); | ||
347 | load_TLS_descriptor(t, cpu, 1); | ||
348 | load_TLS_descriptor(t, cpu, 2); | ||
349 | |||
350 | xen_mc_issue(PARAVIRT_LAZY_CPU); | ||
351 | |||
352 | /* | ||
353 | * XXX sleazy hack: If we're being called in a lazy-cpu zone, | ||
354 | * it means we're in a context switch, and %gs has just been | ||
355 | * saved. This means we can zero it out to prevent faults on | ||
356 | * exit from the hypervisor if the next process has no %gs. | ||
357 | * Either way, it has been saved, and the new value will get | ||
358 | * loaded properly. This will go away as soon as Xen has been | ||
359 | * modified to not save/restore %gs for normal hypercalls. | ||
360 | */ | ||
361 | if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU) | ||
362 | loadsegment(gs, 0); | ||
363 | } | ||
364 | |||
365 | static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, | ||
366 | u32 low, u32 high) | ||
367 | { | ||
368 | unsigned long lp = (unsigned long)&dt[entrynum]; | ||
369 | xmaddr_t mach_lp = virt_to_machine(lp); | ||
370 | u64 entry = (u64)high << 32 | low; | ||
371 | |||
372 | preempt_disable(); | ||
373 | |||
374 | xen_mc_flush(); | ||
375 | if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) | ||
376 | BUG(); | ||
377 | |||
378 | preempt_enable(); | ||
379 | } | ||
380 | |||
381 | static int cvt_gate_to_trap(int vector, u32 low, u32 high, | ||
382 | struct trap_info *info) | ||
383 | { | ||
384 | u8 type, dpl; | ||
385 | |||
386 | type = (high >> 8) & 0x1f; | ||
387 | dpl = (high >> 13) & 3; | ||
388 | |||
389 | if (type != 0xf && type != 0xe) | ||
390 | return 0; | ||
391 | |||
392 | info->vector = vector; | ||
393 | info->address = (high & 0xffff0000) | (low & 0x0000ffff); | ||
394 | info->cs = low >> 16; | ||
395 | info->flags = dpl; | ||
396 | /* interrupt gates clear IF */ | ||
397 | if (type == 0xe) | ||
398 | info->flags |= 4; | ||
399 | |||
400 | return 1; | ||
401 | } | ||
402 | |||
403 | /* Locations of each CPU's IDT */ | ||
404 | static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc); | ||
405 | |||
406 | /* Set an IDT entry. If the entry is part of the current IDT, then | ||
407 | also update Xen. */ | ||
408 | static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, | ||
409 | u32 low, u32 high) | ||
410 | { | ||
411 | unsigned long p = (unsigned long)&dt[entrynum]; | ||
412 | unsigned long start, end; | ||
413 | |||
414 | preempt_disable(); | ||
415 | |||
416 | start = __get_cpu_var(idt_desc).address; | ||
417 | end = start + __get_cpu_var(idt_desc).size + 1; | ||
418 | |||
419 | xen_mc_flush(); | ||
420 | |||
421 | write_dt_entry(dt, entrynum, low, high); | ||
422 | |||
423 | if (p >= start && (p + 8) <= end) { | ||
424 | struct trap_info info[2]; | ||
425 | |||
426 | info[1].address = 0; | ||
427 | |||
428 | if (cvt_gate_to_trap(entrynum, low, high, &info[0])) | ||
429 | if (HYPERVISOR_set_trap_table(info)) | ||
430 | BUG(); | ||
431 | } | ||
432 | |||
433 | preempt_enable(); | ||
434 | } | ||
435 | |||
436 | static void xen_convert_trap_info(const struct Xgt_desc_struct *desc, | ||
437 | struct trap_info *traps) | ||
438 | { | ||
439 | unsigned in, out, count; | ||
440 | |||
441 | count = (desc->size+1) / 8; | ||
442 | BUG_ON(count > 256); | ||
443 | |||
444 | for (in = out = 0; in < count; in++) { | ||
445 | const u32 *entry = (u32 *)(desc->address + in * 8); | ||
446 | |||
447 | if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) | ||
448 | out++; | ||
449 | } | ||
450 | traps[out].address = 0; | ||
451 | } | ||
452 | |||
453 | void xen_copy_trap_info(struct trap_info *traps) | ||
454 | { | ||
455 | const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc); | ||
456 | |||
457 | xen_convert_trap_info(desc, traps); | ||
458 | } | ||
459 | |||
460 | /* Load a new IDT into Xen. In principle this can be per-CPU, so we | ||
461 | hold a spinlock to protect the static traps[] array (static because | ||
462 | it avoids allocation, and saves stack space). */ | ||
463 | static void xen_load_idt(const struct Xgt_desc_struct *desc) | ||
464 | { | ||
465 | static DEFINE_SPINLOCK(lock); | ||
466 | static struct trap_info traps[257]; | ||
467 | |||
468 | spin_lock(&lock); | ||
469 | |||
470 | __get_cpu_var(idt_desc) = *desc; | ||
471 | |||
472 | xen_convert_trap_info(desc, traps); | ||
473 | |||
474 | xen_mc_flush(); | ||
475 | if (HYPERVISOR_set_trap_table(traps)) | ||
476 | BUG(); | ||
477 | |||
478 | spin_unlock(&lock); | ||
479 | } | ||
480 | |||
481 | /* Write a GDT descriptor entry. Ignore LDT descriptors, since | ||
482 | they're handled differently. */ | ||
483 | static void xen_write_gdt_entry(struct desc_struct *dt, int entry, | ||
484 | u32 low, u32 high) | ||
485 | { | ||
486 | preempt_disable(); | ||
487 | |||
488 | switch ((high >> 8) & 0xff) { | ||
489 | case DESCTYPE_LDT: | ||
490 | case DESCTYPE_TSS: | ||
491 | /* ignore */ | ||
492 | break; | ||
493 | |||
494 | default: { | ||
495 | xmaddr_t maddr = virt_to_machine(&dt[entry]); | ||
496 | u64 desc = (u64)high << 32 | low; | ||
497 | |||
498 | xen_mc_flush(); | ||
499 | if (HYPERVISOR_update_descriptor(maddr.maddr, desc)) | ||
500 | BUG(); | ||
501 | } | ||
502 | |||
503 | } | ||
504 | |||
505 | preempt_enable(); | ||
506 | } | ||
507 | |||
508 | static void xen_load_esp0(struct tss_struct *tss, | ||
509 | struct thread_struct *thread) | ||
510 | { | ||
511 | struct multicall_space mcs = xen_mc_entry(0); | ||
512 | MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0); | ||
513 | xen_mc_issue(PARAVIRT_LAZY_CPU); | ||
514 | } | ||
515 | |||
516 | static void xen_set_iopl_mask(unsigned mask) | ||
517 | { | ||
518 | struct physdev_set_iopl set_iopl; | ||
519 | |||
520 | /* Force the change at ring 0. */ | ||
521 | set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; | ||
522 | HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); | ||
523 | } | ||
524 | |||
525 | static void xen_io_delay(void) | ||
526 | { | ||
527 | } | ||
528 | |||
529 | #ifdef CONFIG_X86_LOCAL_APIC | ||
530 | static unsigned long xen_apic_read(unsigned long reg) | ||
531 | { | ||
532 | return 0; | ||
533 | } | ||
534 | |||
535 | static void xen_apic_write(unsigned long reg, unsigned long val) | ||
536 | { | ||
537 | /* Warn to see if there's any stray references */ | ||
538 | WARN_ON(1); | ||
539 | } | ||
540 | #endif | ||
541 | |||
542 | static void xen_flush_tlb(void) | ||
543 | { | ||
544 | struct mmuext_op *op; | ||
545 | struct multicall_space mcs = xen_mc_entry(sizeof(*op)); | ||
546 | |||
547 | op = mcs.args; | ||
548 | op->cmd = MMUEXT_TLB_FLUSH_LOCAL; | ||
549 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
550 | |||
551 | xen_mc_issue(PARAVIRT_LAZY_MMU); | ||
552 | } | ||
553 | |||
554 | static void xen_flush_tlb_single(unsigned long addr) | ||
555 | { | ||
556 | struct mmuext_op *op; | ||
557 | struct multicall_space mcs = xen_mc_entry(sizeof(*op)); | ||
558 | |||
559 | op = mcs.args; | ||
560 | op->cmd = MMUEXT_INVLPG_LOCAL; | ||
561 | op->arg1.linear_addr = addr & PAGE_MASK; | ||
562 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
563 | |||
564 | xen_mc_issue(PARAVIRT_LAZY_MMU); | ||
565 | } | ||
566 | |||
567 | static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, | ||
568 | unsigned long va) | ||
569 | { | ||
570 | struct { | ||
571 | struct mmuext_op op; | ||
572 | cpumask_t mask; | ||
573 | } *args; | ||
574 | cpumask_t cpumask = *cpus; | ||
575 | struct multicall_space mcs; | ||
576 | |||
577 | /* | ||
578 | * A couple of (to be removed) sanity checks: | ||
579 | * | ||
580 | * - current CPU must not be in mask | ||
581 | * - mask must exist :) | ||
582 | */ | ||
583 | BUG_ON(cpus_empty(cpumask)); | ||
584 | BUG_ON(cpu_isset(smp_processor_id(), cpumask)); | ||
585 | BUG_ON(!mm); | ||
586 | |||
587 | /* If a CPU which we ran on has gone down, OK. */ | ||
588 | cpus_and(cpumask, cpumask, cpu_online_map); | ||
589 | if (cpus_empty(cpumask)) | ||
590 | return; | ||
591 | |||
592 | mcs = xen_mc_entry(sizeof(*args)); | ||
593 | args = mcs.args; | ||
594 | args->mask = cpumask; | ||
595 | args->op.arg2.vcpumask = &args->mask; | ||
596 | |||
597 | if (va == TLB_FLUSH_ALL) { | ||
598 | args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; | ||
599 | } else { | ||
600 | args->op.cmd = MMUEXT_INVLPG_MULTI; | ||
601 | args->op.arg1.linear_addr = va; | ||
602 | } | ||
603 | |||
604 | MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); | ||
605 | |||
606 | xen_mc_issue(PARAVIRT_LAZY_MMU); | ||
607 | } | ||
608 | |||
609 | static void xen_write_cr2(unsigned long cr2) | ||
610 | { | ||
611 | x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; | ||
612 | } | ||
613 | |||
614 | static unsigned long xen_read_cr2(void) | ||
615 | { | ||
616 | return x86_read_percpu(xen_vcpu)->arch.cr2; | ||
617 | } | ||
618 | |||
619 | static unsigned long xen_read_cr2_direct(void) | ||
620 | { | ||
621 | return x86_read_percpu(xen_vcpu_info.arch.cr2); | ||
622 | } | ||
623 | |||
624 | static void xen_write_cr4(unsigned long cr4) | ||
625 | { | ||
626 | /* never allow TSC to be disabled */ | ||
627 | native_write_cr4(cr4 & ~X86_CR4_TSD); | ||
628 | } | ||
629 | |||
630 | static unsigned long xen_read_cr3(void) | ||
631 | { | ||
632 | return x86_read_percpu(xen_cr3); | ||
633 | } | ||
634 | |||
635 | static void xen_write_cr3(unsigned long cr3) | ||
636 | { | ||
637 | BUG_ON(preemptible()); | ||
638 | |||
639 | if (cr3 == x86_read_percpu(xen_cr3)) { | ||
640 | /* just a simple tlb flush */ | ||
641 | xen_flush_tlb(); | ||
642 | return; | ||
643 | } | ||
644 | |||
645 | x86_write_percpu(xen_cr3, cr3); | ||
646 | |||
647 | |||
648 | { | ||
649 | struct mmuext_op *op; | ||
650 | struct multicall_space mcs = xen_mc_entry(sizeof(*op)); | ||
651 | unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); | ||
652 | |||
653 | op = mcs.args; | ||
654 | op->cmd = MMUEXT_NEW_BASEPTR; | ||
655 | op->arg1.mfn = mfn; | ||
656 | |||
657 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
658 | |||
659 | xen_mc_issue(PARAVIRT_LAZY_CPU); | ||
660 | } | ||
661 | } | ||
662 | |||
663 | /* Early in boot, while setting up the initial pagetable, assume | ||
664 | everything is pinned. */ | ||
665 | static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) | ||
666 | { | ||
667 | BUG_ON(mem_map); /* should only be used early */ | ||
668 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | ||
669 | } | ||
670 | |||
671 | /* This needs to make sure the new pte page is pinned iff its being | ||
672 | attached to a pinned pagetable. */ | ||
673 | static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) | ||
674 | { | ||
675 | struct page *page = pfn_to_page(pfn); | ||
676 | |||
677 | if (PagePinned(virt_to_page(mm->pgd))) { | ||
678 | SetPagePinned(page); | ||
679 | |||
680 | if (!PageHighMem(page)) | ||
681 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | ||
682 | else | ||
683 | /* make sure there are no stray mappings of | ||
684 | this page */ | ||
685 | kmap_flush_unused(); | ||
686 | } | ||
687 | } | ||
688 | |||
689 | /* This should never happen until we're OK to use struct page */ | ||
690 | static void xen_release_pt(u32 pfn) | ||
691 | { | ||
692 | struct page *page = pfn_to_page(pfn); | ||
693 | |||
694 | if (PagePinned(page)) { | ||
695 | if (!PageHighMem(page)) | ||
696 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | ||
697 | } | ||
698 | } | ||
699 | |||
700 | #ifdef CONFIG_HIGHPTE | ||
701 | static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) | ||
702 | { | ||
703 | pgprot_t prot = PAGE_KERNEL; | ||
704 | |||
705 | if (PagePinned(page)) | ||
706 | prot = PAGE_KERNEL_RO; | ||
707 | |||
708 | if (0 && PageHighMem(page)) | ||
709 | printk("mapping highpte %lx type %d prot %s\n", | ||
710 | page_to_pfn(page), type, | ||
711 | (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); | ||
712 | |||
713 | return kmap_atomic_prot(page, type, prot); | ||
714 | } | ||
715 | #endif | ||
716 | |||
717 | static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) | ||
718 | { | ||
719 | /* If there's an existing pte, then don't allow _PAGE_RW to be set */ | ||
720 | if (pte_val_ma(*ptep) & _PAGE_PRESENT) | ||
721 | pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & | ||
722 | pte_val_ma(pte)); | ||
723 | |||
724 | return pte; | ||
725 | } | ||
726 | |||
727 | /* Init-time set_pte while constructing initial pagetables, which | ||
728 | doesn't allow RO pagetable pages to be remapped RW */ | ||
729 | static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) | ||
730 | { | ||
731 | pte = mask_rw_pte(ptep, pte); | ||
732 | |||
733 | xen_set_pte(ptep, pte); | ||
734 | } | ||
735 | |||
736 | static __init void xen_pagetable_setup_start(pgd_t *base) | ||
737 | { | ||
738 | pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; | ||
739 | |||
740 | /* special set_pte for pagetable initialization */ | ||
741 | paravirt_ops.set_pte = xen_set_pte_init; | ||
742 | |||
743 | init_mm.pgd = base; | ||
744 | /* | ||
745 | * copy top-level of Xen-supplied pagetable into place. For | ||
746 | * !PAE we can use this as-is, but for PAE it is a stand-in | ||
747 | * while we copy the pmd pages. | ||
748 | */ | ||
749 | memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); | ||
750 | |||
751 | if (PTRS_PER_PMD > 1) { | ||
752 | int i; | ||
753 | /* | ||
754 | * For PAE, need to allocate new pmds, rather than | ||
755 | * share Xen's, since Xen doesn't like pmd's being | ||
756 | * shared between address spaces. | ||
757 | */ | ||
758 | for (i = 0; i < PTRS_PER_PGD; i++) { | ||
759 | if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { | ||
760 | pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); | ||
761 | |||
762 | memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), | ||
763 | PAGE_SIZE); | ||
764 | |||
765 | make_lowmem_page_readonly(pmd); | ||
766 | |||
767 | set_pgd(&base[i], __pgd(1 + __pa(pmd))); | ||
768 | } else | ||
769 | pgd_clear(&base[i]); | ||
770 | } | ||
771 | } | ||
772 | |||
773 | /* make sure zero_page is mapped RO so we can use it in pagetables */ | ||
774 | make_lowmem_page_readonly(empty_zero_page); | ||
775 | make_lowmem_page_readonly(base); | ||
776 | /* | ||
777 | * Switch to new pagetable. This is done before | ||
778 | * pagetable_init has done anything so that the new pages | ||
779 | * added to the table can be prepared properly for Xen. | ||
780 | */ | ||
781 | xen_write_cr3(__pa(base)); | ||
782 | } | ||
783 | |||
784 | static __init void xen_pagetable_setup_done(pgd_t *base) | ||
785 | { | ||
786 | /* This will work as long as patching hasn't happened yet | ||
787 | (which it hasn't) */ | ||
788 | paravirt_ops.alloc_pt = xen_alloc_pt; | ||
789 | paravirt_ops.set_pte = xen_set_pte; | ||
790 | |||
791 | if (!xen_feature(XENFEAT_auto_translated_physmap)) { | ||
792 | /* | ||
793 | * Create a mapping for the shared info page. | ||
794 | * Should be set_fixmap(), but shared_info is a machine | ||
795 | * address with no corresponding pseudo-phys address. | ||
796 | */ | ||
797 | set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP), | ||
798 | PFN_DOWN(xen_start_info->shared_info), | ||
799 | PAGE_KERNEL); | ||
800 | |||
801 | HYPERVISOR_shared_info = | ||
802 | (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); | ||
803 | |||
804 | } else | ||
805 | HYPERVISOR_shared_info = | ||
806 | (struct shared_info *)__va(xen_start_info->shared_info); | ||
807 | |||
808 | /* Actually pin the pagetable down, but we can't set PG_pinned | ||
809 | yet because the page structures don't exist yet. */ | ||
810 | { | ||
811 | struct mmuext_op op; | ||
812 | #ifdef CONFIG_X86_PAE | ||
813 | op.cmd = MMUEXT_PIN_L3_TABLE; | ||
814 | #else | ||
815 | op.cmd = MMUEXT_PIN_L3_TABLE; | ||
816 | #endif | ||
817 | op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); | ||
818 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) | ||
819 | BUG(); | ||
820 | } | ||
821 | } | ||
822 | |||
823 | /* This is called once we have the cpu_possible_map */ | ||
824 | void __init xen_setup_vcpu_info_placement(void) | ||
825 | { | ||
826 | int cpu; | ||
827 | |||
828 | for_each_possible_cpu(cpu) | ||
829 | xen_vcpu_setup(cpu); | ||
830 | |||
831 | /* xen_vcpu_setup managed to place the vcpu_info within the | ||
832 | percpu area for all cpus, so make use of it */ | ||
833 | if (have_vcpu_info_placement) { | ||
834 | printk(KERN_INFO "Xen: using vcpu_info placement\n"); | ||
835 | |||
836 | paravirt_ops.save_fl = xen_save_fl_direct; | ||
837 | paravirt_ops.restore_fl = xen_restore_fl_direct; | ||
838 | paravirt_ops.irq_disable = xen_irq_disable_direct; | ||
839 | paravirt_ops.irq_enable = xen_irq_enable_direct; | ||
840 | paravirt_ops.read_cr2 = xen_read_cr2_direct; | ||
841 | paravirt_ops.iret = xen_iret_direct; | ||
842 | } | ||
843 | } | ||
844 | |||
845 | static unsigned xen_patch(u8 type, u16 clobbers, void *insns, unsigned len) | ||
846 | { | ||
847 | char *start, *end, *reloc; | ||
848 | unsigned ret; | ||
849 | |||
850 | start = end = reloc = NULL; | ||
851 | |||
852 | #define SITE(x) \ | ||
853 | case PARAVIRT_PATCH(x): \ | ||
854 | if (have_vcpu_info_placement) { \ | ||
855 | start = (char *)xen_##x##_direct; \ | ||
856 | end = xen_##x##_direct_end; \ | ||
857 | reloc = xen_##x##_direct_reloc; \ | ||
858 | } \ | ||
859 | goto patch_site | ||
860 | |||
861 | switch (type) { | ||
862 | SITE(irq_enable); | ||
863 | SITE(irq_disable); | ||
864 | SITE(save_fl); | ||
865 | SITE(restore_fl); | ||
866 | #undef SITE | ||
867 | |||
868 | patch_site: | ||
869 | if (start == NULL || (end-start) > len) | ||
870 | goto default_patch; | ||
871 | |||
872 | ret = paravirt_patch_insns(insns, len, start, end); | ||
873 | |||
874 | /* Note: because reloc is assigned from something that | ||
875 | appears to be an array, gcc assumes it's non-null, | ||
876 | but doesn't know its relationship with start and | ||
877 | end. */ | ||
878 | if (reloc > start && reloc < end) { | ||
879 | int reloc_off = reloc - start; | ||
880 | long *relocp = (long *)(insns + reloc_off); | ||
881 | long delta = start - (char *)insns; | ||
882 | |||
883 | *relocp += delta; | ||
884 | } | ||
885 | break; | ||
886 | |||
887 | default_patch: | ||
888 | default: | ||
889 | ret = paravirt_patch_default(type, clobbers, insns, len); | ||
890 | break; | ||
891 | } | ||
892 | |||
893 | return ret; | ||
894 | } | ||
895 | |||
896 | static const struct paravirt_ops xen_paravirt_ops __initdata = { | ||
897 | .paravirt_enabled = 1, | ||
898 | .shared_kernel_pmd = 0, | ||
899 | |||
900 | .name = "Xen", | ||
901 | .banner = xen_banner, | ||
902 | |||
903 | .patch = xen_patch, | ||
904 | |||
905 | .memory_setup = xen_memory_setup, | ||
906 | .arch_setup = xen_arch_setup, | ||
907 | .init_IRQ = xen_init_IRQ, | ||
908 | .post_allocator_init = xen_mark_init_mm_pinned, | ||
909 | |||
910 | .time_init = xen_time_init, | ||
911 | .set_wallclock = xen_set_wallclock, | ||
912 | .get_wallclock = xen_get_wallclock, | ||
913 | .get_cpu_khz = xen_cpu_khz, | ||
914 | .sched_clock = xen_sched_clock, | ||
915 | |||
916 | .cpuid = xen_cpuid, | ||
917 | |||
918 | .set_debugreg = xen_set_debugreg, | ||
919 | .get_debugreg = xen_get_debugreg, | ||
920 | |||
921 | .clts = native_clts, | ||
922 | |||
923 | .read_cr0 = native_read_cr0, | ||
924 | .write_cr0 = native_write_cr0, | ||
925 | |||
926 | .read_cr2 = xen_read_cr2, | ||
927 | .write_cr2 = xen_write_cr2, | ||
928 | |||
929 | .read_cr3 = xen_read_cr3, | ||
930 | .write_cr3 = xen_write_cr3, | ||
931 | |||
932 | .read_cr4 = native_read_cr4, | ||
933 | .read_cr4_safe = native_read_cr4_safe, | ||
934 | .write_cr4 = xen_write_cr4, | ||
935 | |||
936 | .save_fl = xen_save_fl, | ||
937 | .restore_fl = xen_restore_fl, | ||
938 | .irq_disable = xen_irq_disable, | ||
939 | .irq_enable = xen_irq_enable, | ||
940 | .safe_halt = xen_safe_halt, | ||
941 | .halt = xen_halt, | ||
942 | .wbinvd = native_wbinvd, | ||
943 | |||
944 | .read_msr = native_read_msr_safe, | ||
945 | .write_msr = native_write_msr_safe, | ||
946 | .read_tsc = native_read_tsc, | ||
947 | .read_pmc = native_read_pmc, | ||
948 | |||
949 | .iret = (void *)&hypercall_page[__HYPERVISOR_iret], | ||
950 | .irq_enable_sysexit = NULL, /* never called */ | ||
951 | |||
952 | .load_tr_desc = paravirt_nop, | ||
953 | .set_ldt = xen_set_ldt, | ||
954 | .load_gdt = xen_load_gdt, | ||
955 | .load_idt = xen_load_idt, | ||
956 | .load_tls = xen_load_tls, | ||
957 | |||
958 | .store_gdt = native_store_gdt, | ||
959 | .store_idt = native_store_idt, | ||
960 | .store_tr = xen_store_tr, | ||
961 | |||
962 | .write_ldt_entry = xen_write_ldt_entry, | ||
963 | .write_gdt_entry = xen_write_gdt_entry, | ||
964 | .write_idt_entry = xen_write_idt_entry, | ||
965 | .load_esp0 = xen_load_esp0, | ||
966 | |||
967 | .set_iopl_mask = xen_set_iopl_mask, | ||
968 | .io_delay = xen_io_delay, | ||
969 | |||
970 | #ifdef CONFIG_X86_LOCAL_APIC | ||
971 | .apic_write = xen_apic_write, | ||
972 | .apic_write_atomic = xen_apic_write, | ||
973 | .apic_read = xen_apic_read, | ||
974 | .setup_boot_clock = paravirt_nop, | ||
975 | .setup_secondary_clock = paravirt_nop, | ||
976 | .startup_ipi_hook = paravirt_nop, | ||
977 | #endif | ||
978 | |||
979 | .flush_tlb_user = xen_flush_tlb, | ||
980 | .flush_tlb_kernel = xen_flush_tlb, | ||
981 | .flush_tlb_single = xen_flush_tlb_single, | ||
982 | .flush_tlb_others = xen_flush_tlb_others, | ||
983 | |||
984 | .pte_update = paravirt_nop, | ||
985 | .pte_update_defer = paravirt_nop, | ||
986 | |||
987 | .pagetable_setup_start = xen_pagetable_setup_start, | ||
988 | .pagetable_setup_done = xen_pagetable_setup_done, | ||
989 | |||
990 | .alloc_pt = xen_alloc_pt_init, | ||
991 | .release_pt = xen_release_pt, | ||
992 | .alloc_pd = paravirt_nop, | ||
993 | .alloc_pd_clone = paravirt_nop, | ||
994 | .release_pd = paravirt_nop, | ||
995 | |||
996 | #ifdef CONFIG_HIGHPTE | ||
997 | .kmap_atomic_pte = xen_kmap_atomic_pte, | ||
998 | #endif | ||
999 | |||
1000 | .set_pte = NULL, /* see xen_pagetable_setup_* */ | ||
1001 | .set_pte_at = xen_set_pte_at, | ||
1002 | .set_pmd = xen_set_pmd, | ||
1003 | |||
1004 | .pte_val = xen_pte_val, | ||
1005 | .pgd_val = xen_pgd_val, | ||
1006 | |||
1007 | .make_pte = xen_make_pte, | ||
1008 | .make_pgd = xen_make_pgd, | ||
1009 | |||
1010 | #ifdef CONFIG_X86_PAE | ||
1011 | .set_pte_atomic = xen_set_pte_atomic, | ||
1012 | .set_pte_present = xen_set_pte_at, | ||
1013 | .set_pud = xen_set_pud, | ||
1014 | .pte_clear = xen_pte_clear, | ||
1015 | .pmd_clear = xen_pmd_clear, | ||
1016 | |||
1017 | .make_pmd = xen_make_pmd, | ||
1018 | .pmd_val = xen_pmd_val, | ||
1019 | #endif /* PAE */ | ||
1020 | |||
1021 | .activate_mm = xen_activate_mm, | ||
1022 | .dup_mmap = xen_dup_mmap, | ||
1023 | .exit_mmap = xen_exit_mmap, | ||
1024 | |||
1025 | .set_lazy_mode = xen_set_lazy_mode, | ||
1026 | }; | ||
1027 | |||
1028 | #ifdef CONFIG_SMP | ||
1029 | static const struct smp_ops xen_smp_ops __initdata = { | ||
1030 | .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, | ||
1031 | .smp_prepare_cpus = xen_smp_prepare_cpus, | ||
1032 | .cpu_up = xen_cpu_up, | ||
1033 | .smp_cpus_done = xen_smp_cpus_done, | ||
1034 | |||
1035 | .smp_send_stop = xen_smp_send_stop, | ||
1036 | .smp_send_reschedule = xen_smp_send_reschedule, | ||
1037 | .smp_call_function_mask = xen_smp_call_function_mask, | ||
1038 | }; | ||
1039 | #endif /* CONFIG_SMP */ | ||
1040 | |||
1041 | static void xen_reboot(int reason) | ||
1042 | { | ||
1043 | #ifdef CONFIG_SMP | ||
1044 | smp_send_stop(); | ||
1045 | #endif | ||
1046 | |||
1047 | if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) | ||
1048 | BUG(); | ||
1049 | } | ||
1050 | |||
1051 | static void xen_restart(char *msg) | ||
1052 | { | ||
1053 | xen_reboot(SHUTDOWN_reboot); | ||
1054 | } | ||
1055 | |||
1056 | static void xen_emergency_restart(void) | ||
1057 | { | ||
1058 | xen_reboot(SHUTDOWN_reboot); | ||
1059 | } | ||
1060 | |||
1061 | static void xen_machine_halt(void) | ||
1062 | { | ||
1063 | xen_reboot(SHUTDOWN_poweroff); | ||
1064 | } | ||
1065 | |||
1066 | static void xen_crash_shutdown(struct pt_regs *regs) | ||
1067 | { | ||
1068 | xen_reboot(SHUTDOWN_crash); | ||
1069 | } | ||
1070 | |||
1071 | static const struct machine_ops __initdata xen_machine_ops = { | ||
1072 | .restart = xen_restart, | ||
1073 | .halt = xen_machine_halt, | ||
1074 | .power_off = xen_machine_halt, | ||
1075 | .shutdown = xen_machine_halt, | ||
1076 | .crash_shutdown = xen_crash_shutdown, | ||
1077 | .emergency_restart = xen_emergency_restart, | ||
1078 | }; | ||
1079 | |||
1080 | |||
1081 | /* First C function to be called on Xen boot */ | ||
1082 | asmlinkage void __init xen_start_kernel(void) | ||
1083 | { | ||
1084 | pgd_t *pgd; | ||
1085 | |||
1086 | if (!xen_start_info) | ||
1087 | return; | ||
1088 | |||
1089 | BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); | ||
1090 | |||
1091 | /* Install Xen paravirt ops */ | ||
1092 | paravirt_ops = xen_paravirt_ops; | ||
1093 | machine_ops = xen_machine_ops; | ||
1094 | |||
1095 | #ifdef CONFIG_SMP | ||
1096 | smp_ops = xen_smp_ops; | ||
1097 | #endif | ||
1098 | |||
1099 | xen_setup_features(); | ||
1100 | |||
1101 | /* Get mfn list */ | ||
1102 | if (!xen_feature(XENFEAT_auto_translated_physmap)) | ||
1103 | phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; | ||
1104 | |||
1105 | pgd = (pgd_t *)xen_start_info->pt_base; | ||
1106 | |||
1107 | init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; | ||
1108 | |||
1109 | init_mm.pgd = pgd; /* use the Xen pagetables to start */ | ||
1110 | |||
1111 | /* keep using Xen gdt for now; no urgent need to change it */ | ||
1112 | |||
1113 | x86_write_percpu(xen_cr3, __pa(pgd)); | ||
1114 | |||
1115 | #ifdef CONFIG_SMP | ||
1116 | /* Don't do the full vcpu_info placement stuff until we have a | ||
1117 | possible map. */ | ||
1118 | per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; | ||
1119 | #else | ||
1120 | /* May as well do it now, since there's no good time to call | ||
1121 | it later on UP. */ | ||
1122 | xen_setup_vcpu_info_placement(); | ||
1123 | #endif | ||
1124 | |||
1125 | paravirt_ops.kernel_rpl = 1; | ||
1126 | if (xen_feature(XENFEAT_supervisor_mode_kernel)) | ||
1127 | paravirt_ops.kernel_rpl = 0; | ||
1128 | |||
1129 | /* set the limit of our address space */ | ||
1130 | reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); | ||
1131 | |||
1132 | /* set up basic CPUID stuff */ | ||
1133 | cpu_detect(&new_cpu_data); | ||
1134 | new_cpu_data.hard_math = 1; | ||
1135 | new_cpu_data.x86_capability[0] = cpuid_edx(1); | ||
1136 | |||
1137 | /* Poke various useful things into boot_params */ | ||
1138 | LOADER_TYPE = (9 << 4) | 0; | ||
1139 | INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0; | ||
1140 | INITRD_SIZE = xen_start_info->mod_len; | ||
1141 | |||
1142 | /* Start the world */ | ||
1143 | start_kernel(); | ||
1144 | } | ||
diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c new file mode 100644 index 000000000000..8904acc20f8c --- /dev/null +++ b/arch/i386/xen/events.c | |||
@@ -0,0 +1,590 @@ | |||
1 | /* | ||
2 | * Xen event channels | ||
3 | * | ||
4 | * Xen models interrupts with abstract event channels. Because each | ||
5 | * domain gets 1024 event channels, but NR_IRQ is not that large, we | ||
6 | * must dynamically map irqs<->event channels. The event channels | ||
7 | * interface with the rest of the kernel by defining a xen interrupt | ||
8 | * chip. When an event is recieved, it is mapped to an irq and sent | ||
9 | * through the normal interrupt processing path. | ||
10 | * | ||
11 | * There are four kinds of events which can be mapped to an event | ||
12 | * channel: | ||
13 | * | ||
14 | * 1. Inter-domain notifications. This includes all the virtual | ||
15 | * device events, since they're driven by front-ends in another domain | ||
16 | * (typically dom0). | ||
17 | * 2. VIRQs, typically used for timers. These are per-cpu events. | ||
18 | * 3. IPIs. | ||
19 | * 4. Hardware interrupts. Not supported at present. | ||
20 | * | ||
21 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | ||
22 | */ | ||
23 | |||
24 | #include <linux/linkage.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/irq.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <linux/string.h> | ||
29 | |||
30 | #include <asm/ptrace.h> | ||
31 | #include <asm/irq.h> | ||
32 | #include <asm/sync_bitops.h> | ||
33 | #include <asm/xen/hypercall.h> | ||
34 | |||
35 | #include <xen/events.h> | ||
36 | #include <xen/interface/xen.h> | ||
37 | #include <xen/interface/event_channel.h> | ||
38 | |||
39 | #include "xen-ops.h" | ||
40 | |||
41 | /* | ||
42 | * This lock protects updates to the following mapping and reference-count | ||
43 | * arrays. The lock does not need to be acquired to read the mapping tables. | ||
44 | */ | ||
45 | static DEFINE_SPINLOCK(irq_mapping_update_lock); | ||
46 | |||
47 | /* IRQ <-> VIRQ mapping. */ | ||
48 | static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; | ||
49 | |||
50 | /* IRQ <-> IPI mapping */ | ||
51 | static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; | ||
52 | |||
53 | /* Packed IRQ information: binding type, sub-type index, and event channel. */ | ||
54 | struct packed_irq | ||
55 | { | ||
56 | unsigned short evtchn; | ||
57 | unsigned char index; | ||
58 | unsigned char type; | ||
59 | }; | ||
60 | |||
61 | static struct packed_irq irq_info[NR_IRQS]; | ||
62 | |||
63 | /* Binding types. */ | ||
64 | enum { | ||
65 | IRQT_UNBOUND, | ||
66 | IRQT_PIRQ, | ||
67 | IRQT_VIRQ, | ||
68 | IRQT_IPI, | ||
69 | IRQT_EVTCHN | ||
70 | }; | ||
71 | |||
72 | /* Convenient shorthand for packed representation of an unbound IRQ. */ | ||
73 | #define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) | ||
74 | |||
75 | static int evtchn_to_irq[NR_EVENT_CHANNELS] = { | ||
76 | [0 ... NR_EVENT_CHANNELS-1] = -1 | ||
77 | }; | ||
78 | static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; | ||
79 | static u8 cpu_evtchn[NR_EVENT_CHANNELS]; | ||
80 | |||
81 | /* Reference counts for bindings to IRQs. */ | ||
82 | static int irq_bindcount[NR_IRQS]; | ||
83 | |||
84 | /* Xen will never allocate port zero for any purpose. */ | ||
85 | #define VALID_EVTCHN(chn) ((chn) != 0) | ||
86 | |||
87 | /* | ||
88 | * Force a proper event-channel callback from Xen after clearing the | ||
89 | * callback mask. We do this in a very simple manner, by making a call | ||
90 | * down into Xen. The pending flag will be checked by Xen on return. | ||
91 | */ | ||
92 | void force_evtchn_callback(void) | ||
93 | { | ||
94 | (void)HYPERVISOR_xen_version(0, NULL); | ||
95 | } | ||
96 | EXPORT_SYMBOL_GPL(force_evtchn_callback); | ||
97 | |||
98 | static struct irq_chip xen_dynamic_chip; | ||
99 | |||
100 | /* Constructor for packed IRQ information. */ | ||
101 | static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn) | ||
102 | { | ||
103 | return (struct packed_irq) { evtchn, index, type }; | ||
104 | } | ||
105 | |||
106 | /* | ||
107 | * Accessors for packed IRQ information. | ||
108 | */ | ||
109 | static inline unsigned int evtchn_from_irq(int irq) | ||
110 | { | ||
111 | return irq_info[irq].evtchn; | ||
112 | } | ||
113 | |||
114 | static inline unsigned int index_from_irq(int irq) | ||
115 | { | ||
116 | return irq_info[irq].index; | ||
117 | } | ||
118 | |||
119 | static inline unsigned int type_from_irq(int irq) | ||
120 | { | ||
121 | return irq_info[irq].type; | ||
122 | } | ||
123 | |||
124 | static inline unsigned long active_evtchns(unsigned int cpu, | ||
125 | struct shared_info *sh, | ||
126 | unsigned int idx) | ||
127 | { | ||
128 | return (sh->evtchn_pending[idx] & | ||
129 | cpu_evtchn_mask[cpu][idx] & | ||
130 | ~sh->evtchn_mask[idx]); | ||
131 | } | ||
132 | |||
133 | static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) | ||
134 | { | ||
135 | int irq = evtchn_to_irq[chn]; | ||
136 | |||
137 | BUG_ON(irq == -1); | ||
138 | #ifdef CONFIG_SMP | ||
139 | irq_desc[irq].affinity = cpumask_of_cpu(cpu); | ||
140 | #endif | ||
141 | |||
142 | __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); | ||
143 | __set_bit(chn, cpu_evtchn_mask[cpu]); | ||
144 | |||
145 | cpu_evtchn[chn] = cpu; | ||
146 | } | ||
147 | |||
148 | static void init_evtchn_cpu_bindings(void) | ||
149 | { | ||
150 | #ifdef CONFIG_SMP | ||
151 | int i; | ||
152 | /* By default all event channels notify CPU#0. */ | ||
153 | for (i = 0; i < NR_IRQS; i++) | ||
154 | irq_desc[i].affinity = cpumask_of_cpu(0); | ||
155 | #endif | ||
156 | |||
157 | memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); | ||
158 | memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); | ||
159 | } | ||
160 | |||
161 | static inline unsigned int cpu_from_evtchn(unsigned int evtchn) | ||
162 | { | ||
163 | return cpu_evtchn[evtchn]; | ||
164 | } | ||
165 | |||
166 | static inline void clear_evtchn(int port) | ||
167 | { | ||
168 | struct shared_info *s = HYPERVISOR_shared_info; | ||
169 | sync_clear_bit(port, &s->evtchn_pending[0]); | ||
170 | } | ||
171 | |||
172 | static inline void set_evtchn(int port) | ||
173 | { | ||
174 | struct shared_info *s = HYPERVISOR_shared_info; | ||
175 | sync_set_bit(port, &s->evtchn_pending[0]); | ||
176 | } | ||
177 | |||
178 | |||
179 | /** | ||
180 | * notify_remote_via_irq - send event to remote end of event channel via irq | ||
181 | * @irq: irq of event channel to send event to | ||
182 | * | ||
183 | * Unlike notify_remote_via_evtchn(), this is safe to use across | ||
184 | * save/restore. Notifications on a broken connection are silently | ||
185 | * dropped. | ||
186 | */ | ||
187 | void notify_remote_via_irq(int irq) | ||
188 | { | ||
189 | int evtchn = evtchn_from_irq(irq); | ||
190 | |||
191 | if (VALID_EVTCHN(evtchn)) | ||
192 | notify_remote_via_evtchn(evtchn); | ||
193 | } | ||
194 | EXPORT_SYMBOL_GPL(notify_remote_via_irq); | ||
195 | |||
196 | static void mask_evtchn(int port) | ||
197 | { | ||
198 | struct shared_info *s = HYPERVISOR_shared_info; | ||
199 | sync_set_bit(port, &s->evtchn_mask[0]); | ||
200 | } | ||
201 | |||
202 | static void unmask_evtchn(int port) | ||
203 | { | ||
204 | struct shared_info *s = HYPERVISOR_shared_info; | ||
205 | unsigned int cpu = get_cpu(); | ||
206 | |||
207 | BUG_ON(!irqs_disabled()); | ||
208 | |||
209 | /* Slow path (hypercall) if this is a non-local port. */ | ||
210 | if (unlikely(cpu != cpu_from_evtchn(port))) { | ||
211 | struct evtchn_unmask unmask = { .port = port }; | ||
212 | (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); | ||
213 | } else { | ||
214 | struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); | ||
215 | |||
216 | sync_clear_bit(port, &s->evtchn_mask[0]); | ||
217 | |||
218 | /* | ||
219 | * The following is basically the equivalent of | ||
220 | * 'hw_resend_irq'. Just like a real IO-APIC we 'lose | ||
221 | * the interrupt edge' if the channel is masked. | ||
222 | */ | ||
223 | if (sync_test_bit(port, &s->evtchn_pending[0]) && | ||
224 | !sync_test_and_set_bit(port / BITS_PER_LONG, | ||
225 | &vcpu_info->evtchn_pending_sel)) | ||
226 | vcpu_info->evtchn_upcall_pending = 1; | ||
227 | } | ||
228 | |||
229 | put_cpu(); | ||
230 | } | ||
231 | |||
232 | static int find_unbound_irq(void) | ||
233 | { | ||
234 | int irq; | ||
235 | |||
236 | /* Only allocate from dynirq range */ | ||
237 | for (irq = 0; irq < NR_IRQS; irq++) | ||
238 | if (irq_bindcount[irq] == 0) | ||
239 | break; | ||
240 | |||
241 | if (irq == NR_IRQS) | ||
242 | panic("No available IRQ to bind to: increase NR_IRQS!\n"); | ||
243 | |||
244 | return irq; | ||
245 | } | ||
246 | |||
247 | int bind_evtchn_to_irq(unsigned int evtchn) | ||
248 | { | ||
249 | int irq; | ||
250 | |||
251 | spin_lock(&irq_mapping_update_lock); | ||
252 | |||
253 | irq = evtchn_to_irq[evtchn]; | ||
254 | |||
255 | if (irq == -1) { | ||
256 | irq = find_unbound_irq(); | ||
257 | |||
258 | dynamic_irq_init(irq); | ||
259 | set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, | ||
260 | handle_level_irq, "event"); | ||
261 | |||
262 | evtchn_to_irq[evtchn] = irq; | ||
263 | irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); | ||
264 | } | ||
265 | |||
266 | irq_bindcount[irq]++; | ||
267 | |||
268 | spin_unlock(&irq_mapping_update_lock); | ||
269 | |||
270 | return irq; | ||
271 | } | ||
272 | EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); | ||
273 | |||
274 | static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) | ||
275 | { | ||
276 | struct evtchn_bind_ipi bind_ipi; | ||
277 | int evtchn, irq; | ||
278 | |||
279 | spin_lock(&irq_mapping_update_lock); | ||
280 | |||
281 | irq = per_cpu(ipi_to_irq, cpu)[ipi]; | ||
282 | if (irq == -1) { | ||
283 | irq = find_unbound_irq(); | ||
284 | if (irq < 0) | ||
285 | goto out; | ||
286 | |||
287 | dynamic_irq_init(irq); | ||
288 | set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, | ||
289 | handle_level_irq, "ipi"); | ||
290 | |||
291 | bind_ipi.vcpu = cpu; | ||
292 | if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, | ||
293 | &bind_ipi) != 0) | ||
294 | BUG(); | ||
295 | evtchn = bind_ipi.port; | ||
296 | |||
297 | evtchn_to_irq[evtchn] = irq; | ||
298 | irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); | ||
299 | |||
300 | per_cpu(ipi_to_irq, cpu)[ipi] = irq; | ||
301 | |||
302 | bind_evtchn_to_cpu(evtchn, cpu); | ||
303 | } | ||
304 | |||
305 | irq_bindcount[irq]++; | ||
306 | |||
307 | out: | ||
308 | spin_unlock(&irq_mapping_update_lock); | ||
309 | return irq; | ||
310 | } | ||
311 | |||
312 | |||
313 | static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) | ||
314 | { | ||
315 | struct evtchn_bind_virq bind_virq; | ||
316 | int evtchn, irq; | ||
317 | |||
318 | spin_lock(&irq_mapping_update_lock); | ||
319 | |||
320 | irq = per_cpu(virq_to_irq, cpu)[virq]; | ||
321 | |||
322 | if (irq == -1) { | ||
323 | bind_virq.virq = virq; | ||
324 | bind_virq.vcpu = cpu; | ||
325 | if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, | ||
326 | &bind_virq) != 0) | ||
327 | BUG(); | ||
328 | evtchn = bind_virq.port; | ||
329 | |||
330 | irq = find_unbound_irq(); | ||
331 | |||
332 | dynamic_irq_init(irq); | ||
333 | set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, | ||
334 | handle_level_irq, "virq"); | ||
335 | |||
336 | evtchn_to_irq[evtchn] = irq; | ||
337 | irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); | ||
338 | |||
339 | per_cpu(virq_to_irq, cpu)[virq] = irq; | ||
340 | |||
341 | bind_evtchn_to_cpu(evtchn, cpu); | ||
342 | } | ||
343 | |||
344 | irq_bindcount[irq]++; | ||
345 | |||
346 | spin_unlock(&irq_mapping_update_lock); | ||
347 | |||
348 | return irq; | ||
349 | } | ||
350 | |||
351 | static void unbind_from_irq(unsigned int irq) | ||
352 | { | ||
353 | struct evtchn_close close; | ||
354 | int evtchn = evtchn_from_irq(irq); | ||
355 | |||
356 | spin_lock(&irq_mapping_update_lock); | ||
357 | |||
358 | if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) { | ||
359 | close.port = evtchn; | ||
360 | if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) | ||
361 | BUG(); | ||
362 | |||
363 | switch (type_from_irq(irq)) { | ||
364 | case IRQT_VIRQ: | ||
365 | per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) | ||
366 | [index_from_irq(irq)] = -1; | ||
367 | break; | ||
368 | default: | ||
369 | break; | ||
370 | } | ||
371 | |||
372 | /* Closed ports are implicitly re-bound to VCPU0. */ | ||
373 | bind_evtchn_to_cpu(evtchn, 0); | ||
374 | |||
375 | evtchn_to_irq[evtchn] = -1; | ||
376 | irq_info[irq] = IRQ_UNBOUND; | ||
377 | |||
378 | dynamic_irq_init(irq); | ||
379 | } | ||
380 | |||
381 | spin_unlock(&irq_mapping_update_lock); | ||
382 | } | ||
383 | |||
384 | int bind_evtchn_to_irqhandler(unsigned int evtchn, | ||
385 | irqreturn_t (*handler)(int, void *), | ||
386 | unsigned long irqflags, | ||
387 | const char *devname, void *dev_id) | ||
388 | { | ||
389 | unsigned int irq; | ||
390 | int retval; | ||
391 | |||
392 | irq = bind_evtchn_to_irq(evtchn); | ||
393 | retval = request_irq(irq, handler, irqflags, devname, dev_id); | ||
394 | if (retval != 0) { | ||
395 | unbind_from_irq(irq); | ||
396 | return retval; | ||
397 | } | ||
398 | |||
399 | return irq; | ||
400 | } | ||
401 | EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); | ||
402 | |||
403 | int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, | ||
404 | irqreturn_t (*handler)(int, void *), | ||
405 | unsigned long irqflags, const char *devname, void *dev_id) | ||
406 | { | ||
407 | unsigned int irq; | ||
408 | int retval; | ||
409 | |||
410 | irq = bind_virq_to_irq(virq, cpu); | ||
411 | retval = request_irq(irq, handler, irqflags, devname, dev_id); | ||
412 | if (retval != 0) { | ||
413 | unbind_from_irq(irq); | ||
414 | return retval; | ||
415 | } | ||
416 | |||
417 | return irq; | ||
418 | } | ||
419 | EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); | ||
420 | |||
421 | int bind_ipi_to_irqhandler(enum ipi_vector ipi, | ||
422 | unsigned int cpu, | ||
423 | irq_handler_t handler, | ||
424 | unsigned long irqflags, | ||
425 | const char *devname, | ||
426 | void *dev_id) | ||
427 | { | ||
428 | int irq, retval; | ||
429 | |||
430 | irq = bind_ipi_to_irq(ipi, cpu); | ||
431 | if (irq < 0) | ||
432 | return irq; | ||
433 | |||
434 | retval = request_irq(irq, handler, irqflags, devname, dev_id); | ||
435 | if (retval != 0) { | ||
436 | unbind_from_irq(irq); | ||
437 | return retval; | ||
438 | } | ||
439 | |||
440 | return irq; | ||
441 | } | ||
442 | |||
443 | void unbind_from_irqhandler(unsigned int irq, void *dev_id) | ||
444 | { | ||
445 | free_irq(irq, dev_id); | ||
446 | unbind_from_irq(irq); | ||
447 | } | ||
448 | EXPORT_SYMBOL_GPL(unbind_from_irqhandler); | ||
449 | |||
450 | void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) | ||
451 | { | ||
452 | int irq = per_cpu(ipi_to_irq, cpu)[vector]; | ||
453 | BUG_ON(irq < 0); | ||
454 | notify_remote_via_irq(irq); | ||
455 | } | ||
456 | |||
457 | |||
458 | /* | ||
459 | * Search the CPUs pending events bitmasks. For each one found, map | ||
460 | * the event number to an irq, and feed it into do_IRQ() for | ||
461 | * handling. | ||
462 | * | ||
463 | * Xen uses a two-level bitmap to speed searching. The first level is | ||
464 | * a bitset of words which contain pending event bits. The second | ||
465 | * level is a bitset of pending events themselves. | ||
466 | */ | ||
467 | fastcall void xen_evtchn_do_upcall(struct pt_regs *regs) | ||
468 | { | ||
469 | int cpu = get_cpu(); | ||
470 | struct shared_info *s = HYPERVISOR_shared_info; | ||
471 | struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); | ||
472 | unsigned long pending_words; | ||
473 | |||
474 | vcpu_info->evtchn_upcall_pending = 0; | ||
475 | |||
476 | /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ | ||
477 | pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); | ||
478 | while (pending_words != 0) { | ||
479 | unsigned long pending_bits; | ||
480 | int word_idx = __ffs(pending_words); | ||
481 | pending_words &= ~(1UL << word_idx); | ||
482 | |||
483 | while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { | ||
484 | int bit_idx = __ffs(pending_bits); | ||
485 | int port = (word_idx * BITS_PER_LONG) + bit_idx; | ||
486 | int irq = evtchn_to_irq[port]; | ||
487 | |||
488 | if (irq != -1) { | ||
489 | regs->orig_eax = ~irq; | ||
490 | do_IRQ(regs); | ||
491 | } | ||
492 | } | ||
493 | } | ||
494 | |||
495 | put_cpu(); | ||
496 | } | ||
497 | |||
498 | /* Rebind an evtchn so that it gets delivered to a specific cpu */ | ||
499 | static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) | ||
500 | { | ||
501 | struct evtchn_bind_vcpu bind_vcpu; | ||
502 | int evtchn = evtchn_from_irq(irq); | ||
503 | |||
504 | if (!VALID_EVTCHN(evtchn)) | ||
505 | return; | ||
506 | |||
507 | /* Send future instances of this interrupt to other vcpu. */ | ||
508 | bind_vcpu.port = evtchn; | ||
509 | bind_vcpu.vcpu = tcpu; | ||
510 | |||
511 | /* | ||
512 | * If this fails, it usually just indicates that we're dealing with a | ||
513 | * virq or IPI channel, which don't actually need to be rebound. Ignore | ||
514 | * it, but don't do the xenlinux-level rebind in that case. | ||
515 | */ | ||
516 | if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) | ||
517 | bind_evtchn_to_cpu(evtchn, tcpu); | ||
518 | } | ||
519 | |||
520 | |||
521 | static void set_affinity_irq(unsigned irq, cpumask_t dest) | ||
522 | { | ||
523 | unsigned tcpu = first_cpu(dest); | ||
524 | rebind_irq_to_cpu(irq, tcpu); | ||
525 | } | ||
526 | |||
527 | static void enable_dynirq(unsigned int irq) | ||
528 | { | ||
529 | int evtchn = evtchn_from_irq(irq); | ||
530 | |||
531 | if (VALID_EVTCHN(evtchn)) | ||
532 | unmask_evtchn(evtchn); | ||
533 | } | ||
534 | |||
535 | static void disable_dynirq(unsigned int irq) | ||
536 | { | ||
537 | int evtchn = evtchn_from_irq(irq); | ||
538 | |||
539 | if (VALID_EVTCHN(evtchn)) | ||
540 | mask_evtchn(evtchn); | ||
541 | } | ||
542 | |||
543 | static void ack_dynirq(unsigned int irq) | ||
544 | { | ||
545 | int evtchn = evtchn_from_irq(irq); | ||
546 | |||
547 | move_native_irq(irq); | ||
548 | |||
549 | if (VALID_EVTCHN(evtchn)) | ||
550 | clear_evtchn(evtchn); | ||
551 | } | ||
552 | |||
553 | static int retrigger_dynirq(unsigned int irq) | ||
554 | { | ||
555 | int evtchn = evtchn_from_irq(irq); | ||
556 | int ret = 0; | ||
557 | |||
558 | if (VALID_EVTCHN(evtchn)) { | ||
559 | set_evtchn(evtchn); | ||
560 | ret = 1; | ||
561 | } | ||
562 | |||
563 | return ret; | ||
564 | } | ||
565 | |||
566 | static struct irq_chip xen_dynamic_chip __read_mostly = { | ||
567 | .name = "xen-dyn", | ||
568 | .mask = disable_dynirq, | ||
569 | .unmask = enable_dynirq, | ||
570 | .ack = ack_dynirq, | ||
571 | .set_affinity = set_affinity_irq, | ||
572 | .retrigger = retrigger_dynirq, | ||
573 | }; | ||
574 | |||
575 | void __init xen_init_IRQ(void) | ||
576 | { | ||
577 | int i; | ||
578 | |||
579 | init_evtchn_cpu_bindings(); | ||
580 | |||
581 | /* No event channels are 'live' right now. */ | ||
582 | for (i = 0; i < NR_EVENT_CHANNELS; i++) | ||
583 | mask_evtchn(i); | ||
584 | |||
585 | /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ | ||
586 | for (i = 0; i < NR_IRQS; i++) | ||
587 | irq_bindcount[i] = 0; | ||
588 | |||
589 | irq_ctx_init(smp_processor_id()); | ||
590 | } | ||
diff --git a/arch/i386/xen/features.c b/arch/i386/xen/features.c new file mode 100644 index 000000000000..0707714e40d6 --- /dev/null +++ b/arch/i386/xen/features.c | |||
@@ -0,0 +1,29 @@ | |||
1 | /****************************************************************************** | ||
2 | * features.c | ||
3 | * | ||
4 | * Xen feature flags. | ||
5 | * | ||
6 | * Copyright (c) 2006, Ian Campbell, XenSource Inc. | ||
7 | */ | ||
8 | #include <linux/types.h> | ||
9 | #include <linux/cache.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <asm/xen/hypervisor.h> | ||
12 | #include <xen/features.h> | ||
13 | |||
14 | u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; | ||
15 | EXPORT_SYMBOL_GPL(xen_features); | ||
16 | |||
17 | void xen_setup_features(void) | ||
18 | { | ||
19 | struct xen_feature_info fi; | ||
20 | int i, j; | ||
21 | |||
22 | for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { | ||
23 | fi.submap_idx = i; | ||
24 | if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) | ||
25 | break; | ||
26 | for (j = 0; j < 32; j++) | ||
27 | xen_features[i * 32 + j] = !!(fi.submap & 1<<j); | ||
28 | } | ||
29 | } | ||
diff --git a/arch/i386/xen/manage.c b/arch/i386/xen/manage.c new file mode 100644 index 000000000000..aa7af9e6abc0 --- /dev/null +++ b/arch/i386/xen/manage.c | |||
@@ -0,0 +1,143 @@ | |||
1 | /* | ||
2 | * Handle extern requests for shutdown, reboot and sysrq | ||
3 | */ | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/err.h> | ||
6 | #include <linux/reboot.h> | ||
7 | #include <linux/sysrq.h> | ||
8 | |||
9 | #include <xen/xenbus.h> | ||
10 | |||
11 | #define SHUTDOWN_INVALID -1 | ||
12 | #define SHUTDOWN_POWEROFF 0 | ||
13 | #define SHUTDOWN_SUSPEND 2 | ||
14 | /* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only | ||
15 | * report a crash, not be instructed to crash! | ||
16 | * HALT is the same as POWEROFF, as far as we're concerned. The tools use | ||
17 | * the distinction when we return the reason code to them. | ||
18 | */ | ||
19 | #define SHUTDOWN_HALT 4 | ||
20 | |||
21 | /* Ignore multiple shutdown requests. */ | ||
22 | static int shutting_down = SHUTDOWN_INVALID; | ||
23 | |||
24 | static void shutdown_handler(struct xenbus_watch *watch, | ||
25 | const char **vec, unsigned int len) | ||
26 | { | ||
27 | char *str; | ||
28 | struct xenbus_transaction xbt; | ||
29 | int err; | ||
30 | |||
31 | if (shutting_down != SHUTDOWN_INVALID) | ||
32 | return; | ||
33 | |||
34 | again: | ||
35 | err = xenbus_transaction_start(&xbt); | ||
36 | if (err) | ||
37 | return; | ||
38 | |||
39 | str = (char *)xenbus_read(xbt, "control", "shutdown", NULL); | ||
40 | /* Ignore read errors and empty reads. */ | ||
41 | if (XENBUS_IS_ERR_READ(str)) { | ||
42 | xenbus_transaction_end(xbt, 1); | ||
43 | return; | ||
44 | } | ||
45 | |||
46 | xenbus_write(xbt, "control", "shutdown", ""); | ||
47 | |||
48 | err = xenbus_transaction_end(xbt, 0); | ||
49 | if (err == -EAGAIN) { | ||
50 | kfree(str); | ||
51 | goto again; | ||
52 | } | ||
53 | |||
54 | if (strcmp(str, "poweroff") == 0 || | ||
55 | strcmp(str, "halt") == 0) | ||
56 | orderly_poweroff(false); | ||
57 | else if (strcmp(str, "reboot") == 0) | ||
58 | ctrl_alt_del(); | ||
59 | else { | ||
60 | printk(KERN_INFO "Ignoring shutdown request: %s\n", str); | ||
61 | shutting_down = SHUTDOWN_INVALID; | ||
62 | } | ||
63 | |||
64 | kfree(str); | ||
65 | } | ||
66 | |||
67 | static void sysrq_handler(struct xenbus_watch *watch, const char **vec, | ||
68 | unsigned int len) | ||
69 | { | ||
70 | char sysrq_key = '\0'; | ||
71 | struct xenbus_transaction xbt; | ||
72 | int err; | ||
73 | |||
74 | again: | ||
75 | err = xenbus_transaction_start(&xbt); | ||
76 | if (err) | ||
77 | return; | ||
78 | if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { | ||
79 | printk(KERN_ERR "Unable to read sysrq code in " | ||
80 | "control/sysrq\n"); | ||
81 | xenbus_transaction_end(xbt, 1); | ||
82 | return; | ||
83 | } | ||
84 | |||
85 | if (sysrq_key != '\0') | ||
86 | xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); | ||
87 | |||
88 | err = xenbus_transaction_end(xbt, 0); | ||
89 | if (err == -EAGAIN) | ||
90 | goto again; | ||
91 | |||
92 | if (sysrq_key != '\0') | ||
93 | handle_sysrq(sysrq_key, NULL); | ||
94 | } | ||
95 | |||
96 | static struct xenbus_watch shutdown_watch = { | ||
97 | .node = "control/shutdown", | ||
98 | .callback = shutdown_handler | ||
99 | }; | ||
100 | |||
101 | static struct xenbus_watch sysrq_watch = { | ||
102 | .node = "control/sysrq", | ||
103 | .callback = sysrq_handler | ||
104 | }; | ||
105 | |||
106 | static int setup_shutdown_watcher(void) | ||
107 | { | ||
108 | int err; | ||
109 | |||
110 | err = register_xenbus_watch(&shutdown_watch); | ||
111 | if (err) { | ||
112 | printk(KERN_ERR "Failed to set shutdown watcher\n"); | ||
113 | return err; | ||
114 | } | ||
115 | |||
116 | err = register_xenbus_watch(&sysrq_watch); | ||
117 | if (err) { | ||
118 | printk(KERN_ERR "Failed to set sysrq watcher\n"); | ||
119 | return err; | ||
120 | } | ||
121 | |||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | static int shutdown_event(struct notifier_block *notifier, | ||
126 | unsigned long event, | ||
127 | void *data) | ||
128 | { | ||
129 | setup_shutdown_watcher(); | ||
130 | return NOTIFY_DONE; | ||
131 | } | ||
132 | |||
133 | static int __init setup_shutdown_event(void) | ||
134 | { | ||
135 | static struct notifier_block xenstore_notifier = { | ||
136 | .notifier_call = shutdown_event | ||
137 | }; | ||
138 | register_xenstore_notifier(&xenstore_notifier); | ||
139 | |||
140 | return 0; | ||
141 | } | ||
142 | |||
143 | subsys_initcall(setup_shutdown_event); | ||
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c new file mode 100644 index 000000000000..4ae038aa6c24 --- /dev/null +++ b/arch/i386/xen/mmu.c | |||
@@ -0,0 +1,564 @@ | |||
1 | /* | ||
2 | * Xen mmu operations | ||
3 | * | ||
4 | * This file contains the various mmu fetch and update operations. | ||
5 | * The most important job they must perform is the mapping between the | ||
6 | * domain's pfn and the overall machine mfns. | ||
7 | * | ||
8 | * Xen allows guests to directly update the pagetable, in a controlled | ||
9 | * fashion. In other words, the guest modifies the same pagetable | ||
10 | * that the CPU actually uses, which eliminates the overhead of having | ||
11 | * a separate shadow pagetable. | ||
12 | * | ||
13 | * In order to allow this, it falls on the guest domain to map its | ||
14 | * notion of a "physical" pfn - which is just a domain-local linear | ||
15 | * address - into a real "machine address" which the CPU's MMU can | ||
16 | * use. | ||
17 | * | ||
18 | * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be | ||
19 | * inserted directly into the pagetable. When creating a new | ||
20 | * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, | ||
21 | * when reading the content back with __(pgd|pmd|pte)_val, it converts | ||
22 | * the mfn back into a pfn. | ||
23 | * | ||
24 | * The other constraint is that all pages which make up a pagetable | ||
25 | * must be mapped read-only in the guest. This prevents uncontrolled | ||
26 | * guest updates to the pagetable. Xen strictly enforces this, and | ||
27 | * will disallow any pagetable update which will end up mapping a | ||
28 | * pagetable page RW, and will disallow using any writable page as a | ||
29 | * pagetable. | ||
30 | * | ||
31 | * Naively, when loading %cr3 with the base of a new pagetable, Xen | ||
32 | * would need to validate the whole pagetable before going on. | ||
33 | * Naturally, this is quite slow. The solution is to "pin" a | ||
34 | * pagetable, which enforces all the constraints on the pagetable even | ||
35 | * when it is not actively in use. This menas that Xen can be assured | ||
36 | * that it is still valid when you do load it into %cr3, and doesn't | ||
37 | * need to revalidate it. | ||
38 | * | ||
39 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | ||
40 | */ | ||
41 | #include <linux/sched.h> | ||
42 | #include <linux/highmem.h> | ||
43 | #include <linux/bug.h> | ||
44 | #include <linux/sched.h> | ||
45 | |||
46 | #include <asm/pgtable.h> | ||
47 | #include <asm/tlbflush.h> | ||
48 | #include <asm/mmu_context.h> | ||
49 | #include <asm/paravirt.h> | ||
50 | |||
51 | #include <asm/xen/hypercall.h> | ||
52 | #include <asm/xen/hypervisor.h> | ||
53 | |||
54 | #include <xen/page.h> | ||
55 | #include <xen/interface/xen.h> | ||
56 | |||
57 | #include "multicalls.h" | ||
58 | #include "mmu.h" | ||
59 | |||
60 | xmaddr_t arbitrary_virt_to_machine(unsigned long address) | ||
61 | { | ||
62 | pte_t *pte = lookup_address(address); | ||
63 | unsigned offset = address & PAGE_MASK; | ||
64 | |||
65 | BUG_ON(pte == NULL); | ||
66 | |||
67 | return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); | ||
68 | } | ||
69 | |||
70 | void make_lowmem_page_readonly(void *vaddr) | ||
71 | { | ||
72 | pte_t *pte, ptev; | ||
73 | unsigned long address = (unsigned long)vaddr; | ||
74 | |||
75 | pte = lookup_address(address); | ||
76 | BUG_ON(pte == NULL); | ||
77 | |||
78 | ptev = pte_wrprotect(*pte); | ||
79 | |||
80 | if (HYPERVISOR_update_va_mapping(address, ptev, 0)) | ||
81 | BUG(); | ||
82 | } | ||
83 | |||
84 | void make_lowmem_page_readwrite(void *vaddr) | ||
85 | { | ||
86 | pte_t *pte, ptev; | ||
87 | unsigned long address = (unsigned long)vaddr; | ||
88 | |||
89 | pte = lookup_address(address); | ||
90 | BUG_ON(pte == NULL); | ||
91 | |||
92 | ptev = pte_mkwrite(*pte); | ||
93 | |||
94 | if (HYPERVISOR_update_va_mapping(address, ptev, 0)) | ||
95 | BUG(); | ||
96 | } | ||
97 | |||
98 | |||
99 | void xen_set_pmd(pmd_t *ptr, pmd_t val) | ||
100 | { | ||
101 | struct multicall_space mcs; | ||
102 | struct mmu_update *u; | ||
103 | |||
104 | preempt_disable(); | ||
105 | |||
106 | mcs = xen_mc_entry(sizeof(*u)); | ||
107 | u = mcs.args; | ||
108 | u->ptr = virt_to_machine(ptr).maddr; | ||
109 | u->val = pmd_val_ma(val); | ||
110 | MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); | ||
111 | |||
112 | xen_mc_issue(PARAVIRT_LAZY_MMU); | ||
113 | |||
114 | preempt_enable(); | ||
115 | } | ||
116 | |||
117 | /* | ||
118 | * Associate a virtual page frame with a given physical page frame | ||
119 | * and protection flags for that frame. | ||
120 | */ | ||
121 | void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) | ||
122 | { | ||
123 | pgd_t *pgd; | ||
124 | pud_t *pud; | ||
125 | pmd_t *pmd; | ||
126 | pte_t *pte; | ||
127 | |||
128 | pgd = swapper_pg_dir + pgd_index(vaddr); | ||
129 | if (pgd_none(*pgd)) { | ||
130 | BUG(); | ||
131 | return; | ||
132 | } | ||
133 | pud = pud_offset(pgd, vaddr); | ||
134 | if (pud_none(*pud)) { | ||
135 | BUG(); | ||
136 | return; | ||
137 | } | ||
138 | pmd = pmd_offset(pud, vaddr); | ||
139 | if (pmd_none(*pmd)) { | ||
140 | BUG(); | ||
141 | return; | ||
142 | } | ||
143 | pte = pte_offset_kernel(pmd, vaddr); | ||
144 | /* <mfn,flags> stored as-is, to permit clearing entries */ | ||
145 | xen_set_pte(pte, mfn_pte(mfn, flags)); | ||
146 | |||
147 | /* | ||
148 | * It's enough to flush this one mapping. | ||
149 | * (PGE mappings get flushed as well) | ||
150 | */ | ||
151 | __flush_tlb_one(vaddr); | ||
152 | } | ||
153 | |||
154 | void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | ||
155 | pte_t *ptep, pte_t pteval) | ||
156 | { | ||
157 | if (mm == current->mm || mm == &init_mm) { | ||
158 | if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { | ||
159 | struct multicall_space mcs; | ||
160 | mcs = xen_mc_entry(0); | ||
161 | |||
162 | MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); | ||
163 | xen_mc_issue(PARAVIRT_LAZY_MMU); | ||
164 | return; | ||
165 | } else | ||
166 | if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) | ||
167 | return; | ||
168 | } | ||
169 | xen_set_pte(ptep, pteval); | ||
170 | } | ||
171 | |||
172 | #ifdef CONFIG_X86_PAE | ||
173 | void xen_set_pud(pud_t *ptr, pud_t val) | ||
174 | { | ||
175 | struct multicall_space mcs; | ||
176 | struct mmu_update *u; | ||
177 | |||
178 | preempt_disable(); | ||
179 | |||
180 | mcs = xen_mc_entry(sizeof(*u)); | ||
181 | u = mcs.args; | ||
182 | u->ptr = virt_to_machine(ptr).maddr; | ||
183 | u->val = pud_val_ma(val); | ||
184 | MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); | ||
185 | |||
186 | xen_mc_issue(PARAVIRT_LAZY_MMU); | ||
187 | |||
188 | preempt_enable(); | ||
189 | } | ||
190 | |||
191 | void xen_set_pte(pte_t *ptep, pte_t pte) | ||
192 | { | ||
193 | ptep->pte_high = pte.pte_high; | ||
194 | smp_wmb(); | ||
195 | ptep->pte_low = pte.pte_low; | ||
196 | } | ||
197 | |||
198 | void xen_set_pte_atomic(pte_t *ptep, pte_t pte) | ||
199 | { | ||
200 | set_64bit((u64 *)ptep, pte_val_ma(pte)); | ||
201 | } | ||
202 | |||
203 | void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | ||
204 | { | ||
205 | ptep->pte_low = 0; | ||
206 | smp_wmb(); /* make sure low gets written first */ | ||
207 | ptep->pte_high = 0; | ||
208 | } | ||
209 | |||
210 | void xen_pmd_clear(pmd_t *pmdp) | ||
211 | { | ||
212 | xen_set_pmd(pmdp, __pmd(0)); | ||
213 | } | ||
214 | |||
215 | unsigned long long xen_pte_val(pte_t pte) | ||
216 | { | ||
217 | unsigned long long ret = 0; | ||
218 | |||
219 | if (pte.pte_low) { | ||
220 | ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low; | ||
221 | ret = machine_to_phys(XMADDR(ret)).paddr | 1; | ||
222 | } | ||
223 | |||
224 | return ret; | ||
225 | } | ||
226 | |||
227 | unsigned long long xen_pmd_val(pmd_t pmd) | ||
228 | { | ||
229 | unsigned long long ret = pmd.pmd; | ||
230 | if (ret) | ||
231 | ret = machine_to_phys(XMADDR(ret)).paddr | 1; | ||
232 | return ret; | ||
233 | } | ||
234 | |||
235 | unsigned long long xen_pgd_val(pgd_t pgd) | ||
236 | { | ||
237 | unsigned long long ret = pgd.pgd; | ||
238 | if (ret) | ||
239 | ret = machine_to_phys(XMADDR(ret)).paddr | 1; | ||
240 | return ret; | ||
241 | } | ||
242 | |||
243 | pte_t xen_make_pte(unsigned long long pte) | ||
244 | { | ||
245 | if (pte & 1) | ||
246 | pte = phys_to_machine(XPADDR(pte)).maddr; | ||
247 | |||
248 | return (pte_t){ pte, pte >> 32 }; | ||
249 | } | ||
250 | |||
251 | pmd_t xen_make_pmd(unsigned long long pmd) | ||
252 | { | ||
253 | if (pmd & 1) | ||
254 | pmd = phys_to_machine(XPADDR(pmd)).maddr; | ||
255 | |||
256 | return (pmd_t){ pmd }; | ||
257 | } | ||
258 | |||
259 | pgd_t xen_make_pgd(unsigned long long pgd) | ||
260 | { | ||
261 | if (pgd & _PAGE_PRESENT) | ||
262 | pgd = phys_to_machine(XPADDR(pgd)).maddr; | ||
263 | |||
264 | return (pgd_t){ pgd }; | ||
265 | } | ||
266 | #else /* !PAE */ | ||
267 | void xen_set_pte(pte_t *ptep, pte_t pte) | ||
268 | { | ||
269 | *ptep = pte; | ||
270 | } | ||
271 | |||
272 | unsigned long xen_pte_val(pte_t pte) | ||
273 | { | ||
274 | unsigned long ret = pte.pte_low; | ||
275 | |||
276 | if (ret & _PAGE_PRESENT) | ||
277 | ret = machine_to_phys(XMADDR(ret)).paddr; | ||
278 | |||
279 | return ret; | ||
280 | } | ||
281 | |||
282 | unsigned long xen_pgd_val(pgd_t pgd) | ||
283 | { | ||
284 | unsigned long ret = pgd.pgd; | ||
285 | if (ret) | ||
286 | ret = machine_to_phys(XMADDR(ret)).paddr | 1; | ||
287 | return ret; | ||
288 | } | ||
289 | |||
290 | pte_t xen_make_pte(unsigned long pte) | ||
291 | { | ||
292 | if (pte & _PAGE_PRESENT) | ||
293 | pte = phys_to_machine(XPADDR(pte)).maddr; | ||
294 | |||
295 | return (pte_t){ pte }; | ||
296 | } | ||
297 | |||
298 | pgd_t xen_make_pgd(unsigned long pgd) | ||
299 | { | ||
300 | if (pgd & _PAGE_PRESENT) | ||
301 | pgd = phys_to_machine(XPADDR(pgd)).maddr; | ||
302 | |||
303 | return (pgd_t){ pgd }; | ||
304 | } | ||
305 | #endif /* CONFIG_X86_PAE */ | ||
306 | |||
307 | |||
308 | |||
309 | /* | ||
310 | (Yet another) pagetable walker. This one is intended for pinning a | ||
311 | pagetable. This means that it walks a pagetable and calls the | ||
312 | callback function on each page it finds making up the page table, | ||
313 | at every level. It walks the entire pagetable, but it only bothers | ||
314 | pinning pte pages which are below pte_limit. In the normal case | ||
315 | this will be TASK_SIZE, but at boot we need to pin up to | ||
316 | FIXADDR_TOP. But the important bit is that we don't pin beyond | ||
317 | there, because then we start getting into Xen's ptes. | ||
318 | */ | ||
319 | static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | ||
320 | unsigned long limit) | ||
321 | { | ||
322 | pgd_t *pgd = pgd_base; | ||
323 | int flush = 0; | ||
324 | unsigned long addr = 0; | ||
325 | unsigned long pgd_next; | ||
326 | |||
327 | BUG_ON(limit > FIXADDR_TOP); | ||
328 | |||
329 | if (xen_feature(XENFEAT_auto_translated_physmap)) | ||
330 | return 0; | ||
331 | |||
332 | for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { | ||
333 | pud_t *pud; | ||
334 | unsigned long pud_limit, pud_next; | ||
335 | |||
336 | pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); | ||
337 | |||
338 | if (!pgd_val(*pgd)) | ||
339 | continue; | ||
340 | |||
341 | pud = pud_offset(pgd, 0); | ||
342 | |||
343 | if (PTRS_PER_PUD > 1) /* not folded */ | ||
344 | flush |= (*func)(virt_to_page(pud), 0); | ||
345 | |||
346 | for (; addr != pud_limit; pud++, addr = pud_next) { | ||
347 | pmd_t *pmd; | ||
348 | unsigned long pmd_limit; | ||
349 | |||
350 | pud_next = pud_addr_end(addr, pud_limit); | ||
351 | |||
352 | if (pud_next < limit) | ||
353 | pmd_limit = pud_next; | ||
354 | else | ||
355 | pmd_limit = limit; | ||
356 | |||
357 | if (pud_none(*pud)) | ||
358 | continue; | ||
359 | |||
360 | pmd = pmd_offset(pud, 0); | ||
361 | |||
362 | if (PTRS_PER_PMD > 1) /* not folded */ | ||
363 | flush |= (*func)(virt_to_page(pmd), 0); | ||
364 | |||
365 | for (; addr != pmd_limit; pmd++) { | ||
366 | addr += (PAGE_SIZE * PTRS_PER_PTE); | ||
367 | if ((pmd_limit-1) < (addr-1)) { | ||
368 | addr = pmd_limit; | ||
369 | break; | ||
370 | } | ||
371 | |||
372 | if (pmd_none(*pmd)) | ||
373 | continue; | ||
374 | |||
375 | flush |= (*func)(pmd_page(*pmd), 0); | ||
376 | } | ||
377 | } | ||
378 | } | ||
379 | |||
380 | flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); | ||
381 | |||
382 | return flush; | ||
383 | } | ||
384 | |||
385 | static int pin_page(struct page *page, unsigned flags) | ||
386 | { | ||
387 | unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); | ||
388 | int flush; | ||
389 | |||
390 | if (pgfl) | ||
391 | flush = 0; /* already pinned */ | ||
392 | else if (PageHighMem(page)) | ||
393 | /* kmaps need flushing if we found an unpinned | ||
394 | highpage */ | ||
395 | flush = 1; | ||
396 | else { | ||
397 | void *pt = lowmem_page_address(page); | ||
398 | unsigned long pfn = page_to_pfn(page); | ||
399 | struct multicall_space mcs = __xen_mc_entry(0); | ||
400 | |||
401 | flush = 0; | ||
402 | |||
403 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | ||
404 | pfn_pte(pfn, PAGE_KERNEL_RO), | ||
405 | flags); | ||
406 | } | ||
407 | |||
408 | return flush; | ||
409 | } | ||
410 | |||
411 | /* This is called just after a mm has been created, but it has not | ||
412 | been used yet. We need to make sure that its pagetable is all | ||
413 | read-only, and can be pinned. */ | ||
414 | void xen_pgd_pin(pgd_t *pgd) | ||
415 | { | ||
416 | struct multicall_space mcs; | ||
417 | struct mmuext_op *op; | ||
418 | |||
419 | xen_mc_batch(); | ||
420 | |||
421 | if (pgd_walk(pgd, pin_page, TASK_SIZE)) { | ||
422 | /* re-enable interrupts for kmap_flush_unused */ | ||
423 | xen_mc_issue(0); | ||
424 | kmap_flush_unused(); | ||
425 | xen_mc_batch(); | ||
426 | } | ||
427 | |||
428 | mcs = __xen_mc_entry(sizeof(*op)); | ||
429 | op = mcs.args; | ||
430 | |||
431 | #ifdef CONFIG_X86_PAE | ||
432 | op->cmd = MMUEXT_PIN_L3_TABLE; | ||
433 | #else | ||
434 | op->cmd = MMUEXT_PIN_L2_TABLE; | ||
435 | #endif | ||
436 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | ||
437 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
438 | |||
439 | xen_mc_issue(0); | ||
440 | } | ||
441 | |||
442 | /* The init_mm pagetable is really pinned as soon as its created, but | ||
443 | that's before we have page structures to store the bits. So do all | ||
444 | the book-keeping now. */ | ||
445 | static __init int mark_pinned(struct page *page, unsigned flags) | ||
446 | { | ||
447 | SetPagePinned(page); | ||
448 | return 0; | ||
449 | } | ||
450 | |||
451 | void __init xen_mark_init_mm_pinned(void) | ||
452 | { | ||
453 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); | ||
454 | } | ||
455 | |||
456 | static int unpin_page(struct page *page, unsigned flags) | ||
457 | { | ||
458 | unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); | ||
459 | |||
460 | if (pgfl && !PageHighMem(page)) { | ||
461 | void *pt = lowmem_page_address(page); | ||
462 | unsigned long pfn = page_to_pfn(page); | ||
463 | struct multicall_space mcs = __xen_mc_entry(0); | ||
464 | |||
465 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | ||
466 | pfn_pte(pfn, PAGE_KERNEL), | ||
467 | flags); | ||
468 | } | ||
469 | |||
470 | return 0; /* never need to flush on unpin */ | ||
471 | } | ||
472 | |||
473 | /* Release a pagetables pages back as normal RW */ | ||
474 | static void xen_pgd_unpin(pgd_t *pgd) | ||
475 | { | ||
476 | struct mmuext_op *op; | ||
477 | struct multicall_space mcs; | ||
478 | |||
479 | xen_mc_batch(); | ||
480 | |||
481 | mcs = __xen_mc_entry(sizeof(*op)); | ||
482 | |||
483 | op = mcs.args; | ||
484 | op->cmd = MMUEXT_UNPIN_TABLE; | ||
485 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | ||
486 | |||
487 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
488 | |||
489 | pgd_walk(pgd, unpin_page, TASK_SIZE); | ||
490 | |||
491 | xen_mc_issue(0); | ||
492 | } | ||
493 | |||
494 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) | ||
495 | { | ||
496 | spin_lock(&next->page_table_lock); | ||
497 | xen_pgd_pin(next->pgd); | ||
498 | spin_unlock(&next->page_table_lock); | ||
499 | } | ||
500 | |||
501 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | ||
502 | { | ||
503 | spin_lock(&mm->page_table_lock); | ||
504 | xen_pgd_pin(mm->pgd); | ||
505 | spin_unlock(&mm->page_table_lock); | ||
506 | } | ||
507 | |||
508 | |||
509 | #ifdef CONFIG_SMP | ||
510 | /* Another cpu may still have their %cr3 pointing at the pagetable, so | ||
511 | we need to repoint it somewhere else before we can unpin it. */ | ||
512 | static void drop_other_mm_ref(void *info) | ||
513 | { | ||
514 | struct mm_struct *mm = info; | ||
515 | |||
516 | if (__get_cpu_var(cpu_tlbstate).active_mm == mm) | ||
517 | leave_mm(smp_processor_id()); | ||
518 | } | ||
519 | |||
520 | static void drop_mm_ref(struct mm_struct *mm) | ||
521 | { | ||
522 | if (current->active_mm == mm) { | ||
523 | if (current->mm == mm) | ||
524 | load_cr3(swapper_pg_dir); | ||
525 | else | ||
526 | leave_mm(smp_processor_id()); | ||
527 | } | ||
528 | |||
529 | if (!cpus_empty(mm->cpu_vm_mask)) | ||
530 | xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, | ||
531 | mm, 1); | ||
532 | } | ||
533 | #else | ||
534 | static void drop_mm_ref(struct mm_struct *mm) | ||
535 | { | ||
536 | if (current->active_mm == mm) | ||
537 | load_cr3(swapper_pg_dir); | ||
538 | } | ||
539 | #endif | ||
540 | |||
541 | /* | ||
542 | * While a process runs, Xen pins its pagetables, which means that the | ||
543 | * hypervisor forces it to be read-only, and it controls all updates | ||
544 | * to it. This means that all pagetable updates have to go via the | ||
545 | * hypervisor, which is moderately expensive. | ||
546 | * | ||
547 | * Since we're pulling the pagetable down, we switch to use init_mm, | ||
548 | * unpin old process pagetable and mark it all read-write, which | ||
549 | * allows further operations on it to be simple memory accesses. | ||
550 | * | ||
551 | * The only subtle point is that another CPU may be still using the | ||
552 | * pagetable because of lazy tlb flushing. This means we need need to | ||
553 | * switch all CPUs off this pagetable before we can unpin it. | ||
554 | */ | ||
555 | void xen_exit_mmap(struct mm_struct *mm) | ||
556 | { | ||
557 | get_cpu(); /* make sure we don't move around */ | ||
558 | drop_mm_ref(mm); | ||
559 | put_cpu(); | ||
560 | |||
561 | spin_lock(&mm->page_table_lock); | ||
562 | xen_pgd_unpin(mm->pgd); | ||
563 | spin_unlock(&mm->page_table_lock); | ||
564 | } | ||
diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h new file mode 100644 index 000000000000..c9ff27f3ac3a --- /dev/null +++ b/arch/i386/xen/mmu.h | |||
@@ -0,0 +1,60 @@ | |||
1 | #ifndef _XEN_MMU_H | ||
2 | |||
3 | #include <linux/linkage.h> | ||
4 | #include <asm/page.h> | ||
5 | |||
6 | /* | ||
7 | * Page-directory addresses above 4GB do not fit into architectural %cr3. | ||
8 | * When accessing %cr3, or equivalent field in vcpu_guest_context, guests | ||
9 | * must use the following accessor macros to pack/unpack valid MFNs. | ||
10 | * | ||
11 | * Note that Xen is using the fact that the pagetable base is always | ||
12 | * page-aligned, and putting the 12 MSB of the address into the 12 LSB | ||
13 | * of cr3. | ||
14 | */ | ||
15 | #define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) | ||
16 | #define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) | ||
17 | |||
18 | |||
19 | void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); | ||
20 | |||
21 | void xen_set_pte(pte_t *ptep, pte_t pteval); | ||
22 | void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | ||
23 | pte_t *ptep, pte_t pteval); | ||
24 | void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); | ||
25 | |||
26 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); | ||
27 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); | ||
28 | void xen_exit_mmap(struct mm_struct *mm); | ||
29 | |||
30 | void xen_pgd_pin(pgd_t *pgd); | ||
31 | //void xen_pgd_unpin(pgd_t *pgd); | ||
32 | |||
33 | #ifdef CONFIG_X86_PAE | ||
34 | unsigned long long xen_pte_val(pte_t); | ||
35 | unsigned long long xen_pmd_val(pmd_t); | ||
36 | unsigned long long xen_pgd_val(pgd_t); | ||
37 | |||
38 | pte_t xen_make_pte(unsigned long long); | ||
39 | pmd_t xen_make_pmd(unsigned long long); | ||
40 | pgd_t xen_make_pgd(unsigned long long); | ||
41 | |||
42 | void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | ||
43 | pte_t *ptep, pte_t pteval); | ||
44 | void xen_set_pte_atomic(pte_t *ptep, pte_t pte); | ||
45 | void xen_set_pud(pud_t *ptr, pud_t val); | ||
46 | void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); | ||
47 | void xen_pmd_clear(pmd_t *pmdp); | ||
48 | |||
49 | |||
50 | #else | ||
51 | unsigned long xen_pte_val(pte_t); | ||
52 | unsigned long xen_pmd_val(pmd_t); | ||
53 | unsigned long xen_pgd_val(pgd_t); | ||
54 | |||
55 | pte_t xen_make_pte(unsigned long); | ||
56 | pmd_t xen_make_pmd(unsigned long); | ||
57 | pgd_t xen_make_pgd(unsigned long); | ||
58 | #endif | ||
59 | |||
60 | #endif /* _XEN_MMU_H */ | ||
diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c new file mode 100644 index 000000000000..c837e8e463db --- /dev/null +++ b/arch/i386/xen/multicalls.c | |||
@@ -0,0 +1,90 @@ | |||
1 | /* | ||
2 | * Xen hypercall batching. | ||
3 | * | ||
4 | * Xen allows multiple hypercalls to be issued at once, using the | ||
5 | * multicall interface. This allows the cost of trapping into the | ||
6 | * hypervisor to be amortized over several calls. | ||
7 | * | ||
8 | * This file implements a simple interface for multicalls. There's a | ||
9 | * per-cpu buffer of outstanding multicalls. When you want to queue a | ||
10 | * multicall for issuing, you can allocate a multicall slot for the | ||
11 | * call and its arguments, along with storage for space which is | ||
12 | * pointed to by the arguments (for passing pointers to structures, | ||
13 | * etc). When the multicall is actually issued, all the space for the | ||
14 | * commands and allocated memory is freed for reuse. | ||
15 | * | ||
16 | * Multicalls are flushed whenever any of the buffers get full, or | ||
17 | * when explicitly requested. There's no way to get per-multicall | ||
18 | * return results back. It will BUG if any of the multicalls fail. | ||
19 | * | ||
20 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | ||
21 | */ | ||
22 | #include <linux/percpu.h> | ||
23 | #include <linux/hardirq.h> | ||
24 | |||
25 | #include <asm/xen/hypercall.h> | ||
26 | |||
27 | #include "multicalls.h" | ||
28 | |||
29 | #define MC_BATCH 32 | ||
30 | #define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) | ||
31 | |||
32 | struct mc_buffer { | ||
33 | struct multicall_entry entries[MC_BATCH]; | ||
34 | u64 args[MC_ARGS]; | ||
35 | unsigned mcidx, argidx; | ||
36 | }; | ||
37 | |||
38 | static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); | ||
39 | DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); | ||
40 | |||
41 | void xen_mc_flush(void) | ||
42 | { | ||
43 | struct mc_buffer *b = &__get_cpu_var(mc_buffer); | ||
44 | int ret = 0; | ||
45 | unsigned long flags; | ||
46 | |||
47 | BUG_ON(preemptible()); | ||
48 | |||
49 | /* Disable interrupts in case someone comes in and queues | ||
50 | something in the middle */ | ||
51 | local_irq_save(flags); | ||
52 | |||
53 | if (b->mcidx) { | ||
54 | int i; | ||
55 | |||
56 | if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) | ||
57 | BUG(); | ||
58 | for (i = 0; i < b->mcidx; i++) | ||
59 | if (b->entries[i].result < 0) | ||
60 | ret++; | ||
61 | b->mcidx = 0; | ||
62 | b->argidx = 0; | ||
63 | } else | ||
64 | BUG_ON(b->argidx != 0); | ||
65 | |||
66 | local_irq_restore(flags); | ||
67 | |||
68 | BUG_ON(ret); | ||
69 | } | ||
70 | |||
71 | struct multicall_space __xen_mc_entry(size_t args) | ||
72 | { | ||
73 | struct mc_buffer *b = &__get_cpu_var(mc_buffer); | ||
74 | struct multicall_space ret; | ||
75 | unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); | ||
76 | |||
77 | BUG_ON(preemptible()); | ||
78 | BUG_ON(argspace > MC_ARGS); | ||
79 | |||
80 | if (b->mcidx == MC_BATCH || | ||
81 | (b->argidx + argspace) > MC_ARGS) | ||
82 | xen_mc_flush(); | ||
83 | |||
84 | ret.mc = &b->entries[b->mcidx]; | ||
85 | b->mcidx++; | ||
86 | ret.args = &b->args[b->argidx]; | ||
87 | b->argidx += argspace; | ||
88 | |||
89 | return ret; | ||
90 | } | ||
diff --git a/arch/i386/xen/multicalls.h b/arch/i386/xen/multicalls.h new file mode 100644 index 000000000000..e6f7530b156c --- /dev/null +++ b/arch/i386/xen/multicalls.h | |||
@@ -0,0 +1,45 @@ | |||
1 | #ifndef _XEN_MULTICALLS_H | ||
2 | #define _XEN_MULTICALLS_H | ||
3 | |||
4 | #include "xen-ops.h" | ||
5 | |||
6 | /* Multicalls */ | ||
7 | struct multicall_space | ||
8 | { | ||
9 | struct multicall_entry *mc; | ||
10 | void *args; | ||
11 | }; | ||
12 | |||
13 | /* Allocate room for a multicall and its args */ | ||
14 | struct multicall_space __xen_mc_entry(size_t args); | ||
15 | |||
16 | DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags); | ||
17 | |||
18 | /* Call to start a batch of multiple __xen_mc_entry()s. Must be | ||
19 | paired with xen_mc_issue() */ | ||
20 | static inline void xen_mc_batch(void) | ||
21 | { | ||
22 | /* need to disable interrupts until this entry is complete */ | ||
23 | local_irq_save(__get_cpu_var(xen_mc_irq_flags)); | ||
24 | } | ||
25 | |||
26 | static inline struct multicall_space xen_mc_entry(size_t args) | ||
27 | { | ||
28 | xen_mc_batch(); | ||
29 | return __xen_mc_entry(args); | ||
30 | } | ||
31 | |||
32 | /* Flush all pending multicalls */ | ||
33 | void xen_mc_flush(void); | ||
34 | |||
35 | /* Issue a multicall if we're not in a lazy mode */ | ||
36 | static inline void xen_mc_issue(unsigned mode) | ||
37 | { | ||
38 | if ((xen_get_lazy_mode() & mode) == 0) | ||
39 | xen_mc_flush(); | ||
40 | |||
41 | /* restore flags saved in xen_mc_batch */ | ||
42 | local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); | ||
43 | } | ||
44 | |||
45 | #endif /* _XEN_MULTICALLS_H */ | ||
diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c new file mode 100644 index 000000000000..2fe6eac510f0 --- /dev/null +++ b/arch/i386/xen/setup.c | |||
@@ -0,0 +1,96 @@ | |||
1 | /* | ||
2 | * Machine specific setup for xen | ||
3 | * | ||
4 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | ||
5 | */ | ||
6 | |||
7 | #include <linux/module.h> | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/mm.h> | ||
10 | #include <linux/pm.h> | ||
11 | |||
12 | #include <asm/elf.h> | ||
13 | #include <asm/e820.h> | ||
14 | #include <asm/setup.h> | ||
15 | #include <asm/xen/hypervisor.h> | ||
16 | #include <asm/xen/hypercall.h> | ||
17 | |||
18 | #include <xen/interface/physdev.h> | ||
19 | #include <xen/features.h> | ||
20 | |||
21 | #include "xen-ops.h" | ||
22 | |||
23 | /* These are code, but not functions. Defined in entry.S */ | ||
24 | extern const char xen_hypervisor_callback[]; | ||
25 | extern const char xen_failsafe_callback[]; | ||
26 | |||
27 | unsigned long *phys_to_machine_mapping; | ||
28 | EXPORT_SYMBOL(phys_to_machine_mapping); | ||
29 | |||
30 | /** | ||
31 | * machine_specific_memory_setup - Hook for machine specific memory setup. | ||
32 | **/ | ||
33 | |||
34 | char * __init xen_memory_setup(void) | ||
35 | { | ||
36 | unsigned long max_pfn = xen_start_info->nr_pages; | ||
37 | |||
38 | e820.nr_map = 0; | ||
39 | add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM); | ||
40 | |||
41 | return "Xen"; | ||
42 | } | ||
43 | |||
44 | static void xen_idle(void) | ||
45 | { | ||
46 | local_irq_disable(); | ||
47 | |||
48 | if (need_resched()) | ||
49 | local_irq_enable(); | ||
50 | else { | ||
51 | current_thread_info()->status &= ~TS_POLLING; | ||
52 | smp_mb__after_clear_bit(); | ||
53 | safe_halt(); | ||
54 | current_thread_info()->status |= TS_POLLING; | ||
55 | } | ||
56 | } | ||
57 | |||
58 | void __init xen_arch_setup(void) | ||
59 | { | ||
60 | struct physdev_set_iopl set_iopl; | ||
61 | int rc; | ||
62 | |||
63 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); | ||
64 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); | ||
65 | |||
66 | if (!xen_feature(XENFEAT_auto_translated_physmap)) | ||
67 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); | ||
68 | |||
69 | HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, | ||
70 | __KERNEL_CS, (unsigned long)xen_failsafe_callback); | ||
71 | |||
72 | set_iopl.iopl = 1; | ||
73 | rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); | ||
74 | if (rc != 0) | ||
75 | printk(KERN_INFO "physdev_op failed %d\n", rc); | ||
76 | |||
77 | #ifdef CONFIG_ACPI | ||
78 | if (!(xen_start_info->flags & SIF_INITDOMAIN)) { | ||
79 | printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); | ||
80 | disable_acpi(); | ||
81 | } | ||
82 | #endif | ||
83 | |||
84 | memcpy(boot_command_line, xen_start_info->cmd_line, | ||
85 | MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? | ||
86 | COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); | ||
87 | |||
88 | pm_idle = xen_idle; | ||
89 | |||
90 | #ifdef CONFIG_SMP | ||
91 | /* fill cpus_possible with all available cpus */ | ||
92 | xen_fill_possible_map(); | ||
93 | #endif | ||
94 | |||
95 | paravirt_disable_iospace(); | ||
96 | } | ||
diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c new file mode 100644 index 000000000000..557b8e24706a --- /dev/null +++ b/arch/i386/xen/smp.c | |||
@@ -0,0 +1,404 @@ | |||
1 | /* | ||
2 | * Xen SMP support | ||
3 | * | ||
4 | * This file implements the Xen versions of smp_ops. SMP under Xen is | ||
5 | * very straightforward. Bringing a CPU up is simply a matter of | ||
6 | * loading its initial context and setting it running. | ||
7 | * | ||
8 | * IPIs are handled through the Xen event mechanism. | ||
9 | * | ||
10 | * Because virtual CPUs can be scheduled onto any real CPU, there's no | ||
11 | * useful topology information for the kernel to make use of. As a | ||
12 | * result, all CPUs are treated as if they're single-core and | ||
13 | * single-threaded. | ||
14 | * | ||
15 | * This does not handle HOTPLUG_CPU yet. | ||
16 | */ | ||
17 | #include <linux/sched.h> | ||
18 | #include <linux/err.h> | ||
19 | #include <linux/smp.h> | ||
20 | |||
21 | #include <asm/paravirt.h> | ||
22 | #include <asm/desc.h> | ||
23 | #include <asm/pgtable.h> | ||
24 | #include <asm/cpu.h> | ||
25 | |||
26 | #include <xen/interface/xen.h> | ||
27 | #include <xen/interface/vcpu.h> | ||
28 | |||
29 | #include <asm/xen/interface.h> | ||
30 | #include <asm/xen/hypercall.h> | ||
31 | |||
32 | #include <xen/page.h> | ||
33 | #include <xen/events.h> | ||
34 | |||
35 | #include "xen-ops.h" | ||
36 | #include "mmu.h" | ||
37 | |||
38 | static cpumask_t cpu_initialized_map; | ||
39 | static DEFINE_PER_CPU(int, resched_irq); | ||
40 | static DEFINE_PER_CPU(int, callfunc_irq); | ||
41 | |||
42 | /* | ||
43 | * Structure and data for smp_call_function(). This is designed to minimise | ||
44 | * static memory requirements. It also looks cleaner. | ||
45 | */ | ||
46 | static DEFINE_SPINLOCK(call_lock); | ||
47 | |||
48 | struct call_data_struct { | ||
49 | void (*func) (void *info); | ||
50 | void *info; | ||
51 | atomic_t started; | ||
52 | atomic_t finished; | ||
53 | int wait; | ||
54 | }; | ||
55 | |||
56 | static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); | ||
57 | |||
58 | static struct call_data_struct *call_data; | ||
59 | |||
60 | /* | ||
61 | * Reschedule call back. Nothing to do, | ||
62 | * all the work is done automatically when | ||
63 | * we return from the interrupt. | ||
64 | */ | ||
65 | static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) | ||
66 | { | ||
67 | return IRQ_HANDLED; | ||
68 | } | ||
69 | |||
70 | static __cpuinit void cpu_bringup_and_idle(void) | ||
71 | { | ||
72 | int cpu = smp_processor_id(); | ||
73 | |||
74 | cpu_init(); | ||
75 | |||
76 | preempt_disable(); | ||
77 | per_cpu(cpu_state, cpu) = CPU_ONLINE; | ||
78 | |||
79 | xen_setup_cpu_clockevents(); | ||
80 | |||
81 | /* We can take interrupts now: we're officially "up". */ | ||
82 | local_irq_enable(); | ||
83 | |||
84 | wmb(); /* make sure everything is out */ | ||
85 | cpu_idle(); | ||
86 | } | ||
87 | |||
88 | static int xen_smp_intr_init(unsigned int cpu) | ||
89 | { | ||
90 | int rc; | ||
91 | const char *resched_name, *callfunc_name; | ||
92 | |||
93 | per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; | ||
94 | |||
95 | resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); | ||
96 | rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, | ||
97 | cpu, | ||
98 | xen_reschedule_interrupt, | ||
99 | IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, | ||
100 | resched_name, | ||
101 | NULL); | ||
102 | if (rc < 0) | ||
103 | goto fail; | ||
104 | per_cpu(resched_irq, cpu) = rc; | ||
105 | |||
106 | callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); | ||
107 | rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, | ||
108 | cpu, | ||
109 | xen_call_function_interrupt, | ||
110 | IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, | ||
111 | callfunc_name, | ||
112 | NULL); | ||
113 | if (rc < 0) | ||
114 | goto fail; | ||
115 | per_cpu(callfunc_irq, cpu) = rc; | ||
116 | |||
117 | return 0; | ||
118 | |||
119 | fail: | ||
120 | if (per_cpu(resched_irq, cpu) >= 0) | ||
121 | unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); | ||
122 | if (per_cpu(callfunc_irq, cpu) >= 0) | ||
123 | unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); | ||
124 | return rc; | ||
125 | } | ||
126 | |||
127 | void __init xen_fill_possible_map(void) | ||
128 | { | ||
129 | int i, rc; | ||
130 | |||
131 | for (i = 0; i < NR_CPUS; i++) { | ||
132 | rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); | ||
133 | if (rc >= 0) | ||
134 | cpu_set(i, cpu_possible_map); | ||
135 | } | ||
136 | } | ||
137 | |||
138 | void __init xen_smp_prepare_boot_cpu(void) | ||
139 | { | ||
140 | int cpu; | ||
141 | |||
142 | BUG_ON(smp_processor_id() != 0); | ||
143 | native_smp_prepare_boot_cpu(); | ||
144 | |||
145 | /* We've switched to the "real" per-cpu gdt, so make sure the | ||
146 | old memory can be recycled */ | ||
147 | make_lowmem_page_readwrite(&per_cpu__gdt_page); | ||
148 | |||
149 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
150 | cpus_clear(cpu_sibling_map[cpu]); | ||
151 | cpus_clear(cpu_core_map[cpu]); | ||
152 | } | ||
153 | |||
154 | xen_setup_vcpu_info_placement(); | ||
155 | } | ||
156 | |||
157 | void __init xen_smp_prepare_cpus(unsigned int max_cpus) | ||
158 | { | ||
159 | unsigned cpu; | ||
160 | |||
161 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
162 | cpus_clear(cpu_sibling_map[cpu]); | ||
163 | cpus_clear(cpu_core_map[cpu]); | ||
164 | } | ||
165 | |||
166 | smp_store_cpu_info(0); | ||
167 | set_cpu_sibling_map(0); | ||
168 | |||
169 | if (xen_smp_intr_init(0)) | ||
170 | BUG(); | ||
171 | |||
172 | cpu_initialized_map = cpumask_of_cpu(0); | ||
173 | |||
174 | /* Restrict the possible_map according to max_cpus. */ | ||
175 | while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { | ||
176 | for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--) | ||
177 | continue; | ||
178 | cpu_clear(cpu, cpu_possible_map); | ||
179 | } | ||
180 | |||
181 | for_each_possible_cpu (cpu) { | ||
182 | struct task_struct *idle; | ||
183 | |||
184 | if (cpu == 0) | ||
185 | continue; | ||
186 | |||
187 | idle = fork_idle(cpu); | ||
188 | if (IS_ERR(idle)) | ||
189 | panic("failed fork for CPU %d", cpu); | ||
190 | |||
191 | cpu_set(cpu, cpu_present_map); | ||
192 | } | ||
193 | |||
194 | //init_xenbus_allowed_cpumask(); | ||
195 | } | ||
196 | |||
197 | static __cpuinit int | ||
198 | cpu_initialize_context(unsigned int cpu, struct task_struct *idle) | ||
199 | { | ||
200 | struct vcpu_guest_context *ctxt; | ||
201 | struct gdt_page *gdt = &per_cpu(gdt_page, cpu); | ||
202 | |||
203 | if (cpu_test_and_set(cpu, cpu_initialized_map)) | ||
204 | return 0; | ||
205 | |||
206 | ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); | ||
207 | if (ctxt == NULL) | ||
208 | return -ENOMEM; | ||
209 | |||
210 | ctxt->flags = VGCF_IN_KERNEL; | ||
211 | ctxt->user_regs.ds = __USER_DS; | ||
212 | ctxt->user_regs.es = __USER_DS; | ||
213 | ctxt->user_regs.fs = __KERNEL_PERCPU; | ||
214 | ctxt->user_regs.gs = 0; | ||
215 | ctxt->user_regs.ss = __KERNEL_DS; | ||
216 | ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; | ||
217 | ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ | ||
218 | |||
219 | memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); | ||
220 | |||
221 | xen_copy_trap_info(ctxt->trap_ctxt); | ||
222 | |||
223 | ctxt->ldt_ents = 0; | ||
224 | |||
225 | BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); | ||
226 | make_lowmem_page_readonly(gdt->gdt); | ||
227 | |||
228 | ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); | ||
229 | ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); | ||
230 | |||
231 | ctxt->user_regs.cs = __KERNEL_CS; | ||
232 | ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); | ||
233 | |||
234 | ctxt->kernel_ss = __KERNEL_DS; | ||
235 | ctxt->kernel_sp = idle->thread.esp0; | ||
236 | |||
237 | ctxt->event_callback_cs = __KERNEL_CS; | ||
238 | ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; | ||
239 | ctxt->failsafe_callback_cs = __KERNEL_CS; | ||
240 | ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; | ||
241 | |||
242 | per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); | ||
243 | ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); | ||
244 | |||
245 | if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) | ||
246 | BUG(); | ||
247 | |||
248 | kfree(ctxt); | ||
249 | return 0; | ||
250 | } | ||
251 | |||
252 | int __cpuinit xen_cpu_up(unsigned int cpu) | ||
253 | { | ||
254 | struct task_struct *idle = idle_task(cpu); | ||
255 | int rc; | ||
256 | |||
257 | #if 0 | ||
258 | rc = cpu_up_check(cpu); | ||
259 | if (rc) | ||
260 | return rc; | ||
261 | #endif | ||
262 | |||
263 | init_gdt(cpu); | ||
264 | per_cpu(current_task, cpu) = idle; | ||
265 | irq_ctx_init(cpu); | ||
266 | xen_setup_timer(cpu); | ||
267 | |||
268 | /* make sure interrupts start blocked */ | ||
269 | per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; | ||
270 | |||
271 | rc = cpu_initialize_context(cpu, idle); | ||
272 | if (rc) | ||
273 | return rc; | ||
274 | |||
275 | if (num_online_cpus() == 1) | ||
276 | alternatives_smp_switch(1); | ||
277 | |||
278 | rc = xen_smp_intr_init(cpu); | ||
279 | if (rc) | ||
280 | return rc; | ||
281 | |||
282 | smp_store_cpu_info(cpu); | ||
283 | set_cpu_sibling_map(cpu); | ||
284 | /* This must be done before setting cpu_online_map */ | ||
285 | wmb(); | ||
286 | |||
287 | cpu_set(cpu, cpu_online_map); | ||
288 | |||
289 | rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); | ||
290 | BUG_ON(rc); | ||
291 | |||
292 | return 0; | ||
293 | } | ||
294 | |||
295 | void xen_smp_cpus_done(unsigned int max_cpus) | ||
296 | { | ||
297 | } | ||
298 | |||
299 | static void stop_self(void *v) | ||
300 | { | ||
301 | int cpu = smp_processor_id(); | ||
302 | |||
303 | /* make sure we're not pinning something down */ | ||
304 | load_cr3(swapper_pg_dir); | ||
305 | /* should set up a minimal gdt */ | ||
306 | |||
307 | HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); | ||
308 | BUG(); | ||
309 | } | ||
310 | |||
311 | void xen_smp_send_stop(void) | ||
312 | { | ||
313 | smp_call_function(stop_self, NULL, 0, 0); | ||
314 | } | ||
315 | |||
316 | void xen_smp_send_reschedule(int cpu) | ||
317 | { | ||
318 | xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); | ||
319 | } | ||
320 | |||
321 | |||
322 | static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) | ||
323 | { | ||
324 | unsigned cpu; | ||
325 | |||
326 | cpus_and(mask, mask, cpu_online_map); | ||
327 | |||
328 | for_each_cpu_mask(cpu, mask) | ||
329 | xen_send_IPI_one(cpu, vector); | ||
330 | } | ||
331 | |||
332 | static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) | ||
333 | { | ||
334 | void (*func) (void *info) = call_data->func; | ||
335 | void *info = call_data->info; | ||
336 | int wait = call_data->wait; | ||
337 | |||
338 | /* | ||
339 | * Notify initiating CPU that I've grabbed the data and am | ||
340 | * about to execute the function | ||
341 | */ | ||
342 | mb(); | ||
343 | atomic_inc(&call_data->started); | ||
344 | /* | ||
345 | * At this point the info structure may be out of scope unless wait==1 | ||
346 | */ | ||
347 | irq_enter(); | ||
348 | (*func)(info); | ||
349 | irq_exit(); | ||
350 | |||
351 | if (wait) { | ||
352 | mb(); /* commit everything before setting finished */ | ||
353 | atomic_inc(&call_data->finished); | ||
354 | } | ||
355 | |||
356 | return IRQ_HANDLED; | ||
357 | } | ||
358 | |||
359 | int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), | ||
360 | void *info, int wait) | ||
361 | { | ||
362 | struct call_data_struct data; | ||
363 | int cpus; | ||
364 | |||
365 | /* Holding any lock stops cpus from going down. */ | ||
366 | spin_lock(&call_lock); | ||
367 | |||
368 | cpu_clear(smp_processor_id(), mask); | ||
369 | |||
370 | cpus = cpus_weight(mask); | ||
371 | if (!cpus) { | ||
372 | spin_unlock(&call_lock); | ||
373 | return 0; | ||
374 | } | ||
375 | |||
376 | /* Can deadlock when called with interrupts disabled */ | ||
377 | WARN_ON(irqs_disabled()); | ||
378 | |||
379 | data.func = func; | ||
380 | data.info = info; | ||
381 | atomic_set(&data.started, 0); | ||
382 | data.wait = wait; | ||
383 | if (wait) | ||
384 | atomic_set(&data.finished, 0); | ||
385 | |||
386 | call_data = &data; | ||
387 | mb(); /* write everything before IPI */ | ||
388 | |||
389 | /* Send a message to other CPUs and wait for them to respond */ | ||
390 | xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); | ||
391 | |||
392 | /* Make sure other vcpus get a chance to run. | ||
393 | XXX too severe? Maybe we should check the other CPU's states? */ | ||
394 | HYPERVISOR_sched_op(SCHEDOP_yield, 0); | ||
395 | |||
396 | /* Wait for response */ | ||
397 | while (atomic_read(&data.started) != cpus || | ||
398 | (wait && atomic_read(&data.finished) != cpus)) | ||
399 | cpu_relax(); | ||
400 | |||
401 | spin_unlock(&call_lock); | ||
402 | |||
403 | return 0; | ||
404 | } | ||
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c new file mode 100644 index 000000000000..51fdabf1fd4d --- /dev/null +++ b/arch/i386/xen/time.c | |||
@@ -0,0 +1,590 @@ | |||
1 | /* | ||
2 | * Xen time implementation. | ||
3 | * | ||
4 | * This is implemented in terms of a clocksource driver which uses | ||
5 | * the hypervisor clock as a nanosecond timebase, and a clockevent | ||
6 | * driver which uses the hypervisor's timer mechanism. | ||
7 | * | ||
8 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | ||
9 | */ | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/interrupt.h> | ||
12 | #include <linux/clocksource.h> | ||
13 | #include <linux/clockchips.h> | ||
14 | #include <linux/kernel_stat.h> | ||
15 | |||
16 | #include <asm/xen/hypervisor.h> | ||
17 | #include <asm/xen/hypercall.h> | ||
18 | |||
19 | #include <xen/events.h> | ||
20 | #include <xen/interface/xen.h> | ||
21 | #include <xen/interface/vcpu.h> | ||
22 | |||
23 | #include "xen-ops.h" | ||
24 | |||
25 | #define XEN_SHIFT 22 | ||
26 | |||
27 | /* Xen may fire a timer up to this many ns early */ | ||
28 | #define TIMER_SLOP 100000 | ||
29 | #define NS_PER_TICK (1000000000LL / HZ) | ||
30 | |||
31 | static cycle_t xen_clocksource_read(void); | ||
32 | |||
33 | /* These are perodically updated in shared_info, and then copied here. */ | ||
34 | struct shadow_time_info { | ||
35 | u64 tsc_timestamp; /* TSC at last update of time vals. */ | ||
36 | u64 system_timestamp; /* Time, in nanosecs, since boot. */ | ||
37 | u32 tsc_to_nsec_mul; | ||
38 | int tsc_shift; | ||
39 | u32 version; | ||
40 | }; | ||
41 | |||
42 | static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); | ||
43 | |||
44 | /* runstate info updated by Xen */ | ||
45 | static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); | ||
46 | |||
47 | /* snapshots of runstate info */ | ||
48 | static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); | ||
49 | |||
50 | /* unused ns of stolen and blocked time */ | ||
51 | static DEFINE_PER_CPU(u64, residual_stolen); | ||
52 | static DEFINE_PER_CPU(u64, residual_blocked); | ||
53 | |||
54 | /* return an consistent snapshot of 64-bit time/counter value */ | ||
55 | static u64 get64(const u64 *p) | ||
56 | { | ||
57 | u64 ret; | ||
58 | |||
59 | if (BITS_PER_LONG < 64) { | ||
60 | u32 *p32 = (u32 *)p; | ||
61 | u32 h, l; | ||
62 | |||
63 | /* | ||
64 | * Read high then low, and then make sure high is | ||
65 | * still the same; this will only loop if low wraps | ||
66 | * and carries into high. | ||
67 | * XXX some clean way to make this endian-proof? | ||
68 | */ | ||
69 | do { | ||
70 | h = p32[1]; | ||
71 | barrier(); | ||
72 | l = p32[0]; | ||
73 | barrier(); | ||
74 | } while (p32[1] != h); | ||
75 | |||
76 | ret = (((u64)h) << 32) | l; | ||
77 | } else | ||
78 | ret = *p; | ||
79 | |||
80 | return ret; | ||
81 | } | ||
82 | |||
83 | /* | ||
84 | * Runstate accounting | ||
85 | */ | ||
86 | static void get_runstate_snapshot(struct vcpu_runstate_info *res) | ||
87 | { | ||
88 | u64 state_time; | ||
89 | struct vcpu_runstate_info *state; | ||
90 | |||
91 | BUG_ON(preemptible()); | ||
92 | |||
93 | state = &__get_cpu_var(runstate); | ||
94 | |||
95 | /* | ||
96 | * The runstate info is always updated by the hypervisor on | ||
97 | * the current CPU, so there's no need to use anything | ||
98 | * stronger than a compiler barrier when fetching it. | ||
99 | */ | ||
100 | do { | ||
101 | state_time = get64(&state->state_entry_time); | ||
102 | barrier(); | ||
103 | *res = *state; | ||
104 | barrier(); | ||
105 | } while (get64(&state->state_entry_time) != state_time); | ||
106 | } | ||
107 | |||
108 | static void setup_runstate_info(int cpu) | ||
109 | { | ||
110 | struct vcpu_register_runstate_memory_area area; | ||
111 | |||
112 | area.addr.v = &per_cpu(runstate, cpu); | ||
113 | |||
114 | if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, | ||
115 | cpu, &area)) | ||
116 | BUG(); | ||
117 | } | ||
118 | |||
119 | static void do_stolen_accounting(void) | ||
120 | { | ||
121 | struct vcpu_runstate_info state; | ||
122 | struct vcpu_runstate_info *snap; | ||
123 | s64 blocked, runnable, offline, stolen; | ||
124 | cputime_t ticks; | ||
125 | |||
126 | get_runstate_snapshot(&state); | ||
127 | |||
128 | WARN_ON(state.state != RUNSTATE_running); | ||
129 | |||
130 | snap = &__get_cpu_var(runstate_snapshot); | ||
131 | |||
132 | /* work out how much time the VCPU has not been runn*ing* */ | ||
133 | blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; | ||
134 | runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; | ||
135 | offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; | ||
136 | |||
137 | *snap = state; | ||
138 | |||
139 | /* Add the appropriate number of ticks of stolen time, | ||
140 | including any left-overs from last time. Passing NULL to | ||
141 | account_steal_time accounts the time as stolen. */ | ||
142 | stolen = runnable + offline + __get_cpu_var(residual_stolen); | ||
143 | |||
144 | if (stolen < 0) | ||
145 | stolen = 0; | ||
146 | |||
147 | ticks = 0; | ||
148 | while (stolen >= NS_PER_TICK) { | ||
149 | ticks++; | ||
150 | stolen -= NS_PER_TICK; | ||
151 | } | ||
152 | __get_cpu_var(residual_stolen) = stolen; | ||
153 | account_steal_time(NULL, ticks); | ||
154 | |||
155 | /* Add the appropriate number of ticks of blocked time, | ||
156 | including any left-overs from last time. Passing idle to | ||
157 | account_steal_time accounts the time as idle/wait. */ | ||
158 | blocked += __get_cpu_var(residual_blocked); | ||
159 | |||
160 | if (blocked < 0) | ||
161 | blocked = 0; | ||
162 | |||
163 | ticks = 0; | ||
164 | while (blocked >= NS_PER_TICK) { | ||
165 | ticks++; | ||
166 | blocked -= NS_PER_TICK; | ||
167 | } | ||
168 | __get_cpu_var(residual_blocked) = blocked; | ||
169 | account_steal_time(idle_task(smp_processor_id()), ticks); | ||
170 | } | ||
171 | |||
172 | /* | ||
173 | * Xen sched_clock implementation. Returns the number of unstolen | ||
174 | * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED | ||
175 | * states. | ||
176 | */ | ||
177 | unsigned long long xen_sched_clock(void) | ||
178 | { | ||
179 | struct vcpu_runstate_info state; | ||
180 | cycle_t now; | ||
181 | u64 ret; | ||
182 | s64 offset; | ||
183 | |||
184 | /* | ||
185 | * Ideally sched_clock should be called on a per-cpu basis | ||
186 | * anyway, so preempt should already be disabled, but that's | ||
187 | * not current practice at the moment. | ||
188 | */ | ||
189 | preempt_disable(); | ||
190 | |||
191 | now = xen_clocksource_read(); | ||
192 | |||
193 | get_runstate_snapshot(&state); | ||
194 | |||
195 | WARN_ON(state.state != RUNSTATE_running); | ||
196 | |||
197 | offset = now - state.state_entry_time; | ||
198 | if (offset < 0) | ||
199 | offset = 0; | ||
200 | |||
201 | ret = state.time[RUNSTATE_blocked] + | ||
202 | state.time[RUNSTATE_running] + | ||
203 | offset; | ||
204 | |||
205 | preempt_enable(); | ||
206 | |||
207 | return ret; | ||
208 | } | ||
209 | |||
210 | |||
211 | /* Get the CPU speed from Xen */ | ||
212 | unsigned long xen_cpu_khz(void) | ||
213 | { | ||
214 | u64 cpu_khz = 1000000ULL << 32; | ||
215 | const struct vcpu_time_info *info = | ||
216 | &HYPERVISOR_shared_info->vcpu_info[0].time; | ||
217 | |||
218 | do_div(cpu_khz, info->tsc_to_system_mul); | ||
219 | if (info->tsc_shift < 0) | ||
220 | cpu_khz <<= -info->tsc_shift; | ||
221 | else | ||
222 | cpu_khz >>= info->tsc_shift; | ||
223 | |||
224 | return cpu_khz; | ||
225 | } | ||
226 | |||
227 | /* | ||
228 | * Reads a consistent set of time-base values from Xen, into a shadow data | ||
229 | * area. | ||
230 | */ | ||
231 | static unsigned get_time_values_from_xen(void) | ||
232 | { | ||
233 | struct vcpu_time_info *src; | ||
234 | struct shadow_time_info *dst; | ||
235 | |||
236 | /* src is shared memory with the hypervisor, so we need to | ||
237 | make sure we get a consistent snapshot, even in the face of | ||
238 | being preempted. */ | ||
239 | src = &__get_cpu_var(xen_vcpu)->time; | ||
240 | dst = &__get_cpu_var(shadow_time); | ||
241 | |||
242 | do { | ||
243 | dst->version = src->version; | ||
244 | rmb(); /* fetch version before data */ | ||
245 | dst->tsc_timestamp = src->tsc_timestamp; | ||
246 | dst->system_timestamp = src->system_time; | ||
247 | dst->tsc_to_nsec_mul = src->tsc_to_system_mul; | ||
248 | dst->tsc_shift = src->tsc_shift; | ||
249 | rmb(); /* test version after fetching data */ | ||
250 | } while ((src->version & 1) | (dst->version ^ src->version)); | ||
251 | |||
252 | return dst->version; | ||
253 | } | ||
254 | |||
255 | /* | ||
256 | * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, | ||
257 | * yielding a 64-bit result. | ||
258 | */ | ||
259 | static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) | ||
260 | { | ||
261 | u64 product; | ||
262 | #ifdef __i386__ | ||
263 | u32 tmp1, tmp2; | ||
264 | #endif | ||
265 | |||
266 | if (shift < 0) | ||
267 | delta >>= -shift; | ||
268 | else | ||
269 | delta <<= shift; | ||
270 | |||
271 | #ifdef __i386__ | ||
272 | __asm__ ( | ||
273 | "mul %5 ; " | ||
274 | "mov %4,%%eax ; " | ||
275 | "mov %%edx,%4 ; " | ||
276 | "mul %5 ; " | ||
277 | "xor %5,%5 ; " | ||
278 | "add %4,%%eax ; " | ||
279 | "adc %5,%%edx ; " | ||
280 | : "=A" (product), "=r" (tmp1), "=r" (tmp2) | ||
281 | : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); | ||
282 | #elif __x86_64__ | ||
283 | __asm__ ( | ||
284 | "mul %%rdx ; shrd $32,%%rdx,%%rax" | ||
285 | : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); | ||
286 | #else | ||
287 | #error implement me! | ||
288 | #endif | ||
289 | |||
290 | return product; | ||
291 | } | ||
292 | |||
293 | static u64 get_nsec_offset(struct shadow_time_info *shadow) | ||
294 | { | ||
295 | u64 now, delta; | ||
296 | now = native_read_tsc(); | ||
297 | delta = now - shadow->tsc_timestamp; | ||
298 | return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); | ||
299 | } | ||
300 | |||
301 | static cycle_t xen_clocksource_read(void) | ||
302 | { | ||
303 | struct shadow_time_info *shadow = &get_cpu_var(shadow_time); | ||
304 | cycle_t ret; | ||
305 | unsigned version; | ||
306 | |||
307 | do { | ||
308 | version = get_time_values_from_xen(); | ||
309 | barrier(); | ||
310 | ret = shadow->system_timestamp + get_nsec_offset(shadow); | ||
311 | barrier(); | ||
312 | } while (version != __get_cpu_var(xen_vcpu)->time.version); | ||
313 | |||
314 | put_cpu_var(shadow_time); | ||
315 | |||
316 | return ret; | ||
317 | } | ||
318 | |||
319 | static void xen_read_wallclock(struct timespec *ts) | ||
320 | { | ||
321 | const struct shared_info *s = HYPERVISOR_shared_info; | ||
322 | u32 version; | ||
323 | u64 delta; | ||
324 | struct timespec now; | ||
325 | |||
326 | /* get wallclock at system boot */ | ||
327 | do { | ||
328 | version = s->wc_version; | ||
329 | rmb(); /* fetch version before time */ | ||
330 | now.tv_sec = s->wc_sec; | ||
331 | now.tv_nsec = s->wc_nsec; | ||
332 | rmb(); /* fetch time before checking version */ | ||
333 | } while ((s->wc_version & 1) | (version ^ s->wc_version)); | ||
334 | |||
335 | delta = xen_clocksource_read(); /* time since system boot */ | ||
336 | delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; | ||
337 | |||
338 | now.tv_nsec = do_div(delta, NSEC_PER_SEC); | ||
339 | now.tv_sec = delta; | ||
340 | |||
341 | set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); | ||
342 | } | ||
343 | |||
344 | unsigned long xen_get_wallclock(void) | ||
345 | { | ||
346 | struct timespec ts; | ||
347 | |||
348 | xen_read_wallclock(&ts); | ||
349 | |||
350 | return ts.tv_sec; | ||
351 | } | ||
352 | |||
353 | int xen_set_wallclock(unsigned long now) | ||
354 | { | ||
355 | /* do nothing for domU */ | ||
356 | return -1; | ||
357 | } | ||
358 | |||
359 | static struct clocksource xen_clocksource __read_mostly = { | ||
360 | .name = "xen", | ||
361 | .rating = 400, | ||
362 | .read = xen_clocksource_read, | ||
363 | .mask = ~0, | ||
364 | .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */ | ||
365 | .shift = XEN_SHIFT, | ||
366 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | ||
367 | }; | ||
368 | |||
369 | /* | ||
370 | Xen clockevent implementation | ||
371 | |||
372 | Xen has two clockevent implementations: | ||
373 | |||
374 | The old timer_op one works with all released versions of Xen prior | ||
375 | to version 3.0.4. This version of the hypervisor provides a | ||
376 | single-shot timer with nanosecond resolution. However, sharing the | ||
377 | same event channel is a 100Hz tick which is delivered while the | ||
378 | vcpu is running. We don't care about or use this tick, but it will | ||
379 | cause the core time code to think the timer fired too soon, and | ||
380 | will end up resetting it each time. It could be filtered, but | ||
381 | doing so has complications when the ktime clocksource is not yet | ||
382 | the xen clocksource (ie, at boot time). | ||
383 | |||
384 | The new vcpu_op-based timer interface allows the tick timer period | ||
385 | to be changed or turned off. The tick timer is not useful as a | ||
386 | periodic timer because events are only delivered to running vcpus. | ||
387 | The one-shot timer can report when a timeout is in the past, so | ||
388 | set_next_event is capable of returning -ETIME when appropriate. | ||
389 | This interface is used when available. | ||
390 | */ | ||
391 | |||
392 | |||
393 | /* | ||
394 | Get a hypervisor absolute time. In theory we could maintain an | ||
395 | offset between the kernel's time and the hypervisor's time, and | ||
396 | apply that to a kernel's absolute timeout. Unfortunately the | ||
397 | hypervisor and kernel times can drift even if the kernel is using | ||
398 | the Xen clocksource, because ntp can warp the kernel's clocksource. | ||
399 | */ | ||
400 | static s64 get_abs_timeout(unsigned long delta) | ||
401 | { | ||
402 | return xen_clocksource_read() + delta; | ||
403 | } | ||
404 | |||
405 | static void xen_timerop_set_mode(enum clock_event_mode mode, | ||
406 | struct clock_event_device *evt) | ||
407 | { | ||
408 | switch (mode) { | ||
409 | case CLOCK_EVT_MODE_PERIODIC: | ||
410 | /* unsupported */ | ||
411 | WARN_ON(1); | ||
412 | break; | ||
413 | |||
414 | case CLOCK_EVT_MODE_ONESHOT: | ||
415 | break; | ||
416 | |||
417 | case CLOCK_EVT_MODE_UNUSED: | ||
418 | case CLOCK_EVT_MODE_SHUTDOWN: | ||
419 | HYPERVISOR_set_timer_op(0); /* cancel timeout */ | ||
420 | break; | ||
421 | } | ||
422 | } | ||
423 | |||
424 | static int xen_timerop_set_next_event(unsigned long delta, | ||
425 | struct clock_event_device *evt) | ||
426 | { | ||
427 | WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); | ||
428 | |||
429 | if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0) | ||
430 | BUG(); | ||
431 | |||
432 | /* We may have missed the deadline, but there's no real way of | ||
433 | knowing for sure. If the event was in the past, then we'll | ||
434 | get an immediate interrupt. */ | ||
435 | |||
436 | return 0; | ||
437 | } | ||
438 | |||
439 | static const struct clock_event_device xen_timerop_clockevent = { | ||
440 | .name = "xen", | ||
441 | .features = CLOCK_EVT_FEAT_ONESHOT, | ||
442 | |||
443 | .max_delta_ns = 0xffffffff, | ||
444 | .min_delta_ns = TIMER_SLOP, | ||
445 | |||
446 | .mult = 1, | ||
447 | .shift = 0, | ||
448 | .rating = 500, | ||
449 | |||
450 | .set_mode = xen_timerop_set_mode, | ||
451 | .set_next_event = xen_timerop_set_next_event, | ||
452 | }; | ||
453 | |||
454 | |||
455 | |||
456 | static void xen_vcpuop_set_mode(enum clock_event_mode mode, | ||
457 | struct clock_event_device *evt) | ||
458 | { | ||
459 | int cpu = smp_processor_id(); | ||
460 | |||
461 | switch (mode) { | ||
462 | case CLOCK_EVT_MODE_PERIODIC: | ||
463 | WARN_ON(1); /* unsupported */ | ||
464 | break; | ||
465 | |||
466 | case CLOCK_EVT_MODE_ONESHOT: | ||
467 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) | ||
468 | BUG(); | ||
469 | break; | ||
470 | |||
471 | case CLOCK_EVT_MODE_UNUSED: | ||
472 | case CLOCK_EVT_MODE_SHUTDOWN: | ||
473 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) || | ||
474 | HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) | ||
475 | BUG(); | ||
476 | break; | ||
477 | } | ||
478 | } | ||
479 | |||
480 | static int xen_vcpuop_set_next_event(unsigned long delta, | ||
481 | struct clock_event_device *evt) | ||
482 | { | ||
483 | int cpu = smp_processor_id(); | ||
484 | struct vcpu_set_singleshot_timer single; | ||
485 | int ret; | ||
486 | |||
487 | WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); | ||
488 | |||
489 | single.timeout_abs_ns = get_abs_timeout(delta); | ||
490 | single.flags = VCPU_SSHOTTMR_future; | ||
491 | |||
492 | ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single); | ||
493 | |||
494 | BUG_ON(ret != 0 && ret != -ETIME); | ||
495 | |||
496 | return ret; | ||
497 | } | ||
498 | |||
499 | static const struct clock_event_device xen_vcpuop_clockevent = { | ||
500 | .name = "xen", | ||
501 | .features = CLOCK_EVT_FEAT_ONESHOT, | ||
502 | |||
503 | .max_delta_ns = 0xffffffff, | ||
504 | .min_delta_ns = TIMER_SLOP, | ||
505 | |||
506 | .mult = 1, | ||
507 | .shift = 0, | ||
508 | .rating = 500, | ||
509 | |||
510 | .set_mode = xen_vcpuop_set_mode, | ||
511 | .set_next_event = xen_vcpuop_set_next_event, | ||
512 | }; | ||
513 | |||
514 | static const struct clock_event_device *xen_clockevent = | ||
515 | &xen_timerop_clockevent; | ||
516 | static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events); | ||
517 | |||
518 | static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) | ||
519 | { | ||
520 | struct clock_event_device *evt = &__get_cpu_var(xen_clock_events); | ||
521 | irqreturn_t ret; | ||
522 | |||
523 | ret = IRQ_NONE; | ||
524 | if (evt->event_handler) { | ||
525 | evt->event_handler(evt); | ||
526 | ret = IRQ_HANDLED; | ||
527 | } | ||
528 | |||
529 | do_stolen_accounting(); | ||
530 | |||
531 | return ret; | ||
532 | } | ||
533 | |||
534 | void xen_setup_timer(int cpu) | ||
535 | { | ||
536 | const char *name; | ||
537 | struct clock_event_device *evt; | ||
538 | int irq; | ||
539 | |||
540 | printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); | ||
541 | |||
542 | name = kasprintf(GFP_KERNEL, "timer%d", cpu); | ||
543 | if (!name) | ||
544 | name = "<timer kasprintf failed>"; | ||
545 | |||
546 | irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, | ||
547 | IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, | ||
548 | name, NULL); | ||
549 | |||
550 | evt = &per_cpu(xen_clock_events, cpu); | ||
551 | memcpy(evt, xen_clockevent, sizeof(*evt)); | ||
552 | |||
553 | evt->cpumask = cpumask_of_cpu(cpu); | ||
554 | evt->irq = irq; | ||
555 | |||
556 | setup_runstate_info(cpu); | ||
557 | } | ||
558 | |||
559 | void xen_setup_cpu_clockevents(void) | ||
560 | { | ||
561 | BUG_ON(preemptible()); | ||
562 | |||
563 | clockevents_register_device(&__get_cpu_var(xen_clock_events)); | ||
564 | } | ||
565 | |||
566 | __init void xen_time_init(void) | ||
567 | { | ||
568 | int cpu = smp_processor_id(); | ||
569 | |||
570 | get_time_values_from_xen(); | ||
571 | |||
572 | clocksource_register(&xen_clocksource); | ||
573 | |||
574 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { | ||
575 | /* Successfully turned off 100Hz tick, so we have the | ||
576 | vcpuop-based timer interface */ | ||
577 | printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); | ||
578 | xen_clockevent = &xen_vcpuop_clockevent; | ||
579 | } | ||
580 | |||
581 | /* Set initial system time with full resolution */ | ||
582 | xen_read_wallclock(&xtime); | ||
583 | set_normalized_timespec(&wall_to_monotonic, | ||
584 | -xtime.tv_sec, -xtime.tv_nsec); | ||
585 | |||
586 | tsc_disable = 0; | ||
587 | |||
588 | xen_setup_timer(cpu); | ||
589 | xen_setup_cpu_clockevents(); | ||
590 | } | ||
diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S new file mode 100644 index 000000000000..1a43b60c0c62 --- /dev/null +++ b/arch/i386/xen/xen-asm.S | |||
@@ -0,0 +1,291 @@ | |||
1 | /* | ||
2 | Asm versions of Xen pv-ops, suitable for either direct use or inlining. | ||
3 | The inline versions are the same as the direct-use versions, with the | ||
4 | pre- and post-amble chopped off. | ||
5 | |||
6 | This code is encoded for size rather than absolute efficiency, | ||
7 | with a view to being able to inline as much as possible. | ||
8 | |||
9 | We only bother with direct forms (ie, vcpu in pda) of the operations | ||
10 | here; the indirect forms are better handled in C, since they're | ||
11 | generally too large to inline anyway. | ||
12 | */ | ||
13 | |||
14 | #include <linux/linkage.h> | ||
15 | |||
16 | #include <asm/asm-offsets.h> | ||
17 | #include <asm/thread_info.h> | ||
18 | #include <asm/percpu.h> | ||
19 | #include <asm/processor-flags.h> | ||
20 | #include <asm/segment.h> | ||
21 | |||
22 | #include <xen/interface/xen.h> | ||
23 | |||
24 | #define RELOC(x, v) .globl x##_reloc; x##_reloc=v | ||
25 | #define ENDPATCH(x) .globl x##_end; x##_end=. | ||
26 | |||
27 | /* Pseudo-flag used for virtual NMI, which we don't implement yet */ | ||
28 | #define XEN_EFLAGS_NMI 0x80000000 | ||
29 | |||
30 | /* | ||
31 | Enable events. This clears the event mask and tests the pending | ||
32 | event status with one and operation. If there are pending | ||
33 | events, then enter the hypervisor to get them handled. | ||
34 | */ | ||
35 | ENTRY(xen_irq_enable_direct) | ||
36 | /* Clear mask and test pending */ | ||
37 | andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending | ||
38 | /* Preempt here doesn't matter because that will deal with | ||
39 | any pending interrupts. The pending check may end up being | ||
40 | run on the wrong CPU, but that doesn't hurt. */ | ||
41 | jz 1f | ||
42 | 2: call check_events | ||
43 | 1: | ||
44 | ENDPATCH(xen_irq_enable_direct) | ||
45 | ret | ||
46 | ENDPROC(xen_irq_enable_direct) | ||
47 | RELOC(xen_irq_enable_direct, 2b+1) | ||
48 | |||
49 | |||
50 | /* | ||
51 | Disabling events is simply a matter of making the event mask | ||
52 | non-zero. | ||
53 | */ | ||
54 | ENTRY(xen_irq_disable_direct) | ||
55 | movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask | ||
56 | ENDPATCH(xen_irq_disable_direct) | ||
57 | ret | ||
58 | ENDPROC(xen_irq_disable_direct) | ||
59 | RELOC(xen_irq_disable_direct, 0) | ||
60 | |||
61 | /* | ||
62 | (xen_)save_fl is used to get the current interrupt enable status. | ||
63 | Callers expect the status to be in X86_EFLAGS_IF, and other bits | ||
64 | may be set in the return value. We take advantage of this by | ||
65 | making sure that X86_EFLAGS_IF has the right value (and other bits | ||
66 | in that byte are 0), but other bits in the return value are | ||
67 | undefined. We need to toggle the state of the bit, because | ||
68 | Xen and x86 use opposite senses (mask vs enable). | ||
69 | */ | ||
70 | ENTRY(xen_save_fl_direct) | ||
71 | testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask | ||
72 | setz %ah | ||
73 | addb %ah,%ah | ||
74 | ENDPATCH(xen_save_fl_direct) | ||
75 | ret | ||
76 | ENDPROC(xen_save_fl_direct) | ||
77 | RELOC(xen_save_fl_direct, 0) | ||
78 | |||
79 | |||
80 | /* | ||
81 | In principle the caller should be passing us a value return | ||
82 | from xen_save_fl_direct, but for robustness sake we test only | ||
83 | the X86_EFLAGS_IF flag rather than the whole byte. After | ||
84 | setting the interrupt mask state, it checks for unmasked | ||
85 | pending events and enters the hypervisor to get them delivered | ||
86 | if so. | ||
87 | */ | ||
88 | ENTRY(xen_restore_fl_direct) | ||
89 | testb $X86_EFLAGS_IF>>8, %ah | ||
90 | setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask | ||
91 | /* Preempt here doesn't matter because that will deal with | ||
92 | any pending interrupts. The pending check may end up being | ||
93 | run on the wrong CPU, but that doesn't hurt. */ | ||
94 | |||
95 | /* check for unmasked and pending */ | ||
96 | cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending | ||
97 | jz 1f | ||
98 | 2: call check_events | ||
99 | 1: | ||
100 | ENDPATCH(xen_restore_fl_direct) | ||
101 | ret | ||
102 | ENDPROC(xen_restore_fl_direct) | ||
103 | RELOC(xen_restore_fl_direct, 2b+1) | ||
104 | |||
105 | /* | ||
106 | This is run where a normal iret would be run, with the same stack setup: | ||
107 | 8: eflags | ||
108 | 4: cs | ||
109 | esp-> 0: eip | ||
110 | |||
111 | This attempts to make sure that any pending events are dealt | ||
112 | with on return to usermode, but there is a small window in | ||
113 | which an event can happen just before entering usermode. If | ||
114 | the nested interrupt ends up setting one of the TIF_WORK_MASK | ||
115 | pending work flags, they will not be tested again before | ||
116 | returning to usermode. This means that a process can end up | ||
117 | with pending work, which will be unprocessed until the process | ||
118 | enters and leaves the kernel again, which could be an | ||
119 | unbounded amount of time. This means that a pending signal or | ||
120 | reschedule event could be indefinitely delayed. | ||
121 | |||
122 | The fix is to notice a nested interrupt in the critical | ||
123 | window, and if one occurs, then fold the nested interrupt into | ||
124 | the current interrupt stack frame, and re-process it | ||
125 | iteratively rather than recursively. This means that it will | ||
126 | exit via the normal path, and all pending work will be dealt | ||
127 | with appropriately. | ||
128 | |||
129 | Because the nested interrupt handler needs to deal with the | ||
130 | current stack state in whatever form its in, we keep things | ||
131 | simple by only using a single register which is pushed/popped | ||
132 | on the stack. | ||
133 | |||
134 | Non-direct iret could be done in the same way, but it would | ||
135 | require an annoying amount of code duplication. We'll assume | ||
136 | that direct mode will be the common case once the hypervisor | ||
137 | support becomes commonplace. | ||
138 | */ | ||
139 | ENTRY(xen_iret_direct) | ||
140 | /* test eflags for special cases */ | ||
141 | testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) | ||
142 | jnz hyper_iret | ||
143 | |||
144 | push %eax | ||
145 | ESP_OFFSET=4 # bytes pushed onto stack | ||
146 | |||
147 | /* Store vcpu_info pointer for easy access. Do it this | ||
148 | way to avoid having to reload %fs */ | ||
149 | #ifdef CONFIG_SMP | ||
150 | GET_THREAD_INFO(%eax) | ||
151 | movl TI_cpu(%eax),%eax | ||
152 | movl __per_cpu_offset(,%eax,4),%eax | ||
153 | lea per_cpu__xen_vcpu_info(%eax),%eax | ||
154 | #else | ||
155 | movl $per_cpu__xen_vcpu_info, %eax | ||
156 | #endif | ||
157 | |||
158 | /* check IF state we're restoring */ | ||
159 | testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) | ||
160 | |||
161 | /* Maybe enable events. Once this happens we could get a | ||
162 | recursive event, so the critical region starts immediately | ||
163 | afterwards. However, if that happens we don't end up | ||
164 | resuming the code, so we don't have to be worried about | ||
165 | being preempted to another CPU. */ | ||
166 | setz XEN_vcpu_info_mask(%eax) | ||
167 | xen_iret_start_crit: | ||
168 | |||
169 | /* check for unmasked and pending */ | ||
170 | cmpw $0x0001, XEN_vcpu_info_pending(%eax) | ||
171 | |||
172 | /* If there's something pending, mask events again so we | ||
173 | can jump back into xen_hypervisor_callback */ | ||
174 | sete XEN_vcpu_info_mask(%eax) | ||
175 | |||
176 | popl %eax | ||
177 | |||
178 | /* From this point on the registers are restored and the stack | ||
179 | updated, so we don't need to worry about it if we're preempted */ | ||
180 | iret_restore_end: | ||
181 | |||
182 | /* Jump to hypervisor_callback after fixing up the stack. | ||
183 | Events are masked, so jumping out of the critical | ||
184 | region is OK. */ | ||
185 | je xen_hypervisor_callback | ||
186 | |||
187 | iret | ||
188 | xen_iret_end_crit: | ||
189 | |||
190 | hyper_iret: | ||
191 | /* put this out of line since its very rarely used */ | ||
192 | jmp hypercall_page + __HYPERVISOR_iret * 32 | ||
193 | |||
194 | .globl xen_iret_start_crit, xen_iret_end_crit | ||
195 | |||
196 | /* | ||
197 | This is called by xen_hypervisor_callback in entry.S when it sees | ||
198 | that the EIP at the time of interrupt was between xen_iret_start_crit | ||
199 | and xen_iret_end_crit. We're passed the EIP in %eax so we can do | ||
200 | a more refined determination of what to do. | ||
201 | |||
202 | The stack format at this point is: | ||
203 | ---------------- | ||
204 | ss : (ss/esp may be present if we came from usermode) | ||
205 | esp : | ||
206 | eflags } outer exception info | ||
207 | cs } | ||
208 | eip } | ||
209 | ---------------- <- edi (copy dest) | ||
210 | eax : outer eax if it hasn't been restored | ||
211 | ---------------- | ||
212 | eflags } nested exception info | ||
213 | cs } (no ss/esp because we're nested | ||
214 | eip } from the same ring) | ||
215 | orig_eax }<- esi (copy src) | ||
216 | - - - - - - - - | ||
217 | fs } | ||
218 | es } | ||
219 | ds } SAVE_ALL state | ||
220 | eax } | ||
221 | : : | ||
222 | ebx } | ||
223 | ---------------- | ||
224 | return addr <- esp | ||
225 | ---------------- | ||
226 | |||
227 | In order to deliver the nested exception properly, we need to shift | ||
228 | everything from the return addr up to the error code so it | ||
229 | sits just under the outer exception info. This means that when we | ||
230 | handle the exception, we do it in the context of the outer exception | ||
231 | rather than starting a new one. | ||
232 | |||
233 | The only caveat is that if the outer eax hasn't been | ||
234 | restored yet (ie, it's still on stack), we need to insert | ||
235 | its value into the SAVE_ALL state before going on, since | ||
236 | it's usermode state which we eventually need to restore. | ||
237 | */ | ||
238 | ENTRY(xen_iret_crit_fixup) | ||
239 | /* offsets +4 for return address */ | ||
240 | |||
241 | /* | ||
242 | Paranoia: Make sure we're really coming from userspace. | ||
243 | One could imagine a case where userspace jumps into the | ||
244 | critical range address, but just before the CPU delivers a GP, | ||
245 | it decides to deliver an interrupt instead. Unlikely? | ||
246 | Definitely. Easy to avoid? Yes. The Intel documents | ||
247 | explicitly say that the reported EIP for a bad jump is the | ||
248 | jump instruction itself, not the destination, but some virtual | ||
249 | environments get this wrong. | ||
250 | */ | ||
251 | movl PT_CS+4(%esp), %ecx | ||
252 | andl $SEGMENT_RPL_MASK, %ecx | ||
253 | cmpl $USER_RPL, %ecx | ||
254 | je 2f | ||
255 | |||
256 | lea PT_ORIG_EAX+4(%esp), %esi | ||
257 | lea PT_EFLAGS+4(%esp), %edi | ||
258 | |||
259 | /* If eip is before iret_restore_end then stack | ||
260 | hasn't been restored yet. */ | ||
261 | cmp $iret_restore_end, %eax | ||
262 | jae 1f | ||
263 | |||
264 | movl 0+4(%edi),%eax /* copy EAX */ | ||
265 | movl %eax, PT_EAX+4(%esp) | ||
266 | |||
267 | lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ | ||
268 | |||
269 | /* set up the copy */ | ||
270 | 1: std | ||
271 | mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ | ||
272 | rep movsl | ||
273 | cld | ||
274 | |||
275 | lea 4(%edi),%esp /* point esp to new frame */ | ||
276 | 2: ret | ||
277 | |||
278 | |||
279 | /* | ||
280 | Force an event check by making a hypercall, | ||
281 | but preserve regs before making the call. | ||
282 | */ | ||
283 | check_events: | ||
284 | push %eax | ||
285 | push %ecx | ||
286 | push %edx | ||
287 | call force_evtchn_callback | ||
288 | pop %edx | ||
289 | pop %ecx | ||
290 | pop %eax | ||
291 | ret | ||
diff --git a/arch/i386/xen/xen-head.S b/arch/i386/xen/xen-head.S new file mode 100644 index 000000000000..2998d55a0017 --- /dev/null +++ b/arch/i386/xen/xen-head.S | |||
@@ -0,0 +1,36 @@ | |||
1 | /* Xen-specific pieces of head.S, intended to be included in the right | ||
2 | place in head.S */ | ||
3 | |||
4 | #ifdef CONFIG_XEN | ||
5 | |||
6 | #include <linux/elfnote.h> | ||
7 | #include <asm/boot.h> | ||
8 | #include <xen/interface/elfnote.h> | ||
9 | |||
10 | ENTRY(startup_xen) | ||
11 | movl %esi,xen_start_info | ||
12 | cld | ||
13 | movl $(init_thread_union+THREAD_SIZE),%esp | ||
14 | jmp xen_start_kernel | ||
15 | |||
16 | .pushsection ".bss.page_aligned" | ||
17 | .align PAGE_SIZE_asm | ||
18 | ENTRY(hypercall_page) | ||
19 | .skip 0x1000 | ||
20 | .popsection | ||
21 | |||
22 | ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") | ||
23 | ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") | ||
24 | ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") | ||
25 | ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) | ||
26 | ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) | ||
27 | ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) | ||
28 | ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") | ||
29 | #ifdef CONFIG_X86_PAE | ||
30 | ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") | ||
31 | #else | ||
32 | ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") | ||
33 | #endif | ||
34 | ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") | ||
35 | |||
36 | #endif /*CONFIG_XEN */ | ||
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h new file mode 100644 index 000000000000..b9aaea45f07f --- /dev/null +++ b/arch/i386/xen/xen-ops.h | |||
@@ -0,0 +1,71 @@ | |||
1 | #ifndef XEN_OPS_H | ||
2 | #define XEN_OPS_H | ||
3 | |||
4 | #include <linux/init.h> | ||
5 | |||
6 | /* These are code, but not functions. Defined in entry.S */ | ||
7 | extern const char xen_hypervisor_callback[]; | ||
8 | extern const char xen_failsafe_callback[]; | ||
9 | |||
10 | void xen_copy_trap_info(struct trap_info *traps); | ||
11 | |||
12 | DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); | ||
13 | DECLARE_PER_CPU(unsigned long, xen_cr3); | ||
14 | |||
15 | extern struct start_info *xen_start_info; | ||
16 | extern struct shared_info *HYPERVISOR_shared_info; | ||
17 | |||
18 | char * __init xen_memory_setup(void); | ||
19 | void __init xen_arch_setup(void); | ||
20 | void __init xen_init_IRQ(void); | ||
21 | |||
22 | void xen_setup_timer(int cpu); | ||
23 | void xen_setup_cpu_clockevents(void); | ||
24 | unsigned long xen_cpu_khz(void); | ||
25 | void __init xen_time_init(void); | ||
26 | unsigned long xen_get_wallclock(void); | ||
27 | int xen_set_wallclock(unsigned long time); | ||
28 | unsigned long long xen_sched_clock(void); | ||
29 | |||
30 | void xen_mark_init_mm_pinned(void); | ||
31 | |||
32 | DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); | ||
33 | |||
34 | static inline unsigned xen_get_lazy_mode(void) | ||
35 | { | ||
36 | return x86_read_percpu(xen_lazy_mode); | ||
37 | } | ||
38 | |||
39 | void __init xen_fill_possible_map(void); | ||
40 | |||
41 | void __init xen_setup_vcpu_info_placement(void); | ||
42 | void xen_smp_prepare_boot_cpu(void); | ||
43 | void xen_smp_prepare_cpus(unsigned int max_cpus); | ||
44 | int xen_cpu_up(unsigned int cpu); | ||
45 | void xen_smp_cpus_done(unsigned int max_cpus); | ||
46 | |||
47 | void xen_smp_send_stop(void); | ||
48 | void xen_smp_send_reschedule(int cpu); | ||
49 | int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic, | ||
50 | int wait); | ||
51 | int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, | ||
52 | int nonatomic, int wait); | ||
53 | |||
54 | int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), | ||
55 | void *info, int wait); | ||
56 | |||
57 | |||
58 | /* Declare an asm function, along with symbols needed to make it | ||
59 | inlineable */ | ||
60 | #define DECL_ASM(ret, name, ...) \ | ||
61 | ret name(__VA_ARGS__); \ | ||
62 | extern char name##_end[]; \ | ||
63 | extern char name##_reloc[] \ | ||
64 | |||
65 | DECL_ASM(void, xen_irq_enable_direct, void); | ||
66 | DECL_ASM(void, xen_irq_disable_direct, void); | ||
67 | DECL_ASM(unsigned long, xen_save_fl_direct, void); | ||
68 | DECL_ASM(void, xen_restore_fl_direct, unsigned long); | ||
69 | |||
70 | void xen_iret_direct(void); | ||
71 | #endif /* XEN_OPS_H */ | ||
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index 296d2b0c5d88..fd9aff3f3890 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <asm/io.h> | 6 | #include <asm/io.h> |
7 | #include <asm/processor.h> | 7 | #include <asm/processor.h> |
8 | #include <asm/fcntl.h> | 8 | #include <asm/fcntl.h> |
9 | #include <xen/hvc-console.h> | ||
9 | 10 | ||
10 | /* Simple VGA output */ | 11 | /* Simple VGA output */ |
11 | 12 | ||
@@ -242,6 +243,10 @@ static int __init setup_early_printk(char *buf) | |||
242 | simnow_init(buf + 6); | 243 | simnow_init(buf + 6); |
243 | early_console = &simnow_console; | 244 | early_console = &simnow_console; |
244 | keep_early = 1; | 245 | keep_early = 1; |
246 | #ifdef CONFIG_HVC_XEN | ||
247 | } else if (!strncmp(buf, "xen", 3)) { | ||
248 | early_console = &xenboot_console; | ||
249 | #endif | ||
245 | } | 250 | } |
246 | 251 | ||
247 | if (keep_early) | 252 | if (keep_early) |
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index aa1d15991794..f3fb8174559e 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c | |||
@@ -174,7 +174,7 @@ static void do_mce_trigger(void) | |||
174 | if (events != atomic_read(&mce_logged) && trigger[0]) { | 174 | if (events != atomic_read(&mce_logged) && trigger[0]) { |
175 | /* Small race window, but should be harmless. */ | 175 | /* Small race window, but should be harmless. */ |
176 | atomic_set(&mce_logged, events); | 176 | atomic_set(&mce_logged, events); |
177 | call_usermodehelper(trigger, trigger_argv, NULL, -1); | 177 | call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); |
178 | } | 178 | } |
179 | } | 179 | } |
180 | 180 | ||
diff --git a/drivers/Makefile b/drivers/Makefile index 503d82569449..6d9d7fab77f5 100644 --- a/drivers/Makefile +++ b/drivers/Makefile | |||
@@ -15,6 +15,8 @@ obj-$(CONFIG_ACPI) += acpi/ | |||
15 | obj-$(CONFIG_PNP) += pnp/ | 15 | obj-$(CONFIG_PNP) += pnp/ |
16 | obj-$(CONFIG_ARM_AMBA) += amba/ | 16 | obj-$(CONFIG_ARM_AMBA) += amba/ |
17 | 17 | ||
18 | obj-$(CONFIG_XEN) += xen/ | ||
19 | |||
18 | # char/ comes before serial/ etc so that the VT console is the boot-time | 20 | # char/ comes before serial/ etc so that the VT console is the boot-time |
19 | # default. | 21 | # default. |
20 | obj-y += char/ | 22 | obj-y += char/ |
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 88a6fc7fd271..58f1338981bc 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #include <linux/jiffies.h> | 40 | #include <linux/jiffies.h> |
41 | #include <linux/kmod.h> | 41 | #include <linux/kmod.h> |
42 | #include <linux/seq_file.h> | 42 | #include <linux/seq_file.h> |
43 | #include <linux/reboot.h> | ||
43 | #include <asm/uaccess.h> | 44 | #include <asm/uaccess.h> |
44 | 45 | ||
45 | #include <acpi/acpi_bus.h> | 46 | #include <acpi/acpi_bus.h> |
@@ -59,7 +60,6 @@ | |||
59 | #define ACPI_THERMAL_NOTIFY_CRITICAL 0xF0 | 60 | #define ACPI_THERMAL_NOTIFY_CRITICAL 0xF0 |
60 | #define ACPI_THERMAL_NOTIFY_HOT 0xF1 | 61 | #define ACPI_THERMAL_NOTIFY_HOT 0xF1 |
61 | #define ACPI_THERMAL_MODE_ACTIVE 0x00 | 62 | #define ACPI_THERMAL_MODE_ACTIVE 0x00 |
62 | #define ACPI_THERMAL_PATH_POWEROFF "/sbin/poweroff" | ||
63 | 63 | ||
64 | #define ACPI_THERMAL_MAX_ACTIVE 10 | 64 | #define ACPI_THERMAL_MAX_ACTIVE 10 |
65 | #define ACPI_THERMAL_MAX_LIMIT_STR_LEN 65 | 65 | #define ACPI_THERMAL_MAX_LIMIT_STR_LEN 65 |
@@ -419,26 +419,6 @@ static int acpi_thermal_get_devices(struct acpi_thermal *tz) | |||
419 | return 0; | 419 | return 0; |
420 | } | 420 | } |
421 | 421 | ||
422 | static int acpi_thermal_call_usermode(char *path) | ||
423 | { | ||
424 | char *argv[2] = { NULL, NULL }; | ||
425 | char *envp[3] = { NULL, NULL, NULL }; | ||
426 | |||
427 | |||
428 | if (!path) | ||
429 | return -EINVAL; | ||
430 | |||
431 | argv[0] = path; | ||
432 | |||
433 | /* minimal command environment */ | ||
434 | envp[0] = "HOME=/"; | ||
435 | envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | ||
436 | |||
437 | call_usermodehelper(argv[0], argv, envp, 0); | ||
438 | |||
439 | return 0; | ||
440 | } | ||
441 | |||
442 | static int acpi_thermal_critical(struct acpi_thermal *tz) | 422 | static int acpi_thermal_critical(struct acpi_thermal *tz) |
443 | { | 423 | { |
444 | if (!tz || !tz->trips.critical.flags.valid) | 424 | if (!tz || !tz->trips.critical.flags.valid) |
@@ -456,7 +436,7 @@ static int acpi_thermal_critical(struct acpi_thermal *tz) | |||
456 | acpi_bus_generate_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL, | 436 | acpi_bus_generate_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL, |
457 | tz->trips.critical.flags.enabled); | 437 | tz->trips.critical.flags.enabled); |
458 | 438 | ||
459 | acpi_thermal_call_usermode(ACPI_THERMAL_PATH_POWEROFF); | 439 | orderly_poweroff(true); |
460 | 440 | ||
461 | return 0; | 441 | return 0; |
462 | } | 442 | } |
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 8f65b88cf711..a4a311992408 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig | |||
@@ -427,4 +427,13 @@ config XILINX_SYSACE | |||
427 | help | 427 | help |
428 | Include support for the Xilinx SystemACE CompactFlash interface | 428 | Include support for the Xilinx SystemACE CompactFlash interface |
429 | 429 | ||
430 | config XEN_BLKDEV_FRONTEND | ||
431 | tristate "Xen virtual block device support" | ||
432 | depends on XEN | ||
433 | default y | ||
434 | help | ||
435 | This driver implements the front-end of the Xen virtual | ||
436 | block device driver. It communicates with a back-end driver | ||
437 | in another domain which drives the actual block device. | ||
438 | |||
430 | endif # BLK_DEV | 439 | endif # BLK_DEV |
diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 9ee08ab4ffa8..3e31532df0ed 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile | |||
@@ -29,3 +29,4 @@ obj-$(CONFIG_VIODASD) += viodasd.o | |||
29 | obj-$(CONFIG_BLK_DEV_SX8) += sx8.o | 29 | obj-$(CONFIG_BLK_DEV_SX8) += sx8.o |
30 | obj-$(CONFIG_BLK_DEV_UB) += ub.o | 30 | obj-$(CONFIG_BLK_DEV_UB) += ub.o |
31 | 31 | ||
32 | obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o | ||
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c new file mode 100644 index 000000000000..6746c29181f8 --- /dev/null +++ b/drivers/block/xen-blkfront.c | |||
@@ -0,0 +1,988 @@ | |||
1 | /* | ||
2 | * blkfront.c | ||
3 | * | ||
4 | * XenLinux virtual block device driver. | ||
5 | * | ||
6 | * Copyright (c) 2003-2004, Keir Fraser & Steve Hand | ||
7 | * Modifications by Mark A. Williamson are (c) Intel Research Cambridge | ||
8 | * Copyright (c) 2004, Christian Limpach | ||
9 | * Copyright (c) 2004, Andrew Warfield | ||
10 | * Copyright (c) 2005, Christopher Clark | ||
11 | * Copyright (c) 2005, XenSource Ltd | ||
12 | * | ||
13 | * This program is free software; you can redistribute it and/or | ||
14 | * modify it under the terms of the GNU General Public License version 2 | ||
15 | * as published by the Free Software Foundation; or, when distributed | ||
16 | * separately from the Linux kernel or incorporated into other | ||
17 | * software packages, subject to the following license: | ||
18 | * | ||
19 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
20 | * of this source file (the "Software"), to deal in the Software without | ||
21 | * restriction, including without limitation the rights to use, copy, modify, | ||
22 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
23 | * and to permit persons to whom the Software is furnished to do so, subject to | ||
24 | * the following conditions: | ||
25 | * | ||
26 | * The above copyright notice and this permission notice shall be included in | ||
27 | * all copies or substantial portions of the Software. | ||
28 | * | ||
29 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
30 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
31 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
32 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
33 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
34 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
35 | * IN THE SOFTWARE. | ||
36 | */ | ||
37 | |||
38 | #include <linux/interrupt.h> | ||
39 | #include <linux/blkdev.h> | ||
40 | #include <linux/module.h> | ||
41 | |||
42 | #include <xen/xenbus.h> | ||
43 | #include <xen/grant_table.h> | ||
44 | #include <xen/events.h> | ||
45 | #include <xen/page.h> | ||
46 | |||
47 | #include <xen/interface/grant_table.h> | ||
48 | #include <xen/interface/io/blkif.h> | ||
49 | |||
50 | #include <asm/xen/hypervisor.h> | ||
51 | |||
52 | enum blkif_state { | ||
53 | BLKIF_STATE_DISCONNECTED, | ||
54 | BLKIF_STATE_CONNECTED, | ||
55 | BLKIF_STATE_SUSPENDED, | ||
56 | }; | ||
57 | |||
58 | struct blk_shadow { | ||
59 | struct blkif_request req; | ||
60 | unsigned long request; | ||
61 | unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
62 | }; | ||
63 | |||
64 | static struct block_device_operations xlvbd_block_fops; | ||
65 | |||
66 | #define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE) | ||
67 | |||
68 | /* | ||
69 | * We have one of these per vbd, whether ide, scsi or 'other'. They | ||
70 | * hang in private_data off the gendisk structure. We may end up | ||
71 | * putting all kinds of interesting stuff here :-) | ||
72 | */ | ||
73 | struct blkfront_info | ||
74 | { | ||
75 | struct xenbus_device *xbdev; | ||
76 | dev_t dev; | ||
77 | struct gendisk *gd; | ||
78 | int vdevice; | ||
79 | blkif_vdev_t handle; | ||
80 | enum blkif_state connected; | ||
81 | int ring_ref; | ||
82 | struct blkif_front_ring ring; | ||
83 | unsigned int evtchn, irq; | ||
84 | struct request_queue *rq; | ||
85 | struct work_struct work; | ||
86 | struct gnttab_free_callback callback; | ||
87 | struct blk_shadow shadow[BLK_RING_SIZE]; | ||
88 | unsigned long shadow_free; | ||
89 | int feature_barrier; | ||
90 | |||
91 | /** | ||
92 | * The number of people holding this device open. We won't allow a | ||
93 | * hot-unplug unless this is 0. | ||
94 | */ | ||
95 | int users; | ||
96 | }; | ||
97 | |||
98 | static DEFINE_SPINLOCK(blkif_io_lock); | ||
99 | |||
100 | #define MAXIMUM_OUTSTANDING_BLOCK_REQS \ | ||
101 | (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) | ||
102 | #define GRANT_INVALID_REF 0 | ||
103 | |||
104 | #define PARTS_PER_DISK 16 | ||
105 | |||
106 | #define BLKIF_MAJOR(dev) ((dev)>>8) | ||
107 | #define BLKIF_MINOR(dev) ((dev) & 0xff) | ||
108 | |||
109 | #define DEV_NAME "xvd" /* name in /dev */ | ||
110 | |||
111 | /* Information about our VBDs. */ | ||
112 | #define MAX_VBDS 64 | ||
113 | static LIST_HEAD(vbds_list); | ||
114 | |||
115 | static int get_id_from_freelist(struct blkfront_info *info) | ||
116 | { | ||
117 | unsigned long free = info->shadow_free; | ||
118 | BUG_ON(free > BLK_RING_SIZE); | ||
119 | info->shadow_free = info->shadow[free].req.id; | ||
120 | info->shadow[free].req.id = 0x0fffffee; /* debug */ | ||
121 | return free; | ||
122 | } | ||
123 | |||
124 | static void add_id_to_freelist(struct blkfront_info *info, | ||
125 | unsigned long id) | ||
126 | { | ||
127 | info->shadow[id].req.id = info->shadow_free; | ||
128 | info->shadow[id].request = 0; | ||
129 | info->shadow_free = id; | ||
130 | } | ||
131 | |||
132 | static void blkif_restart_queue_callback(void *arg) | ||
133 | { | ||
134 | struct blkfront_info *info = (struct blkfront_info *)arg; | ||
135 | schedule_work(&info->work); | ||
136 | } | ||
137 | |||
138 | /* | ||
139 | * blkif_queue_request | ||
140 | * | ||
141 | * request block io | ||
142 | * | ||
143 | * id: for guest use only. | ||
144 | * operation: BLKIF_OP_{READ,WRITE,PROBE} | ||
145 | * buffer: buffer to read/write into. this should be a | ||
146 | * virtual address in the guest os. | ||
147 | */ | ||
148 | static int blkif_queue_request(struct request *req) | ||
149 | { | ||
150 | struct blkfront_info *info = req->rq_disk->private_data; | ||
151 | unsigned long buffer_mfn; | ||
152 | struct blkif_request *ring_req; | ||
153 | struct bio *bio; | ||
154 | struct bio_vec *bvec; | ||
155 | int idx; | ||
156 | unsigned long id; | ||
157 | unsigned int fsect, lsect; | ||
158 | int ref; | ||
159 | grant_ref_t gref_head; | ||
160 | |||
161 | if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) | ||
162 | return 1; | ||
163 | |||
164 | if (gnttab_alloc_grant_references( | ||
165 | BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) { | ||
166 | gnttab_request_free_callback( | ||
167 | &info->callback, | ||
168 | blkif_restart_queue_callback, | ||
169 | info, | ||
170 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
171 | return 1; | ||
172 | } | ||
173 | |||
174 | /* Fill out a communications ring structure. */ | ||
175 | ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); | ||
176 | id = get_id_from_freelist(info); | ||
177 | info->shadow[id].request = (unsigned long)req; | ||
178 | |||
179 | ring_req->id = id; | ||
180 | ring_req->sector_number = (blkif_sector_t)req->sector; | ||
181 | ring_req->handle = info->handle; | ||
182 | |||
183 | ring_req->operation = rq_data_dir(req) ? | ||
184 | BLKIF_OP_WRITE : BLKIF_OP_READ; | ||
185 | if (blk_barrier_rq(req)) | ||
186 | ring_req->operation = BLKIF_OP_WRITE_BARRIER; | ||
187 | |||
188 | ring_req->nr_segments = 0; | ||
189 | rq_for_each_bio (bio, req) { | ||
190 | bio_for_each_segment (bvec, bio, idx) { | ||
191 | BUG_ON(ring_req->nr_segments | ||
192 | == BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
193 | buffer_mfn = pfn_to_mfn(page_to_pfn(bvec->bv_page)); | ||
194 | fsect = bvec->bv_offset >> 9; | ||
195 | lsect = fsect + (bvec->bv_len >> 9) - 1; | ||
196 | /* install a grant reference. */ | ||
197 | ref = gnttab_claim_grant_reference(&gref_head); | ||
198 | BUG_ON(ref == -ENOSPC); | ||
199 | |||
200 | gnttab_grant_foreign_access_ref( | ||
201 | ref, | ||
202 | info->xbdev->otherend_id, | ||
203 | buffer_mfn, | ||
204 | rq_data_dir(req) ); | ||
205 | |||
206 | info->shadow[id].frame[ring_req->nr_segments] = | ||
207 | mfn_to_pfn(buffer_mfn); | ||
208 | |||
209 | ring_req->seg[ring_req->nr_segments] = | ||
210 | (struct blkif_request_segment) { | ||
211 | .gref = ref, | ||
212 | .first_sect = fsect, | ||
213 | .last_sect = lsect }; | ||
214 | |||
215 | ring_req->nr_segments++; | ||
216 | } | ||
217 | } | ||
218 | |||
219 | info->ring.req_prod_pvt++; | ||
220 | |||
221 | /* Keep a private copy so we can reissue requests when recovering. */ | ||
222 | info->shadow[id].req = *ring_req; | ||
223 | |||
224 | gnttab_free_grant_references(gref_head); | ||
225 | |||
226 | return 0; | ||
227 | } | ||
228 | |||
229 | |||
230 | static inline void flush_requests(struct blkfront_info *info) | ||
231 | { | ||
232 | int notify; | ||
233 | |||
234 | RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); | ||
235 | |||
236 | if (notify) | ||
237 | notify_remote_via_irq(info->irq); | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | * do_blkif_request | ||
242 | * read a block; request is in a request queue | ||
243 | */ | ||
244 | static void do_blkif_request(request_queue_t *rq) | ||
245 | { | ||
246 | struct blkfront_info *info = NULL; | ||
247 | struct request *req; | ||
248 | int queued; | ||
249 | |||
250 | pr_debug("Entered do_blkif_request\n"); | ||
251 | |||
252 | queued = 0; | ||
253 | |||
254 | while ((req = elv_next_request(rq)) != NULL) { | ||
255 | info = req->rq_disk->private_data; | ||
256 | if (!blk_fs_request(req)) { | ||
257 | end_request(req, 0); | ||
258 | continue; | ||
259 | } | ||
260 | |||
261 | if (RING_FULL(&info->ring)) | ||
262 | goto wait; | ||
263 | |||
264 | pr_debug("do_blk_req %p: cmd %p, sec %lx, " | ||
265 | "(%u/%li) buffer:%p [%s]\n", | ||
266 | req, req->cmd, (unsigned long)req->sector, | ||
267 | req->current_nr_sectors, | ||
268 | req->nr_sectors, req->buffer, | ||
269 | rq_data_dir(req) ? "write" : "read"); | ||
270 | |||
271 | |||
272 | blkdev_dequeue_request(req); | ||
273 | if (blkif_queue_request(req)) { | ||
274 | blk_requeue_request(rq, req); | ||
275 | wait: | ||
276 | /* Avoid pointless unplugs. */ | ||
277 | blk_stop_queue(rq); | ||
278 | break; | ||
279 | } | ||
280 | |||
281 | queued++; | ||
282 | } | ||
283 | |||
284 | if (queued != 0) | ||
285 | flush_requests(info); | ||
286 | } | ||
287 | |||
288 | static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) | ||
289 | { | ||
290 | request_queue_t *rq; | ||
291 | |||
292 | rq = blk_init_queue(do_blkif_request, &blkif_io_lock); | ||
293 | if (rq == NULL) | ||
294 | return -1; | ||
295 | |||
296 | elevator_init(rq, "noop"); | ||
297 | |||
298 | /* Hard sector size and max sectors impersonate the equiv. hardware. */ | ||
299 | blk_queue_hardsect_size(rq, sector_size); | ||
300 | blk_queue_max_sectors(rq, 512); | ||
301 | |||
302 | /* Each segment in a request is up to an aligned page in size. */ | ||
303 | blk_queue_segment_boundary(rq, PAGE_SIZE - 1); | ||
304 | blk_queue_max_segment_size(rq, PAGE_SIZE); | ||
305 | |||
306 | /* Ensure a merged request will fit in a single I/O ring slot. */ | ||
307 | blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
308 | blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
309 | |||
310 | /* Make sure buffer addresses are sector-aligned. */ | ||
311 | blk_queue_dma_alignment(rq, 511); | ||
312 | |||
313 | gd->queue = rq; | ||
314 | |||
315 | return 0; | ||
316 | } | ||
317 | |||
318 | |||
319 | static int xlvbd_barrier(struct blkfront_info *info) | ||
320 | { | ||
321 | int err; | ||
322 | |||
323 | err = blk_queue_ordered(info->rq, | ||
324 | info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE, | ||
325 | NULL); | ||
326 | |||
327 | if (err) | ||
328 | return err; | ||
329 | |||
330 | printk(KERN_INFO "blkfront: %s: barriers %s\n", | ||
331 | info->gd->disk_name, | ||
332 | info->feature_barrier ? "enabled" : "disabled"); | ||
333 | return 0; | ||
334 | } | ||
335 | |||
336 | |||
337 | static int xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, | ||
338 | int vdevice, u16 vdisk_info, u16 sector_size, | ||
339 | struct blkfront_info *info) | ||
340 | { | ||
341 | struct gendisk *gd; | ||
342 | int nr_minors = 1; | ||
343 | int err = -ENODEV; | ||
344 | |||
345 | BUG_ON(info->gd != NULL); | ||
346 | BUG_ON(info->rq != NULL); | ||
347 | |||
348 | if ((minor % PARTS_PER_DISK) == 0) | ||
349 | nr_minors = PARTS_PER_DISK; | ||
350 | |||
351 | gd = alloc_disk(nr_minors); | ||
352 | if (gd == NULL) | ||
353 | goto out; | ||
354 | |||
355 | if (nr_minors > 1) | ||
356 | sprintf(gd->disk_name, "%s%c", DEV_NAME, | ||
357 | 'a' + minor / PARTS_PER_DISK); | ||
358 | else | ||
359 | sprintf(gd->disk_name, "%s%c%d", DEV_NAME, | ||
360 | 'a' + minor / PARTS_PER_DISK, | ||
361 | minor % PARTS_PER_DISK); | ||
362 | |||
363 | gd->major = XENVBD_MAJOR; | ||
364 | gd->first_minor = minor; | ||
365 | gd->fops = &xlvbd_block_fops; | ||
366 | gd->private_data = info; | ||
367 | gd->driverfs_dev = &(info->xbdev->dev); | ||
368 | set_capacity(gd, capacity); | ||
369 | |||
370 | if (xlvbd_init_blk_queue(gd, sector_size)) { | ||
371 | del_gendisk(gd); | ||
372 | goto out; | ||
373 | } | ||
374 | |||
375 | info->rq = gd->queue; | ||
376 | info->gd = gd; | ||
377 | |||
378 | if (info->feature_barrier) | ||
379 | xlvbd_barrier(info); | ||
380 | |||
381 | if (vdisk_info & VDISK_READONLY) | ||
382 | set_disk_ro(gd, 1); | ||
383 | |||
384 | if (vdisk_info & VDISK_REMOVABLE) | ||
385 | gd->flags |= GENHD_FL_REMOVABLE; | ||
386 | |||
387 | if (vdisk_info & VDISK_CDROM) | ||
388 | gd->flags |= GENHD_FL_CD; | ||
389 | |||
390 | return 0; | ||
391 | |||
392 | out: | ||
393 | return err; | ||
394 | } | ||
395 | |||
396 | static void kick_pending_request_queues(struct blkfront_info *info) | ||
397 | { | ||
398 | if (!RING_FULL(&info->ring)) { | ||
399 | /* Re-enable calldowns. */ | ||
400 | blk_start_queue(info->rq); | ||
401 | /* Kick things off immediately. */ | ||
402 | do_blkif_request(info->rq); | ||
403 | } | ||
404 | } | ||
405 | |||
406 | static void blkif_restart_queue(struct work_struct *work) | ||
407 | { | ||
408 | struct blkfront_info *info = container_of(work, struct blkfront_info, work); | ||
409 | |||
410 | spin_lock_irq(&blkif_io_lock); | ||
411 | if (info->connected == BLKIF_STATE_CONNECTED) | ||
412 | kick_pending_request_queues(info); | ||
413 | spin_unlock_irq(&blkif_io_lock); | ||
414 | } | ||
415 | |||
416 | static void blkif_free(struct blkfront_info *info, int suspend) | ||
417 | { | ||
418 | /* Prevent new requests being issued until we fix things up. */ | ||
419 | spin_lock_irq(&blkif_io_lock); | ||
420 | info->connected = suspend ? | ||
421 | BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; | ||
422 | /* No more blkif_request(). */ | ||
423 | if (info->rq) | ||
424 | blk_stop_queue(info->rq); | ||
425 | /* No more gnttab callback work. */ | ||
426 | gnttab_cancel_free_callback(&info->callback); | ||
427 | spin_unlock_irq(&blkif_io_lock); | ||
428 | |||
429 | /* Flush gnttab callback work. Must be done with no locks held. */ | ||
430 | flush_scheduled_work(); | ||
431 | |||
432 | /* Free resources associated with old device channel. */ | ||
433 | if (info->ring_ref != GRANT_INVALID_REF) { | ||
434 | gnttab_end_foreign_access(info->ring_ref, 0, | ||
435 | (unsigned long)info->ring.sring); | ||
436 | info->ring_ref = GRANT_INVALID_REF; | ||
437 | info->ring.sring = NULL; | ||
438 | } | ||
439 | if (info->irq) | ||
440 | unbind_from_irqhandler(info->irq, info); | ||
441 | info->evtchn = info->irq = 0; | ||
442 | |||
443 | } | ||
444 | |||
445 | static void blkif_completion(struct blk_shadow *s) | ||
446 | { | ||
447 | int i; | ||
448 | for (i = 0; i < s->req.nr_segments; i++) | ||
449 | gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL); | ||
450 | } | ||
451 | |||
452 | static irqreturn_t blkif_interrupt(int irq, void *dev_id) | ||
453 | { | ||
454 | struct request *req; | ||
455 | struct blkif_response *bret; | ||
456 | RING_IDX i, rp; | ||
457 | unsigned long flags; | ||
458 | struct blkfront_info *info = (struct blkfront_info *)dev_id; | ||
459 | int uptodate; | ||
460 | |||
461 | spin_lock_irqsave(&blkif_io_lock, flags); | ||
462 | |||
463 | if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { | ||
464 | spin_unlock_irqrestore(&blkif_io_lock, flags); | ||
465 | return IRQ_HANDLED; | ||
466 | } | ||
467 | |||
468 | again: | ||
469 | rp = info->ring.sring->rsp_prod; | ||
470 | rmb(); /* Ensure we see queued responses up to 'rp'. */ | ||
471 | |||
472 | for (i = info->ring.rsp_cons; i != rp; i++) { | ||
473 | unsigned long id; | ||
474 | int ret; | ||
475 | |||
476 | bret = RING_GET_RESPONSE(&info->ring, i); | ||
477 | id = bret->id; | ||
478 | req = (struct request *)info->shadow[id].request; | ||
479 | |||
480 | blkif_completion(&info->shadow[id]); | ||
481 | |||
482 | add_id_to_freelist(info, id); | ||
483 | |||
484 | uptodate = (bret->status == BLKIF_RSP_OKAY); | ||
485 | switch (bret->operation) { | ||
486 | case BLKIF_OP_WRITE_BARRIER: | ||
487 | if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { | ||
488 | printk(KERN_WARNING "blkfront: %s: write barrier op failed\n", | ||
489 | info->gd->disk_name); | ||
490 | uptodate = -EOPNOTSUPP; | ||
491 | info->feature_barrier = 0; | ||
492 | xlvbd_barrier(info); | ||
493 | } | ||
494 | /* fall through */ | ||
495 | case BLKIF_OP_READ: | ||
496 | case BLKIF_OP_WRITE: | ||
497 | if (unlikely(bret->status != BLKIF_RSP_OKAY)) | ||
498 | dev_dbg(&info->xbdev->dev, "Bad return from blkdev data " | ||
499 | "request: %x\n", bret->status); | ||
500 | |||
501 | ret = end_that_request_first(req, uptodate, | ||
502 | req->hard_nr_sectors); | ||
503 | BUG_ON(ret); | ||
504 | end_that_request_last(req, uptodate); | ||
505 | break; | ||
506 | default: | ||
507 | BUG(); | ||
508 | } | ||
509 | } | ||
510 | |||
511 | info->ring.rsp_cons = i; | ||
512 | |||
513 | if (i != info->ring.req_prod_pvt) { | ||
514 | int more_to_do; | ||
515 | RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); | ||
516 | if (more_to_do) | ||
517 | goto again; | ||
518 | } else | ||
519 | info->ring.sring->rsp_event = i + 1; | ||
520 | |||
521 | kick_pending_request_queues(info); | ||
522 | |||
523 | spin_unlock_irqrestore(&blkif_io_lock, flags); | ||
524 | |||
525 | return IRQ_HANDLED; | ||
526 | } | ||
527 | |||
528 | |||
529 | static int setup_blkring(struct xenbus_device *dev, | ||
530 | struct blkfront_info *info) | ||
531 | { | ||
532 | struct blkif_sring *sring; | ||
533 | int err; | ||
534 | |||
535 | info->ring_ref = GRANT_INVALID_REF; | ||
536 | |||
537 | sring = (struct blkif_sring *)__get_free_page(GFP_KERNEL); | ||
538 | if (!sring) { | ||
539 | xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); | ||
540 | return -ENOMEM; | ||
541 | } | ||
542 | SHARED_RING_INIT(sring); | ||
543 | FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); | ||
544 | |||
545 | err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); | ||
546 | if (err < 0) { | ||
547 | free_page((unsigned long)sring); | ||
548 | info->ring.sring = NULL; | ||
549 | goto fail; | ||
550 | } | ||
551 | info->ring_ref = err; | ||
552 | |||
553 | err = xenbus_alloc_evtchn(dev, &info->evtchn); | ||
554 | if (err) | ||
555 | goto fail; | ||
556 | |||
557 | err = bind_evtchn_to_irqhandler(info->evtchn, | ||
558 | blkif_interrupt, | ||
559 | IRQF_SAMPLE_RANDOM, "blkif", info); | ||
560 | if (err <= 0) { | ||
561 | xenbus_dev_fatal(dev, err, | ||
562 | "bind_evtchn_to_irqhandler failed"); | ||
563 | goto fail; | ||
564 | } | ||
565 | info->irq = err; | ||
566 | |||
567 | return 0; | ||
568 | fail: | ||
569 | blkif_free(info, 0); | ||
570 | return err; | ||
571 | } | ||
572 | |||
573 | |||
574 | /* Common code used when first setting up, and when resuming. */ | ||
575 | static int talk_to_backend(struct xenbus_device *dev, | ||
576 | struct blkfront_info *info) | ||
577 | { | ||
578 | const char *message = NULL; | ||
579 | struct xenbus_transaction xbt; | ||
580 | int err; | ||
581 | |||
582 | /* Create shared ring, alloc event channel. */ | ||
583 | err = setup_blkring(dev, info); | ||
584 | if (err) | ||
585 | goto out; | ||
586 | |||
587 | again: | ||
588 | err = xenbus_transaction_start(&xbt); | ||
589 | if (err) { | ||
590 | xenbus_dev_fatal(dev, err, "starting transaction"); | ||
591 | goto destroy_blkring; | ||
592 | } | ||
593 | |||
594 | err = xenbus_printf(xbt, dev->nodename, | ||
595 | "ring-ref", "%u", info->ring_ref); | ||
596 | if (err) { | ||
597 | message = "writing ring-ref"; | ||
598 | goto abort_transaction; | ||
599 | } | ||
600 | err = xenbus_printf(xbt, dev->nodename, | ||
601 | "event-channel", "%u", info->evtchn); | ||
602 | if (err) { | ||
603 | message = "writing event-channel"; | ||
604 | goto abort_transaction; | ||
605 | } | ||
606 | |||
607 | err = xenbus_transaction_end(xbt, 0); | ||
608 | if (err) { | ||
609 | if (err == -EAGAIN) | ||
610 | goto again; | ||
611 | xenbus_dev_fatal(dev, err, "completing transaction"); | ||
612 | goto destroy_blkring; | ||
613 | } | ||
614 | |||
615 | xenbus_switch_state(dev, XenbusStateInitialised); | ||
616 | |||
617 | return 0; | ||
618 | |||
619 | abort_transaction: | ||
620 | xenbus_transaction_end(xbt, 1); | ||
621 | if (message) | ||
622 | xenbus_dev_fatal(dev, err, "%s", message); | ||
623 | destroy_blkring: | ||
624 | blkif_free(info, 0); | ||
625 | out: | ||
626 | return err; | ||
627 | } | ||
628 | |||
629 | |||
630 | /** | ||
631 | * Entry point to this code when a new device is created. Allocate the basic | ||
632 | * structures and the ring buffer for communication with the backend, and | ||
633 | * inform the backend of the appropriate details for those. Switch to | ||
634 | * Initialised state. | ||
635 | */ | ||
636 | static int blkfront_probe(struct xenbus_device *dev, | ||
637 | const struct xenbus_device_id *id) | ||
638 | { | ||
639 | int err, vdevice, i; | ||
640 | struct blkfront_info *info; | ||
641 | |||
642 | /* FIXME: Use dynamic device id if this is not set. */ | ||
643 | err = xenbus_scanf(XBT_NIL, dev->nodename, | ||
644 | "virtual-device", "%i", &vdevice); | ||
645 | if (err != 1) { | ||
646 | xenbus_dev_fatal(dev, err, "reading virtual-device"); | ||
647 | return err; | ||
648 | } | ||
649 | |||
650 | info = kzalloc(sizeof(*info), GFP_KERNEL); | ||
651 | if (!info) { | ||
652 | xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); | ||
653 | return -ENOMEM; | ||
654 | } | ||
655 | |||
656 | info->xbdev = dev; | ||
657 | info->vdevice = vdevice; | ||
658 | info->connected = BLKIF_STATE_DISCONNECTED; | ||
659 | INIT_WORK(&info->work, blkif_restart_queue); | ||
660 | |||
661 | for (i = 0; i < BLK_RING_SIZE; i++) | ||
662 | info->shadow[i].req.id = i+1; | ||
663 | info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; | ||
664 | |||
665 | /* Front end dir is a number, which is used as the id. */ | ||
666 | info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); | ||
667 | dev->dev.driver_data = info; | ||
668 | |||
669 | err = talk_to_backend(dev, info); | ||
670 | if (err) { | ||
671 | kfree(info); | ||
672 | dev->dev.driver_data = NULL; | ||
673 | return err; | ||
674 | } | ||
675 | |||
676 | return 0; | ||
677 | } | ||
678 | |||
679 | |||
680 | static int blkif_recover(struct blkfront_info *info) | ||
681 | { | ||
682 | int i; | ||
683 | struct blkif_request *req; | ||
684 | struct blk_shadow *copy; | ||
685 | int j; | ||
686 | |||
687 | /* Stage 1: Make a safe copy of the shadow state. */ | ||
688 | copy = kmalloc(sizeof(info->shadow), GFP_KERNEL); | ||
689 | if (!copy) | ||
690 | return -ENOMEM; | ||
691 | memcpy(copy, info->shadow, sizeof(info->shadow)); | ||
692 | |||
693 | /* Stage 2: Set up free list. */ | ||
694 | memset(&info->shadow, 0, sizeof(info->shadow)); | ||
695 | for (i = 0; i < BLK_RING_SIZE; i++) | ||
696 | info->shadow[i].req.id = i+1; | ||
697 | info->shadow_free = info->ring.req_prod_pvt; | ||
698 | info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; | ||
699 | |||
700 | /* Stage 3: Find pending requests and requeue them. */ | ||
701 | for (i = 0; i < BLK_RING_SIZE; i++) { | ||
702 | /* Not in use? */ | ||
703 | if (copy[i].request == 0) | ||
704 | continue; | ||
705 | |||
706 | /* Grab a request slot and copy shadow state into it. */ | ||
707 | req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); | ||
708 | *req = copy[i].req; | ||
709 | |||
710 | /* We get a new request id, and must reset the shadow state. */ | ||
711 | req->id = get_id_from_freelist(info); | ||
712 | memcpy(&info->shadow[req->id], ©[i], sizeof(copy[i])); | ||
713 | |||
714 | /* Rewrite any grant references invalidated by susp/resume. */ | ||
715 | for (j = 0; j < req->nr_segments; j++) | ||
716 | gnttab_grant_foreign_access_ref( | ||
717 | req->seg[j].gref, | ||
718 | info->xbdev->otherend_id, | ||
719 | pfn_to_mfn(info->shadow[req->id].frame[j]), | ||
720 | rq_data_dir( | ||
721 | (struct request *) | ||
722 | info->shadow[req->id].request)); | ||
723 | info->shadow[req->id].req = *req; | ||
724 | |||
725 | info->ring.req_prod_pvt++; | ||
726 | } | ||
727 | |||
728 | kfree(copy); | ||
729 | |||
730 | xenbus_switch_state(info->xbdev, XenbusStateConnected); | ||
731 | |||
732 | spin_lock_irq(&blkif_io_lock); | ||
733 | |||
734 | /* Now safe for us to use the shared ring */ | ||
735 | info->connected = BLKIF_STATE_CONNECTED; | ||
736 | |||
737 | /* Send off requeued requests */ | ||
738 | flush_requests(info); | ||
739 | |||
740 | /* Kick any other new requests queued since we resumed */ | ||
741 | kick_pending_request_queues(info); | ||
742 | |||
743 | spin_unlock_irq(&blkif_io_lock); | ||
744 | |||
745 | return 0; | ||
746 | } | ||
747 | |||
748 | /** | ||
749 | * We are reconnecting to the backend, due to a suspend/resume, or a backend | ||
750 | * driver restart. We tear down our blkif structure and recreate it, but | ||
751 | * leave the device-layer structures intact so that this is transparent to the | ||
752 | * rest of the kernel. | ||
753 | */ | ||
754 | static int blkfront_resume(struct xenbus_device *dev) | ||
755 | { | ||
756 | struct blkfront_info *info = dev->dev.driver_data; | ||
757 | int err; | ||
758 | |||
759 | dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename); | ||
760 | |||
761 | blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); | ||
762 | |||
763 | err = talk_to_backend(dev, info); | ||
764 | if (info->connected == BLKIF_STATE_SUSPENDED && !err) | ||
765 | err = blkif_recover(info); | ||
766 | |||
767 | return err; | ||
768 | } | ||
769 | |||
770 | |||
771 | /* | ||
772 | * Invoked when the backend is finally 'ready' (and has told produced | ||
773 | * the details about the physical device - #sectors, size, etc). | ||
774 | */ | ||
775 | static void blkfront_connect(struct blkfront_info *info) | ||
776 | { | ||
777 | unsigned long long sectors; | ||
778 | unsigned long sector_size; | ||
779 | unsigned int binfo; | ||
780 | int err; | ||
781 | |||
782 | if ((info->connected == BLKIF_STATE_CONNECTED) || | ||
783 | (info->connected == BLKIF_STATE_SUSPENDED) ) | ||
784 | return; | ||
785 | |||
786 | dev_dbg(&info->xbdev->dev, "%s:%s.\n", | ||
787 | __func__, info->xbdev->otherend); | ||
788 | |||
789 | err = xenbus_gather(XBT_NIL, info->xbdev->otherend, | ||
790 | "sectors", "%llu", §ors, | ||
791 | "info", "%u", &binfo, | ||
792 | "sector-size", "%lu", §or_size, | ||
793 | NULL); | ||
794 | if (err) { | ||
795 | xenbus_dev_fatal(info->xbdev, err, | ||
796 | "reading backend fields at %s", | ||
797 | info->xbdev->otherend); | ||
798 | return; | ||
799 | } | ||
800 | |||
801 | err = xenbus_gather(XBT_NIL, info->xbdev->otherend, | ||
802 | "feature-barrier", "%lu", &info->feature_barrier, | ||
803 | NULL); | ||
804 | if (err) | ||
805 | info->feature_barrier = 0; | ||
806 | |||
807 | err = xlvbd_alloc_gendisk(BLKIF_MINOR(info->vdevice), | ||
808 | sectors, info->vdevice, | ||
809 | binfo, sector_size, info); | ||
810 | if (err) { | ||
811 | xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", | ||
812 | info->xbdev->otherend); | ||
813 | return; | ||
814 | } | ||
815 | |||
816 | xenbus_switch_state(info->xbdev, XenbusStateConnected); | ||
817 | |||
818 | /* Kick pending requests. */ | ||
819 | spin_lock_irq(&blkif_io_lock); | ||
820 | info->connected = BLKIF_STATE_CONNECTED; | ||
821 | kick_pending_request_queues(info); | ||
822 | spin_unlock_irq(&blkif_io_lock); | ||
823 | |||
824 | add_disk(info->gd); | ||
825 | } | ||
826 | |||
827 | /** | ||
828 | * Handle the change of state of the backend to Closing. We must delete our | ||
829 | * device-layer structures now, to ensure that writes are flushed through to | ||
830 | * the backend. Once is this done, we can switch to Closed in | ||
831 | * acknowledgement. | ||
832 | */ | ||
833 | static void blkfront_closing(struct xenbus_device *dev) | ||
834 | { | ||
835 | struct blkfront_info *info = dev->dev.driver_data; | ||
836 | unsigned long flags; | ||
837 | |||
838 | dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename); | ||
839 | |||
840 | if (info->rq == NULL) | ||
841 | goto out; | ||
842 | |||
843 | spin_lock_irqsave(&blkif_io_lock, flags); | ||
844 | |||
845 | del_gendisk(info->gd); | ||
846 | |||
847 | /* No more blkif_request(). */ | ||
848 | blk_stop_queue(info->rq); | ||
849 | |||
850 | /* No more gnttab callback work. */ | ||
851 | gnttab_cancel_free_callback(&info->callback); | ||
852 | spin_unlock_irqrestore(&blkif_io_lock, flags); | ||
853 | |||
854 | /* Flush gnttab callback work. Must be done with no locks held. */ | ||
855 | flush_scheduled_work(); | ||
856 | |||
857 | blk_cleanup_queue(info->rq); | ||
858 | info->rq = NULL; | ||
859 | |||
860 | out: | ||
861 | xenbus_frontend_closed(dev); | ||
862 | } | ||
863 | |||
864 | /** | ||
865 | * Callback received when the backend's state changes. | ||
866 | */ | ||
867 | static void backend_changed(struct xenbus_device *dev, | ||
868 | enum xenbus_state backend_state) | ||
869 | { | ||
870 | struct blkfront_info *info = dev->dev.driver_data; | ||
871 | struct block_device *bd; | ||
872 | |||
873 | dev_dbg(&dev->dev, "blkfront:backend_changed.\n"); | ||
874 | |||
875 | switch (backend_state) { | ||
876 | case XenbusStateInitialising: | ||
877 | case XenbusStateInitWait: | ||
878 | case XenbusStateInitialised: | ||
879 | case XenbusStateUnknown: | ||
880 | case XenbusStateClosed: | ||
881 | break; | ||
882 | |||
883 | case XenbusStateConnected: | ||
884 | blkfront_connect(info); | ||
885 | break; | ||
886 | |||
887 | case XenbusStateClosing: | ||
888 | bd = bdget(info->dev); | ||
889 | if (bd == NULL) | ||
890 | xenbus_dev_fatal(dev, -ENODEV, "bdget failed"); | ||
891 | |||
892 | mutex_lock(&bd->bd_mutex); | ||
893 | if (info->users > 0) | ||
894 | xenbus_dev_error(dev, -EBUSY, | ||
895 | "Device in use; refusing to close"); | ||
896 | else | ||
897 | blkfront_closing(dev); | ||
898 | mutex_unlock(&bd->bd_mutex); | ||
899 | bdput(bd); | ||
900 | break; | ||
901 | } | ||
902 | } | ||
903 | |||
904 | static int blkfront_remove(struct xenbus_device *dev) | ||
905 | { | ||
906 | struct blkfront_info *info = dev->dev.driver_data; | ||
907 | |||
908 | dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename); | ||
909 | |||
910 | blkif_free(info, 0); | ||
911 | |||
912 | kfree(info); | ||
913 | |||
914 | return 0; | ||
915 | } | ||
916 | |||
917 | static int blkif_open(struct inode *inode, struct file *filep) | ||
918 | { | ||
919 | struct blkfront_info *info = inode->i_bdev->bd_disk->private_data; | ||
920 | info->users++; | ||
921 | return 0; | ||
922 | } | ||
923 | |||
924 | static int blkif_release(struct inode *inode, struct file *filep) | ||
925 | { | ||
926 | struct blkfront_info *info = inode->i_bdev->bd_disk->private_data; | ||
927 | info->users--; | ||
928 | if (info->users == 0) { | ||
929 | /* Check whether we have been instructed to close. We will | ||
930 | have ignored this request initially, as the device was | ||
931 | still mounted. */ | ||
932 | struct xenbus_device *dev = info->xbdev; | ||
933 | enum xenbus_state state = xenbus_read_driver_state(dev->otherend); | ||
934 | |||
935 | if (state == XenbusStateClosing) | ||
936 | blkfront_closing(dev); | ||
937 | } | ||
938 | return 0; | ||
939 | } | ||
940 | |||
941 | static struct block_device_operations xlvbd_block_fops = | ||
942 | { | ||
943 | .owner = THIS_MODULE, | ||
944 | .open = blkif_open, | ||
945 | .release = blkif_release, | ||
946 | }; | ||
947 | |||
948 | |||
949 | static struct xenbus_device_id blkfront_ids[] = { | ||
950 | { "vbd" }, | ||
951 | { "" } | ||
952 | }; | ||
953 | |||
954 | static struct xenbus_driver blkfront = { | ||
955 | .name = "vbd", | ||
956 | .owner = THIS_MODULE, | ||
957 | .ids = blkfront_ids, | ||
958 | .probe = blkfront_probe, | ||
959 | .remove = blkfront_remove, | ||
960 | .resume = blkfront_resume, | ||
961 | .otherend_changed = backend_changed, | ||
962 | }; | ||
963 | |||
964 | static int __init xlblk_init(void) | ||
965 | { | ||
966 | if (!is_running_on_xen()) | ||
967 | return -ENODEV; | ||
968 | |||
969 | if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { | ||
970 | printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n", | ||
971 | XENVBD_MAJOR, DEV_NAME); | ||
972 | return -ENODEV; | ||
973 | } | ||
974 | |||
975 | return xenbus_register_frontend(&blkfront); | ||
976 | } | ||
977 | module_init(xlblk_init); | ||
978 | |||
979 | |||
980 | static void xlblk_exit(void) | ||
981 | { | ||
982 | return xenbus_unregister_driver(&blkfront); | ||
983 | } | ||
984 | module_exit(xlblk_exit); | ||
985 | |||
986 | MODULE_DESCRIPTION("Xen virtual block device frontend"); | ||
987 | MODULE_LICENSE("GPL"); | ||
988 | MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR); | ||
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 97bd71bc3aea..9e8f21410d2d 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig | |||
@@ -604,6 +604,14 @@ config HVC_BEAT | |||
604 | help | 604 | help |
605 | Toshiba's Cell Reference Set Beat Console device driver | 605 | Toshiba's Cell Reference Set Beat Console device driver |
606 | 606 | ||
607 | config HVC_XEN | ||
608 | bool "Xen Hypervisor Console support" | ||
609 | depends on XEN | ||
610 | select HVC_DRIVER | ||
611 | default y | ||
612 | help | ||
613 | Xen virtual console device driver | ||
614 | |||
607 | config HVCS | 615 | config HVCS |
608 | tristate "IBM Hypervisor Virtual Console Server support" | 616 | tristate "IBM Hypervisor Virtual Console Server support" |
609 | depends on PPC_PSERIES | 617 | depends on PPC_PSERIES |
diff --git a/drivers/char/Makefile b/drivers/char/Makefile index f2996a95eb07..8852b8d643cf 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile | |||
@@ -48,6 +48,7 @@ obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o | |||
48 | obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o | 48 | obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o |
49 | obj-$(CONFIG_HVC_BEAT) += hvc_beat.o | 49 | obj-$(CONFIG_HVC_BEAT) += hvc_beat.o |
50 | obj-$(CONFIG_HVC_DRIVER) += hvc_console.o | 50 | obj-$(CONFIG_HVC_DRIVER) += hvc_console.o |
51 | obj-$(CONFIG_HVC_XEN) += hvc_xen.o | ||
51 | obj-$(CONFIG_RAW_DRIVER) += raw.o | 52 | obj-$(CONFIG_RAW_DRIVER) += raw.o |
52 | obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o | 53 | obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o |
53 | obj-$(CONFIG_MSPEC) += mspec.o | 54 | obj-$(CONFIG_MSPEC) += mspec.o |
diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c new file mode 100644 index 000000000000..dd68f8541c2d --- /dev/null +++ b/drivers/char/hvc_xen.c | |||
@@ -0,0 +1,159 @@ | |||
1 | /* | ||
2 | * xen console driver interface to hvc_console.c | ||
3 | * | ||
4 | * (c) 2007 Gerd Hoffmann <kraxel@suse.de> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
19 | */ | ||
20 | |||
21 | #include <linux/console.h> | ||
22 | #include <linux/delay.h> | ||
23 | #include <linux/err.h> | ||
24 | #include <linux/init.h> | ||
25 | #include <linux/types.h> | ||
26 | |||
27 | #include <asm/xen/hypervisor.h> | ||
28 | #include <xen/page.h> | ||
29 | #include <xen/events.h> | ||
30 | #include <xen/interface/io/console.h> | ||
31 | #include <xen/hvc-console.h> | ||
32 | |||
33 | #include "hvc_console.h" | ||
34 | |||
35 | #define HVC_COOKIE 0x58656e /* "Xen" in hex */ | ||
36 | |||
37 | static struct hvc_struct *hvc; | ||
38 | static int xencons_irq; | ||
39 | |||
40 | /* ------------------------------------------------------------------ */ | ||
41 | |||
42 | static inline struct xencons_interface *xencons_interface(void) | ||
43 | { | ||
44 | return mfn_to_virt(xen_start_info->console.domU.mfn); | ||
45 | } | ||
46 | |||
47 | static inline void notify_daemon(void) | ||
48 | { | ||
49 | /* Use evtchn: this is called early, before irq is set up. */ | ||
50 | notify_remote_via_evtchn(xen_start_info->console.domU.evtchn); | ||
51 | } | ||
52 | |||
53 | static int write_console(uint32_t vtermno, const char *data, int len) | ||
54 | { | ||
55 | struct xencons_interface *intf = xencons_interface(); | ||
56 | XENCONS_RING_IDX cons, prod; | ||
57 | int sent = 0; | ||
58 | |||
59 | cons = intf->out_cons; | ||
60 | prod = intf->out_prod; | ||
61 | mb(); /* update queue values before going on */ | ||
62 | BUG_ON((prod - cons) > sizeof(intf->out)); | ||
63 | |||
64 | while ((sent < len) && ((prod - cons) < sizeof(intf->out))) | ||
65 | intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++]; | ||
66 | |||
67 | wmb(); /* write ring before updating pointer */ | ||
68 | intf->out_prod = prod; | ||
69 | |||
70 | notify_daemon(); | ||
71 | return sent; | ||
72 | } | ||
73 | |||
74 | static int read_console(uint32_t vtermno, char *buf, int len) | ||
75 | { | ||
76 | struct xencons_interface *intf = xencons_interface(); | ||
77 | XENCONS_RING_IDX cons, prod; | ||
78 | int recv = 0; | ||
79 | |||
80 | cons = intf->in_cons; | ||
81 | prod = intf->in_prod; | ||
82 | mb(); /* get pointers before reading ring */ | ||
83 | BUG_ON((prod - cons) > sizeof(intf->in)); | ||
84 | |||
85 | while (cons != prod && recv < len) | ||
86 | buf[recv++] = intf->in[MASK_XENCONS_IDX(cons++, intf->in)]; | ||
87 | |||
88 | mb(); /* read ring before consuming */ | ||
89 | intf->in_cons = cons; | ||
90 | |||
91 | notify_daemon(); | ||
92 | return recv; | ||
93 | } | ||
94 | |||
95 | static struct hv_ops hvc_ops = { | ||
96 | .get_chars = read_console, | ||
97 | .put_chars = write_console, | ||
98 | }; | ||
99 | |||
100 | static int __init xen_init(void) | ||
101 | { | ||
102 | struct hvc_struct *hp; | ||
103 | |||
104 | if (!is_running_on_xen()) | ||
105 | return 0; | ||
106 | |||
107 | xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn); | ||
108 | if (xencons_irq < 0) | ||
109 | xencons_irq = 0 /* NO_IRQ */; | ||
110 | hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256); | ||
111 | if (IS_ERR(hp)) | ||
112 | return PTR_ERR(hp); | ||
113 | |||
114 | hvc = hp; | ||
115 | return 0; | ||
116 | } | ||
117 | |||
118 | static void __exit xen_fini(void) | ||
119 | { | ||
120 | if (hvc) | ||
121 | hvc_remove(hvc); | ||
122 | } | ||
123 | |||
124 | static int xen_cons_init(void) | ||
125 | { | ||
126 | if (!is_running_on_xen()) | ||
127 | return 0; | ||
128 | |||
129 | hvc_instantiate(HVC_COOKIE, 0, &hvc_ops); | ||
130 | return 0; | ||
131 | } | ||
132 | |||
133 | module_init(xen_init); | ||
134 | module_exit(xen_fini); | ||
135 | console_initcall(xen_cons_init); | ||
136 | |||
137 | static void xenboot_write_console(struct console *console, const char *string, | ||
138 | unsigned len) | ||
139 | { | ||
140 | unsigned int linelen, off = 0; | ||
141 | const char *pos; | ||
142 | |||
143 | while (off < len && NULL != (pos = strchr(string+off, '\n'))) { | ||
144 | linelen = pos-string+off; | ||
145 | if (off + linelen > len) | ||
146 | break; | ||
147 | write_console(0, string+off, linelen); | ||
148 | write_console(0, "\r\n", 2); | ||
149 | off += linelen + 1; | ||
150 | } | ||
151 | if (off < len) | ||
152 | write_console(0, string+off, len-off); | ||
153 | } | ||
154 | |||
155 | struct console xenboot_console = { | ||
156 | .name = "xenboot", | ||
157 | .write = xenboot_write_console, | ||
158 | .flags = CON_PRINTBUFFER | CON_BOOT, | ||
159 | }; | ||
diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c index dbb22403979f..3d90fc002097 100644 --- a/drivers/macintosh/therm_pm72.c +++ b/drivers/macintosh/therm_pm72.c | |||
@@ -1770,7 +1770,8 @@ static int call_critical_overtemp(void) | |||
1770 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", | 1770 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", |
1771 | NULL }; | 1771 | NULL }; |
1772 | 1772 | ||
1773 | return call_usermodehelper(critical_overtemp_path, argv, envp, 0); | 1773 | return call_usermodehelper(critical_overtemp_path, |
1774 | argv, envp, UMH_WAIT_EXEC); | ||
1774 | } | 1775 | } |
1775 | 1776 | ||
1776 | 1777 | ||
diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c index e18d265d5d33..516d943227e2 100644 --- a/drivers/macintosh/windfarm_core.c +++ b/drivers/macintosh/windfarm_core.c | |||
@@ -80,7 +80,8 @@ int wf_critical_overtemp(void) | |||
80 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", | 80 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", |
81 | NULL }; | 81 | NULL }; |
82 | 82 | ||
83 | return call_usermodehelper(critical_overtemp_path, argv, envp, 0); | 83 | return call_usermodehelper(critical_overtemp_path, |
84 | argv, envp, UMH_WAIT_EXEC); | ||
84 | } | 85 | } |
85 | EXPORT_SYMBOL_GPL(wf_critical_overtemp); | 86 | EXPORT_SYMBOL_GPL(wf_critical_overtemp); |
86 | 87 | ||
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 43d03178064d..5fb659f8b20e 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig | |||
@@ -2486,6 +2486,18 @@ source "drivers/atm/Kconfig" | |||
2486 | 2486 | ||
2487 | source "drivers/s390/net/Kconfig" | 2487 | source "drivers/s390/net/Kconfig" |
2488 | 2488 | ||
2489 | config XEN_NETDEV_FRONTEND | ||
2490 | tristate "Xen network device frontend driver" | ||
2491 | depends on XEN | ||
2492 | default y | ||
2493 | help | ||
2494 | The network device frontend driver allows the kernel to | ||
2495 | access network devices exported exported by a virtual | ||
2496 | machine containing a physical network device driver. The | ||
2497 | frontend driver is intended for unprivileged guest domains; | ||
2498 | if you are compiling a kernel for a Xen guest, you almost | ||
2499 | certainly want to enable this. | ||
2500 | |||
2489 | config ISERIES_VETH | 2501 | config ISERIES_VETH |
2490 | tristate "iSeries Virtual Ethernet driver support" | 2502 | tristate "iSeries Virtual Ethernet driver support" |
2491 | depends on PPC_ISERIES | 2503 | depends on PPC_ISERIES |
diff --git a/drivers/net/Makefile b/drivers/net/Makefile index eb4167622a6a..0e286ab8855a 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile | |||
@@ -127,6 +127,8 @@ obj-$(CONFIG_PPPOL2TP) += pppox.o pppol2tp.o | |||
127 | obj-$(CONFIG_SLIP) += slip.o | 127 | obj-$(CONFIG_SLIP) += slip.o |
128 | obj-$(CONFIG_SLHC) += slhc.o | 128 | obj-$(CONFIG_SLHC) += slhc.o |
129 | 129 | ||
130 | obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o | ||
131 | |||
130 | obj-$(CONFIG_DUMMY) += dummy.o | 132 | obj-$(CONFIG_DUMMY) += dummy.o |
131 | obj-$(CONFIG_IFB) += ifb.o | 133 | obj-$(CONFIG_IFB) += ifb.o |
132 | obj-$(CONFIG_MACVLAN) += macvlan.o | 134 | obj-$(CONFIG_MACVLAN) += macvlan.o |
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 84aa2117c0ee..355c6cf3d112 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c | |||
@@ -320,7 +320,7 @@ static int eppconfig(struct baycom_state *bc) | |||
320 | sprintf(portarg, "%ld", bc->pdev->port->base); | 320 | sprintf(portarg, "%ld", bc->pdev->port->base); |
321 | printk(KERN_DEBUG "%s: %s -s -p %s -m %s\n", bc_drvname, eppconfig_path, portarg, modearg); | 321 | printk(KERN_DEBUG "%s: %s -s -p %s -m %s\n", bc_drvname, eppconfig_path, portarg, modearg); |
322 | 322 | ||
323 | return call_usermodehelper(eppconfig_path, argv, envp, 1); | 323 | return call_usermodehelper(eppconfig_path, argv, envp, UMH_WAIT_PROC); |
324 | } | 324 | } |
325 | 325 | ||
326 | /* ---------------------------------------------------------------------- */ | 326 | /* ---------------------------------------------------------------------- */ |
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c new file mode 100644 index 000000000000..489f69c5d6ca --- /dev/null +++ b/drivers/net/xen-netfront.c | |||
@@ -0,0 +1,1863 @@ | |||
1 | /* | ||
2 | * Virtual network driver for conversing with remote driver backends. | ||
3 | * | ||
4 | * Copyright (c) 2002-2005, K A Fraser | ||
5 | * Copyright (c) 2005, XenSource Ltd | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License version 2 | ||
9 | * as published by the Free Software Foundation; or, when distributed | ||
10 | * separately from the Linux kernel or incorporated into other | ||
11 | * software packages, subject to the following license: | ||
12 | * | ||
13 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
14 | * of this source file (the "Software"), to deal in the Software without | ||
15 | * restriction, including without limitation the rights to use, copy, modify, | ||
16 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
17 | * and to permit persons to whom the Software is furnished to do so, subject to | ||
18 | * the following conditions: | ||
19 | * | ||
20 | * The above copyright notice and this permission notice shall be included in | ||
21 | * all copies or substantial portions of the Software. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
24 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
25 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
26 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
27 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
28 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
29 | * IN THE SOFTWARE. | ||
30 | */ | ||
31 | |||
32 | #include <linux/module.h> | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/netdevice.h> | ||
35 | #include <linux/etherdevice.h> | ||
36 | #include <linux/skbuff.h> | ||
37 | #include <linux/ethtool.h> | ||
38 | #include <linux/if_ether.h> | ||
39 | #include <linux/tcp.h> | ||
40 | #include <linux/udp.h> | ||
41 | #include <linux/moduleparam.h> | ||
42 | #include <linux/mm.h> | ||
43 | #include <net/ip.h> | ||
44 | |||
45 | #include <xen/xenbus.h> | ||
46 | #include <xen/events.h> | ||
47 | #include <xen/page.h> | ||
48 | #include <xen/grant_table.h> | ||
49 | |||
50 | #include <xen/interface/io/netif.h> | ||
51 | #include <xen/interface/memory.h> | ||
52 | #include <xen/interface/grant_table.h> | ||
53 | |||
54 | static struct ethtool_ops xennet_ethtool_ops; | ||
55 | |||
56 | struct netfront_cb { | ||
57 | struct page *page; | ||
58 | unsigned offset; | ||
59 | }; | ||
60 | |||
61 | #define NETFRONT_SKB_CB(skb) ((struct netfront_cb *)((skb)->cb)) | ||
62 | |||
63 | #define RX_COPY_THRESHOLD 256 | ||
64 | |||
65 | #define GRANT_INVALID_REF 0 | ||
66 | |||
67 | #define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE) | ||
68 | #define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE) | ||
69 | #define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256) | ||
70 | |||
71 | struct netfront_info { | ||
72 | struct list_head list; | ||
73 | struct net_device *netdev; | ||
74 | |||
75 | struct net_device_stats stats; | ||
76 | |||
77 | struct xen_netif_tx_front_ring tx; | ||
78 | struct xen_netif_rx_front_ring rx; | ||
79 | |||
80 | spinlock_t tx_lock; | ||
81 | spinlock_t rx_lock; | ||
82 | |||
83 | unsigned int evtchn; | ||
84 | |||
85 | /* Receive-ring batched refills. */ | ||
86 | #define RX_MIN_TARGET 8 | ||
87 | #define RX_DFL_MIN_TARGET 64 | ||
88 | #define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256) | ||
89 | unsigned rx_min_target, rx_max_target, rx_target; | ||
90 | struct sk_buff_head rx_batch; | ||
91 | |||
92 | struct timer_list rx_refill_timer; | ||
93 | |||
94 | /* | ||
95 | * {tx,rx}_skbs store outstanding skbuffs. Free tx_skb entries | ||
96 | * are linked from tx_skb_freelist through skb_entry.link. | ||
97 | * | ||
98 | * NB. Freelist index entries are always going to be less than | ||
99 | * PAGE_OFFSET, whereas pointers to skbs will always be equal or | ||
100 | * greater than PAGE_OFFSET: we use this property to distinguish | ||
101 | * them. | ||
102 | */ | ||
103 | union skb_entry { | ||
104 | struct sk_buff *skb; | ||
105 | unsigned link; | ||
106 | } tx_skbs[NET_TX_RING_SIZE]; | ||
107 | grant_ref_t gref_tx_head; | ||
108 | grant_ref_t grant_tx_ref[NET_TX_RING_SIZE]; | ||
109 | unsigned tx_skb_freelist; | ||
110 | |||
111 | struct sk_buff *rx_skbs[NET_RX_RING_SIZE]; | ||
112 | grant_ref_t gref_rx_head; | ||
113 | grant_ref_t grant_rx_ref[NET_RX_RING_SIZE]; | ||
114 | |||
115 | struct xenbus_device *xbdev; | ||
116 | int tx_ring_ref; | ||
117 | int rx_ring_ref; | ||
118 | |||
119 | unsigned long rx_pfn_array[NET_RX_RING_SIZE]; | ||
120 | struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1]; | ||
121 | struct mmu_update rx_mmu[NET_RX_RING_SIZE]; | ||
122 | }; | ||
123 | |||
124 | struct netfront_rx_info { | ||
125 | struct xen_netif_rx_response rx; | ||
126 | struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; | ||
127 | }; | ||
128 | |||
129 | /* | ||
130 | * Access macros for acquiring freeing slots in tx_skbs[]. | ||
131 | */ | ||
132 | |||
133 | static void add_id_to_freelist(unsigned *head, union skb_entry *list, | ||
134 | unsigned short id) | ||
135 | { | ||
136 | list[id].link = *head; | ||
137 | *head = id; | ||
138 | } | ||
139 | |||
140 | static unsigned short get_id_from_freelist(unsigned *head, | ||
141 | union skb_entry *list) | ||
142 | { | ||
143 | unsigned int id = *head; | ||
144 | *head = list[id].link; | ||
145 | return id; | ||
146 | } | ||
147 | |||
148 | static int xennet_rxidx(RING_IDX idx) | ||
149 | { | ||
150 | return idx & (NET_RX_RING_SIZE - 1); | ||
151 | } | ||
152 | |||
153 | static struct sk_buff *xennet_get_rx_skb(struct netfront_info *np, | ||
154 | RING_IDX ri) | ||
155 | { | ||
156 | int i = xennet_rxidx(ri); | ||
157 | struct sk_buff *skb = np->rx_skbs[i]; | ||
158 | np->rx_skbs[i] = NULL; | ||
159 | return skb; | ||
160 | } | ||
161 | |||
162 | static grant_ref_t xennet_get_rx_ref(struct netfront_info *np, | ||
163 | RING_IDX ri) | ||
164 | { | ||
165 | int i = xennet_rxidx(ri); | ||
166 | grant_ref_t ref = np->grant_rx_ref[i]; | ||
167 | np->grant_rx_ref[i] = GRANT_INVALID_REF; | ||
168 | return ref; | ||
169 | } | ||
170 | |||
171 | #ifdef CONFIG_SYSFS | ||
172 | static int xennet_sysfs_addif(struct net_device *netdev); | ||
173 | static void xennet_sysfs_delif(struct net_device *netdev); | ||
174 | #else /* !CONFIG_SYSFS */ | ||
175 | #define xennet_sysfs_addif(dev) (0) | ||
176 | #define xennet_sysfs_delif(dev) do { } while (0) | ||
177 | #endif | ||
178 | |||
179 | static int xennet_can_sg(struct net_device *dev) | ||
180 | { | ||
181 | return dev->features & NETIF_F_SG; | ||
182 | } | ||
183 | |||
184 | |||
185 | static void rx_refill_timeout(unsigned long data) | ||
186 | { | ||
187 | struct net_device *dev = (struct net_device *)data; | ||
188 | netif_rx_schedule(dev); | ||
189 | } | ||
190 | |||
191 | static int netfront_tx_slot_available(struct netfront_info *np) | ||
192 | { | ||
193 | return ((np->tx.req_prod_pvt - np->tx.rsp_cons) < | ||
194 | (TX_MAX_TARGET - MAX_SKB_FRAGS - 2)); | ||
195 | } | ||
196 | |||
197 | static void xennet_maybe_wake_tx(struct net_device *dev) | ||
198 | { | ||
199 | struct netfront_info *np = netdev_priv(dev); | ||
200 | |||
201 | if (unlikely(netif_queue_stopped(dev)) && | ||
202 | netfront_tx_slot_available(np) && | ||
203 | likely(netif_running(dev))) | ||
204 | netif_wake_queue(dev); | ||
205 | } | ||
206 | |||
207 | static void xennet_alloc_rx_buffers(struct net_device *dev) | ||
208 | { | ||
209 | unsigned short id; | ||
210 | struct netfront_info *np = netdev_priv(dev); | ||
211 | struct sk_buff *skb; | ||
212 | struct page *page; | ||
213 | int i, batch_target, notify; | ||
214 | RING_IDX req_prod = np->rx.req_prod_pvt; | ||
215 | struct xen_memory_reservation reservation; | ||
216 | grant_ref_t ref; | ||
217 | unsigned long pfn; | ||
218 | void *vaddr; | ||
219 | int nr_flips; | ||
220 | struct xen_netif_rx_request *req; | ||
221 | |||
222 | if (unlikely(!netif_carrier_ok(dev))) | ||
223 | return; | ||
224 | |||
225 | /* | ||
226 | * Allocate skbuffs greedily, even though we batch updates to the | ||
227 | * receive ring. This creates a less bursty demand on the memory | ||
228 | * allocator, so should reduce the chance of failed allocation requests | ||
229 | * both for ourself and for other kernel subsystems. | ||
230 | */ | ||
231 | batch_target = np->rx_target - (req_prod - np->rx.rsp_cons); | ||
232 | for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) { | ||
233 | skb = __netdev_alloc_skb(dev, RX_COPY_THRESHOLD, | ||
234 | GFP_ATOMIC | __GFP_NOWARN); | ||
235 | if (unlikely(!skb)) | ||
236 | goto no_skb; | ||
237 | |||
238 | page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); | ||
239 | if (!page) { | ||
240 | kfree_skb(skb); | ||
241 | no_skb: | ||
242 | /* Any skbuffs queued for refill? Force them out. */ | ||
243 | if (i != 0) | ||
244 | goto refill; | ||
245 | /* Could not allocate any skbuffs. Try again later. */ | ||
246 | mod_timer(&np->rx_refill_timer, | ||
247 | jiffies + (HZ/10)); | ||
248 | break; | ||
249 | } | ||
250 | |||
251 | skb_shinfo(skb)->frags[0].page = page; | ||
252 | skb_shinfo(skb)->nr_frags = 1; | ||
253 | __skb_queue_tail(&np->rx_batch, skb); | ||
254 | } | ||
255 | |||
256 | /* Is the batch large enough to be worthwhile? */ | ||
257 | if (i < (np->rx_target/2)) { | ||
258 | if (req_prod > np->rx.sring->req_prod) | ||
259 | goto push; | ||
260 | return; | ||
261 | } | ||
262 | |||
263 | /* Adjust our fill target if we risked running out of buffers. */ | ||
264 | if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) && | ||
265 | ((np->rx_target *= 2) > np->rx_max_target)) | ||
266 | np->rx_target = np->rx_max_target; | ||
267 | |||
268 | refill: | ||
269 | for (nr_flips = i = 0; ; i++) { | ||
270 | skb = __skb_dequeue(&np->rx_batch); | ||
271 | if (skb == NULL) | ||
272 | break; | ||
273 | |||
274 | skb->dev = dev; | ||
275 | |||
276 | id = xennet_rxidx(req_prod + i); | ||
277 | |||
278 | BUG_ON(np->rx_skbs[id]); | ||
279 | np->rx_skbs[id] = skb; | ||
280 | |||
281 | ref = gnttab_claim_grant_reference(&np->gref_rx_head); | ||
282 | BUG_ON((signed short)ref < 0); | ||
283 | np->grant_rx_ref[id] = ref; | ||
284 | |||
285 | pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page); | ||
286 | vaddr = page_address(skb_shinfo(skb)->frags[0].page); | ||
287 | |||
288 | req = RING_GET_REQUEST(&np->rx, req_prod + i); | ||
289 | gnttab_grant_foreign_access_ref(ref, | ||
290 | np->xbdev->otherend_id, | ||
291 | pfn_to_mfn(pfn), | ||
292 | 0); | ||
293 | |||
294 | req->id = id; | ||
295 | req->gref = ref; | ||
296 | } | ||
297 | |||
298 | if (nr_flips != 0) { | ||
299 | reservation.extent_start = np->rx_pfn_array; | ||
300 | reservation.nr_extents = nr_flips; | ||
301 | reservation.extent_order = 0; | ||
302 | reservation.address_bits = 0; | ||
303 | reservation.domid = DOMID_SELF; | ||
304 | |||
305 | if (!xen_feature(XENFEAT_auto_translated_physmap)) { | ||
306 | /* After all PTEs have been zapped, flush the TLB. */ | ||
307 | np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] = | ||
308 | UVMF_TLB_FLUSH|UVMF_ALL; | ||
309 | |||
310 | /* Give away a batch of pages. */ | ||
311 | np->rx_mcl[i].op = __HYPERVISOR_memory_op; | ||
312 | np->rx_mcl[i].args[0] = XENMEM_decrease_reservation; | ||
313 | np->rx_mcl[i].args[1] = (unsigned long)&reservation; | ||
314 | |||
315 | /* Zap PTEs and give away pages in one big | ||
316 | * multicall. */ | ||
317 | (void)HYPERVISOR_multicall(np->rx_mcl, i+1); | ||
318 | |||
319 | /* Check return status of HYPERVISOR_memory_op(). */ | ||
320 | if (unlikely(np->rx_mcl[i].result != i)) | ||
321 | panic("Unable to reduce memory reservation\n"); | ||
322 | } else { | ||
323 | if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, | ||
324 | &reservation) != i) | ||
325 | panic("Unable to reduce memory reservation\n"); | ||
326 | } | ||
327 | } else { | ||
328 | wmb(); /* barrier so backend seens requests */ | ||
329 | } | ||
330 | |||
331 | /* Above is a suitable barrier to ensure backend will see requests. */ | ||
332 | np->rx.req_prod_pvt = req_prod + i; | ||
333 | push: | ||
334 | RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify); | ||
335 | if (notify) | ||
336 | notify_remote_via_irq(np->netdev->irq); | ||
337 | } | ||
338 | |||
339 | static int xennet_open(struct net_device *dev) | ||
340 | { | ||
341 | struct netfront_info *np = netdev_priv(dev); | ||
342 | |||
343 | memset(&np->stats, 0, sizeof(np->stats)); | ||
344 | |||
345 | spin_lock_bh(&np->rx_lock); | ||
346 | if (netif_carrier_ok(dev)) { | ||
347 | xennet_alloc_rx_buffers(dev); | ||
348 | np->rx.sring->rsp_event = np->rx.rsp_cons + 1; | ||
349 | if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) | ||
350 | netif_rx_schedule(dev); | ||
351 | } | ||
352 | spin_unlock_bh(&np->rx_lock); | ||
353 | |||
354 | xennet_maybe_wake_tx(dev); | ||
355 | |||
356 | return 0; | ||
357 | } | ||
358 | |||
359 | static void xennet_tx_buf_gc(struct net_device *dev) | ||
360 | { | ||
361 | RING_IDX cons, prod; | ||
362 | unsigned short id; | ||
363 | struct netfront_info *np = netdev_priv(dev); | ||
364 | struct sk_buff *skb; | ||
365 | |||
366 | BUG_ON(!netif_carrier_ok(dev)); | ||
367 | |||
368 | do { | ||
369 | prod = np->tx.sring->rsp_prod; | ||
370 | rmb(); /* Ensure we see responses up to 'rp'. */ | ||
371 | |||
372 | for (cons = np->tx.rsp_cons; cons != prod; cons++) { | ||
373 | struct xen_netif_tx_response *txrsp; | ||
374 | |||
375 | txrsp = RING_GET_RESPONSE(&np->tx, cons); | ||
376 | if (txrsp->status == NETIF_RSP_NULL) | ||
377 | continue; | ||
378 | |||
379 | id = txrsp->id; | ||
380 | skb = np->tx_skbs[id].skb; | ||
381 | if (unlikely(gnttab_query_foreign_access( | ||
382 | np->grant_tx_ref[id]) != 0)) { | ||
383 | printk(KERN_ALERT "xennet_tx_buf_gc: warning " | ||
384 | "-- grant still in use by backend " | ||
385 | "domain.\n"); | ||
386 | BUG(); | ||
387 | } | ||
388 | gnttab_end_foreign_access_ref( | ||
389 | np->grant_tx_ref[id], GNTMAP_readonly); | ||
390 | gnttab_release_grant_reference( | ||
391 | &np->gref_tx_head, np->grant_tx_ref[id]); | ||
392 | np->grant_tx_ref[id] = GRANT_INVALID_REF; | ||
393 | add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, id); | ||
394 | dev_kfree_skb_irq(skb); | ||
395 | } | ||
396 | |||
397 | np->tx.rsp_cons = prod; | ||
398 | |||
399 | /* | ||
400 | * Set a new event, then check for race with update of tx_cons. | ||
401 | * Note that it is essential to schedule a callback, no matter | ||
402 | * how few buffers are pending. Even if there is space in the | ||
403 | * transmit ring, higher layers may be blocked because too much | ||
404 | * data is outstanding: in such cases notification from Xen is | ||
405 | * likely to be the only kick that we'll get. | ||
406 | */ | ||
407 | np->tx.sring->rsp_event = | ||
408 | prod + ((np->tx.sring->req_prod - prod) >> 1) + 1; | ||
409 | mb(); /* update shared area */ | ||
410 | } while ((cons == prod) && (prod != np->tx.sring->rsp_prod)); | ||
411 | |||
412 | xennet_maybe_wake_tx(dev); | ||
413 | } | ||
414 | |||
415 | static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev, | ||
416 | struct xen_netif_tx_request *tx) | ||
417 | { | ||
418 | struct netfront_info *np = netdev_priv(dev); | ||
419 | char *data = skb->data; | ||
420 | unsigned long mfn; | ||
421 | RING_IDX prod = np->tx.req_prod_pvt; | ||
422 | int frags = skb_shinfo(skb)->nr_frags; | ||
423 | unsigned int offset = offset_in_page(data); | ||
424 | unsigned int len = skb_headlen(skb); | ||
425 | unsigned int id; | ||
426 | grant_ref_t ref; | ||
427 | int i; | ||
428 | |||
429 | /* While the header overlaps a page boundary (including being | ||
430 | larger than a page), split it it into page-sized chunks. */ | ||
431 | while (len > PAGE_SIZE - offset) { | ||
432 | tx->size = PAGE_SIZE - offset; | ||
433 | tx->flags |= NETTXF_more_data; | ||
434 | len -= tx->size; | ||
435 | data += tx->size; | ||
436 | offset = 0; | ||
437 | |||
438 | id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs); | ||
439 | np->tx_skbs[id].skb = skb_get(skb); | ||
440 | tx = RING_GET_REQUEST(&np->tx, prod++); | ||
441 | tx->id = id; | ||
442 | ref = gnttab_claim_grant_reference(&np->gref_tx_head); | ||
443 | BUG_ON((signed short)ref < 0); | ||
444 | |||
445 | mfn = virt_to_mfn(data); | ||
446 | gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id, | ||
447 | mfn, GNTMAP_readonly); | ||
448 | |||
449 | tx->gref = np->grant_tx_ref[id] = ref; | ||
450 | tx->offset = offset; | ||
451 | tx->size = len; | ||
452 | tx->flags = 0; | ||
453 | } | ||
454 | |||
455 | /* Grant backend access to each skb fragment page. */ | ||
456 | for (i = 0; i < frags; i++) { | ||
457 | skb_frag_t *frag = skb_shinfo(skb)->frags + i; | ||
458 | |||
459 | tx->flags |= NETTXF_more_data; | ||
460 | |||
461 | id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs); | ||
462 | np->tx_skbs[id].skb = skb_get(skb); | ||
463 | tx = RING_GET_REQUEST(&np->tx, prod++); | ||
464 | tx->id = id; | ||
465 | ref = gnttab_claim_grant_reference(&np->gref_tx_head); | ||
466 | BUG_ON((signed short)ref < 0); | ||
467 | |||
468 | mfn = pfn_to_mfn(page_to_pfn(frag->page)); | ||
469 | gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id, | ||
470 | mfn, GNTMAP_readonly); | ||
471 | |||
472 | tx->gref = np->grant_tx_ref[id] = ref; | ||
473 | tx->offset = frag->page_offset; | ||
474 | tx->size = frag->size; | ||
475 | tx->flags = 0; | ||
476 | } | ||
477 | |||
478 | np->tx.req_prod_pvt = prod; | ||
479 | } | ||
480 | |||
481 | static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev) | ||
482 | { | ||
483 | unsigned short id; | ||
484 | struct netfront_info *np = netdev_priv(dev); | ||
485 | struct xen_netif_tx_request *tx; | ||
486 | struct xen_netif_extra_info *extra; | ||
487 | char *data = skb->data; | ||
488 | RING_IDX i; | ||
489 | grant_ref_t ref; | ||
490 | unsigned long mfn; | ||
491 | int notify; | ||
492 | int frags = skb_shinfo(skb)->nr_frags; | ||
493 | unsigned int offset = offset_in_page(data); | ||
494 | unsigned int len = skb_headlen(skb); | ||
495 | |||
496 | frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE; | ||
497 | if (unlikely(frags > MAX_SKB_FRAGS + 1)) { | ||
498 | printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n", | ||
499 | frags); | ||
500 | dump_stack(); | ||
501 | goto drop; | ||
502 | } | ||
503 | |||
504 | spin_lock_irq(&np->tx_lock); | ||
505 | |||
506 | if (unlikely(!netif_carrier_ok(dev) || | ||
507 | (frags > 1 && !xennet_can_sg(dev)) || | ||
508 | netif_needs_gso(dev, skb))) { | ||
509 | spin_unlock_irq(&np->tx_lock); | ||
510 | goto drop; | ||
511 | } | ||
512 | |||
513 | i = np->tx.req_prod_pvt; | ||
514 | |||
515 | id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs); | ||
516 | np->tx_skbs[id].skb = skb; | ||
517 | |||
518 | tx = RING_GET_REQUEST(&np->tx, i); | ||
519 | |||
520 | tx->id = id; | ||
521 | ref = gnttab_claim_grant_reference(&np->gref_tx_head); | ||
522 | BUG_ON((signed short)ref < 0); | ||
523 | mfn = virt_to_mfn(data); | ||
524 | gnttab_grant_foreign_access_ref( | ||
525 | ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly); | ||
526 | tx->gref = np->grant_tx_ref[id] = ref; | ||
527 | tx->offset = offset; | ||
528 | tx->size = len; | ||
529 | extra = NULL; | ||
530 | |||
531 | tx->flags = 0; | ||
532 | if (skb->ip_summed == CHECKSUM_PARTIAL) | ||
533 | /* local packet? */ | ||
534 | tx->flags |= NETTXF_csum_blank | NETTXF_data_validated; | ||
535 | else if (skb->ip_summed == CHECKSUM_UNNECESSARY) | ||
536 | /* remote but checksummed. */ | ||
537 | tx->flags |= NETTXF_data_validated; | ||
538 | |||
539 | if (skb_shinfo(skb)->gso_size) { | ||
540 | struct xen_netif_extra_info *gso; | ||
541 | |||
542 | gso = (struct xen_netif_extra_info *) | ||
543 | RING_GET_REQUEST(&np->tx, ++i); | ||
544 | |||
545 | if (extra) | ||
546 | extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE; | ||
547 | else | ||
548 | tx->flags |= NETTXF_extra_info; | ||
549 | |||
550 | gso->u.gso.size = skb_shinfo(skb)->gso_size; | ||
551 | gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; | ||
552 | gso->u.gso.pad = 0; | ||
553 | gso->u.gso.features = 0; | ||
554 | |||
555 | gso->type = XEN_NETIF_EXTRA_TYPE_GSO; | ||
556 | gso->flags = 0; | ||
557 | extra = gso; | ||
558 | } | ||
559 | |||
560 | np->tx.req_prod_pvt = i + 1; | ||
561 | |||
562 | xennet_make_frags(skb, dev, tx); | ||
563 | tx->size = skb->len; | ||
564 | |||
565 | RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify); | ||
566 | if (notify) | ||
567 | notify_remote_via_irq(np->netdev->irq); | ||
568 | |||
569 | xennet_tx_buf_gc(dev); | ||
570 | |||
571 | if (!netfront_tx_slot_available(np)) | ||
572 | netif_stop_queue(dev); | ||
573 | |||
574 | spin_unlock_irq(&np->tx_lock); | ||
575 | |||
576 | np->stats.tx_bytes += skb->len; | ||
577 | np->stats.tx_packets++; | ||
578 | |||
579 | return 0; | ||
580 | |||
581 | drop: | ||
582 | np->stats.tx_dropped++; | ||
583 | dev_kfree_skb(skb); | ||
584 | return 0; | ||
585 | } | ||
586 | |||
587 | static int xennet_close(struct net_device *dev) | ||
588 | { | ||
589 | struct netfront_info *np = netdev_priv(dev); | ||
590 | netif_stop_queue(np->netdev); | ||
591 | return 0; | ||
592 | } | ||
593 | |||
594 | static struct net_device_stats *xennet_get_stats(struct net_device *dev) | ||
595 | { | ||
596 | struct netfront_info *np = netdev_priv(dev); | ||
597 | return &np->stats; | ||
598 | } | ||
599 | |||
600 | static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb, | ||
601 | grant_ref_t ref) | ||
602 | { | ||
603 | int new = xennet_rxidx(np->rx.req_prod_pvt); | ||
604 | |||
605 | BUG_ON(np->rx_skbs[new]); | ||
606 | np->rx_skbs[new] = skb; | ||
607 | np->grant_rx_ref[new] = ref; | ||
608 | RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new; | ||
609 | RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref; | ||
610 | np->rx.req_prod_pvt++; | ||
611 | } | ||
612 | |||
613 | static int xennet_get_extras(struct netfront_info *np, | ||
614 | struct xen_netif_extra_info *extras, | ||
615 | RING_IDX rp) | ||
616 | |||
617 | { | ||
618 | struct xen_netif_extra_info *extra; | ||
619 | struct device *dev = &np->netdev->dev; | ||
620 | RING_IDX cons = np->rx.rsp_cons; | ||
621 | int err = 0; | ||
622 | |||
623 | do { | ||
624 | struct sk_buff *skb; | ||
625 | grant_ref_t ref; | ||
626 | |||
627 | if (unlikely(cons + 1 == rp)) { | ||
628 | if (net_ratelimit()) | ||
629 | dev_warn(dev, "Missing extra info\n"); | ||
630 | err = -EBADR; | ||
631 | break; | ||
632 | } | ||
633 | |||
634 | extra = (struct xen_netif_extra_info *) | ||
635 | RING_GET_RESPONSE(&np->rx, ++cons); | ||
636 | |||
637 | if (unlikely(!extra->type || | ||
638 | extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) { | ||
639 | if (net_ratelimit()) | ||
640 | dev_warn(dev, "Invalid extra type: %d\n", | ||
641 | extra->type); | ||
642 | err = -EINVAL; | ||
643 | } else { | ||
644 | memcpy(&extras[extra->type - 1], extra, | ||
645 | sizeof(*extra)); | ||
646 | } | ||
647 | |||
648 | skb = xennet_get_rx_skb(np, cons); | ||
649 | ref = xennet_get_rx_ref(np, cons); | ||
650 | xennet_move_rx_slot(np, skb, ref); | ||
651 | } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE); | ||
652 | |||
653 | np->rx.rsp_cons = cons; | ||
654 | return err; | ||
655 | } | ||
656 | |||
657 | static int xennet_get_responses(struct netfront_info *np, | ||
658 | struct netfront_rx_info *rinfo, RING_IDX rp, | ||
659 | struct sk_buff_head *list) | ||
660 | { | ||
661 | struct xen_netif_rx_response *rx = &rinfo->rx; | ||
662 | struct xen_netif_extra_info *extras = rinfo->extras; | ||
663 | struct device *dev = &np->netdev->dev; | ||
664 | RING_IDX cons = np->rx.rsp_cons; | ||
665 | struct sk_buff *skb = xennet_get_rx_skb(np, cons); | ||
666 | grant_ref_t ref = xennet_get_rx_ref(np, cons); | ||
667 | int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD); | ||
668 | int frags = 1; | ||
669 | int err = 0; | ||
670 | unsigned long ret; | ||
671 | |||
672 | if (rx->flags & NETRXF_extra_info) { | ||
673 | err = xennet_get_extras(np, extras, rp); | ||
674 | cons = np->rx.rsp_cons; | ||
675 | } | ||
676 | |||
677 | for (;;) { | ||
678 | if (unlikely(rx->status < 0 || | ||
679 | rx->offset + rx->status > PAGE_SIZE)) { | ||
680 | if (net_ratelimit()) | ||
681 | dev_warn(dev, "rx->offset: %x, size: %u\n", | ||
682 | rx->offset, rx->status); | ||
683 | xennet_move_rx_slot(np, skb, ref); | ||
684 | err = -EINVAL; | ||
685 | goto next; | ||
686 | } | ||
687 | |||
688 | /* | ||
689 | * This definitely indicates a bug, either in this driver or in | ||
690 | * the backend driver. In future this should flag the bad | ||
691 | * situation to the system controller to reboot the backed. | ||
692 | */ | ||
693 | if (ref == GRANT_INVALID_REF) { | ||
694 | if (net_ratelimit()) | ||
695 | dev_warn(dev, "Bad rx response id %d.\n", | ||
696 | rx->id); | ||
697 | err = -EINVAL; | ||
698 | goto next; | ||
699 | } | ||
700 | |||
701 | ret = gnttab_end_foreign_access_ref(ref, 0); | ||
702 | BUG_ON(!ret); | ||
703 | |||
704 | gnttab_release_grant_reference(&np->gref_rx_head, ref); | ||
705 | |||
706 | __skb_queue_tail(list, skb); | ||
707 | |||
708 | next: | ||
709 | if (!(rx->flags & NETRXF_more_data)) | ||
710 | break; | ||
711 | |||
712 | if (cons + frags == rp) { | ||
713 | if (net_ratelimit()) | ||
714 | dev_warn(dev, "Need more frags\n"); | ||
715 | err = -ENOENT; | ||
716 | break; | ||
717 | } | ||
718 | |||
719 | rx = RING_GET_RESPONSE(&np->rx, cons + frags); | ||
720 | skb = xennet_get_rx_skb(np, cons + frags); | ||
721 | ref = xennet_get_rx_ref(np, cons + frags); | ||
722 | frags++; | ||
723 | } | ||
724 | |||
725 | if (unlikely(frags > max)) { | ||
726 | if (net_ratelimit()) | ||
727 | dev_warn(dev, "Too many frags\n"); | ||
728 | err = -E2BIG; | ||
729 | } | ||
730 | |||
731 | if (unlikely(err)) | ||
732 | np->rx.rsp_cons = cons + frags; | ||
733 | |||
734 | return err; | ||
735 | } | ||
736 | |||
737 | static int xennet_set_skb_gso(struct sk_buff *skb, | ||
738 | struct xen_netif_extra_info *gso) | ||
739 | { | ||
740 | if (!gso->u.gso.size) { | ||
741 | if (net_ratelimit()) | ||
742 | printk(KERN_WARNING "GSO size must not be zero.\n"); | ||
743 | return -EINVAL; | ||
744 | } | ||
745 | |||
746 | /* Currently only TCPv4 S.O. is supported. */ | ||
747 | if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) { | ||
748 | if (net_ratelimit()) | ||
749 | printk(KERN_WARNING "Bad GSO type %d.\n", gso->u.gso.type); | ||
750 | return -EINVAL; | ||
751 | } | ||
752 | |||
753 | skb_shinfo(skb)->gso_size = gso->u.gso.size; | ||
754 | skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; | ||
755 | |||
756 | /* Header must be checked, and gso_segs computed. */ | ||
757 | skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; | ||
758 | skb_shinfo(skb)->gso_segs = 0; | ||
759 | |||
760 | return 0; | ||
761 | } | ||
762 | |||
763 | static RING_IDX xennet_fill_frags(struct netfront_info *np, | ||
764 | struct sk_buff *skb, | ||
765 | struct sk_buff_head *list) | ||
766 | { | ||
767 | struct skb_shared_info *shinfo = skb_shinfo(skb); | ||
768 | int nr_frags = shinfo->nr_frags; | ||
769 | RING_IDX cons = np->rx.rsp_cons; | ||
770 | skb_frag_t *frag = shinfo->frags + nr_frags; | ||
771 | struct sk_buff *nskb; | ||
772 | |||
773 | while ((nskb = __skb_dequeue(list))) { | ||
774 | struct xen_netif_rx_response *rx = | ||
775 | RING_GET_RESPONSE(&np->rx, ++cons); | ||
776 | |||
777 | frag->page = skb_shinfo(nskb)->frags[0].page; | ||
778 | frag->page_offset = rx->offset; | ||
779 | frag->size = rx->status; | ||
780 | |||
781 | skb->data_len += rx->status; | ||
782 | |||
783 | skb_shinfo(nskb)->nr_frags = 0; | ||
784 | kfree_skb(nskb); | ||
785 | |||
786 | frag++; | ||
787 | nr_frags++; | ||
788 | } | ||
789 | |||
790 | shinfo->nr_frags = nr_frags; | ||
791 | return cons; | ||
792 | } | ||
793 | |||
794 | static int skb_checksum_setup(struct sk_buff *skb) | ||
795 | { | ||
796 | struct iphdr *iph; | ||
797 | unsigned char *th; | ||
798 | int err = -EPROTO; | ||
799 | |||
800 | if (skb->protocol != htons(ETH_P_IP)) | ||
801 | goto out; | ||
802 | |||
803 | iph = (void *)skb->data; | ||
804 | th = skb->data + 4 * iph->ihl; | ||
805 | if (th >= skb_tail_pointer(skb)) | ||
806 | goto out; | ||
807 | |||
808 | skb->csum_start = th - skb->head; | ||
809 | switch (iph->protocol) { | ||
810 | case IPPROTO_TCP: | ||
811 | skb->csum_offset = offsetof(struct tcphdr, check); | ||
812 | break; | ||
813 | case IPPROTO_UDP: | ||
814 | skb->csum_offset = offsetof(struct udphdr, check); | ||
815 | break; | ||
816 | default: | ||
817 | if (net_ratelimit()) | ||
818 | printk(KERN_ERR "Attempting to checksum a non-" | ||
819 | "TCP/UDP packet, dropping a protocol" | ||
820 | " %d packet", iph->protocol); | ||
821 | goto out; | ||
822 | } | ||
823 | |||
824 | if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb)) | ||
825 | goto out; | ||
826 | |||
827 | err = 0; | ||
828 | |||
829 | out: | ||
830 | return err; | ||
831 | } | ||
832 | |||
833 | static int handle_incoming_queue(struct net_device *dev, | ||
834 | struct sk_buff_head *rxq) | ||
835 | { | ||
836 | struct netfront_info *np = netdev_priv(dev); | ||
837 | int packets_dropped = 0; | ||
838 | struct sk_buff *skb; | ||
839 | |||
840 | while ((skb = __skb_dequeue(rxq)) != NULL) { | ||
841 | struct page *page = NETFRONT_SKB_CB(skb)->page; | ||
842 | void *vaddr = page_address(page); | ||
843 | unsigned offset = NETFRONT_SKB_CB(skb)->offset; | ||
844 | |||
845 | memcpy(skb->data, vaddr + offset, | ||
846 | skb_headlen(skb)); | ||
847 | |||
848 | if (page != skb_shinfo(skb)->frags[0].page) | ||
849 | __free_page(page); | ||
850 | |||
851 | /* Ethernet work: Delayed to here as it peeks the header. */ | ||
852 | skb->protocol = eth_type_trans(skb, dev); | ||
853 | |||
854 | if (skb->ip_summed == CHECKSUM_PARTIAL) { | ||
855 | if (skb_checksum_setup(skb)) { | ||
856 | kfree_skb(skb); | ||
857 | packets_dropped++; | ||
858 | np->stats.rx_errors++; | ||
859 | continue; | ||
860 | } | ||
861 | } | ||
862 | |||
863 | np->stats.rx_packets++; | ||
864 | np->stats.rx_bytes += skb->len; | ||
865 | |||
866 | /* Pass it up. */ | ||
867 | netif_receive_skb(skb); | ||
868 | dev->last_rx = jiffies; | ||
869 | } | ||
870 | |||
871 | return packets_dropped; | ||
872 | } | ||
873 | |||
874 | static int xennet_poll(struct net_device *dev, int *pbudget) | ||
875 | { | ||
876 | struct netfront_info *np = netdev_priv(dev); | ||
877 | struct sk_buff *skb; | ||
878 | struct netfront_rx_info rinfo; | ||
879 | struct xen_netif_rx_response *rx = &rinfo.rx; | ||
880 | struct xen_netif_extra_info *extras = rinfo.extras; | ||
881 | RING_IDX i, rp; | ||
882 | int work_done, budget, more_to_do = 1; | ||
883 | struct sk_buff_head rxq; | ||
884 | struct sk_buff_head errq; | ||
885 | struct sk_buff_head tmpq; | ||
886 | unsigned long flags; | ||
887 | unsigned int len; | ||
888 | int err; | ||
889 | |||
890 | spin_lock(&np->rx_lock); | ||
891 | |||
892 | if (unlikely(!netif_carrier_ok(dev))) { | ||
893 | spin_unlock(&np->rx_lock); | ||
894 | return 0; | ||
895 | } | ||
896 | |||
897 | skb_queue_head_init(&rxq); | ||
898 | skb_queue_head_init(&errq); | ||
899 | skb_queue_head_init(&tmpq); | ||
900 | |||
901 | budget = *pbudget; | ||
902 | if (budget > dev->quota) | ||
903 | budget = dev->quota; | ||
904 | rp = np->rx.sring->rsp_prod; | ||
905 | rmb(); /* Ensure we see queued responses up to 'rp'. */ | ||
906 | |||
907 | i = np->rx.rsp_cons; | ||
908 | work_done = 0; | ||
909 | while ((i != rp) && (work_done < budget)) { | ||
910 | memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx)); | ||
911 | memset(extras, 0, sizeof(rinfo.extras)); | ||
912 | |||
913 | err = xennet_get_responses(np, &rinfo, rp, &tmpq); | ||
914 | |||
915 | if (unlikely(err)) { | ||
916 | err: | ||
917 | while ((skb = __skb_dequeue(&tmpq))) | ||
918 | __skb_queue_tail(&errq, skb); | ||
919 | np->stats.rx_errors++; | ||
920 | i = np->rx.rsp_cons; | ||
921 | continue; | ||
922 | } | ||
923 | |||
924 | skb = __skb_dequeue(&tmpq); | ||
925 | |||
926 | if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) { | ||
927 | struct xen_netif_extra_info *gso; | ||
928 | gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1]; | ||
929 | |||
930 | if (unlikely(xennet_set_skb_gso(skb, gso))) { | ||
931 | __skb_queue_head(&tmpq, skb); | ||
932 | np->rx.rsp_cons += skb_queue_len(&tmpq); | ||
933 | goto err; | ||
934 | } | ||
935 | } | ||
936 | |||
937 | NETFRONT_SKB_CB(skb)->page = skb_shinfo(skb)->frags[0].page; | ||
938 | NETFRONT_SKB_CB(skb)->offset = rx->offset; | ||
939 | |||
940 | len = rx->status; | ||
941 | if (len > RX_COPY_THRESHOLD) | ||
942 | len = RX_COPY_THRESHOLD; | ||
943 | skb_put(skb, len); | ||
944 | |||
945 | if (rx->status > len) { | ||
946 | skb_shinfo(skb)->frags[0].page_offset = | ||
947 | rx->offset + len; | ||
948 | skb_shinfo(skb)->frags[0].size = rx->status - len; | ||
949 | skb->data_len = rx->status - len; | ||
950 | } else { | ||
951 | skb_shinfo(skb)->frags[0].page = NULL; | ||
952 | skb_shinfo(skb)->nr_frags = 0; | ||
953 | } | ||
954 | |||
955 | i = xennet_fill_frags(np, skb, &tmpq); | ||
956 | |||
957 | /* | ||
958 | * Truesize approximates the size of true data plus | ||
959 | * any supervisor overheads. Adding hypervisor | ||
960 | * overheads has been shown to significantly reduce | ||
961 | * achievable bandwidth with the default receive | ||
962 | * buffer size. It is therefore not wise to account | ||
963 | * for it here. | ||
964 | * | ||
965 | * After alloc_skb(RX_COPY_THRESHOLD), truesize is set | ||
966 | * to RX_COPY_THRESHOLD + the supervisor | ||
967 | * overheads. Here, we add the size of the data pulled | ||
968 | * in xennet_fill_frags(). | ||
969 | * | ||
970 | * We also adjust for any unused space in the main | ||
971 | * data area by subtracting (RX_COPY_THRESHOLD - | ||
972 | * len). This is especially important with drivers | ||
973 | * which split incoming packets into header and data, | ||
974 | * using only 66 bytes of the main data area (see the | ||
975 | * e1000 driver for example.) On such systems, | ||
976 | * without this last adjustement, our achievable | ||
977 | * receive throughout using the standard receive | ||
978 | * buffer size was cut by 25%(!!!). | ||
979 | */ | ||
980 | skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len); | ||
981 | skb->len += skb->data_len; | ||
982 | |||
983 | if (rx->flags & NETRXF_csum_blank) | ||
984 | skb->ip_summed = CHECKSUM_PARTIAL; | ||
985 | else if (rx->flags & NETRXF_data_validated) | ||
986 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
987 | |||
988 | __skb_queue_tail(&rxq, skb); | ||
989 | |||
990 | np->rx.rsp_cons = ++i; | ||
991 | work_done++; | ||
992 | } | ||
993 | |||
994 | while ((skb = __skb_dequeue(&errq))) | ||
995 | kfree_skb(skb); | ||
996 | |||
997 | work_done -= handle_incoming_queue(dev, &rxq); | ||
998 | |||
999 | /* If we get a callback with very few responses, reduce fill target. */ | ||
1000 | /* NB. Note exponential increase, linear decrease. */ | ||
1001 | if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) > | ||
1002 | ((3*np->rx_target) / 4)) && | ||
1003 | (--np->rx_target < np->rx_min_target)) | ||
1004 | np->rx_target = np->rx_min_target; | ||
1005 | |||
1006 | xennet_alloc_rx_buffers(dev); | ||
1007 | |||
1008 | *pbudget -= work_done; | ||
1009 | dev->quota -= work_done; | ||
1010 | |||
1011 | if (work_done < budget) { | ||
1012 | local_irq_save(flags); | ||
1013 | |||
1014 | RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do); | ||
1015 | if (!more_to_do) | ||
1016 | __netif_rx_complete(dev); | ||
1017 | |||
1018 | local_irq_restore(flags); | ||
1019 | } | ||
1020 | |||
1021 | spin_unlock(&np->rx_lock); | ||
1022 | |||
1023 | return more_to_do; | ||
1024 | } | ||
1025 | |||
1026 | static int xennet_change_mtu(struct net_device *dev, int mtu) | ||
1027 | { | ||
1028 | int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN; | ||
1029 | |||
1030 | if (mtu > max) | ||
1031 | return -EINVAL; | ||
1032 | dev->mtu = mtu; | ||
1033 | return 0; | ||
1034 | } | ||
1035 | |||
1036 | static void xennet_release_tx_bufs(struct netfront_info *np) | ||
1037 | { | ||
1038 | struct sk_buff *skb; | ||
1039 | int i; | ||
1040 | |||
1041 | for (i = 0; i < NET_TX_RING_SIZE; i++) { | ||
1042 | /* Skip over entries which are actually freelist references */ | ||
1043 | if ((unsigned long)np->tx_skbs[i].skb < PAGE_OFFSET) | ||
1044 | continue; | ||
1045 | |||
1046 | skb = np->tx_skbs[i].skb; | ||
1047 | gnttab_end_foreign_access_ref(np->grant_tx_ref[i], | ||
1048 | GNTMAP_readonly); | ||
1049 | gnttab_release_grant_reference(&np->gref_tx_head, | ||
1050 | np->grant_tx_ref[i]); | ||
1051 | np->grant_tx_ref[i] = GRANT_INVALID_REF; | ||
1052 | add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, i); | ||
1053 | dev_kfree_skb_irq(skb); | ||
1054 | } | ||
1055 | } | ||
1056 | |||
1057 | static void xennet_release_rx_bufs(struct netfront_info *np) | ||
1058 | { | ||
1059 | struct mmu_update *mmu = np->rx_mmu; | ||
1060 | struct multicall_entry *mcl = np->rx_mcl; | ||
1061 | struct sk_buff_head free_list; | ||
1062 | struct sk_buff *skb; | ||
1063 | unsigned long mfn; | ||
1064 | int xfer = 0, noxfer = 0, unused = 0; | ||
1065 | int id, ref; | ||
1066 | |||
1067 | dev_warn(&np->netdev->dev, "%s: fix me for copying receiver.\n", | ||
1068 | __func__); | ||
1069 | return; | ||
1070 | |||
1071 | skb_queue_head_init(&free_list); | ||
1072 | |||
1073 | spin_lock_bh(&np->rx_lock); | ||
1074 | |||
1075 | for (id = 0; id < NET_RX_RING_SIZE; id++) { | ||
1076 | ref = np->grant_rx_ref[id]; | ||
1077 | if (ref == GRANT_INVALID_REF) { | ||
1078 | unused++; | ||
1079 | continue; | ||
1080 | } | ||
1081 | |||
1082 | skb = np->rx_skbs[id]; | ||
1083 | mfn = gnttab_end_foreign_transfer_ref(ref); | ||
1084 | gnttab_release_grant_reference(&np->gref_rx_head, ref); | ||
1085 | np->grant_rx_ref[id] = GRANT_INVALID_REF; | ||
1086 | |||
1087 | if (0 == mfn) { | ||
1088 | skb_shinfo(skb)->nr_frags = 0; | ||
1089 | dev_kfree_skb(skb); | ||
1090 | noxfer++; | ||
1091 | continue; | ||
1092 | } | ||
1093 | |||
1094 | if (!xen_feature(XENFEAT_auto_translated_physmap)) { | ||
1095 | /* Remap the page. */ | ||
1096 | struct page *page = skb_shinfo(skb)->frags[0].page; | ||
1097 | unsigned long pfn = page_to_pfn(page); | ||
1098 | void *vaddr = page_address(page); | ||
1099 | |||
1100 | MULTI_update_va_mapping(mcl, (unsigned long)vaddr, | ||
1101 | mfn_pte(mfn, PAGE_KERNEL), | ||
1102 | 0); | ||
1103 | mcl++; | ||
1104 | mmu->ptr = ((u64)mfn << PAGE_SHIFT) | ||
1105 | | MMU_MACHPHYS_UPDATE; | ||
1106 | mmu->val = pfn; | ||
1107 | mmu++; | ||
1108 | |||
1109 | set_phys_to_machine(pfn, mfn); | ||
1110 | } | ||
1111 | __skb_queue_tail(&free_list, skb); | ||
1112 | xfer++; | ||
1113 | } | ||
1114 | |||
1115 | dev_info(&np->netdev->dev, "%s: %d xfer, %d noxfer, %d unused\n", | ||
1116 | __func__, xfer, noxfer, unused); | ||
1117 | |||
1118 | if (xfer) { | ||
1119 | if (!xen_feature(XENFEAT_auto_translated_physmap)) { | ||
1120 | /* Do all the remapping work and M2P updates. */ | ||
1121 | MULTI_mmu_update(mcl, np->rx_mmu, mmu - np->rx_mmu, | ||
1122 | 0, DOMID_SELF); | ||
1123 | mcl++; | ||
1124 | HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl); | ||
1125 | } | ||
1126 | } | ||
1127 | |||
1128 | while ((skb = __skb_dequeue(&free_list)) != NULL) | ||
1129 | dev_kfree_skb(skb); | ||
1130 | |||
1131 | spin_unlock_bh(&np->rx_lock); | ||
1132 | } | ||
1133 | |||
1134 | static void xennet_uninit(struct net_device *dev) | ||
1135 | { | ||
1136 | struct netfront_info *np = netdev_priv(dev); | ||
1137 | xennet_release_tx_bufs(np); | ||
1138 | xennet_release_rx_bufs(np); | ||
1139 | gnttab_free_grant_references(np->gref_tx_head); | ||
1140 | gnttab_free_grant_references(np->gref_rx_head); | ||
1141 | } | ||
1142 | |||
1143 | static struct net_device * __devinit xennet_create_dev(struct xenbus_device *dev) | ||
1144 | { | ||
1145 | int i, err; | ||
1146 | struct net_device *netdev; | ||
1147 | struct netfront_info *np; | ||
1148 | |||
1149 | netdev = alloc_etherdev(sizeof(struct netfront_info)); | ||
1150 | if (!netdev) { | ||
1151 | printk(KERN_WARNING "%s> alloc_etherdev failed.\n", | ||
1152 | __func__); | ||
1153 | return ERR_PTR(-ENOMEM); | ||
1154 | } | ||
1155 | |||
1156 | np = netdev_priv(netdev); | ||
1157 | np->xbdev = dev; | ||
1158 | |||
1159 | spin_lock_init(&np->tx_lock); | ||
1160 | spin_lock_init(&np->rx_lock); | ||
1161 | |||
1162 | skb_queue_head_init(&np->rx_batch); | ||
1163 | np->rx_target = RX_DFL_MIN_TARGET; | ||
1164 | np->rx_min_target = RX_DFL_MIN_TARGET; | ||
1165 | np->rx_max_target = RX_MAX_TARGET; | ||
1166 | |||
1167 | init_timer(&np->rx_refill_timer); | ||
1168 | np->rx_refill_timer.data = (unsigned long)netdev; | ||
1169 | np->rx_refill_timer.function = rx_refill_timeout; | ||
1170 | |||
1171 | /* Initialise tx_skbs as a free chain containing every entry. */ | ||
1172 | np->tx_skb_freelist = 0; | ||
1173 | for (i = 0; i < NET_TX_RING_SIZE; i++) { | ||
1174 | np->tx_skbs[i].link = i+1; | ||
1175 | np->grant_tx_ref[i] = GRANT_INVALID_REF; | ||
1176 | } | ||
1177 | |||
1178 | /* Clear out rx_skbs */ | ||
1179 | for (i = 0; i < NET_RX_RING_SIZE; i++) { | ||
1180 | np->rx_skbs[i] = NULL; | ||
1181 | np->grant_rx_ref[i] = GRANT_INVALID_REF; | ||
1182 | } | ||
1183 | |||
1184 | /* A grant for every tx ring slot */ | ||
1185 | if (gnttab_alloc_grant_references(TX_MAX_TARGET, | ||
1186 | &np->gref_tx_head) < 0) { | ||
1187 | printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n"); | ||
1188 | err = -ENOMEM; | ||
1189 | goto exit; | ||
1190 | } | ||
1191 | /* A grant for every rx ring slot */ | ||
1192 | if (gnttab_alloc_grant_references(RX_MAX_TARGET, | ||
1193 | &np->gref_rx_head) < 0) { | ||
1194 | printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n"); | ||
1195 | err = -ENOMEM; | ||
1196 | goto exit_free_tx; | ||
1197 | } | ||
1198 | |||
1199 | netdev->open = xennet_open; | ||
1200 | netdev->hard_start_xmit = xennet_start_xmit; | ||
1201 | netdev->stop = xennet_close; | ||
1202 | netdev->get_stats = xennet_get_stats; | ||
1203 | netdev->poll = xennet_poll; | ||
1204 | netdev->uninit = xennet_uninit; | ||
1205 | netdev->change_mtu = xennet_change_mtu; | ||
1206 | netdev->weight = 64; | ||
1207 | netdev->features = NETIF_F_IP_CSUM; | ||
1208 | |||
1209 | SET_ETHTOOL_OPS(netdev, &xennet_ethtool_ops); | ||
1210 | SET_MODULE_OWNER(netdev); | ||
1211 | SET_NETDEV_DEV(netdev, &dev->dev); | ||
1212 | |||
1213 | np->netdev = netdev; | ||
1214 | |||
1215 | netif_carrier_off(netdev); | ||
1216 | |||
1217 | return netdev; | ||
1218 | |||
1219 | exit_free_tx: | ||
1220 | gnttab_free_grant_references(np->gref_tx_head); | ||
1221 | exit: | ||
1222 | free_netdev(netdev); | ||
1223 | return ERR_PTR(err); | ||
1224 | } | ||
1225 | |||
1226 | /** | ||
1227 | * Entry point to this code when a new device is created. Allocate the basic | ||
1228 | * structures and the ring buffers for communication with the backend, and | ||
1229 | * inform the backend of the appropriate details for those. | ||
1230 | */ | ||
1231 | static int __devinit netfront_probe(struct xenbus_device *dev, | ||
1232 | const struct xenbus_device_id *id) | ||
1233 | { | ||
1234 | int err; | ||
1235 | struct net_device *netdev; | ||
1236 | struct netfront_info *info; | ||
1237 | |||
1238 | netdev = xennet_create_dev(dev); | ||
1239 | if (IS_ERR(netdev)) { | ||
1240 | err = PTR_ERR(netdev); | ||
1241 | xenbus_dev_fatal(dev, err, "creating netdev"); | ||
1242 | return err; | ||
1243 | } | ||
1244 | |||
1245 | info = netdev_priv(netdev); | ||
1246 | dev->dev.driver_data = info; | ||
1247 | |||
1248 | err = register_netdev(info->netdev); | ||
1249 | if (err) { | ||
1250 | printk(KERN_WARNING "%s: register_netdev err=%d\n", | ||
1251 | __func__, err); | ||
1252 | goto fail; | ||
1253 | } | ||
1254 | |||
1255 | err = xennet_sysfs_addif(info->netdev); | ||
1256 | if (err) { | ||
1257 | unregister_netdev(info->netdev); | ||
1258 | printk(KERN_WARNING "%s: add sysfs failed err=%d\n", | ||
1259 | __func__, err); | ||
1260 | goto fail; | ||
1261 | } | ||
1262 | |||
1263 | return 0; | ||
1264 | |||
1265 | fail: | ||
1266 | free_netdev(netdev); | ||
1267 | dev->dev.driver_data = NULL; | ||
1268 | return err; | ||
1269 | } | ||
1270 | |||
1271 | static void xennet_end_access(int ref, void *page) | ||
1272 | { | ||
1273 | /* This frees the page as a side-effect */ | ||
1274 | if (ref != GRANT_INVALID_REF) | ||
1275 | gnttab_end_foreign_access(ref, 0, (unsigned long)page); | ||
1276 | } | ||
1277 | |||
1278 | static void xennet_disconnect_backend(struct netfront_info *info) | ||
1279 | { | ||
1280 | /* Stop old i/f to prevent errors whilst we rebuild the state. */ | ||
1281 | spin_lock_bh(&info->rx_lock); | ||
1282 | spin_lock_irq(&info->tx_lock); | ||
1283 | netif_carrier_off(info->netdev); | ||
1284 | spin_unlock_irq(&info->tx_lock); | ||
1285 | spin_unlock_bh(&info->rx_lock); | ||
1286 | |||
1287 | if (info->netdev->irq) | ||
1288 | unbind_from_irqhandler(info->netdev->irq, info->netdev); | ||
1289 | info->evtchn = info->netdev->irq = 0; | ||
1290 | |||
1291 | /* End access and free the pages */ | ||
1292 | xennet_end_access(info->tx_ring_ref, info->tx.sring); | ||
1293 | xennet_end_access(info->rx_ring_ref, info->rx.sring); | ||
1294 | |||
1295 | info->tx_ring_ref = GRANT_INVALID_REF; | ||
1296 | info->rx_ring_ref = GRANT_INVALID_REF; | ||
1297 | info->tx.sring = NULL; | ||
1298 | info->rx.sring = NULL; | ||
1299 | } | ||
1300 | |||
1301 | /** | ||
1302 | * We are reconnecting to the backend, due to a suspend/resume, or a backend | ||
1303 | * driver restart. We tear down our netif structure and recreate it, but | ||
1304 | * leave the device-layer structures intact so that this is transparent to the | ||
1305 | * rest of the kernel. | ||
1306 | */ | ||
1307 | static int netfront_resume(struct xenbus_device *dev) | ||
1308 | { | ||
1309 | struct netfront_info *info = dev->dev.driver_data; | ||
1310 | |||
1311 | dev_dbg(&dev->dev, "%s\n", dev->nodename); | ||
1312 | |||
1313 | xennet_disconnect_backend(info); | ||
1314 | return 0; | ||
1315 | } | ||
1316 | |||
1317 | static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]) | ||
1318 | { | ||
1319 | char *s, *e, *macstr; | ||
1320 | int i; | ||
1321 | |||
1322 | macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL); | ||
1323 | if (IS_ERR(macstr)) | ||
1324 | return PTR_ERR(macstr); | ||
1325 | |||
1326 | for (i = 0; i < ETH_ALEN; i++) { | ||
1327 | mac[i] = simple_strtoul(s, &e, 16); | ||
1328 | if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) { | ||
1329 | kfree(macstr); | ||
1330 | return -ENOENT; | ||
1331 | } | ||
1332 | s = e+1; | ||
1333 | } | ||
1334 | |||
1335 | kfree(macstr); | ||
1336 | return 0; | ||
1337 | } | ||
1338 | |||
1339 | static irqreturn_t xennet_interrupt(int irq, void *dev_id) | ||
1340 | { | ||
1341 | struct net_device *dev = dev_id; | ||
1342 | struct netfront_info *np = netdev_priv(dev); | ||
1343 | unsigned long flags; | ||
1344 | |||
1345 | spin_lock_irqsave(&np->tx_lock, flags); | ||
1346 | |||
1347 | if (likely(netif_carrier_ok(dev))) { | ||
1348 | xennet_tx_buf_gc(dev); | ||
1349 | /* Under tx_lock: protects access to rx shared-ring indexes. */ | ||
1350 | if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) | ||
1351 | netif_rx_schedule(dev); | ||
1352 | } | ||
1353 | |||
1354 | spin_unlock_irqrestore(&np->tx_lock, flags); | ||
1355 | |||
1356 | return IRQ_HANDLED; | ||
1357 | } | ||
1358 | |||
1359 | static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info) | ||
1360 | { | ||
1361 | struct xen_netif_tx_sring *txs; | ||
1362 | struct xen_netif_rx_sring *rxs; | ||
1363 | int err; | ||
1364 | struct net_device *netdev = info->netdev; | ||
1365 | |||
1366 | info->tx_ring_ref = GRANT_INVALID_REF; | ||
1367 | info->rx_ring_ref = GRANT_INVALID_REF; | ||
1368 | info->rx.sring = NULL; | ||
1369 | info->tx.sring = NULL; | ||
1370 | netdev->irq = 0; | ||
1371 | |||
1372 | err = xen_net_read_mac(dev, netdev->dev_addr); | ||
1373 | if (err) { | ||
1374 | xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename); | ||
1375 | goto fail; | ||
1376 | } | ||
1377 | |||
1378 | txs = (struct xen_netif_tx_sring *)get_zeroed_page(GFP_KERNEL); | ||
1379 | if (!txs) { | ||
1380 | err = -ENOMEM; | ||
1381 | xenbus_dev_fatal(dev, err, "allocating tx ring page"); | ||
1382 | goto fail; | ||
1383 | } | ||
1384 | SHARED_RING_INIT(txs); | ||
1385 | FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE); | ||
1386 | |||
1387 | err = xenbus_grant_ring(dev, virt_to_mfn(txs)); | ||
1388 | if (err < 0) { | ||
1389 | free_page((unsigned long)txs); | ||
1390 | goto fail; | ||
1391 | } | ||
1392 | |||
1393 | info->tx_ring_ref = err; | ||
1394 | rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_KERNEL); | ||
1395 | if (!rxs) { | ||
1396 | err = -ENOMEM; | ||
1397 | xenbus_dev_fatal(dev, err, "allocating rx ring page"); | ||
1398 | goto fail; | ||
1399 | } | ||
1400 | SHARED_RING_INIT(rxs); | ||
1401 | FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE); | ||
1402 | |||
1403 | err = xenbus_grant_ring(dev, virt_to_mfn(rxs)); | ||
1404 | if (err < 0) { | ||
1405 | free_page((unsigned long)rxs); | ||
1406 | goto fail; | ||
1407 | } | ||
1408 | info->rx_ring_ref = err; | ||
1409 | |||
1410 | err = xenbus_alloc_evtchn(dev, &info->evtchn); | ||
1411 | if (err) | ||
1412 | goto fail; | ||
1413 | |||
1414 | err = bind_evtchn_to_irqhandler(info->evtchn, xennet_interrupt, | ||
1415 | IRQF_SAMPLE_RANDOM, netdev->name, | ||
1416 | netdev); | ||
1417 | if (err < 0) | ||
1418 | goto fail; | ||
1419 | netdev->irq = err; | ||
1420 | return 0; | ||
1421 | |||
1422 | fail: | ||
1423 | return err; | ||
1424 | } | ||
1425 | |||
1426 | /* Common code used when first setting up, and when resuming. */ | ||
1427 | static int talk_to_backend(struct xenbus_device *dev, | ||
1428 | struct netfront_info *info) | ||
1429 | { | ||
1430 | const char *message; | ||
1431 | struct xenbus_transaction xbt; | ||
1432 | int err; | ||
1433 | |||
1434 | /* Create shared ring, alloc event channel. */ | ||
1435 | err = setup_netfront(dev, info); | ||
1436 | if (err) | ||
1437 | goto out; | ||
1438 | |||
1439 | again: | ||
1440 | err = xenbus_transaction_start(&xbt); | ||
1441 | if (err) { | ||
1442 | xenbus_dev_fatal(dev, err, "starting transaction"); | ||
1443 | goto destroy_ring; | ||
1444 | } | ||
1445 | |||
1446 | err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref", "%u", | ||
1447 | info->tx_ring_ref); | ||
1448 | if (err) { | ||
1449 | message = "writing tx ring-ref"; | ||
1450 | goto abort_transaction; | ||
1451 | } | ||
1452 | err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref", "%u", | ||
1453 | info->rx_ring_ref); | ||
1454 | if (err) { | ||
1455 | message = "writing rx ring-ref"; | ||
1456 | goto abort_transaction; | ||
1457 | } | ||
1458 | err = xenbus_printf(xbt, dev->nodename, | ||
1459 | "event-channel", "%u", info->evtchn); | ||
1460 | if (err) { | ||
1461 | message = "writing event-channel"; | ||
1462 | goto abort_transaction; | ||
1463 | } | ||
1464 | |||
1465 | err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u", | ||
1466 | 1); | ||
1467 | if (err) { | ||
1468 | message = "writing request-rx-copy"; | ||
1469 | goto abort_transaction; | ||
1470 | } | ||
1471 | |||
1472 | err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1); | ||
1473 | if (err) { | ||
1474 | message = "writing feature-rx-notify"; | ||
1475 | goto abort_transaction; | ||
1476 | } | ||
1477 | |||
1478 | err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1); | ||
1479 | if (err) { | ||
1480 | message = "writing feature-sg"; | ||
1481 | goto abort_transaction; | ||
1482 | } | ||
1483 | |||
1484 | err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1); | ||
1485 | if (err) { | ||
1486 | message = "writing feature-gso-tcpv4"; | ||
1487 | goto abort_transaction; | ||
1488 | } | ||
1489 | |||
1490 | err = xenbus_transaction_end(xbt, 0); | ||
1491 | if (err) { | ||
1492 | if (err == -EAGAIN) | ||
1493 | goto again; | ||
1494 | xenbus_dev_fatal(dev, err, "completing transaction"); | ||
1495 | goto destroy_ring; | ||
1496 | } | ||
1497 | |||
1498 | return 0; | ||
1499 | |||
1500 | abort_transaction: | ||
1501 | xenbus_transaction_end(xbt, 1); | ||
1502 | xenbus_dev_fatal(dev, err, "%s", message); | ||
1503 | destroy_ring: | ||
1504 | xennet_disconnect_backend(info); | ||
1505 | out: | ||
1506 | return err; | ||
1507 | } | ||
1508 | |||
1509 | static int xennet_set_sg(struct net_device *dev, u32 data) | ||
1510 | { | ||
1511 | if (data) { | ||
1512 | struct netfront_info *np = netdev_priv(dev); | ||
1513 | int val; | ||
1514 | |||
1515 | if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg", | ||
1516 | "%d", &val) < 0) | ||
1517 | val = 0; | ||
1518 | if (!val) | ||
1519 | return -ENOSYS; | ||
1520 | } else if (dev->mtu > ETH_DATA_LEN) | ||
1521 | dev->mtu = ETH_DATA_LEN; | ||
1522 | |||
1523 | return ethtool_op_set_sg(dev, data); | ||
1524 | } | ||
1525 | |||
1526 | static int xennet_set_tso(struct net_device *dev, u32 data) | ||
1527 | { | ||
1528 | if (data) { | ||
1529 | struct netfront_info *np = netdev_priv(dev); | ||
1530 | int val; | ||
1531 | |||
1532 | if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, | ||
1533 | "feature-gso-tcpv4", "%d", &val) < 0) | ||
1534 | val = 0; | ||
1535 | if (!val) | ||
1536 | return -ENOSYS; | ||
1537 | } | ||
1538 | |||
1539 | return ethtool_op_set_tso(dev, data); | ||
1540 | } | ||
1541 | |||
1542 | static void xennet_set_features(struct net_device *dev) | ||
1543 | { | ||
1544 | /* Turn off all GSO bits except ROBUST. */ | ||
1545 | dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1; | ||
1546 | dev->features |= NETIF_F_GSO_ROBUST; | ||
1547 | xennet_set_sg(dev, 0); | ||
1548 | |||
1549 | /* We need checksum offload to enable scatter/gather and TSO. */ | ||
1550 | if (!(dev->features & NETIF_F_IP_CSUM)) | ||
1551 | return; | ||
1552 | |||
1553 | if (!xennet_set_sg(dev, 1)) | ||
1554 | xennet_set_tso(dev, 1); | ||
1555 | } | ||
1556 | |||
1557 | static int xennet_connect(struct net_device *dev) | ||
1558 | { | ||
1559 | struct netfront_info *np = netdev_priv(dev); | ||
1560 | int i, requeue_idx, err; | ||
1561 | struct sk_buff *skb; | ||
1562 | grant_ref_t ref; | ||
1563 | struct xen_netif_rx_request *req; | ||
1564 | unsigned int feature_rx_copy; | ||
1565 | |||
1566 | err = xenbus_scanf(XBT_NIL, np->xbdev->otherend, | ||
1567 | "feature-rx-copy", "%u", &feature_rx_copy); | ||
1568 | if (err != 1) | ||
1569 | feature_rx_copy = 0; | ||
1570 | |||
1571 | if (!feature_rx_copy) { | ||
1572 | dev_info(&dev->dev, | ||
1573 | "backend does not support copying recieve path"); | ||
1574 | return -ENODEV; | ||
1575 | } | ||
1576 | |||
1577 | err = talk_to_backend(np->xbdev, np); | ||
1578 | if (err) | ||
1579 | return err; | ||
1580 | |||
1581 | xennet_set_features(dev); | ||
1582 | |||
1583 | spin_lock_bh(&np->rx_lock); | ||
1584 | spin_lock_irq(&np->tx_lock); | ||
1585 | |||
1586 | /* Step 1: Discard all pending TX packet fragments. */ | ||
1587 | xennet_release_tx_bufs(np); | ||
1588 | |||
1589 | /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */ | ||
1590 | for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) { | ||
1591 | if (!np->rx_skbs[i]) | ||
1592 | continue; | ||
1593 | |||
1594 | skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i); | ||
1595 | ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i); | ||
1596 | req = RING_GET_REQUEST(&np->rx, requeue_idx); | ||
1597 | |||
1598 | gnttab_grant_foreign_access_ref( | ||
1599 | ref, np->xbdev->otherend_id, | ||
1600 | pfn_to_mfn(page_to_pfn(skb_shinfo(skb)-> | ||
1601 | frags->page)), | ||
1602 | 0); | ||
1603 | req->gref = ref; | ||
1604 | req->id = requeue_idx; | ||
1605 | |||
1606 | requeue_idx++; | ||
1607 | } | ||
1608 | |||
1609 | np->rx.req_prod_pvt = requeue_idx; | ||
1610 | |||
1611 | /* | ||
1612 | * Step 3: All public and private state should now be sane. Get | ||
1613 | * ready to start sending and receiving packets and give the driver | ||
1614 | * domain a kick because we've probably just requeued some | ||
1615 | * packets. | ||
1616 | */ | ||
1617 | netif_carrier_on(np->netdev); | ||
1618 | notify_remote_via_irq(np->netdev->irq); | ||
1619 | xennet_tx_buf_gc(dev); | ||
1620 | xennet_alloc_rx_buffers(dev); | ||
1621 | |||
1622 | spin_unlock_irq(&np->tx_lock); | ||
1623 | spin_unlock_bh(&np->rx_lock); | ||
1624 | |||
1625 | return 0; | ||
1626 | } | ||
1627 | |||
1628 | /** | ||
1629 | * Callback received when the backend's state changes. | ||
1630 | */ | ||
1631 | static void backend_changed(struct xenbus_device *dev, | ||
1632 | enum xenbus_state backend_state) | ||
1633 | { | ||
1634 | struct netfront_info *np = dev->dev.driver_data; | ||
1635 | struct net_device *netdev = np->netdev; | ||
1636 | |||
1637 | dev_dbg(&dev->dev, "%s\n", xenbus_strstate(backend_state)); | ||
1638 | |||
1639 | switch (backend_state) { | ||
1640 | case XenbusStateInitialising: | ||
1641 | case XenbusStateInitialised: | ||
1642 | case XenbusStateConnected: | ||
1643 | case XenbusStateUnknown: | ||
1644 | case XenbusStateClosed: | ||
1645 | break; | ||
1646 | |||
1647 | case XenbusStateInitWait: | ||
1648 | if (dev->state != XenbusStateInitialising) | ||
1649 | break; | ||
1650 | if (xennet_connect(netdev) != 0) | ||
1651 | break; | ||
1652 | xenbus_switch_state(dev, XenbusStateConnected); | ||
1653 | break; | ||
1654 | |||
1655 | case XenbusStateClosing: | ||
1656 | xenbus_frontend_closed(dev); | ||
1657 | break; | ||
1658 | } | ||
1659 | } | ||
1660 | |||
1661 | static struct ethtool_ops xennet_ethtool_ops = | ||
1662 | { | ||
1663 | .get_tx_csum = ethtool_op_get_tx_csum, | ||
1664 | .set_tx_csum = ethtool_op_set_tx_csum, | ||
1665 | .get_sg = ethtool_op_get_sg, | ||
1666 | .set_sg = xennet_set_sg, | ||
1667 | .get_tso = ethtool_op_get_tso, | ||
1668 | .set_tso = xennet_set_tso, | ||
1669 | .get_link = ethtool_op_get_link, | ||
1670 | }; | ||
1671 | |||
1672 | #ifdef CONFIG_SYSFS | ||
1673 | static ssize_t show_rxbuf_min(struct device *dev, | ||
1674 | struct device_attribute *attr, char *buf) | ||
1675 | { | ||
1676 | struct net_device *netdev = to_net_dev(dev); | ||
1677 | struct netfront_info *info = netdev_priv(netdev); | ||
1678 | |||
1679 | return sprintf(buf, "%u\n", info->rx_min_target); | ||
1680 | } | ||
1681 | |||
1682 | static ssize_t store_rxbuf_min(struct device *dev, | ||
1683 | struct device_attribute *attr, | ||
1684 | const char *buf, size_t len) | ||
1685 | { | ||
1686 | struct net_device *netdev = to_net_dev(dev); | ||
1687 | struct netfront_info *np = netdev_priv(netdev); | ||
1688 | char *endp; | ||
1689 | unsigned long target; | ||
1690 | |||
1691 | if (!capable(CAP_NET_ADMIN)) | ||
1692 | return -EPERM; | ||
1693 | |||
1694 | target = simple_strtoul(buf, &endp, 0); | ||
1695 | if (endp == buf) | ||
1696 | return -EBADMSG; | ||
1697 | |||
1698 | if (target < RX_MIN_TARGET) | ||
1699 | target = RX_MIN_TARGET; | ||
1700 | if (target > RX_MAX_TARGET) | ||
1701 | target = RX_MAX_TARGET; | ||
1702 | |||
1703 | spin_lock_bh(&np->rx_lock); | ||
1704 | if (target > np->rx_max_target) | ||
1705 | np->rx_max_target = target; | ||
1706 | np->rx_min_target = target; | ||
1707 | if (target > np->rx_target) | ||
1708 | np->rx_target = target; | ||
1709 | |||
1710 | xennet_alloc_rx_buffers(netdev); | ||
1711 | |||
1712 | spin_unlock_bh(&np->rx_lock); | ||
1713 | return len; | ||
1714 | } | ||
1715 | |||
1716 | static ssize_t show_rxbuf_max(struct device *dev, | ||
1717 | struct device_attribute *attr, char *buf) | ||
1718 | { | ||
1719 | struct net_device *netdev = to_net_dev(dev); | ||
1720 | struct netfront_info *info = netdev_priv(netdev); | ||
1721 | |||
1722 | return sprintf(buf, "%u\n", info->rx_max_target); | ||
1723 | } | ||
1724 | |||
1725 | static ssize_t store_rxbuf_max(struct device *dev, | ||
1726 | struct device_attribute *attr, | ||
1727 | const char *buf, size_t len) | ||
1728 | { | ||
1729 | struct net_device *netdev = to_net_dev(dev); | ||
1730 | struct netfront_info *np = netdev_priv(netdev); | ||
1731 | char *endp; | ||
1732 | unsigned long target; | ||
1733 | |||
1734 | if (!capable(CAP_NET_ADMIN)) | ||
1735 | return -EPERM; | ||
1736 | |||
1737 | target = simple_strtoul(buf, &endp, 0); | ||
1738 | if (endp == buf) | ||
1739 | return -EBADMSG; | ||
1740 | |||
1741 | if (target < RX_MIN_TARGET) | ||
1742 | target = RX_MIN_TARGET; | ||
1743 | if (target > RX_MAX_TARGET) | ||
1744 | target = RX_MAX_TARGET; | ||
1745 | |||
1746 | spin_lock_bh(&np->rx_lock); | ||
1747 | if (target < np->rx_min_target) | ||
1748 | np->rx_min_target = target; | ||
1749 | np->rx_max_target = target; | ||
1750 | if (target < np->rx_target) | ||
1751 | np->rx_target = target; | ||
1752 | |||
1753 | xennet_alloc_rx_buffers(netdev); | ||
1754 | |||
1755 | spin_unlock_bh(&np->rx_lock); | ||
1756 | return len; | ||
1757 | } | ||
1758 | |||
1759 | static ssize_t show_rxbuf_cur(struct device *dev, | ||
1760 | struct device_attribute *attr, char *buf) | ||
1761 | { | ||
1762 | struct net_device *netdev = to_net_dev(dev); | ||
1763 | struct netfront_info *info = netdev_priv(netdev); | ||
1764 | |||
1765 | return sprintf(buf, "%u\n", info->rx_target); | ||
1766 | } | ||
1767 | |||
1768 | static struct device_attribute xennet_attrs[] = { | ||
1769 | __ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min), | ||
1770 | __ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max), | ||
1771 | __ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL), | ||
1772 | }; | ||
1773 | |||
1774 | static int xennet_sysfs_addif(struct net_device *netdev) | ||
1775 | { | ||
1776 | int i; | ||
1777 | int err; | ||
1778 | |||
1779 | for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) { | ||
1780 | err = device_create_file(&netdev->dev, | ||
1781 | &xennet_attrs[i]); | ||
1782 | if (err) | ||
1783 | goto fail; | ||
1784 | } | ||
1785 | return 0; | ||
1786 | |||
1787 | fail: | ||
1788 | while (--i >= 0) | ||
1789 | device_remove_file(&netdev->dev, &xennet_attrs[i]); | ||
1790 | return err; | ||
1791 | } | ||
1792 | |||
1793 | static void xennet_sysfs_delif(struct net_device *netdev) | ||
1794 | { | ||
1795 | int i; | ||
1796 | |||
1797 | for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) | ||
1798 | device_remove_file(&netdev->dev, &xennet_attrs[i]); | ||
1799 | } | ||
1800 | |||
1801 | #endif /* CONFIG_SYSFS */ | ||
1802 | |||
1803 | static struct xenbus_device_id netfront_ids[] = { | ||
1804 | { "vif" }, | ||
1805 | { "" } | ||
1806 | }; | ||
1807 | |||
1808 | |||
1809 | static int __devexit xennet_remove(struct xenbus_device *dev) | ||
1810 | { | ||
1811 | struct netfront_info *info = dev->dev.driver_data; | ||
1812 | |||
1813 | dev_dbg(&dev->dev, "%s\n", dev->nodename); | ||
1814 | |||
1815 | unregister_netdev(info->netdev); | ||
1816 | |||
1817 | xennet_disconnect_backend(info); | ||
1818 | |||
1819 | del_timer_sync(&info->rx_refill_timer); | ||
1820 | |||
1821 | xennet_sysfs_delif(info->netdev); | ||
1822 | |||
1823 | free_netdev(info->netdev); | ||
1824 | |||
1825 | return 0; | ||
1826 | } | ||
1827 | |||
1828 | static struct xenbus_driver netfront = { | ||
1829 | .name = "vif", | ||
1830 | .owner = THIS_MODULE, | ||
1831 | .ids = netfront_ids, | ||
1832 | .probe = netfront_probe, | ||
1833 | .remove = __devexit_p(xennet_remove), | ||
1834 | .resume = netfront_resume, | ||
1835 | .otherend_changed = backend_changed, | ||
1836 | }; | ||
1837 | |||
1838 | static int __init netif_init(void) | ||
1839 | { | ||
1840 | if (!is_running_on_xen()) | ||
1841 | return -ENODEV; | ||
1842 | |||
1843 | if (is_initial_xendomain()) | ||
1844 | return 0; | ||
1845 | |||
1846 | printk(KERN_INFO "Initialising Xen virtual ethernet driver.\n"); | ||
1847 | |||
1848 | return xenbus_register_frontend(&netfront); | ||
1849 | } | ||
1850 | module_init(netif_init); | ||
1851 | |||
1852 | |||
1853 | static void __exit netif_exit(void) | ||
1854 | { | ||
1855 | if (is_initial_xendomain()) | ||
1856 | return; | ||
1857 | |||
1858 | return xenbus_unregister_driver(&netfront); | ||
1859 | } | ||
1860 | module_exit(netif_exit); | ||
1861 | |||
1862 | MODULE_DESCRIPTION("Xen virtual network device frontend"); | ||
1863 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c index 03baf1c64a2e..ed112ee16012 100644 --- a/drivers/pnp/pnpbios/core.c +++ b/drivers/pnp/pnpbios/core.c | |||
@@ -147,7 +147,7 @@ static int pnp_dock_event(int dock, struct pnp_docking_station_info *info) | |||
147 | info->location_id, info->serial, info->capabilities); | 147 | info->location_id, info->serial, info->capabilities); |
148 | envp[i] = NULL; | 148 | envp[i] = NULL; |
149 | 149 | ||
150 | value = call_usermodehelper (argv [0], argv, envp, 0); | 150 | value = call_usermodehelper (argv [0], argv, envp, UMH_WAIT_EXEC); |
151 | kfree (buf); | 151 | kfree (buf); |
152 | kfree (envp); | 152 | kfree (envp); |
153 | return 0; | 153 | return 0; |
diff --git a/drivers/sbus/char/bbc_envctrl.c b/drivers/sbus/char/bbc_envctrl.c index a54e4140683a..e821a155b658 100644 --- a/drivers/sbus/char/bbc_envctrl.c +++ b/drivers/sbus/char/bbc_envctrl.c | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <linux/kthread.h> | 7 | #include <linux/kthread.h> |
8 | #include <linux/delay.h> | 8 | #include <linux/delay.h> |
9 | #include <linux/kmod.h> | 9 | #include <linux/kmod.h> |
10 | #include <linux/reboot.h> | ||
10 | #include <asm/oplib.h> | 11 | #include <asm/oplib.h> |
11 | #include <asm/ebus.h> | 12 | #include <asm/ebus.h> |
12 | 13 | ||
@@ -170,8 +171,6 @@ static void get_current_temps(struct bbc_cpu_temperature *tp) | |||
170 | static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp) | 171 | static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp) |
171 | { | 172 | { |
172 | static int shutting_down = 0; | 173 | static int shutting_down = 0; |
173 | static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; | ||
174 | char *argv[] = { "/sbin/shutdown", "-h", "now", NULL }; | ||
175 | char *type = "???"; | 174 | char *type = "???"; |
176 | s8 val = -1; | 175 | s8 val = -1; |
177 | 176 | ||
@@ -195,7 +194,7 @@ static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp) | |||
195 | printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n"); | 194 | printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n"); |
196 | 195 | ||
197 | shutting_down = 1; | 196 | shutting_down = 1; |
198 | if (call_usermodehelper("/sbin/shutdown", argv, envp, 0) < 0) | 197 | if (orderly_poweroff(true) < 0) |
199 | printk(KERN_CRIT "envctrl: shutdown execution failed\n"); | 198 | printk(KERN_CRIT "envctrl: shutdown execution failed\n"); |
200 | } | 199 | } |
201 | 200 | ||
diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c index 8328acab47fd..dadabef116b6 100644 --- a/drivers/sbus/char/envctrl.c +++ b/drivers/sbus/char/envctrl.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/ioport.h> | 26 | #include <linux/ioport.h> |
27 | #include <linux/miscdevice.h> | 27 | #include <linux/miscdevice.h> |
28 | #include <linux/kmod.h> | 28 | #include <linux/kmod.h> |
29 | #include <linux/reboot.h> | ||
29 | 30 | ||
30 | #include <asm/ebus.h> | 31 | #include <asm/ebus.h> |
31 | #include <asm/uaccess.h> | 32 | #include <asm/uaccess.h> |
@@ -966,10 +967,6 @@ static struct i2c_child_t *envctrl_get_i2c_child(unsigned char mon_type) | |||
966 | static void envctrl_do_shutdown(void) | 967 | static void envctrl_do_shutdown(void) |
967 | { | 968 | { |
968 | static int inprog = 0; | 969 | static int inprog = 0; |
969 | static char *envp[] = { | ||
970 | "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; | ||
971 | char *argv[] = { | ||
972 | "/sbin/shutdown", "-h", "now", NULL }; | ||
973 | int ret; | 970 | int ret; |
974 | 971 | ||
975 | if (inprog != 0) | 972 | if (inprog != 0) |
@@ -977,7 +974,7 @@ static void envctrl_do_shutdown(void) | |||
977 | 974 | ||
978 | inprog = 1; | 975 | inprog = 1; |
979 | printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n"); | 976 | printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n"); |
980 | ret = call_usermodehelper("/sbin/shutdown", argv, envp, 0); | 977 | ret = orderly_poweroff(true); |
981 | if (ret < 0) { | 978 | if (ret < 0) { |
982 | printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n"); | 979 | printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n"); |
983 | inprog = 0; /* unlikely to succeed, but we could try again */ | 980 | inprog = 0; /* unlikely to succeed, but we could try again */ |
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile new file mode 100644 index 000000000000..56592f0d6cef --- /dev/null +++ b/drivers/xen/Makefile | |||
@@ -0,0 +1,2 @@ | |||
1 | obj-y += grant-table.o | ||
2 | obj-y += xenbus/ | ||
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c new file mode 100644 index 000000000000..ea94dbabf9a9 --- /dev/null +++ b/drivers/xen/grant-table.c | |||
@@ -0,0 +1,582 @@ | |||
1 | /****************************************************************************** | ||
2 | * grant_table.c | ||
3 | * | ||
4 | * Granting foreign access to our memory reservation. | ||
5 | * | ||
6 | * Copyright (c) 2005-2006, Christopher Clark | ||
7 | * Copyright (c) 2004-2005, K A Fraser | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License version 2 | ||
11 | * as published by the Free Software Foundation; or, when distributed | ||
12 | * separately from the Linux kernel or incorporated into other | ||
13 | * software packages, subject to the following license: | ||
14 | * | ||
15 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
16 | * of this source file (the "Software"), to deal in the Software without | ||
17 | * restriction, including without limitation the rights to use, copy, modify, | ||
18 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
19 | * and to permit persons to whom the Software is furnished to do so, subject to | ||
20 | * the following conditions: | ||
21 | * | ||
22 | * The above copyright notice and this permission notice shall be included in | ||
23 | * all copies or substantial portions of the Software. | ||
24 | * | ||
25 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
26 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
27 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
28 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
29 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
30 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
31 | * IN THE SOFTWARE. | ||
32 | */ | ||
33 | |||
34 | #include <linux/module.h> | ||
35 | #include <linux/sched.h> | ||
36 | #include <linux/mm.h> | ||
37 | #include <linux/vmalloc.h> | ||
38 | #include <linux/uaccess.h> | ||
39 | |||
40 | #include <xen/interface/xen.h> | ||
41 | #include <xen/page.h> | ||
42 | #include <xen/grant_table.h> | ||
43 | |||
44 | #include <asm/pgtable.h> | ||
45 | #include <asm/sync_bitops.h> | ||
46 | |||
47 | |||
48 | /* External tools reserve first few grant table entries. */ | ||
49 | #define NR_RESERVED_ENTRIES 8 | ||
50 | #define GNTTAB_LIST_END 0xffffffff | ||
51 | #define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry)) | ||
52 | |||
53 | static grant_ref_t **gnttab_list; | ||
54 | static unsigned int nr_grant_frames; | ||
55 | static unsigned int boot_max_nr_grant_frames; | ||
56 | static int gnttab_free_count; | ||
57 | static grant_ref_t gnttab_free_head; | ||
58 | static DEFINE_SPINLOCK(gnttab_list_lock); | ||
59 | |||
60 | static struct grant_entry *shared; | ||
61 | |||
62 | static struct gnttab_free_callback *gnttab_free_callback_list; | ||
63 | |||
64 | static int gnttab_expand(unsigned int req_entries); | ||
65 | |||
66 | #define RPP (PAGE_SIZE / sizeof(grant_ref_t)) | ||
67 | |||
68 | static inline grant_ref_t *__gnttab_entry(grant_ref_t entry) | ||
69 | { | ||
70 | return &gnttab_list[(entry) / RPP][(entry) % RPP]; | ||
71 | } | ||
72 | /* This can be used as an l-value */ | ||
73 | #define gnttab_entry(entry) (*__gnttab_entry(entry)) | ||
74 | |||
75 | static int get_free_entries(unsigned count) | ||
76 | { | ||
77 | unsigned long flags; | ||
78 | int ref, rc; | ||
79 | grant_ref_t head; | ||
80 | |||
81 | spin_lock_irqsave(&gnttab_list_lock, flags); | ||
82 | |||
83 | if ((gnttab_free_count < count) && | ||
84 | ((rc = gnttab_expand(count - gnttab_free_count)) < 0)) { | ||
85 | spin_unlock_irqrestore(&gnttab_list_lock, flags); | ||
86 | return rc; | ||
87 | } | ||
88 | |||
89 | ref = head = gnttab_free_head; | ||
90 | gnttab_free_count -= count; | ||
91 | while (count-- > 1) | ||
92 | head = gnttab_entry(head); | ||
93 | gnttab_free_head = gnttab_entry(head); | ||
94 | gnttab_entry(head) = GNTTAB_LIST_END; | ||
95 | |||
96 | spin_unlock_irqrestore(&gnttab_list_lock, flags); | ||
97 | |||
98 | return ref; | ||
99 | } | ||
100 | |||
101 | static void do_free_callbacks(void) | ||
102 | { | ||
103 | struct gnttab_free_callback *callback, *next; | ||
104 | |||
105 | callback = gnttab_free_callback_list; | ||
106 | gnttab_free_callback_list = NULL; | ||
107 | |||
108 | while (callback != NULL) { | ||
109 | next = callback->next; | ||
110 | if (gnttab_free_count >= callback->count) { | ||
111 | callback->next = NULL; | ||
112 | callback->fn(callback->arg); | ||
113 | } else { | ||
114 | callback->next = gnttab_free_callback_list; | ||
115 | gnttab_free_callback_list = callback; | ||
116 | } | ||
117 | callback = next; | ||
118 | } | ||
119 | } | ||
120 | |||
121 | static inline void check_free_callbacks(void) | ||
122 | { | ||
123 | if (unlikely(gnttab_free_callback_list)) | ||
124 | do_free_callbacks(); | ||
125 | } | ||
126 | |||
127 | static void put_free_entry(grant_ref_t ref) | ||
128 | { | ||
129 | unsigned long flags; | ||
130 | spin_lock_irqsave(&gnttab_list_lock, flags); | ||
131 | gnttab_entry(ref) = gnttab_free_head; | ||
132 | gnttab_free_head = ref; | ||
133 | gnttab_free_count++; | ||
134 | check_free_callbacks(); | ||
135 | spin_unlock_irqrestore(&gnttab_list_lock, flags); | ||
136 | } | ||
137 | |||
138 | static void update_grant_entry(grant_ref_t ref, domid_t domid, | ||
139 | unsigned long frame, unsigned flags) | ||
140 | { | ||
141 | /* | ||
142 | * Introducing a valid entry into the grant table: | ||
143 | * 1. Write ent->domid. | ||
144 | * 2. Write ent->frame: | ||
145 | * GTF_permit_access: Frame to which access is permitted. | ||
146 | * GTF_accept_transfer: Pseudo-phys frame slot being filled by new | ||
147 | * frame, or zero if none. | ||
148 | * 3. Write memory barrier (WMB). | ||
149 | * 4. Write ent->flags, inc. valid type. | ||
150 | */ | ||
151 | shared[ref].frame = frame; | ||
152 | shared[ref].domid = domid; | ||
153 | wmb(); | ||
154 | shared[ref].flags = flags; | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * Public grant-issuing interface functions | ||
159 | */ | ||
160 | void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, | ||
161 | unsigned long frame, int readonly) | ||
162 | { | ||
163 | update_grant_entry(ref, domid, frame, | ||
164 | GTF_permit_access | (readonly ? GTF_readonly : 0)); | ||
165 | } | ||
166 | EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref); | ||
167 | |||
168 | int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, | ||
169 | int readonly) | ||
170 | { | ||
171 | int ref; | ||
172 | |||
173 | ref = get_free_entries(1); | ||
174 | if (unlikely(ref < 0)) | ||
175 | return -ENOSPC; | ||
176 | |||
177 | gnttab_grant_foreign_access_ref(ref, domid, frame, readonly); | ||
178 | |||
179 | return ref; | ||
180 | } | ||
181 | EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); | ||
182 | |||
183 | int gnttab_query_foreign_access(grant_ref_t ref) | ||
184 | { | ||
185 | u16 nflags; | ||
186 | |||
187 | nflags = shared[ref].flags; | ||
188 | |||
189 | return (nflags & (GTF_reading|GTF_writing)); | ||
190 | } | ||
191 | EXPORT_SYMBOL_GPL(gnttab_query_foreign_access); | ||
192 | |||
193 | int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) | ||
194 | { | ||
195 | u16 flags, nflags; | ||
196 | |||
197 | nflags = shared[ref].flags; | ||
198 | do { | ||
199 | flags = nflags; | ||
200 | if (flags & (GTF_reading|GTF_writing)) { | ||
201 | printk(KERN_ALERT "WARNING: g.e. still in use!\n"); | ||
202 | return 0; | ||
203 | } | ||
204 | } while ((nflags = sync_cmpxchg(&shared[ref].flags, flags, 0)) != flags); | ||
205 | |||
206 | return 1; | ||
207 | } | ||
208 | EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); | ||
209 | |||
210 | void gnttab_end_foreign_access(grant_ref_t ref, int readonly, | ||
211 | unsigned long page) | ||
212 | { | ||
213 | if (gnttab_end_foreign_access_ref(ref, readonly)) { | ||
214 | put_free_entry(ref); | ||
215 | if (page != 0) | ||
216 | free_page(page); | ||
217 | } else { | ||
218 | /* XXX This needs to be fixed so that the ref and page are | ||
219 | placed on a list to be freed up later. */ | ||
220 | printk(KERN_WARNING | ||
221 | "WARNING: leaking g.e. and page still in use!\n"); | ||
222 | } | ||
223 | } | ||
224 | EXPORT_SYMBOL_GPL(gnttab_end_foreign_access); | ||
225 | |||
226 | int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn) | ||
227 | { | ||
228 | int ref; | ||
229 | |||
230 | ref = get_free_entries(1); | ||
231 | if (unlikely(ref < 0)) | ||
232 | return -ENOSPC; | ||
233 | gnttab_grant_foreign_transfer_ref(ref, domid, pfn); | ||
234 | |||
235 | return ref; | ||
236 | } | ||
237 | EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer); | ||
238 | |||
239 | void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid, | ||
240 | unsigned long pfn) | ||
241 | { | ||
242 | update_grant_entry(ref, domid, pfn, GTF_accept_transfer); | ||
243 | } | ||
244 | EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref); | ||
245 | |||
246 | unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) | ||
247 | { | ||
248 | unsigned long frame; | ||
249 | u16 flags; | ||
250 | |||
251 | /* | ||
252 | * If a transfer is not even yet started, try to reclaim the grant | ||
253 | * reference and return failure (== 0). | ||
254 | */ | ||
255 | while (!((flags = shared[ref].flags) & GTF_transfer_committed)) { | ||
256 | if (sync_cmpxchg(&shared[ref].flags, flags, 0) == flags) | ||
257 | return 0; | ||
258 | cpu_relax(); | ||
259 | } | ||
260 | |||
261 | /* If a transfer is in progress then wait until it is completed. */ | ||
262 | while (!(flags & GTF_transfer_completed)) { | ||
263 | flags = shared[ref].flags; | ||
264 | cpu_relax(); | ||
265 | } | ||
266 | |||
267 | rmb(); /* Read the frame number /after/ reading completion status. */ | ||
268 | frame = shared[ref].frame; | ||
269 | BUG_ON(frame == 0); | ||
270 | |||
271 | return frame; | ||
272 | } | ||
273 | EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref); | ||
274 | |||
275 | unsigned long gnttab_end_foreign_transfer(grant_ref_t ref) | ||
276 | { | ||
277 | unsigned long frame = gnttab_end_foreign_transfer_ref(ref); | ||
278 | put_free_entry(ref); | ||
279 | return frame; | ||
280 | } | ||
281 | EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer); | ||
282 | |||
283 | void gnttab_free_grant_reference(grant_ref_t ref) | ||
284 | { | ||
285 | put_free_entry(ref); | ||
286 | } | ||
287 | EXPORT_SYMBOL_GPL(gnttab_free_grant_reference); | ||
288 | |||
289 | void gnttab_free_grant_references(grant_ref_t head) | ||
290 | { | ||
291 | grant_ref_t ref; | ||
292 | unsigned long flags; | ||
293 | int count = 1; | ||
294 | if (head == GNTTAB_LIST_END) | ||
295 | return; | ||
296 | spin_lock_irqsave(&gnttab_list_lock, flags); | ||
297 | ref = head; | ||
298 | while (gnttab_entry(ref) != GNTTAB_LIST_END) { | ||
299 | ref = gnttab_entry(ref); | ||
300 | count++; | ||
301 | } | ||
302 | gnttab_entry(ref) = gnttab_free_head; | ||
303 | gnttab_free_head = head; | ||
304 | gnttab_free_count += count; | ||
305 | check_free_callbacks(); | ||
306 | spin_unlock_irqrestore(&gnttab_list_lock, flags); | ||
307 | } | ||
308 | EXPORT_SYMBOL_GPL(gnttab_free_grant_references); | ||
309 | |||
310 | int gnttab_alloc_grant_references(u16 count, grant_ref_t *head) | ||
311 | { | ||
312 | int h = get_free_entries(count); | ||
313 | |||
314 | if (h < 0) | ||
315 | return -ENOSPC; | ||
316 | |||
317 | *head = h; | ||
318 | |||
319 | return 0; | ||
320 | } | ||
321 | EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references); | ||
322 | |||
323 | int gnttab_empty_grant_references(const grant_ref_t *private_head) | ||
324 | { | ||
325 | return (*private_head == GNTTAB_LIST_END); | ||
326 | } | ||
327 | EXPORT_SYMBOL_GPL(gnttab_empty_grant_references); | ||
328 | |||
329 | int gnttab_claim_grant_reference(grant_ref_t *private_head) | ||
330 | { | ||
331 | grant_ref_t g = *private_head; | ||
332 | if (unlikely(g == GNTTAB_LIST_END)) | ||
333 | return -ENOSPC; | ||
334 | *private_head = gnttab_entry(g); | ||
335 | return g; | ||
336 | } | ||
337 | EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference); | ||
338 | |||
339 | void gnttab_release_grant_reference(grant_ref_t *private_head, | ||
340 | grant_ref_t release) | ||
341 | { | ||
342 | gnttab_entry(release) = *private_head; | ||
343 | *private_head = release; | ||
344 | } | ||
345 | EXPORT_SYMBOL_GPL(gnttab_release_grant_reference); | ||
346 | |||
347 | void gnttab_request_free_callback(struct gnttab_free_callback *callback, | ||
348 | void (*fn)(void *), void *arg, u16 count) | ||
349 | { | ||
350 | unsigned long flags; | ||
351 | spin_lock_irqsave(&gnttab_list_lock, flags); | ||
352 | if (callback->next) | ||
353 | goto out; | ||
354 | callback->fn = fn; | ||
355 | callback->arg = arg; | ||
356 | callback->count = count; | ||
357 | callback->next = gnttab_free_callback_list; | ||
358 | gnttab_free_callback_list = callback; | ||
359 | check_free_callbacks(); | ||
360 | out: | ||
361 | spin_unlock_irqrestore(&gnttab_list_lock, flags); | ||
362 | } | ||
363 | EXPORT_SYMBOL_GPL(gnttab_request_free_callback); | ||
364 | |||
365 | void gnttab_cancel_free_callback(struct gnttab_free_callback *callback) | ||
366 | { | ||
367 | struct gnttab_free_callback **pcb; | ||
368 | unsigned long flags; | ||
369 | |||
370 | spin_lock_irqsave(&gnttab_list_lock, flags); | ||
371 | for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) { | ||
372 | if (*pcb == callback) { | ||
373 | *pcb = callback->next; | ||
374 | break; | ||
375 | } | ||
376 | } | ||
377 | spin_unlock_irqrestore(&gnttab_list_lock, flags); | ||
378 | } | ||
379 | EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback); | ||
380 | |||
381 | static int grow_gnttab_list(unsigned int more_frames) | ||
382 | { | ||
383 | unsigned int new_nr_grant_frames, extra_entries, i; | ||
384 | |||
385 | new_nr_grant_frames = nr_grant_frames + more_frames; | ||
386 | extra_entries = more_frames * GREFS_PER_GRANT_FRAME; | ||
387 | |||
388 | for (i = nr_grant_frames; i < new_nr_grant_frames; i++) { | ||
389 | gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC); | ||
390 | if (!gnttab_list[i]) | ||
391 | goto grow_nomem; | ||
392 | } | ||
393 | |||
394 | |||
395 | for (i = GREFS_PER_GRANT_FRAME * nr_grant_frames; | ||
396 | i < GREFS_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++) | ||
397 | gnttab_entry(i) = i + 1; | ||
398 | |||
399 | gnttab_entry(i) = gnttab_free_head; | ||
400 | gnttab_free_head = GREFS_PER_GRANT_FRAME * nr_grant_frames; | ||
401 | gnttab_free_count += extra_entries; | ||
402 | |||
403 | nr_grant_frames = new_nr_grant_frames; | ||
404 | |||
405 | check_free_callbacks(); | ||
406 | |||
407 | return 0; | ||
408 | |||
409 | grow_nomem: | ||
410 | for ( ; i >= nr_grant_frames; i--) | ||
411 | free_page((unsigned long) gnttab_list[i]); | ||
412 | return -ENOMEM; | ||
413 | } | ||
414 | |||
415 | static unsigned int __max_nr_grant_frames(void) | ||
416 | { | ||
417 | struct gnttab_query_size query; | ||
418 | int rc; | ||
419 | |||
420 | query.dom = DOMID_SELF; | ||
421 | |||
422 | rc = HYPERVISOR_grant_table_op(GNTTABOP_query_size, &query, 1); | ||
423 | if ((rc < 0) || (query.status != GNTST_okay)) | ||
424 | return 4; /* Legacy max supported number of frames */ | ||
425 | |||
426 | return query.max_nr_frames; | ||
427 | } | ||
428 | |||
429 | static inline unsigned int max_nr_grant_frames(void) | ||
430 | { | ||
431 | unsigned int xen_max = __max_nr_grant_frames(); | ||
432 | |||
433 | if (xen_max > boot_max_nr_grant_frames) | ||
434 | return boot_max_nr_grant_frames; | ||
435 | return xen_max; | ||
436 | } | ||
437 | |||
438 | static int map_pte_fn(pte_t *pte, struct page *pmd_page, | ||
439 | unsigned long addr, void *data) | ||
440 | { | ||
441 | unsigned long **frames = (unsigned long **)data; | ||
442 | |||
443 | set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL)); | ||
444 | (*frames)++; | ||
445 | return 0; | ||
446 | } | ||
447 | |||
448 | static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, | ||
449 | unsigned long addr, void *data) | ||
450 | { | ||
451 | |||
452 | set_pte_at(&init_mm, addr, pte, __pte(0)); | ||
453 | return 0; | ||
454 | } | ||
455 | |||
456 | static int gnttab_map(unsigned int start_idx, unsigned int end_idx) | ||
457 | { | ||
458 | struct gnttab_setup_table setup; | ||
459 | unsigned long *frames; | ||
460 | unsigned int nr_gframes = end_idx + 1; | ||
461 | int rc; | ||
462 | |||
463 | frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC); | ||
464 | if (!frames) | ||
465 | return -ENOMEM; | ||
466 | |||
467 | setup.dom = DOMID_SELF; | ||
468 | setup.nr_frames = nr_gframes; | ||
469 | setup.frame_list = frames; | ||
470 | |||
471 | rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1); | ||
472 | if (rc == -ENOSYS) { | ||
473 | kfree(frames); | ||
474 | return -ENOSYS; | ||
475 | } | ||
476 | |||
477 | BUG_ON(rc || setup.status); | ||
478 | |||
479 | if (shared == NULL) { | ||
480 | struct vm_struct *area; | ||
481 | area = alloc_vm_area(PAGE_SIZE * max_nr_grant_frames()); | ||
482 | BUG_ON(area == NULL); | ||
483 | shared = area->addr; | ||
484 | } | ||
485 | rc = apply_to_page_range(&init_mm, (unsigned long)shared, | ||
486 | PAGE_SIZE * nr_gframes, | ||
487 | map_pte_fn, &frames); | ||
488 | BUG_ON(rc); | ||
489 | frames -= nr_gframes; /* adjust after map_pte_fn() */ | ||
490 | |||
491 | kfree(frames); | ||
492 | |||
493 | return 0; | ||
494 | } | ||
495 | |||
496 | static int gnttab_resume(void) | ||
497 | { | ||
498 | if (max_nr_grant_frames() < nr_grant_frames) | ||
499 | return -ENOSYS; | ||
500 | return gnttab_map(0, nr_grant_frames - 1); | ||
501 | } | ||
502 | |||
503 | static int gnttab_suspend(void) | ||
504 | { | ||
505 | apply_to_page_range(&init_mm, (unsigned long)shared, | ||
506 | PAGE_SIZE * nr_grant_frames, | ||
507 | unmap_pte_fn, NULL); | ||
508 | |||
509 | return 0; | ||
510 | } | ||
511 | |||
512 | static int gnttab_expand(unsigned int req_entries) | ||
513 | { | ||
514 | int rc; | ||
515 | unsigned int cur, extra; | ||
516 | |||
517 | cur = nr_grant_frames; | ||
518 | extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) / | ||
519 | GREFS_PER_GRANT_FRAME); | ||
520 | if (cur + extra > max_nr_grant_frames()) | ||
521 | return -ENOSPC; | ||
522 | |||
523 | rc = gnttab_map(cur, cur + extra - 1); | ||
524 | if (rc == 0) | ||
525 | rc = grow_gnttab_list(extra); | ||
526 | |||
527 | return rc; | ||
528 | } | ||
529 | |||
530 | static int __devinit gnttab_init(void) | ||
531 | { | ||
532 | int i; | ||
533 | unsigned int max_nr_glist_frames; | ||
534 | unsigned int nr_init_grefs; | ||
535 | |||
536 | if (!is_running_on_xen()) | ||
537 | return -ENODEV; | ||
538 | |||
539 | nr_grant_frames = 1; | ||
540 | boot_max_nr_grant_frames = __max_nr_grant_frames(); | ||
541 | |||
542 | /* Determine the maximum number of frames required for the | ||
543 | * grant reference free list on the current hypervisor. | ||
544 | */ | ||
545 | max_nr_glist_frames = (boot_max_nr_grant_frames * | ||
546 | GREFS_PER_GRANT_FRAME / | ||
547 | (PAGE_SIZE / sizeof(grant_ref_t))); | ||
548 | |||
549 | gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *), | ||
550 | GFP_KERNEL); | ||
551 | if (gnttab_list == NULL) | ||
552 | return -ENOMEM; | ||
553 | |||
554 | for (i = 0; i < nr_grant_frames; i++) { | ||
555 | gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL); | ||
556 | if (gnttab_list[i] == NULL) | ||
557 | goto ini_nomem; | ||
558 | } | ||
559 | |||
560 | if (gnttab_resume() < 0) | ||
561 | return -ENODEV; | ||
562 | |||
563 | nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME; | ||
564 | |||
565 | for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) | ||
566 | gnttab_entry(i) = i + 1; | ||
567 | |||
568 | gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END; | ||
569 | gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES; | ||
570 | gnttab_free_head = NR_RESERVED_ENTRIES; | ||
571 | |||
572 | printk("Grant table initialized\n"); | ||
573 | return 0; | ||
574 | |||
575 | ini_nomem: | ||
576 | for (i--; i >= 0; i--) | ||
577 | free_page((unsigned long)gnttab_list[i]); | ||
578 | kfree(gnttab_list); | ||
579 | return -ENOMEM; | ||
580 | } | ||
581 | |||
582 | core_initcall(gnttab_init); | ||
diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile new file mode 100644 index 000000000000..5571f5b84223 --- /dev/null +++ b/drivers/xen/xenbus/Makefile | |||
@@ -0,0 +1,7 @@ | |||
1 | obj-y += xenbus.o | ||
2 | |||
3 | xenbus-objs = | ||
4 | xenbus-objs += xenbus_client.o | ||
5 | xenbus-objs += xenbus_comms.o | ||
6 | xenbus-objs += xenbus_xs.o | ||
7 | xenbus-objs += xenbus_probe.o | ||
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c new file mode 100644 index 000000000000..9fd2f70ab46d --- /dev/null +++ b/drivers/xen/xenbus/xenbus_client.c | |||
@@ -0,0 +1,569 @@ | |||
1 | /****************************************************************************** | ||
2 | * Client-facing interface for the Xenbus driver. In other words, the | ||
3 | * interface between the Xenbus and the device-specific code, be it the | ||
4 | * frontend or the backend of that driver. | ||
5 | * | ||
6 | * Copyright (C) 2005 XenSource Ltd | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License version 2 | ||
10 | * as published by the Free Software Foundation; or, when distributed | ||
11 | * separately from the Linux kernel or incorporated into other | ||
12 | * software packages, subject to the following license: | ||
13 | * | ||
14 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
15 | * of this source file (the "Software"), to deal in the Software without | ||
16 | * restriction, including without limitation the rights to use, copy, modify, | ||
17 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
18 | * and to permit persons to whom the Software is furnished to do so, subject to | ||
19 | * the following conditions: | ||
20 | * | ||
21 | * The above copyright notice and this permission notice shall be included in | ||
22 | * all copies or substantial portions of the Software. | ||
23 | * | ||
24 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
25 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
26 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
27 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
28 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
29 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
30 | * IN THE SOFTWARE. | ||
31 | */ | ||
32 | |||
33 | #include <linux/types.h> | ||
34 | #include <linux/vmalloc.h> | ||
35 | #include <asm/xen/hypervisor.h> | ||
36 | #include <xen/interface/xen.h> | ||
37 | #include <xen/interface/event_channel.h> | ||
38 | #include <xen/events.h> | ||
39 | #include <xen/grant_table.h> | ||
40 | #include <xen/xenbus.h> | ||
41 | |||
42 | const char *xenbus_strstate(enum xenbus_state state) | ||
43 | { | ||
44 | static const char *const name[] = { | ||
45 | [ XenbusStateUnknown ] = "Unknown", | ||
46 | [ XenbusStateInitialising ] = "Initialising", | ||
47 | [ XenbusStateInitWait ] = "InitWait", | ||
48 | [ XenbusStateInitialised ] = "Initialised", | ||
49 | [ XenbusStateConnected ] = "Connected", | ||
50 | [ XenbusStateClosing ] = "Closing", | ||
51 | [ XenbusStateClosed ] = "Closed", | ||
52 | }; | ||
53 | return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID"; | ||
54 | } | ||
55 | EXPORT_SYMBOL_GPL(xenbus_strstate); | ||
56 | |||
57 | /** | ||
58 | * xenbus_watch_path - register a watch | ||
59 | * @dev: xenbus device | ||
60 | * @path: path to watch | ||
61 | * @watch: watch to register | ||
62 | * @callback: callback to register | ||
63 | * | ||
64 | * Register a @watch on the given path, using the given xenbus_watch structure | ||
65 | * for storage, and the given @callback function as the callback. Return 0 on | ||
66 | * success, or -errno on error. On success, the given @path will be saved as | ||
67 | * @watch->node, and remains the caller's to free. On error, @watch->node will | ||
68 | * be NULL, the device will switch to %XenbusStateClosing, and the error will | ||
69 | * be saved in the store. | ||
70 | */ | ||
71 | int xenbus_watch_path(struct xenbus_device *dev, const char *path, | ||
72 | struct xenbus_watch *watch, | ||
73 | void (*callback)(struct xenbus_watch *, | ||
74 | const char **, unsigned int)) | ||
75 | { | ||
76 | int err; | ||
77 | |||
78 | watch->node = path; | ||
79 | watch->callback = callback; | ||
80 | |||
81 | err = register_xenbus_watch(watch); | ||
82 | |||
83 | if (err) { | ||
84 | watch->node = NULL; | ||
85 | watch->callback = NULL; | ||
86 | xenbus_dev_fatal(dev, err, "adding watch on %s", path); | ||
87 | } | ||
88 | |||
89 | return err; | ||
90 | } | ||
91 | EXPORT_SYMBOL_GPL(xenbus_watch_path); | ||
92 | |||
93 | |||
94 | /** | ||
95 | * xenbus_watch_pathfmt - register a watch on a sprintf-formatted path | ||
96 | * @dev: xenbus device | ||
97 | * @watch: watch to register | ||
98 | * @callback: callback to register | ||
99 | * @pathfmt: format of path to watch | ||
100 | * | ||
101 | * Register a watch on the given @path, using the given xenbus_watch | ||
102 | * structure for storage, and the given @callback function as the callback. | ||
103 | * Return 0 on success, or -errno on error. On success, the watched path | ||
104 | * (@path/@path2) will be saved as @watch->node, and becomes the caller's to | ||
105 | * kfree(). On error, watch->node will be NULL, so the caller has nothing to | ||
106 | * free, the device will switch to %XenbusStateClosing, and the error will be | ||
107 | * saved in the store. | ||
108 | */ | ||
109 | int xenbus_watch_pathfmt(struct xenbus_device *dev, | ||
110 | struct xenbus_watch *watch, | ||
111 | void (*callback)(struct xenbus_watch *, | ||
112 | const char **, unsigned int), | ||
113 | const char *pathfmt, ...) | ||
114 | { | ||
115 | int err; | ||
116 | va_list ap; | ||
117 | char *path; | ||
118 | |||
119 | va_start(ap, pathfmt); | ||
120 | path = kvasprintf(GFP_KERNEL, pathfmt, ap); | ||
121 | va_end(ap); | ||
122 | |||
123 | if (!path) { | ||
124 | xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch"); | ||
125 | return -ENOMEM; | ||
126 | } | ||
127 | err = xenbus_watch_path(dev, path, watch, callback); | ||
128 | |||
129 | if (err) | ||
130 | kfree(path); | ||
131 | return err; | ||
132 | } | ||
133 | EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt); | ||
134 | |||
135 | |||
136 | /** | ||
137 | * xenbus_switch_state | ||
138 | * @dev: xenbus device | ||
139 | * @xbt: transaction handle | ||
140 | * @state: new state | ||
141 | * | ||
142 | * Advertise in the store a change of the given driver to the given new_state. | ||
143 | * Return 0 on success, or -errno on error. On error, the device will switch | ||
144 | * to XenbusStateClosing, and the error will be saved in the store. | ||
145 | */ | ||
146 | int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) | ||
147 | { | ||
148 | /* We check whether the state is currently set to the given value, and | ||
149 | if not, then the state is set. We don't want to unconditionally | ||
150 | write the given state, because we don't want to fire watches | ||
151 | unnecessarily. Furthermore, if the node has gone, we don't write | ||
152 | to it, as the device will be tearing down, and we don't want to | ||
153 | resurrect that directory. | ||
154 | |||
155 | Note that, because of this cached value of our state, this function | ||
156 | will not work inside a Xenstore transaction (something it was | ||
157 | trying to in the past) because dev->state would not get reset if | ||
158 | the transaction was aborted. | ||
159 | |||
160 | */ | ||
161 | |||
162 | int current_state; | ||
163 | int err; | ||
164 | |||
165 | if (state == dev->state) | ||
166 | return 0; | ||
167 | |||
168 | err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d", | ||
169 | ¤t_state); | ||
170 | if (err != 1) | ||
171 | return 0; | ||
172 | |||
173 | err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state); | ||
174 | if (err) { | ||
175 | if (state != XenbusStateClosing) /* Avoid looping */ | ||
176 | xenbus_dev_fatal(dev, err, "writing new state"); | ||
177 | return err; | ||
178 | } | ||
179 | |||
180 | dev->state = state; | ||
181 | |||
182 | return 0; | ||
183 | } | ||
184 | EXPORT_SYMBOL_GPL(xenbus_switch_state); | ||
185 | |||
186 | int xenbus_frontend_closed(struct xenbus_device *dev) | ||
187 | { | ||
188 | xenbus_switch_state(dev, XenbusStateClosed); | ||
189 | complete(&dev->down); | ||
190 | return 0; | ||
191 | } | ||
192 | EXPORT_SYMBOL_GPL(xenbus_frontend_closed); | ||
193 | |||
194 | /** | ||
195 | * Return the path to the error node for the given device, or NULL on failure. | ||
196 | * If the value returned is non-NULL, then it is the caller's to kfree. | ||
197 | */ | ||
198 | static char *error_path(struct xenbus_device *dev) | ||
199 | { | ||
200 | return kasprintf(GFP_KERNEL, "error/%s", dev->nodename); | ||
201 | } | ||
202 | |||
203 | |||
204 | static void xenbus_va_dev_error(struct xenbus_device *dev, int err, | ||
205 | const char *fmt, va_list ap) | ||
206 | { | ||
207 | int ret; | ||
208 | unsigned int len; | ||
209 | char *printf_buffer = NULL; | ||
210 | char *path_buffer = NULL; | ||
211 | |||
212 | #define PRINTF_BUFFER_SIZE 4096 | ||
213 | printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL); | ||
214 | if (printf_buffer == NULL) | ||
215 | goto fail; | ||
216 | |||
217 | len = sprintf(printf_buffer, "%i ", -err); | ||
218 | ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap); | ||
219 | |||
220 | BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1); | ||
221 | |||
222 | dev_err(&dev->dev, "%s\n", printf_buffer); | ||
223 | |||
224 | path_buffer = error_path(dev); | ||
225 | |||
226 | if (path_buffer == NULL) { | ||
227 | dev_err(&dev->dev, "failed to write error node for %s (%s)\n", | ||
228 | dev->nodename, printf_buffer); | ||
229 | goto fail; | ||
230 | } | ||
231 | |||
232 | if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) { | ||
233 | dev_err(&dev->dev, "failed to write error node for %s (%s)\n", | ||
234 | dev->nodename, printf_buffer); | ||
235 | goto fail; | ||
236 | } | ||
237 | |||
238 | fail: | ||
239 | kfree(printf_buffer); | ||
240 | kfree(path_buffer); | ||
241 | } | ||
242 | |||
243 | |||
244 | /** | ||
245 | * xenbus_dev_error | ||
246 | * @dev: xenbus device | ||
247 | * @err: error to report | ||
248 | * @fmt: error message format | ||
249 | * | ||
250 | * Report the given negative errno into the store, along with the given | ||
251 | * formatted message. | ||
252 | */ | ||
253 | void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...) | ||
254 | { | ||
255 | va_list ap; | ||
256 | |||
257 | va_start(ap, fmt); | ||
258 | xenbus_va_dev_error(dev, err, fmt, ap); | ||
259 | va_end(ap); | ||
260 | } | ||
261 | EXPORT_SYMBOL_GPL(xenbus_dev_error); | ||
262 | |||
263 | /** | ||
264 | * xenbus_dev_fatal | ||
265 | * @dev: xenbus device | ||
266 | * @err: error to report | ||
267 | * @fmt: error message format | ||
268 | * | ||
269 | * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by | ||
270 | * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly | ||
271 | * closedown of this driver and its peer. | ||
272 | */ | ||
273 | |||
274 | void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...) | ||
275 | { | ||
276 | va_list ap; | ||
277 | |||
278 | va_start(ap, fmt); | ||
279 | xenbus_va_dev_error(dev, err, fmt, ap); | ||
280 | va_end(ap); | ||
281 | |||
282 | xenbus_switch_state(dev, XenbusStateClosing); | ||
283 | } | ||
284 | EXPORT_SYMBOL_GPL(xenbus_dev_fatal); | ||
285 | |||
286 | /** | ||
287 | * xenbus_grant_ring | ||
288 | * @dev: xenbus device | ||
289 | * @ring_mfn: mfn of ring to grant | ||
290 | |||
291 | * Grant access to the given @ring_mfn to the peer of the given device. Return | ||
292 | * 0 on success, or -errno on error. On error, the device will switch to | ||
293 | * XenbusStateClosing, and the error will be saved in the store. | ||
294 | */ | ||
295 | int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn) | ||
296 | { | ||
297 | int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0); | ||
298 | if (err < 0) | ||
299 | xenbus_dev_fatal(dev, err, "granting access to ring page"); | ||
300 | return err; | ||
301 | } | ||
302 | EXPORT_SYMBOL_GPL(xenbus_grant_ring); | ||
303 | |||
304 | |||
305 | /** | ||
306 | * Allocate an event channel for the given xenbus_device, assigning the newly | ||
307 | * created local port to *port. Return 0 on success, or -errno on error. On | ||
308 | * error, the device will switch to XenbusStateClosing, and the error will be | ||
309 | * saved in the store. | ||
310 | */ | ||
311 | int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port) | ||
312 | { | ||
313 | struct evtchn_alloc_unbound alloc_unbound; | ||
314 | int err; | ||
315 | |||
316 | alloc_unbound.dom = DOMID_SELF; | ||
317 | alloc_unbound.remote_dom = dev->otherend_id; | ||
318 | |||
319 | err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, | ||
320 | &alloc_unbound); | ||
321 | if (err) | ||
322 | xenbus_dev_fatal(dev, err, "allocating event channel"); | ||
323 | else | ||
324 | *port = alloc_unbound.port; | ||
325 | |||
326 | return err; | ||
327 | } | ||
328 | EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn); | ||
329 | |||
330 | |||
331 | /** | ||
332 | * Bind to an existing interdomain event channel in another domain. Returns 0 | ||
333 | * on success and stores the local port in *port. On error, returns -errno, | ||
334 | * switches the device to XenbusStateClosing, and saves the error in XenStore. | ||
335 | */ | ||
336 | int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port) | ||
337 | { | ||
338 | struct evtchn_bind_interdomain bind_interdomain; | ||
339 | int err; | ||
340 | |||
341 | bind_interdomain.remote_dom = dev->otherend_id; | ||
342 | bind_interdomain.remote_port = remote_port; | ||
343 | |||
344 | err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, | ||
345 | &bind_interdomain); | ||
346 | if (err) | ||
347 | xenbus_dev_fatal(dev, err, | ||
348 | "binding to event channel %d from domain %d", | ||
349 | remote_port, dev->otherend_id); | ||
350 | else | ||
351 | *port = bind_interdomain.local_port; | ||
352 | |||
353 | return err; | ||
354 | } | ||
355 | EXPORT_SYMBOL_GPL(xenbus_bind_evtchn); | ||
356 | |||
357 | |||
358 | /** | ||
359 | * Free an existing event channel. Returns 0 on success or -errno on error. | ||
360 | */ | ||
361 | int xenbus_free_evtchn(struct xenbus_device *dev, int port) | ||
362 | { | ||
363 | struct evtchn_close close; | ||
364 | int err; | ||
365 | |||
366 | close.port = port; | ||
367 | |||
368 | err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); | ||
369 | if (err) | ||
370 | xenbus_dev_error(dev, err, "freeing event channel %d", port); | ||
371 | |||
372 | return err; | ||
373 | } | ||
374 | EXPORT_SYMBOL_GPL(xenbus_free_evtchn); | ||
375 | |||
376 | |||
377 | /** | ||
378 | * xenbus_map_ring_valloc | ||
379 | * @dev: xenbus device | ||
380 | * @gnt_ref: grant reference | ||
381 | * @vaddr: pointer to address to be filled out by mapping | ||
382 | * | ||
383 | * Based on Rusty Russell's skeleton driver's map_page. | ||
384 | * Map a page of memory into this domain from another domain's grant table. | ||
385 | * xenbus_map_ring_valloc allocates a page of virtual address space, maps the | ||
386 | * page to that address, and sets *vaddr to that address. | ||
387 | * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h) | ||
388 | * or -ENOMEM on error. If an error is returned, device will switch to | ||
389 | * XenbusStateClosing and the error message will be saved in XenStore. | ||
390 | */ | ||
391 | int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr) | ||
392 | { | ||
393 | struct gnttab_map_grant_ref op = { | ||
394 | .flags = GNTMAP_host_map, | ||
395 | .ref = gnt_ref, | ||
396 | .dom = dev->otherend_id, | ||
397 | }; | ||
398 | struct vm_struct *area; | ||
399 | |||
400 | *vaddr = NULL; | ||
401 | |||
402 | area = alloc_vm_area(PAGE_SIZE); | ||
403 | if (!area) | ||
404 | return -ENOMEM; | ||
405 | |||
406 | op.host_addr = (unsigned long)area->addr; | ||
407 | |||
408 | if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) | ||
409 | BUG(); | ||
410 | |||
411 | if (op.status != GNTST_okay) { | ||
412 | free_vm_area(area); | ||
413 | xenbus_dev_fatal(dev, op.status, | ||
414 | "mapping in shared page %d from domain %d", | ||
415 | gnt_ref, dev->otherend_id); | ||
416 | return op.status; | ||
417 | } | ||
418 | |||
419 | /* Stuff the handle in an unused field */ | ||
420 | area->phys_addr = (unsigned long)op.handle; | ||
421 | |||
422 | *vaddr = area->addr; | ||
423 | return 0; | ||
424 | } | ||
425 | EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc); | ||
426 | |||
427 | |||
428 | /** | ||
429 | * xenbus_map_ring | ||
430 | * @dev: xenbus device | ||
431 | * @gnt_ref: grant reference | ||
432 | * @handle: pointer to grant handle to be filled | ||
433 | * @vaddr: address to be mapped to | ||
434 | * | ||
435 | * Map a page of memory into this domain from another domain's grant table. | ||
436 | * xenbus_map_ring does not allocate the virtual address space (you must do | ||
437 | * this yourself!). It only maps in the page to the specified address. | ||
438 | * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h) | ||
439 | * or -ENOMEM on error. If an error is returned, device will switch to | ||
440 | * XenbusStateClosing and the error message will be saved in XenStore. | ||
441 | */ | ||
442 | int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref, | ||
443 | grant_handle_t *handle, void *vaddr) | ||
444 | { | ||
445 | struct gnttab_map_grant_ref op = { | ||
446 | .host_addr = (unsigned long)vaddr, | ||
447 | .flags = GNTMAP_host_map, | ||
448 | .ref = gnt_ref, | ||
449 | .dom = dev->otherend_id, | ||
450 | }; | ||
451 | |||
452 | if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) | ||
453 | BUG(); | ||
454 | |||
455 | if (op.status != GNTST_okay) { | ||
456 | xenbus_dev_fatal(dev, op.status, | ||
457 | "mapping in shared page %d from domain %d", | ||
458 | gnt_ref, dev->otherend_id); | ||
459 | } else | ||
460 | *handle = op.handle; | ||
461 | |||
462 | return op.status; | ||
463 | } | ||
464 | EXPORT_SYMBOL_GPL(xenbus_map_ring); | ||
465 | |||
466 | |||
467 | /** | ||
468 | * xenbus_unmap_ring_vfree | ||
469 | * @dev: xenbus device | ||
470 | * @vaddr: addr to unmap | ||
471 | * | ||
472 | * Based on Rusty Russell's skeleton driver's unmap_page. | ||
473 | * Unmap a page of memory in this domain that was imported from another domain. | ||
474 | * Use xenbus_unmap_ring_vfree if you mapped in your memory with | ||
475 | * xenbus_map_ring_valloc (it will free the virtual address space). | ||
476 | * Returns 0 on success and returns GNTST_* on error | ||
477 | * (see xen/include/interface/grant_table.h). | ||
478 | */ | ||
479 | int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) | ||
480 | { | ||
481 | struct vm_struct *area; | ||
482 | struct gnttab_unmap_grant_ref op = { | ||
483 | .host_addr = (unsigned long)vaddr, | ||
484 | }; | ||
485 | |||
486 | /* It'd be nice if linux/vmalloc.h provided a find_vm_area(void *addr) | ||
487 | * method so that we don't have to muck with vmalloc internals here. | ||
488 | * We could force the user to hang on to their struct vm_struct from | ||
489 | * xenbus_map_ring_valloc, but these 6 lines considerably simplify | ||
490 | * this API. | ||
491 | */ | ||
492 | read_lock(&vmlist_lock); | ||
493 | for (area = vmlist; area != NULL; area = area->next) { | ||
494 | if (area->addr == vaddr) | ||
495 | break; | ||
496 | } | ||
497 | read_unlock(&vmlist_lock); | ||
498 | |||
499 | if (!area) { | ||
500 | xenbus_dev_error(dev, -ENOENT, | ||
501 | "can't find mapped virtual address %p", vaddr); | ||
502 | return GNTST_bad_virt_addr; | ||
503 | } | ||
504 | |||
505 | op.handle = (grant_handle_t)area->phys_addr; | ||
506 | |||
507 | if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) | ||
508 | BUG(); | ||
509 | |||
510 | if (op.status == GNTST_okay) | ||
511 | free_vm_area(area); | ||
512 | else | ||
513 | xenbus_dev_error(dev, op.status, | ||
514 | "unmapping page at handle %d error %d", | ||
515 | (int16_t)area->phys_addr, op.status); | ||
516 | |||
517 | return op.status; | ||
518 | } | ||
519 | EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); | ||
520 | |||
521 | |||
522 | /** | ||
523 | * xenbus_unmap_ring | ||
524 | * @dev: xenbus device | ||
525 | * @handle: grant handle | ||
526 | * @vaddr: addr to unmap | ||
527 | * | ||
528 | * Unmap a page of memory in this domain that was imported from another domain. | ||
529 | * Returns 0 on success and returns GNTST_* on error | ||
530 | * (see xen/include/interface/grant_table.h). | ||
531 | */ | ||
532 | int xenbus_unmap_ring(struct xenbus_device *dev, | ||
533 | grant_handle_t handle, void *vaddr) | ||
534 | { | ||
535 | struct gnttab_unmap_grant_ref op = { | ||
536 | .host_addr = (unsigned long)vaddr, | ||
537 | .handle = handle, | ||
538 | }; | ||
539 | |||
540 | if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) | ||
541 | BUG(); | ||
542 | |||
543 | if (op.status != GNTST_okay) | ||
544 | xenbus_dev_error(dev, op.status, | ||
545 | "unmapping page at handle %d error %d", | ||
546 | handle, op.status); | ||
547 | |||
548 | return op.status; | ||
549 | } | ||
550 | EXPORT_SYMBOL_GPL(xenbus_unmap_ring); | ||
551 | |||
552 | |||
553 | /** | ||
554 | * xenbus_read_driver_state | ||
555 | * @path: path for driver | ||
556 | * | ||
557 | * Return the state of the driver rooted at the given store path, or | ||
558 | * XenbusStateUnknown if no state can be read. | ||
559 | */ | ||
560 | enum xenbus_state xenbus_read_driver_state(const char *path) | ||
561 | { | ||
562 | enum xenbus_state result; | ||
563 | int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL); | ||
564 | if (err) | ||
565 | result = XenbusStateUnknown; | ||
566 | |||
567 | return result; | ||
568 | } | ||
569 | EXPORT_SYMBOL_GPL(xenbus_read_driver_state); | ||
diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c new file mode 100644 index 000000000000..6efbe3f29ca5 --- /dev/null +++ b/drivers/xen/xenbus/xenbus_comms.c | |||
@@ -0,0 +1,233 @@ | |||
1 | /****************************************************************************** | ||
2 | * xenbus_comms.c | ||
3 | * | ||
4 | * Low level code to talks to Xen Store: ringbuffer and event channel. | ||
5 | * | ||
6 | * Copyright (C) 2005 Rusty Russell, IBM Corporation | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License version 2 | ||
10 | * as published by the Free Software Foundation; or, when distributed | ||
11 | * separately from the Linux kernel or incorporated into other | ||
12 | * software packages, subject to the following license: | ||
13 | * | ||
14 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
15 | * of this source file (the "Software"), to deal in the Software without | ||
16 | * restriction, including without limitation the rights to use, copy, modify, | ||
17 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
18 | * and to permit persons to whom the Software is furnished to do so, subject to | ||
19 | * the following conditions: | ||
20 | * | ||
21 | * The above copyright notice and this permission notice shall be included in | ||
22 | * all copies or substantial portions of the Software. | ||
23 | * | ||
24 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
25 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
26 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
27 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
28 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
29 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
30 | * IN THE SOFTWARE. | ||
31 | */ | ||
32 | |||
33 | #include <linux/wait.h> | ||
34 | #include <linux/interrupt.h> | ||
35 | #include <linux/sched.h> | ||
36 | #include <linux/err.h> | ||
37 | #include <xen/xenbus.h> | ||
38 | #include <asm/xen/hypervisor.h> | ||
39 | #include <xen/events.h> | ||
40 | #include <xen/page.h> | ||
41 | #include "xenbus_comms.h" | ||
42 | |||
43 | static int xenbus_irq; | ||
44 | |||
45 | static DECLARE_WORK(probe_work, xenbus_probe); | ||
46 | |||
47 | static DECLARE_WAIT_QUEUE_HEAD(xb_waitq); | ||
48 | |||
49 | static irqreturn_t wake_waiting(int irq, void *unused) | ||
50 | { | ||
51 | if (unlikely(xenstored_ready == 0)) { | ||
52 | xenstored_ready = 1; | ||
53 | schedule_work(&probe_work); | ||
54 | } | ||
55 | |||
56 | wake_up(&xb_waitq); | ||
57 | return IRQ_HANDLED; | ||
58 | } | ||
59 | |||
60 | static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod) | ||
61 | { | ||
62 | return ((prod - cons) <= XENSTORE_RING_SIZE); | ||
63 | } | ||
64 | |||
65 | static void *get_output_chunk(XENSTORE_RING_IDX cons, | ||
66 | XENSTORE_RING_IDX prod, | ||
67 | char *buf, uint32_t *len) | ||
68 | { | ||
69 | *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod); | ||
70 | if ((XENSTORE_RING_SIZE - (prod - cons)) < *len) | ||
71 | *len = XENSTORE_RING_SIZE - (prod - cons); | ||
72 | return buf + MASK_XENSTORE_IDX(prod); | ||
73 | } | ||
74 | |||
75 | static const void *get_input_chunk(XENSTORE_RING_IDX cons, | ||
76 | XENSTORE_RING_IDX prod, | ||
77 | const char *buf, uint32_t *len) | ||
78 | { | ||
79 | *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons); | ||
80 | if ((prod - cons) < *len) | ||
81 | *len = prod - cons; | ||
82 | return buf + MASK_XENSTORE_IDX(cons); | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * xb_write - low level write | ||
87 | * @data: buffer to send | ||
88 | * @len: length of buffer | ||
89 | * | ||
90 | * Returns 0 on success, error otherwise. | ||
91 | */ | ||
92 | int xb_write(const void *data, unsigned len) | ||
93 | { | ||
94 | struct xenstore_domain_interface *intf = xen_store_interface; | ||
95 | XENSTORE_RING_IDX cons, prod; | ||
96 | int rc; | ||
97 | |||
98 | while (len != 0) { | ||
99 | void *dst; | ||
100 | unsigned int avail; | ||
101 | |||
102 | rc = wait_event_interruptible( | ||
103 | xb_waitq, | ||
104 | (intf->req_prod - intf->req_cons) != | ||
105 | XENSTORE_RING_SIZE); | ||
106 | if (rc < 0) | ||
107 | return rc; | ||
108 | |||
109 | /* Read indexes, then verify. */ | ||
110 | cons = intf->req_cons; | ||
111 | prod = intf->req_prod; | ||
112 | if (!check_indexes(cons, prod)) { | ||
113 | intf->req_cons = intf->req_prod = 0; | ||
114 | return -EIO; | ||
115 | } | ||
116 | |||
117 | dst = get_output_chunk(cons, prod, intf->req, &avail); | ||
118 | if (avail == 0) | ||
119 | continue; | ||
120 | if (avail > len) | ||
121 | avail = len; | ||
122 | |||
123 | /* Must write data /after/ reading the consumer index. */ | ||
124 | mb(); | ||
125 | |||
126 | memcpy(dst, data, avail); | ||
127 | data += avail; | ||
128 | len -= avail; | ||
129 | |||
130 | /* Other side must not see new producer until data is there. */ | ||
131 | wmb(); | ||
132 | intf->req_prod += avail; | ||
133 | |||
134 | /* Implies mb(): other side will see the updated producer. */ | ||
135 | notify_remote_via_evtchn(xen_store_evtchn); | ||
136 | } | ||
137 | |||
138 | return 0; | ||
139 | } | ||
140 | |||
141 | int xb_data_to_read(void) | ||
142 | { | ||
143 | struct xenstore_domain_interface *intf = xen_store_interface; | ||
144 | return (intf->rsp_cons != intf->rsp_prod); | ||
145 | } | ||
146 | |||
147 | int xb_wait_for_data_to_read(void) | ||
148 | { | ||
149 | return wait_event_interruptible(xb_waitq, xb_data_to_read()); | ||
150 | } | ||
151 | |||
152 | int xb_read(void *data, unsigned len) | ||
153 | { | ||
154 | struct xenstore_domain_interface *intf = xen_store_interface; | ||
155 | XENSTORE_RING_IDX cons, prod; | ||
156 | int rc; | ||
157 | |||
158 | while (len != 0) { | ||
159 | unsigned int avail; | ||
160 | const char *src; | ||
161 | |||
162 | rc = xb_wait_for_data_to_read(); | ||
163 | if (rc < 0) | ||
164 | return rc; | ||
165 | |||
166 | /* Read indexes, then verify. */ | ||
167 | cons = intf->rsp_cons; | ||
168 | prod = intf->rsp_prod; | ||
169 | if (!check_indexes(cons, prod)) { | ||
170 | intf->rsp_cons = intf->rsp_prod = 0; | ||
171 | return -EIO; | ||
172 | } | ||
173 | |||
174 | src = get_input_chunk(cons, prod, intf->rsp, &avail); | ||
175 | if (avail == 0) | ||
176 | continue; | ||
177 | if (avail > len) | ||
178 | avail = len; | ||
179 | |||
180 | /* Must read data /after/ reading the producer index. */ | ||
181 | rmb(); | ||
182 | |||
183 | memcpy(data, src, avail); | ||
184 | data += avail; | ||
185 | len -= avail; | ||
186 | |||
187 | /* Other side must not see free space until we've copied out */ | ||
188 | mb(); | ||
189 | intf->rsp_cons += avail; | ||
190 | |||
191 | pr_debug("Finished read of %i bytes (%i to go)\n", avail, len); | ||
192 | |||
193 | /* Implies mb(): other side will see the updated consumer. */ | ||
194 | notify_remote_via_evtchn(xen_store_evtchn); | ||
195 | } | ||
196 | |||
197 | return 0; | ||
198 | } | ||
199 | |||
200 | /** | ||
201 | * xb_init_comms - Set up interrupt handler off store event channel. | ||
202 | */ | ||
203 | int xb_init_comms(void) | ||
204 | { | ||
205 | struct xenstore_domain_interface *intf = xen_store_interface; | ||
206 | int err; | ||
207 | |||
208 | if (intf->req_prod != intf->req_cons) | ||
209 | printk(KERN_ERR "XENBUS request ring is not quiescent " | ||
210 | "(%08x:%08x)!\n", intf->req_cons, intf->req_prod); | ||
211 | |||
212 | if (intf->rsp_prod != intf->rsp_cons) { | ||
213 | printk(KERN_WARNING "XENBUS response ring is not quiescent " | ||
214 | "(%08x:%08x): fixing up\n", | ||
215 | intf->rsp_cons, intf->rsp_prod); | ||
216 | intf->rsp_cons = intf->rsp_prod; | ||
217 | } | ||
218 | |||
219 | if (xenbus_irq) | ||
220 | unbind_from_irqhandler(xenbus_irq, &xb_waitq); | ||
221 | |||
222 | err = bind_evtchn_to_irqhandler( | ||
223 | xen_store_evtchn, wake_waiting, | ||
224 | 0, "xenbus", &xb_waitq); | ||
225 | if (err <= 0) { | ||
226 | printk(KERN_ERR "XENBUS request irq failed %i\n", err); | ||
227 | return err; | ||
228 | } | ||
229 | |||
230 | xenbus_irq = err; | ||
231 | |||
232 | return 0; | ||
233 | } | ||
diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h new file mode 100644 index 000000000000..c21db7513736 --- /dev/null +++ b/drivers/xen/xenbus/xenbus_comms.h | |||
@@ -0,0 +1,46 @@ | |||
1 | /* | ||
2 | * Private include for xenbus communications. | ||
3 | * | ||
4 | * Copyright (C) 2005 Rusty Russell, IBM Corporation | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License version 2 | ||
8 | * as published by the Free Software Foundation; or, when distributed | ||
9 | * separately from the Linux kernel or incorporated into other | ||
10 | * software packages, subject to the following license: | ||
11 | * | ||
12 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
13 | * of this source file (the "Software"), to deal in the Software without | ||
14 | * restriction, including without limitation the rights to use, copy, modify, | ||
15 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
16 | * and to permit persons to whom the Software is furnished to do so, subject to | ||
17 | * the following conditions: | ||
18 | * | ||
19 | * The above copyright notice and this permission notice shall be included in | ||
20 | * all copies or substantial portions of the Software. | ||
21 | * | ||
22 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
23 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
24 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
25 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
26 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
27 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
28 | * IN THE SOFTWARE. | ||
29 | */ | ||
30 | |||
31 | #ifndef _XENBUS_COMMS_H | ||
32 | #define _XENBUS_COMMS_H | ||
33 | |||
34 | int xs_init(void); | ||
35 | int xb_init_comms(void); | ||
36 | |||
37 | /* Low level routines. */ | ||
38 | int xb_write(const void *data, unsigned len); | ||
39 | int xb_read(void *data, unsigned len); | ||
40 | int xb_data_to_read(void); | ||
41 | int xb_wait_for_data_to_read(void); | ||
42 | int xs_input_avail(void); | ||
43 | extern struct xenstore_domain_interface *xen_store_interface; | ||
44 | extern int xen_store_evtchn; | ||
45 | |||
46 | #endif /* _XENBUS_COMMS_H */ | ||
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c new file mode 100644 index 000000000000..0b769f7c4a48 --- /dev/null +++ b/drivers/xen/xenbus/xenbus_probe.c | |||
@@ -0,0 +1,935 @@ | |||
1 | /****************************************************************************** | ||
2 | * Talks to Xen Store to figure out what devices we have. | ||
3 | * | ||
4 | * Copyright (C) 2005 Rusty Russell, IBM Corporation | ||
5 | * Copyright (C) 2005 Mike Wray, Hewlett-Packard | ||
6 | * Copyright (C) 2005, 2006 XenSource Ltd | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License version 2 | ||
10 | * as published by the Free Software Foundation; or, when distributed | ||
11 | * separately from the Linux kernel or incorporated into other | ||
12 | * software packages, subject to the following license: | ||
13 | * | ||
14 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
15 | * of this source file (the "Software"), to deal in the Software without | ||
16 | * restriction, including without limitation the rights to use, copy, modify, | ||
17 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
18 | * and to permit persons to whom the Software is furnished to do so, subject to | ||
19 | * the following conditions: | ||
20 | * | ||
21 | * The above copyright notice and this permission notice shall be included in | ||
22 | * all copies or substantial portions of the Software. | ||
23 | * | ||
24 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
25 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
26 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
27 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
28 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
29 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
30 | * IN THE SOFTWARE. | ||
31 | */ | ||
32 | |||
33 | #define DPRINTK(fmt, args...) \ | ||
34 | pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ | ||
35 | __func__, __LINE__, ##args) | ||
36 | |||
37 | #include <linux/kernel.h> | ||
38 | #include <linux/err.h> | ||
39 | #include <linux/string.h> | ||
40 | #include <linux/ctype.h> | ||
41 | #include <linux/fcntl.h> | ||
42 | #include <linux/mm.h> | ||
43 | #include <linux/notifier.h> | ||
44 | #include <linux/kthread.h> | ||
45 | #include <linux/mutex.h> | ||
46 | #include <linux/io.h> | ||
47 | |||
48 | #include <asm/page.h> | ||
49 | #include <asm/pgtable.h> | ||
50 | #include <asm/xen/hypervisor.h> | ||
51 | #include <xen/xenbus.h> | ||
52 | #include <xen/events.h> | ||
53 | #include <xen/page.h> | ||
54 | |||
55 | #include "xenbus_comms.h" | ||
56 | #include "xenbus_probe.h" | ||
57 | |||
58 | int xen_store_evtchn; | ||
59 | struct xenstore_domain_interface *xen_store_interface; | ||
60 | static unsigned long xen_store_mfn; | ||
61 | |||
62 | static BLOCKING_NOTIFIER_HEAD(xenstore_chain); | ||
63 | |||
64 | static void wait_for_devices(struct xenbus_driver *xendrv); | ||
65 | |||
66 | static int xenbus_probe_frontend(const char *type, const char *name); | ||
67 | |||
68 | static void xenbus_dev_shutdown(struct device *_dev); | ||
69 | |||
70 | /* If something in array of ids matches this device, return it. */ | ||
71 | static const struct xenbus_device_id * | ||
72 | match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) | ||
73 | { | ||
74 | for (; *arr->devicetype != '\0'; arr++) { | ||
75 | if (!strcmp(arr->devicetype, dev->devicetype)) | ||
76 | return arr; | ||
77 | } | ||
78 | return NULL; | ||
79 | } | ||
80 | |||
81 | int xenbus_match(struct device *_dev, struct device_driver *_drv) | ||
82 | { | ||
83 | struct xenbus_driver *drv = to_xenbus_driver(_drv); | ||
84 | |||
85 | if (!drv->ids) | ||
86 | return 0; | ||
87 | |||
88 | return match_device(drv->ids, to_xenbus_device(_dev)) != NULL; | ||
89 | } | ||
90 | |||
91 | /* device/<type>/<id> => <type>-<id> */ | ||
92 | static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) | ||
93 | { | ||
94 | nodename = strchr(nodename, '/'); | ||
95 | if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) { | ||
96 | printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename); | ||
97 | return -EINVAL; | ||
98 | } | ||
99 | |||
100 | strlcpy(bus_id, nodename + 1, BUS_ID_SIZE); | ||
101 | if (!strchr(bus_id, '/')) { | ||
102 | printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id); | ||
103 | return -EINVAL; | ||
104 | } | ||
105 | *strchr(bus_id, '/') = '-'; | ||
106 | return 0; | ||
107 | } | ||
108 | |||
109 | |||
110 | static void free_otherend_details(struct xenbus_device *dev) | ||
111 | { | ||
112 | kfree(dev->otherend); | ||
113 | dev->otherend = NULL; | ||
114 | } | ||
115 | |||
116 | |||
117 | static void free_otherend_watch(struct xenbus_device *dev) | ||
118 | { | ||
119 | if (dev->otherend_watch.node) { | ||
120 | unregister_xenbus_watch(&dev->otherend_watch); | ||
121 | kfree(dev->otherend_watch.node); | ||
122 | dev->otherend_watch.node = NULL; | ||
123 | } | ||
124 | } | ||
125 | |||
126 | |||
127 | int read_otherend_details(struct xenbus_device *xendev, | ||
128 | char *id_node, char *path_node) | ||
129 | { | ||
130 | int err = xenbus_gather(XBT_NIL, xendev->nodename, | ||
131 | id_node, "%i", &xendev->otherend_id, | ||
132 | path_node, NULL, &xendev->otherend, | ||
133 | NULL); | ||
134 | if (err) { | ||
135 | xenbus_dev_fatal(xendev, err, | ||
136 | "reading other end details from %s", | ||
137 | xendev->nodename); | ||
138 | return err; | ||
139 | } | ||
140 | if (strlen(xendev->otherend) == 0 || | ||
141 | !xenbus_exists(XBT_NIL, xendev->otherend, "")) { | ||
142 | xenbus_dev_fatal(xendev, -ENOENT, | ||
143 | "unable to read other end from %s. " | ||
144 | "missing or inaccessible.", | ||
145 | xendev->nodename); | ||
146 | free_otherend_details(xendev); | ||
147 | return -ENOENT; | ||
148 | } | ||
149 | |||
150 | return 0; | ||
151 | } | ||
152 | |||
153 | |||
154 | static int read_backend_details(struct xenbus_device *xendev) | ||
155 | { | ||
156 | return read_otherend_details(xendev, "backend-id", "backend"); | ||
157 | } | ||
158 | |||
159 | |||
160 | /* Bus type for frontend drivers. */ | ||
161 | static struct xen_bus_type xenbus_frontend = { | ||
162 | .root = "device", | ||
163 | .levels = 2, /* device/type/<id> */ | ||
164 | .get_bus_id = frontend_bus_id, | ||
165 | .probe = xenbus_probe_frontend, | ||
166 | .bus = { | ||
167 | .name = "xen", | ||
168 | .match = xenbus_match, | ||
169 | .probe = xenbus_dev_probe, | ||
170 | .remove = xenbus_dev_remove, | ||
171 | .shutdown = xenbus_dev_shutdown, | ||
172 | }, | ||
173 | }; | ||
174 | |||
175 | static void otherend_changed(struct xenbus_watch *watch, | ||
176 | const char **vec, unsigned int len) | ||
177 | { | ||
178 | struct xenbus_device *dev = | ||
179 | container_of(watch, struct xenbus_device, otherend_watch); | ||
180 | struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver); | ||
181 | enum xenbus_state state; | ||
182 | |||
183 | /* Protect us against watches firing on old details when the otherend | ||
184 | details change, say immediately after a resume. */ | ||
185 | if (!dev->otherend || | ||
186 | strncmp(dev->otherend, vec[XS_WATCH_PATH], | ||
187 | strlen(dev->otherend))) { | ||
188 | dev_dbg(&dev->dev, "Ignoring watch at %s", vec[XS_WATCH_PATH]); | ||
189 | return; | ||
190 | } | ||
191 | |||
192 | state = xenbus_read_driver_state(dev->otherend); | ||
193 | |||
194 | dev_dbg(&dev->dev, "state is %d, (%s), %s, %s", | ||
195 | state, xenbus_strstate(state), dev->otherend_watch.node, | ||
196 | vec[XS_WATCH_PATH]); | ||
197 | |||
198 | /* | ||
199 | * Ignore xenbus transitions during shutdown. This prevents us doing | ||
200 | * work that can fail e.g., when the rootfs is gone. | ||
201 | */ | ||
202 | if (system_state > SYSTEM_RUNNING) { | ||
203 | struct xen_bus_type *bus = bus; | ||
204 | bus = container_of(dev->dev.bus, struct xen_bus_type, bus); | ||
205 | /* If we're frontend, drive the state machine to Closed. */ | ||
206 | /* This should cause the backend to release our resources. */ | ||
207 | if ((bus == &xenbus_frontend) && (state == XenbusStateClosing)) | ||
208 | xenbus_frontend_closed(dev); | ||
209 | return; | ||
210 | } | ||
211 | |||
212 | if (drv->otherend_changed) | ||
213 | drv->otherend_changed(dev, state); | ||
214 | } | ||
215 | |||
216 | |||
217 | static int talk_to_otherend(struct xenbus_device *dev) | ||
218 | { | ||
219 | struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver); | ||
220 | |||
221 | free_otherend_watch(dev); | ||
222 | free_otherend_details(dev); | ||
223 | |||
224 | return drv->read_otherend_details(dev); | ||
225 | } | ||
226 | |||
227 | |||
228 | static int watch_otherend(struct xenbus_device *dev) | ||
229 | { | ||
230 | return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed, | ||
231 | "%s/%s", dev->otherend, "state"); | ||
232 | } | ||
233 | |||
234 | |||
235 | int xenbus_dev_probe(struct device *_dev) | ||
236 | { | ||
237 | struct xenbus_device *dev = to_xenbus_device(_dev); | ||
238 | struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); | ||
239 | const struct xenbus_device_id *id; | ||
240 | int err; | ||
241 | |||
242 | DPRINTK("%s", dev->nodename); | ||
243 | |||
244 | if (!drv->probe) { | ||
245 | err = -ENODEV; | ||
246 | goto fail; | ||
247 | } | ||
248 | |||
249 | id = match_device(drv->ids, dev); | ||
250 | if (!id) { | ||
251 | err = -ENODEV; | ||
252 | goto fail; | ||
253 | } | ||
254 | |||
255 | err = talk_to_otherend(dev); | ||
256 | if (err) { | ||
257 | dev_warn(&dev->dev, "talk_to_otherend on %s failed.\n", | ||
258 | dev->nodename); | ||
259 | return err; | ||
260 | } | ||
261 | |||
262 | err = drv->probe(dev, id); | ||
263 | if (err) | ||
264 | goto fail; | ||
265 | |||
266 | err = watch_otherend(dev); | ||
267 | if (err) { | ||
268 | dev_warn(&dev->dev, "watch_otherend on %s failed.\n", | ||
269 | dev->nodename); | ||
270 | return err; | ||
271 | } | ||
272 | |||
273 | return 0; | ||
274 | fail: | ||
275 | xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename); | ||
276 | xenbus_switch_state(dev, XenbusStateClosed); | ||
277 | return -ENODEV; | ||
278 | } | ||
279 | |||
280 | int xenbus_dev_remove(struct device *_dev) | ||
281 | { | ||
282 | struct xenbus_device *dev = to_xenbus_device(_dev); | ||
283 | struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); | ||
284 | |||
285 | DPRINTK("%s", dev->nodename); | ||
286 | |||
287 | free_otherend_watch(dev); | ||
288 | free_otherend_details(dev); | ||
289 | |||
290 | if (drv->remove) | ||
291 | drv->remove(dev); | ||
292 | |||
293 | xenbus_switch_state(dev, XenbusStateClosed); | ||
294 | return 0; | ||
295 | } | ||
296 | |||
297 | static void xenbus_dev_shutdown(struct device *_dev) | ||
298 | { | ||
299 | struct xenbus_device *dev = to_xenbus_device(_dev); | ||
300 | unsigned long timeout = 5*HZ; | ||
301 | |||
302 | DPRINTK("%s", dev->nodename); | ||
303 | |||
304 | get_device(&dev->dev); | ||
305 | if (dev->state != XenbusStateConnected) { | ||
306 | printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__, | ||
307 | dev->nodename, xenbus_strstate(dev->state)); | ||
308 | goto out; | ||
309 | } | ||
310 | xenbus_switch_state(dev, XenbusStateClosing); | ||
311 | timeout = wait_for_completion_timeout(&dev->down, timeout); | ||
312 | if (!timeout) | ||
313 | printk(KERN_INFO "%s: %s timeout closing device\n", | ||
314 | __func__, dev->nodename); | ||
315 | out: | ||
316 | put_device(&dev->dev); | ||
317 | } | ||
318 | |||
319 | int xenbus_register_driver_common(struct xenbus_driver *drv, | ||
320 | struct xen_bus_type *bus, | ||
321 | struct module *owner, | ||
322 | const char *mod_name) | ||
323 | { | ||
324 | drv->driver.name = drv->name; | ||
325 | drv->driver.bus = &bus->bus; | ||
326 | drv->driver.owner = owner; | ||
327 | drv->driver.mod_name = mod_name; | ||
328 | |||
329 | return driver_register(&drv->driver); | ||
330 | } | ||
331 | |||
332 | int __xenbus_register_frontend(struct xenbus_driver *drv, | ||
333 | struct module *owner, const char *mod_name) | ||
334 | { | ||
335 | int ret; | ||
336 | |||
337 | drv->read_otherend_details = read_backend_details; | ||
338 | |||
339 | ret = xenbus_register_driver_common(drv, &xenbus_frontend, | ||
340 | owner, mod_name); | ||
341 | if (ret) | ||
342 | return ret; | ||
343 | |||
344 | /* If this driver is loaded as a module wait for devices to attach. */ | ||
345 | wait_for_devices(drv); | ||
346 | |||
347 | return 0; | ||
348 | } | ||
349 | EXPORT_SYMBOL_GPL(__xenbus_register_frontend); | ||
350 | |||
351 | void xenbus_unregister_driver(struct xenbus_driver *drv) | ||
352 | { | ||
353 | driver_unregister(&drv->driver); | ||
354 | } | ||
355 | EXPORT_SYMBOL_GPL(xenbus_unregister_driver); | ||
356 | |||
357 | struct xb_find_info | ||
358 | { | ||
359 | struct xenbus_device *dev; | ||
360 | const char *nodename; | ||
361 | }; | ||
362 | |||
363 | static int cmp_dev(struct device *dev, void *data) | ||
364 | { | ||
365 | struct xenbus_device *xendev = to_xenbus_device(dev); | ||
366 | struct xb_find_info *info = data; | ||
367 | |||
368 | if (!strcmp(xendev->nodename, info->nodename)) { | ||
369 | info->dev = xendev; | ||
370 | get_device(dev); | ||
371 | return 1; | ||
372 | } | ||
373 | return 0; | ||
374 | } | ||
375 | |||
376 | struct xenbus_device *xenbus_device_find(const char *nodename, | ||
377 | struct bus_type *bus) | ||
378 | { | ||
379 | struct xb_find_info info = { .dev = NULL, .nodename = nodename }; | ||
380 | |||
381 | bus_for_each_dev(bus, NULL, &info, cmp_dev); | ||
382 | return info.dev; | ||
383 | } | ||
384 | |||
385 | static int cleanup_dev(struct device *dev, void *data) | ||
386 | { | ||
387 | struct xenbus_device *xendev = to_xenbus_device(dev); | ||
388 | struct xb_find_info *info = data; | ||
389 | int len = strlen(info->nodename); | ||
390 | |||
391 | DPRINTK("%s", info->nodename); | ||
392 | |||
393 | /* Match the info->nodename path, or any subdirectory of that path. */ | ||
394 | if (strncmp(xendev->nodename, info->nodename, len)) | ||
395 | return 0; | ||
396 | |||
397 | /* If the node name is longer, ensure it really is a subdirectory. */ | ||
398 | if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/')) | ||
399 | return 0; | ||
400 | |||
401 | info->dev = xendev; | ||
402 | get_device(dev); | ||
403 | return 1; | ||
404 | } | ||
405 | |||
406 | static void xenbus_cleanup_devices(const char *path, struct bus_type *bus) | ||
407 | { | ||
408 | struct xb_find_info info = { .nodename = path }; | ||
409 | |||
410 | do { | ||
411 | info.dev = NULL; | ||
412 | bus_for_each_dev(bus, NULL, &info, cleanup_dev); | ||
413 | if (info.dev) { | ||
414 | device_unregister(&info.dev->dev); | ||
415 | put_device(&info.dev->dev); | ||
416 | } | ||
417 | } while (info.dev); | ||
418 | } | ||
419 | |||
420 | static void xenbus_dev_release(struct device *dev) | ||
421 | { | ||
422 | if (dev) | ||
423 | kfree(to_xenbus_device(dev)); | ||
424 | } | ||
425 | |||
426 | static ssize_t xendev_show_nodename(struct device *dev, | ||
427 | struct device_attribute *attr, char *buf) | ||
428 | { | ||
429 | return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename); | ||
430 | } | ||
431 | DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL); | ||
432 | |||
433 | static ssize_t xendev_show_devtype(struct device *dev, | ||
434 | struct device_attribute *attr, char *buf) | ||
435 | { | ||
436 | return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype); | ||
437 | } | ||
438 | DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL); | ||
439 | |||
440 | |||
441 | int xenbus_probe_node(struct xen_bus_type *bus, | ||
442 | const char *type, | ||
443 | const char *nodename) | ||
444 | { | ||
445 | int err; | ||
446 | struct xenbus_device *xendev; | ||
447 | size_t stringlen; | ||
448 | char *tmpstring; | ||
449 | |||
450 | enum xenbus_state state = xenbus_read_driver_state(nodename); | ||
451 | |||
452 | if (state != XenbusStateInitialising) { | ||
453 | /* Device is not new, so ignore it. This can happen if a | ||
454 | device is going away after switching to Closed. */ | ||
455 | return 0; | ||
456 | } | ||
457 | |||
458 | stringlen = strlen(nodename) + 1 + strlen(type) + 1; | ||
459 | xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL); | ||
460 | if (!xendev) | ||
461 | return -ENOMEM; | ||
462 | |||
463 | xendev->state = XenbusStateInitialising; | ||
464 | |||
465 | /* Copy the strings into the extra space. */ | ||
466 | |||
467 | tmpstring = (char *)(xendev + 1); | ||
468 | strcpy(tmpstring, nodename); | ||
469 | xendev->nodename = tmpstring; | ||
470 | |||
471 | tmpstring += strlen(tmpstring) + 1; | ||
472 | strcpy(tmpstring, type); | ||
473 | xendev->devicetype = tmpstring; | ||
474 | init_completion(&xendev->down); | ||
475 | |||
476 | xendev->dev.bus = &bus->bus; | ||
477 | xendev->dev.release = xenbus_dev_release; | ||
478 | |||
479 | err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename); | ||
480 | if (err) | ||
481 | goto fail; | ||
482 | |||
483 | /* Register with generic device framework. */ | ||
484 | err = device_register(&xendev->dev); | ||
485 | if (err) | ||
486 | goto fail; | ||
487 | |||
488 | err = device_create_file(&xendev->dev, &dev_attr_nodename); | ||
489 | if (err) | ||
490 | goto fail_unregister; | ||
491 | |||
492 | err = device_create_file(&xendev->dev, &dev_attr_devtype); | ||
493 | if (err) | ||
494 | goto fail_remove_file; | ||
495 | |||
496 | return 0; | ||
497 | fail_remove_file: | ||
498 | device_remove_file(&xendev->dev, &dev_attr_nodename); | ||
499 | fail_unregister: | ||
500 | device_unregister(&xendev->dev); | ||
501 | fail: | ||
502 | kfree(xendev); | ||
503 | return err; | ||
504 | } | ||
505 | |||
506 | /* device/<typename>/<name> */ | ||
507 | static int xenbus_probe_frontend(const char *type, const char *name) | ||
508 | { | ||
509 | char *nodename; | ||
510 | int err; | ||
511 | |||
512 | nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", | ||
513 | xenbus_frontend.root, type, name); | ||
514 | if (!nodename) | ||
515 | return -ENOMEM; | ||
516 | |||
517 | DPRINTK("%s", nodename); | ||
518 | |||
519 | err = xenbus_probe_node(&xenbus_frontend, type, nodename); | ||
520 | kfree(nodename); | ||
521 | return err; | ||
522 | } | ||
523 | |||
524 | static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) | ||
525 | { | ||
526 | int err = 0; | ||
527 | char **dir; | ||
528 | unsigned int dir_n = 0; | ||
529 | int i; | ||
530 | |||
531 | dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n); | ||
532 | if (IS_ERR(dir)) | ||
533 | return PTR_ERR(dir); | ||
534 | |||
535 | for (i = 0; i < dir_n; i++) { | ||
536 | err = bus->probe(type, dir[i]); | ||
537 | if (err) | ||
538 | break; | ||
539 | } | ||
540 | kfree(dir); | ||
541 | return err; | ||
542 | } | ||
543 | |||
544 | int xenbus_probe_devices(struct xen_bus_type *bus) | ||
545 | { | ||
546 | int err = 0; | ||
547 | char **dir; | ||
548 | unsigned int i, dir_n; | ||
549 | |||
550 | dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n); | ||
551 | if (IS_ERR(dir)) | ||
552 | return PTR_ERR(dir); | ||
553 | |||
554 | for (i = 0; i < dir_n; i++) { | ||
555 | err = xenbus_probe_device_type(bus, dir[i]); | ||
556 | if (err) | ||
557 | break; | ||
558 | } | ||
559 | kfree(dir); | ||
560 | return err; | ||
561 | } | ||
562 | |||
563 | static unsigned int char_count(const char *str, char c) | ||
564 | { | ||
565 | unsigned int i, ret = 0; | ||
566 | |||
567 | for (i = 0; str[i]; i++) | ||
568 | if (str[i] == c) | ||
569 | ret++; | ||
570 | return ret; | ||
571 | } | ||
572 | |||
573 | static int strsep_len(const char *str, char c, unsigned int len) | ||
574 | { | ||
575 | unsigned int i; | ||
576 | |||
577 | for (i = 0; str[i]; i++) | ||
578 | if (str[i] == c) { | ||
579 | if (len == 0) | ||
580 | return i; | ||
581 | len--; | ||
582 | } | ||
583 | return (len == 0) ? i : -ERANGE; | ||
584 | } | ||
585 | |||
586 | void xenbus_dev_changed(const char *node, struct xen_bus_type *bus) | ||
587 | { | ||
588 | int exists, rootlen; | ||
589 | struct xenbus_device *dev; | ||
590 | char type[BUS_ID_SIZE]; | ||
591 | const char *p, *root; | ||
592 | |||
593 | if (char_count(node, '/') < 2) | ||
594 | return; | ||
595 | |||
596 | exists = xenbus_exists(XBT_NIL, node, ""); | ||
597 | if (!exists) { | ||
598 | xenbus_cleanup_devices(node, &bus->bus); | ||
599 | return; | ||
600 | } | ||
601 | |||
602 | /* backend/<type>/... or device/<type>/... */ | ||
603 | p = strchr(node, '/') + 1; | ||
604 | snprintf(type, BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p); | ||
605 | type[BUS_ID_SIZE-1] = '\0'; | ||
606 | |||
607 | rootlen = strsep_len(node, '/', bus->levels); | ||
608 | if (rootlen < 0) | ||
609 | return; | ||
610 | root = kasprintf(GFP_KERNEL, "%.*s", rootlen, node); | ||
611 | if (!root) | ||
612 | return; | ||
613 | |||
614 | dev = xenbus_device_find(root, &bus->bus); | ||
615 | if (!dev) | ||
616 | xenbus_probe_node(bus, type, root); | ||
617 | else | ||
618 | put_device(&dev->dev); | ||
619 | |||
620 | kfree(root); | ||
621 | } | ||
622 | |||
623 | static void frontend_changed(struct xenbus_watch *watch, | ||
624 | const char **vec, unsigned int len) | ||
625 | { | ||
626 | DPRINTK(""); | ||
627 | |||
628 | xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); | ||
629 | } | ||
630 | |||
631 | /* We watch for devices appearing and vanishing. */ | ||
632 | static struct xenbus_watch fe_watch = { | ||
633 | .node = "device", | ||
634 | .callback = frontend_changed, | ||
635 | }; | ||
636 | |||
637 | static int suspend_dev(struct device *dev, void *data) | ||
638 | { | ||
639 | int err = 0; | ||
640 | struct xenbus_driver *drv; | ||
641 | struct xenbus_device *xdev; | ||
642 | |||
643 | DPRINTK(""); | ||
644 | |||
645 | if (dev->driver == NULL) | ||
646 | return 0; | ||
647 | drv = to_xenbus_driver(dev->driver); | ||
648 | xdev = container_of(dev, struct xenbus_device, dev); | ||
649 | if (drv->suspend) | ||
650 | err = drv->suspend(xdev); | ||
651 | if (err) | ||
652 | printk(KERN_WARNING | ||
653 | "xenbus: suspend %s failed: %i\n", dev->bus_id, err); | ||
654 | return 0; | ||
655 | } | ||
656 | |||
657 | static int suspend_cancel_dev(struct device *dev, void *data) | ||
658 | { | ||
659 | int err = 0; | ||
660 | struct xenbus_driver *drv; | ||
661 | struct xenbus_device *xdev; | ||
662 | |||
663 | DPRINTK(""); | ||
664 | |||
665 | if (dev->driver == NULL) | ||
666 | return 0; | ||
667 | drv = to_xenbus_driver(dev->driver); | ||
668 | xdev = container_of(dev, struct xenbus_device, dev); | ||
669 | if (drv->suspend_cancel) | ||
670 | err = drv->suspend_cancel(xdev); | ||
671 | if (err) | ||
672 | printk(KERN_WARNING | ||
673 | "xenbus: suspend_cancel %s failed: %i\n", | ||
674 | dev->bus_id, err); | ||
675 | return 0; | ||
676 | } | ||
677 | |||
678 | static int resume_dev(struct device *dev, void *data) | ||
679 | { | ||
680 | int err; | ||
681 | struct xenbus_driver *drv; | ||
682 | struct xenbus_device *xdev; | ||
683 | |||
684 | DPRINTK(""); | ||
685 | |||
686 | if (dev->driver == NULL) | ||
687 | return 0; | ||
688 | |||
689 | drv = to_xenbus_driver(dev->driver); | ||
690 | xdev = container_of(dev, struct xenbus_device, dev); | ||
691 | |||
692 | err = talk_to_otherend(xdev); | ||
693 | if (err) { | ||
694 | printk(KERN_WARNING | ||
695 | "xenbus: resume (talk_to_otherend) %s failed: %i\n", | ||
696 | dev->bus_id, err); | ||
697 | return err; | ||
698 | } | ||
699 | |||
700 | xdev->state = XenbusStateInitialising; | ||
701 | |||
702 | if (drv->resume) { | ||
703 | err = drv->resume(xdev); | ||
704 | if (err) { | ||
705 | printk(KERN_WARNING | ||
706 | "xenbus: resume %s failed: %i\n", | ||
707 | dev->bus_id, err); | ||
708 | return err; | ||
709 | } | ||
710 | } | ||
711 | |||
712 | err = watch_otherend(xdev); | ||
713 | if (err) { | ||
714 | printk(KERN_WARNING | ||
715 | "xenbus_probe: resume (watch_otherend) %s failed: " | ||
716 | "%d.\n", dev->bus_id, err); | ||
717 | return err; | ||
718 | } | ||
719 | |||
720 | return 0; | ||
721 | } | ||
722 | |||
723 | void xenbus_suspend(void) | ||
724 | { | ||
725 | DPRINTK(""); | ||
726 | |||
727 | bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev); | ||
728 | xenbus_backend_suspend(suspend_dev); | ||
729 | xs_suspend(); | ||
730 | } | ||
731 | EXPORT_SYMBOL_GPL(xenbus_suspend); | ||
732 | |||
733 | void xenbus_resume(void) | ||
734 | { | ||
735 | xb_init_comms(); | ||
736 | xs_resume(); | ||
737 | bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev); | ||
738 | xenbus_backend_resume(resume_dev); | ||
739 | } | ||
740 | EXPORT_SYMBOL_GPL(xenbus_resume); | ||
741 | |||
742 | void xenbus_suspend_cancel(void) | ||
743 | { | ||
744 | xs_suspend_cancel(); | ||
745 | bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev); | ||
746 | xenbus_backend_resume(suspend_cancel_dev); | ||
747 | } | ||
748 | EXPORT_SYMBOL_GPL(xenbus_suspend_cancel); | ||
749 | |||
750 | /* A flag to determine if xenstored is 'ready' (i.e. has started) */ | ||
751 | int xenstored_ready = 0; | ||
752 | |||
753 | |||
754 | int register_xenstore_notifier(struct notifier_block *nb) | ||
755 | { | ||
756 | int ret = 0; | ||
757 | |||
758 | if (xenstored_ready > 0) | ||
759 | ret = nb->notifier_call(nb, 0, NULL); | ||
760 | else | ||
761 | blocking_notifier_chain_register(&xenstore_chain, nb); | ||
762 | |||
763 | return ret; | ||
764 | } | ||
765 | EXPORT_SYMBOL_GPL(register_xenstore_notifier); | ||
766 | |||
767 | void unregister_xenstore_notifier(struct notifier_block *nb) | ||
768 | { | ||
769 | blocking_notifier_chain_unregister(&xenstore_chain, nb); | ||
770 | } | ||
771 | EXPORT_SYMBOL_GPL(unregister_xenstore_notifier); | ||
772 | |||
773 | void xenbus_probe(struct work_struct *unused) | ||
774 | { | ||
775 | BUG_ON((xenstored_ready <= 0)); | ||
776 | |||
777 | /* Enumerate devices in xenstore and watch for changes. */ | ||
778 | xenbus_probe_devices(&xenbus_frontend); | ||
779 | register_xenbus_watch(&fe_watch); | ||
780 | xenbus_backend_probe_and_watch(); | ||
781 | |||
782 | /* Notify others that xenstore is up */ | ||
783 | blocking_notifier_call_chain(&xenstore_chain, 0, NULL); | ||
784 | } | ||
785 | |||
786 | static int __init xenbus_probe_init(void) | ||
787 | { | ||
788 | int err = 0; | ||
789 | |||
790 | DPRINTK(""); | ||
791 | |||
792 | err = -ENODEV; | ||
793 | if (!is_running_on_xen()) | ||
794 | goto out_error; | ||
795 | |||
796 | /* Register ourselves with the kernel bus subsystem */ | ||
797 | err = bus_register(&xenbus_frontend.bus); | ||
798 | if (err) | ||
799 | goto out_error; | ||
800 | |||
801 | err = xenbus_backend_bus_register(); | ||
802 | if (err) | ||
803 | goto out_unreg_front; | ||
804 | |||
805 | /* | ||
806 | * Domain0 doesn't have a store_evtchn or store_mfn yet. | ||
807 | */ | ||
808 | if (is_initial_xendomain()) { | ||
809 | /* dom0 not yet supported */ | ||
810 | } else { | ||
811 | xenstored_ready = 1; | ||
812 | xen_store_evtchn = xen_start_info->store_evtchn; | ||
813 | xen_store_mfn = xen_start_info->store_mfn; | ||
814 | } | ||
815 | xen_store_interface = mfn_to_virt(xen_store_mfn); | ||
816 | |||
817 | /* Initialize the interface to xenstore. */ | ||
818 | err = xs_init(); | ||
819 | if (err) { | ||
820 | printk(KERN_WARNING | ||
821 | "XENBUS: Error initializing xenstore comms: %i\n", err); | ||
822 | goto out_unreg_back; | ||
823 | } | ||
824 | |||
825 | if (!is_initial_xendomain()) | ||
826 | xenbus_probe(NULL); | ||
827 | |||
828 | return 0; | ||
829 | |||
830 | out_unreg_back: | ||
831 | xenbus_backend_bus_unregister(); | ||
832 | |||
833 | out_unreg_front: | ||
834 | bus_unregister(&xenbus_frontend.bus); | ||
835 | |||
836 | out_error: | ||
837 | return err; | ||
838 | } | ||
839 | |||
840 | postcore_initcall(xenbus_probe_init); | ||
841 | |||
842 | MODULE_LICENSE("GPL"); | ||
843 | |||
844 | static int is_disconnected_device(struct device *dev, void *data) | ||
845 | { | ||
846 | struct xenbus_device *xendev = to_xenbus_device(dev); | ||
847 | struct device_driver *drv = data; | ||
848 | |||
849 | /* | ||
850 | * A device with no driver will never connect. We care only about | ||
851 | * devices which should currently be in the process of connecting. | ||
852 | */ | ||
853 | if (!dev->driver) | ||
854 | return 0; | ||
855 | |||
856 | /* Is this search limited to a particular driver? */ | ||
857 | if (drv && (dev->driver != drv)) | ||
858 | return 0; | ||
859 | |||
860 | return (xendev->state != XenbusStateConnected); | ||
861 | } | ||
862 | |||
863 | static int exists_disconnected_device(struct device_driver *drv) | ||
864 | { | ||
865 | return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, | ||
866 | is_disconnected_device); | ||
867 | } | ||
868 | |||
869 | static int print_device_status(struct device *dev, void *data) | ||
870 | { | ||
871 | struct xenbus_device *xendev = to_xenbus_device(dev); | ||
872 | struct device_driver *drv = data; | ||
873 | |||
874 | /* Is this operation limited to a particular driver? */ | ||
875 | if (drv && (dev->driver != drv)) | ||
876 | return 0; | ||
877 | |||
878 | if (!dev->driver) { | ||
879 | /* Information only: is this too noisy? */ | ||
880 | printk(KERN_INFO "XENBUS: Device with no driver: %s\n", | ||
881 | xendev->nodename); | ||
882 | } else if (xendev->state != XenbusStateConnected) { | ||
883 | printk(KERN_WARNING "XENBUS: Timeout connecting " | ||
884 | "to device: %s (state %d)\n", | ||
885 | xendev->nodename, xendev->state); | ||
886 | } | ||
887 | |||
888 | return 0; | ||
889 | } | ||
890 | |||
891 | /* We only wait for device setup after most initcalls have run. */ | ||
892 | static int ready_to_wait_for_devices; | ||
893 | |||
894 | /* | ||
895 | * On a 10 second timeout, wait for all devices currently configured. We need | ||
896 | * to do this to guarantee that the filesystems and / or network devices | ||
897 | * needed for boot are available, before we can allow the boot to proceed. | ||
898 | * | ||
899 | * This needs to be on a late_initcall, to happen after the frontend device | ||
900 | * drivers have been initialised, but before the root fs is mounted. | ||
901 | * | ||
902 | * A possible improvement here would be to have the tools add a per-device | ||
903 | * flag to the store entry, indicating whether it is needed at boot time. | ||
904 | * This would allow people who knew what they were doing to accelerate their | ||
905 | * boot slightly, but of course needs tools or manual intervention to set up | ||
906 | * those flags correctly. | ||
907 | */ | ||
908 | static void wait_for_devices(struct xenbus_driver *xendrv) | ||
909 | { | ||
910 | unsigned long timeout = jiffies + 10*HZ; | ||
911 | struct device_driver *drv = xendrv ? &xendrv->driver : NULL; | ||
912 | |||
913 | if (!ready_to_wait_for_devices || !is_running_on_xen()) | ||
914 | return; | ||
915 | |||
916 | while (exists_disconnected_device(drv)) { | ||
917 | if (time_after(jiffies, timeout)) | ||
918 | break; | ||
919 | schedule_timeout_interruptible(HZ/10); | ||
920 | } | ||
921 | |||
922 | bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, | ||
923 | print_device_status); | ||
924 | } | ||
925 | |||
926 | #ifndef MODULE | ||
927 | static int __init boot_wait_for_devices(void) | ||
928 | { | ||
929 | ready_to_wait_for_devices = 1; | ||
930 | wait_for_devices(NULL); | ||
931 | return 0; | ||
932 | } | ||
933 | |||
934 | late_initcall(boot_wait_for_devices); | ||
935 | #endif | ||
diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h new file mode 100644 index 000000000000..e09b19415a40 --- /dev/null +++ b/drivers/xen/xenbus/xenbus_probe.h | |||
@@ -0,0 +1,74 @@ | |||
1 | /****************************************************************************** | ||
2 | * xenbus_probe.h | ||
3 | * | ||
4 | * Talks to Xen Store to figure out what devices we have. | ||
5 | * | ||
6 | * Copyright (C) 2005 Rusty Russell, IBM Corporation | ||
7 | * Copyright (C) 2005 XenSource Ltd. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License version 2 | ||
11 | * as published by the Free Software Foundation; or, when distributed | ||
12 | * separately from the Linux kernel or incorporated into other | ||
13 | * software packages, subject to the following license: | ||
14 | * | ||
15 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
16 | * of this source file (the "Software"), to deal in the Software without | ||
17 | * restriction, including without limitation the rights to use, copy, modify, | ||
18 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
19 | * and to permit persons to whom the Software is furnished to do so, subject to | ||
20 | * the following conditions: | ||
21 | * | ||
22 | * The above copyright notice and this permission notice shall be included in | ||
23 | * all copies or substantial portions of the Software. | ||
24 | * | ||
25 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
26 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
27 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
28 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
29 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
30 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
31 | * IN THE SOFTWARE. | ||
32 | */ | ||
33 | |||
34 | #ifndef _XENBUS_PROBE_H | ||
35 | #define _XENBUS_PROBE_H | ||
36 | |||
37 | #ifdef CONFIG_XEN_BACKEND | ||
38 | extern void xenbus_backend_suspend(int (*fn)(struct device *, void *)); | ||
39 | extern void xenbus_backend_resume(int (*fn)(struct device *, void *)); | ||
40 | extern void xenbus_backend_probe_and_watch(void); | ||
41 | extern int xenbus_backend_bus_register(void); | ||
42 | extern void xenbus_backend_bus_unregister(void); | ||
43 | #else | ||
44 | static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {} | ||
45 | static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {} | ||
46 | static inline void xenbus_backend_probe_and_watch(void) {} | ||
47 | static inline int xenbus_backend_bus_register(void) { return 0; } | ||
48 | static inline void xenbus_backend_bus_unregister(void) {} | ||
49 | #endif | ||
50 | |||
51 | struct xen_bus_type | ||
52 | { | ||
53 | char *root; | ||
54 | unsigned int levels; | ||
55 | int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename); | ||
56 | int (*probe)(const char *type, const char *dir); | ||
57 | struct bus_type bus; | ||
58 | }; | ||
59 | |||
60 | extern int xenbus_match(struct device *_dev, struct device_driver *_drv); | ||
61 | extern int xenbus_dev_probe(struct device *_dev); | ||
62 | extern int xenbus_dev_remove(struct device *_dev); | ||
63 | extern int xenbus_register_driver_common(struct xenbus_driver *drv, | ||
64 | struct xen_bus_type *bus, | ||
65 | struct module *owner, | ||
66 | const char *mod_name); | ||
67 | extern int xenbus_probe_node(struct xen_bus_type *bus, | ||
68 | const char *type, | ||
69 | const char *nodename); | ||
70 | extern int xenbus_probe_devices(struct xen_bus_type *bus); | ||
71 | |||
72 | extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus); | ||
73 | |||
74 | #endif | ||
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c new file mode 100644 index 000000000000..9e943fbce81b --- /dev/null +++ b/drivers/xen/xenbus/xenbus_xs.c | |||
@@ -0,0 +1,861 @@ | |||
1 | /****************************************************************************** | ||
2 | * xenbus_xs.c | ||
3 | * | ||
4 | * This is the kernel equivalent of the "xs" library. We don't need everything | ||
5 | * and we use xenbus_comms for communication. | ||
6 | * | ||
7 | * Copyright (C) 2005 Rusty Russell, IBM Corporation | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License version 2 | ||
11 | * as published by the Free Software Foundation; or, when distributed | ||
12 | * separately from the Linux kernel or incorporated into other | ||
13 | * software packages, subject to the following license: | ||
14 | * | ||
15 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
16 | * of this source file (the "Software"), to deal in the Software without | ||
17 | * restriction, including without limitation the rights to use, copy, modify, | ||
18 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
19 | * and to permit persons to whom the Software is furnished to do so, subject to | ||
20 | * the following conditions: | ||
21 | * | ||
22 | * The above copyright notice and this permission notice shall be included in | ||
23 | * all copies or substantial portions of the Software. | ||
24 | * | ||
25 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
26 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
27 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
28 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
29 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
30 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
31 | * IN THE SOFTWARE. | ||
32 | */ | ||
33 | |||
34 | #include <linux/unistd.h> | ||
35 | #include <linux/errno.h> | ||
36 | #include <linux/types.h> | ||
37 | #include <linux/uio.h> | ||
38 | #include <linux/kernel.h> | ||
39 | #include <linux/string.h> | ||
40 | #include <linux/err.h> | ||
41 | #include <linux/slab.h> | ||
42 | #include <linux/fcntl.h> | ||
43 | #include <linux/kthread.h> | ||
44 | #include <linux/rwsem.h> | ||
45 | #include <linux/module.h> | ||
46 | #include <linux/mutex.h> | ||
47 | #include <xen/xenbus.h> | ||
48 | #include "xenbus_comms.h" | ||
49 | |||
50 | struct xs_stored_msg { | ||
51 | struct list_head list; | ||
52 | |||
53 | struct xsd_sockmsg hdr; | ||
54 | |||
55 | union { | ||
56 | /* Queued replies. */ | ||
57 | struct { | ||
58 | char *body; | ||
59 | } reply; | ||
60 | |||
61 | /* Queued watch events. */ | ||
62 | struct { | ||
63 | struct xenbus_watch *handle; | ||
64 | char **vec; | ||
65 | unsigned int vec_size; | ||
66 | } watch; | ||
67 | } u; | ||
68 | }; | ||
69 | |||
70 | struct xs_handle { | ||
71 | /* A list of replies. Currently only one will ever be outstanding. */ | ||
72 | struct list_head reply_list; | ||
73 | spinlock_t reply_lock; | ||
74 | wait_queue_head_t reply_waitq; | ||
75 | |||
76 | /* | ||
77 | * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex. | ||
78 | * response_mutex is never taken simultaneously with the other three. | ||
79 | */ | ||
80 | |||
81 | /* One request at a time. */ | ||
82 | struct mutex request_mutex; | ||
83 | |||
84 | /* Protect xenbus reader thread against save/restore. */ | ||
85 | struct mutex response_mutex; | ||
86 | |||
87 | /* Protect transactions against save/restore. */ | ||
88 | struct rw_semaphore transaction_mutex; | ||
89 | |||
90 | /* Protect watch (de)register against save/restore. */ | ||
91 | struct rw_semaphore watch_mutex; | ||
92 | }; | ||
93 | |||
94 | static struct xs_handle xs_state; | ||
95 | |||
96 | /* List of registered watches, and a lock to protect it. */ | ||
97 | static LIST_HEAD(watches); | ||
98 | static DEFINE_SPINLOCK(watches_lock); | ||
99 | |||
100 | /* List of pending watch callback events, and a lock to protect it. */ | ||
101 | static LIST_HEAD(watch_events); | ||
102 | static DEFINE_SPINLOCK(watch_events_lock); | ||
103 | |||
104 | /* | ||
105 | * Details of the xenwatch callback kernel thread. The thread waits on the | ||
106 | * watch_events_waitq for work to do (queued on watch_events list). When it | ||
107 | * wakes up it acquires the xenwatch_mutex before reading the list and | ||
108 | * carrying out work. | ||
109 | */ | ||
110 | static pid_t xenwatch_pid; | ||
111 | static DEFINE_MUTEX(xenwatch_mutex); | ||
112 | static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq); | ||
113 | |||
114 | static int get_error(const char *errorstring) | ||
115 | { | ||
116 | unsigned int i; | ||
117 | |||
118 | for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) { | ||
119 | if (i == ARRAY_SIZE(xsd_errors) - 1) { | ||
120 | printk(KERN_WARNING | ||
121 | "XENBUS xen store gave: unknown error %s", | ||
122 | errorstring); | ||
123 | return EINVAL; | ||
124 | } | ||
125 | } | ||
126 | return xsd_errors[i].errnum; | ||
127 | } | ||
128 | |||
129 | static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) | ||
130 | { | ||
131 | struct xs_stored_msg *msg; | ||
132 | char *body; | ||
133 | |||
134 | spin_lock(&xs_state.reply_lock); | ||
135 | |||
136 | while (list_empty(&xs_state.reply_list)) { | ||
137 | spin_unlock(&xs_state.reply_lock); | ||
138 | /* XXX FIXME: Avoid synchronous wait for response here. */ | ||
139 | wait_event(xs_state.reply_waitq, | ||
140 | !list_empty(&xs_state.reply_list)); | ||
141 | spin_lock(&xs_state.reply_lock); | ||
142 | } | ||
143 | |||
144 | msg = list_entry(xs_state.reply_list.next, | ||
145 | struct xs_stored_msg, list); | ||
146 | list_del(&msg->list); | ||
147 | |||
148 | spin_unlock(&xs_state.reply_lock); | ||
149 | |||
150 | *type = msg->hdr.type; | ||
151 | if (len) | ||
152 | *len = msg->hdr.len; | ||
153 | body = msg->u.reply.body; | ||
154 | |||
155 | kfree(msg); | ||
156 | |||
157 | return body; | ||
158 | } | ||
159 | |||
160 | void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) | ||
161 | { | ||
162 | void *ret; | ||
163 | struct xsd_sockmsg req_msg = *msg; | ||
164 | int err; | ||
165 | |||
166 | if (req_msg.type == XS_TRANSACTION_START) | ||
167 | down_read(&xs_state.transaction_mutex); | ||
168 | |||
169 | mutex_lock(&xs_state.request_mutex); | ||
170 | |||
171 | err = xb_write(msg, sizeof(*msg) + msg->len); | ||
172 | if (err) { | ||
173 | msg->type = XS_ERROR; | ||
174 | ret = ERR_PTR(err); | ||
175 | } else | ||
176 | ret = read_reply(&msg->type, &msg->len); | ||
177 | |||
178 | mutex_unlock(&xs_state.request_mutex); | ||
179 | |||
180 | if ((msg->type == XS_TRANSACTION_END) || | ||
181 | ((req_msg.type == XS_TRANSACTION_START) && | ||
182 | (msg->type == XS_ERROR))) | ||
183 | up_read(&xs_state.transaction_mutex); | ||
184 | |||
185 | return ret; | ||
186 | } | ||
187 | |||
188 | /* Send message to xs, get kmalloc'ed reply. ERR_PTR() on error. */ | ||
189 | static void *xs_talkv(struct xenbus_transaction t, | ||
190 | enum xsd_sockmsg_type type, | ||
191 | const struct kvec *iovec, | ||
192 | unsigned int num_vecs, | ||
193 | unsigned int *len) | ||
194 | { | ||
195 | struct xsd_sockmsg msg; | ||
196 | void *ret = NULL; | ||
197 | unsigned int i; | ||
198 | int err; | ||
199 | |||
200 | msg.tx_id = t.id; | ||
201 | msg.req_id = 0; | ||
202 | msg.type = type; | ||
203 | msg.len = 0; | ||
204 | for (i = 0; i < num_vecs; i++) | ||
205 | msg.len += iovec[i].iov_len; | ||
206 | |||
207 | mutex_lock(&xs_state.request_mutex); | ||
208 | |||
209 | err = xb_write(&msg, sizeof(msg)); | ||
210 | if (err) { | ||
211 | mutex_unlock(&xs_state.request_mutex); | ||
212 | return ERR_PTR(err); | ||
213 | } | ||
214 | |||
215 | for (i = 0; i < num_vecs; i++) { | ||
216 | err = xb_write(iovec[i].iov_base, iovec[i].iov_len); | ||
217 | if (err) { | ||
218 | mutex_unlock(&xs_state.request_mutex); | ||
219 | return ERR_PTR(err); | ||
220 | } | ||
221 | } | ||
222 | |||
223 | ret = read_reply(&msg.type, len); | ||
224 | |||
225 | mutex_unlock(&xs_state.request_mutex); | ||
226 | |||
227 | if (IS_ERR(ret)) | ||
228 | return ret; | ||
229 | |||
230 | if (msg.type == XS_ERROR) { | ||
231 | err = get_error(ret); | ||
232 | kfree(ret); | ||
233 | return ERR_PTR(-err); | ||
234 | } | ||
235 | |||
236 | if (msg.type != type) { | ||
237 | if (printk_ratelimit()) | ||
238 | printk(KERN_WARNING | ||
239 | "XENBUS unexpected type [%d], expected [%d]\n", | ||
240 | msg.type, type); | ||
241 | kfree(ret); | ||
242 | return ERR_PTR(-EINVAL); | ||
243 | } | ||
244 | return ret; | ||
245 | } | ||
246 | |||
247 | /* Simplified version of xs_talkv: single message. */ | ||
248 | static void *xs_single(struct xenbus_transaction t, | ||
249 | enum xsd_sockmsg_type type, | ||
250 | const char *string, | ||
251 | unsigned int *len) | ||
252 | { | ||
253 | struct kvec iovec; | ||
254 | |||
255 | iovec.iov_base = (void *)string; | ||
256 | iovec.iov_len = strlen(string) + 1; | ||
257 | return xs_talkv(t, type, &iovec, 1, len); | ||
258 | } | ||
259 | |||
260 | /* Many commands only need an ack, don't care what it says. */ | ||
261 | static int xs_error(char *reply) | ||
262 | { | ||
263 | if (IS_ERR(reply)) | ||
264 | return PTR_ERR(reply); | ||
265 | kfree(reply); | ||
266 | return 0; | ||
267 | } | ||
268 | |||
269 | static unsigned int count_strings(const char *strings, unsigned int len) | ||
270 | { | ||
271 | unsigned int num; | ||
272 | const char *p; | ||
273 | |||
274 | for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1) | ||
275 | num++; | ||
276 | |||
277 | return num; | ||
278 | } | ||
279 | |||
280 | /* Return the path to dir with /name appended. Buffer must be kfree()'ed. */ | ||
281 | static char *join(const char *dir, const char *name) | ||
282 | { | ||
283 | char *buffer; | ||
284 | |||
285 | if (strlen(name) == 0) | ||
286 | buffer = kasprintf(GFP_KERNEL, "%s", dir); | ||
287 | else | ||
288 | buffer = kasprintf(GFP_KERNEL, "%s/%s", dir, name); | ||
289 | return (!buffer) ? ERR_PTR(-ENOMEM) : buffer; | ||
290 | } | ||
291 | |||
292 | static char **split(char *strings, unsigned int len, unsigned int *num) | ||
293 | { | ||
294 | char *p, **ret; | ||
295 | |||
296 | /* Count the strings. */ | ||
297 | *num = count_strings(strings, len); | ||
298 | |||
299 | /* Transfer to one big alloc for easy freeing. */ | ||
300 | ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL); | ||
301 | if (!ret) { | ||
302 | kfree(strings); | ||
303 | return ERR_PTR(-ENOMEM); | ||
304 | } | ||
305 | memcpy(&ret[*num], strings, len); | ||
306 | kfree(strings); | ||
307 | |||
308 | strings = (char *)&ret[*num]; | ||
309 | for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1) | ||
310 | ret[(*num)++] = p; | ||
311 | |||
312 | return ret; | ||
313 | } | ||
314 | |||
315 | char **xenbus_directory(struct xenbus_transaction t, | ||
316 | const char *dir, const char *node, unsigned int *num) | ||
317 | { | ||
318 | char *strings, *path; | ||
319 | unsigned int len; | ||
320 | |||
321 | path = join(dir, node); | ||
322 | if (IS_ERR(path)) | ||
323 | return (char **)path; | ||
324 | |||
325 | strings = xs_single(t, XS_DIRECTORY, path, &len); | ||
326 | kfree(path); | ||
327 | if (IS_ERR(strings)) | ||
328 | return (char **)strings; | ||
329 | |||
330 | return split(strings, len, num); | ||
331 | } | ||
332 | EXPORT_SYMBOL_GPL(xenbus_directory); | ||
333 | |||
334 | /* Check if a path exists. Return 1 if it does. */ | ||
335 | int xenbus_exists(struct xenbus_transaction t, | ||
336 | const char *dir, const char *node) | ||
337 | { | ||
338 | char **d; | ||
339 | int dir_n; | ||
340 | |||
341 | d = xenbus_directory(t, dir, node, &dir_n); | ||
342 | if (IS_ERR(d)) | ||
343 | return 0; | ||
344 | kfree(d); | ||
345 | return 1; | ||
346 | } | ||
347 | EXPORT_SYMBOL_GPL(xenbus_exists); | ||
348 | |||
349 | /* Get the value of a single file. | ||
350 | * Returns a kmalloced value: call free() on it after use. | ||
351 | * len indicates length in bytes. | ||
352 | */ | ||
353 | void *xenbus_read(struct xenbus_transaction t, | ||
354 | const char *dir, const char *node, unsigned int *len) | ||
355 | { | ||
356 | char *path; | ||
357 | void *ret; | ||
358 | |||
359 | path = join(dir, node); | ||
360 | if (IS_ERR(path)) | ||
361 | return (void *)path; | ||
362 | |||
363 | ret = xs_single(t, XS_READ, path, len); | ||
364 | kfree(path); | ||
365 | return ret; | ||
366 | } | ||
367 | EXPORT_SYMBOL_GPL(xenbus_read); | ||
368 | |||
369 | /* Write the value of a single file. | ||
370 | * Returns -err on failure. | ||
371 | */ | ||
372 | int xenbus_write(struct xenbus_transaction t, | ||
373 | const char *dir, const char *node, const char *string) | ||
374 | { | ||
375 | const char *path; | ||
376 | struct kvec iovec[2]; | ||
377 | int ret; | ||
378 | |||
379 | path = join(dir, node); | ||
380 | if (IS_ERR(path)) | ||
381 | return PTR_ERR(path); | ||
382 | |||
383 | iovec[0].iov_base = (void *)path; | ||
384 | iovec[0].iov_len = strlen(path) + 1; | ||
385 | iovec[1].iov_base = (void *)string; | ||
386 | iovec[1].iov_len = strlen(string); | ||
387 | |||
388 | ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL)); | ||
389 | kfree(path); | ||
390 | return ret; | ||
391 | } | ||
392 | EXPORT_SYMBOL_GPL(xenbus_write); | ||
393 | |||
394 | /* Create a new directory. */ | ||
395 | int xenbus_mkdir(struct xenbus_transaction t, | ||
396 | const char *dir, const char *node) | ||
397 | { | ||
398 | char *path; | ||
399 | int ret; | ||
400 | |||
401 | path = join(dir, node); | ||
402 | if (IS_ERR(path)) | ||
403 | return PTR_ERR(path); | ||
404 | |||
405 | ret = xs_error(xs_single(t, XS_MKDIR, path, NULL)); | ||
406 | kfree(path); | ||
407 | return ret; | ||
408 | } | ||
409 | EXPORT_SYMBOL_GPL(xenbus_mkdir); | ||
410 | |||
411 | /* Destroy a file or directory (directories must be empty). */ | ||
412 | int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node) | ||
413 | { | ||
414 | char *path; | ||
415 | int ret; | ||
416 | |||
417 | path = join(dir, node); | ||
418 | if (IS_ERR(path)) | ||
419 | return PTR_ERR(path); | ||
420 | |||
421 | ret = xs_error(xs_single(t, XS_RM, path, NULL)); | ||
422 | kfree(path); | ||
423 | return ret; | ||
424 | } | ||
425 | EXPORT_SYMBOL_GPL(xenbus_rm); | ||
426 | |||
427 | /* Start a transaction: changes by others will not be seen during this | ||
428 | * transaction, and changes will not be visible to others until end. | ||
429 | */ | ||
430 | int xenbus_transaction_start(struct xenbus_transaction *t) | ||
431 | { | ||
432 | char *id_str; | ||
433 | |||
434 | down_read(&xs_state.transaction_mutex); | ||
435 | |||
436 | id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL); | ||
437 | if (IS_ERR(id_str)) { | ||
438 | up_read(&xs_state.transaction_mutex); | ||
439 | return PTR_ERR(id_str); | ||
440 | } | ||
441 | |||
442 | t->id = simple_strtoul(id_str, NULL, 0); | ||
443 | kfree(id_str); | ||
444 | return 0; | ||
445 | } | ||
446 | EXPORT_SYMBOL_GPL(xenbus_transaction_start); | ||
447 | |||
448 | /* End a transaction. | ||
449 | * If abandon is true, transaction is discarded instead of committed. | ||
450 | */ | ||
451 | int xenbus_transaction_end(struct xenbus_transaction t, int abort) | ||
452 | { | ||
453 | char abortstr[2]; | ||
454 | int err; | ||
455 | |||
456 | if (abort) | ||
457 | strcpy(abortstr, "F"); | ||
458 | else | ||
459 | strcpy(abortstr, "T"); | ||
460 | |||
461 | err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL)); | ||
462 | |||
463 | up_read(&xs_state.transaction_mutex); | ||
464 | |||
465 | return err; | ||
466 | } | ||
467 | EXPORT_SYMBOL_GPL(xenbus_transaction_end); | ||
468 | |||
469 | /* Single read and scanf: returns -errno or num scanned. */ | ||
470 | int xenbus_scanf(struct xenbus_transaction t, | ||
471 | const char *dir, const char *node, const char *fmt, ...) | ||
472 | { | ||
473 | va_list ap; | ||
474 | int ret; | ||
475 | char *val; | ||
476 | |||
477 | val = xenbus_read(t, dir, node, NULL); | ||
478 | if (IS_ERR(val)) | ||
479 | return PTR_ERR(val); | ||
480 | |||
481 | va_start(ap, fmt); | ||
482 | ret = vsscanf(val, fmt, ap); | ||
483 | va_end(ap); | ||
484 | kfree(val); | ||
485 | /* Distinctive errno. */ | ||
486 | if (ret == 0) | ||
487 | return -ERANGE; | ||
488 | return ret; | ||
489 | } | ||
490 | EXPORT_SYMBOL_GPL(xenbus_scanf); | ||
491 | |||
492 | /* Single printf and write: returns -errno or 0. */ | ||
493 | int xenbus_printf(struct xenbus_transaction t, | ||
494 | const char *dir, const char *node, const char *fmt, ...) | ||
495 | { | ||
496 | va_list ap; | ||
497 | int ret; | ||
498 | #define PRINTF_BUFFER_SIZE 4096 | ||
499 | char *printf_buffer; | ||
500 | |||
501 | printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL); | ||
502 | if (printf_buffer == NULL) | ||
503 | return -ENOMEM; | ||
504 | |||
505 | va_start(ap, fmt); | ||
506 | ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap); | ||
507 | va_end(ap); | ||
508 | |||
509 | BUG_ON(ret > PRINTF_BUFFER_SIZE-1); | ||
510 | ret = xenbus_write(t, dir, node, printf_buffer); | ||
511 | |||
512 | kfree(printf_buffer); | ||
513 | |||
514 | return ret; | ||
515 | } | ||
516 | EXPORT_SYMBOL_GPL(xenbus_printf); | ||
517 | |||
518 | /* Takes tuples of names, scanf-style args, and void **, NULL terminated. */ | ||
519 | int xenbus_gather(struct xenbus_transaction t, const char *dir, ...) | ||
520 | { | ||
521 | va_list ap; | ||
522 | const char *name; | ||
523 | int ret = 0; | ||
524 | |||
525 | va_start(ap, dir); | ||
526 | while (ret == 0 && (name = va_arg(ap, char *)) != NULL) { | ||
527 | const char *fmt = va_arg(ap, char *); | ||
528 | void *result = va_arg(ap, void *); | ||
529 | char *p; | ||
530 | |||
531 | p = xenbus_read(t, dir, name, NULL); | ||
532 | if (IS_ERR(p)) { | ||
533 | ret = PTR_ERR(p); | ||
534 | break; | ||
535 | } | ||
536 | if (fmt) { | ||
537 | if (sscanf(p, fmt, result) == 0) | ||
538 | ret = -EINVAL; | ||
539 | kfree(p); | ||
540 | } else | ||
541 | *(char **)result = p; | ||
542 | } | ||
543 | va_end(ap); | ||
544 | return ret; | ||
545 | } | ||
546 | EXPORT_SYMBOL_GPL(xenbus_gather); | ||
547 | |||
548 | static int xs_watch(const char *path, const char *token) | ||
549 | { | ||
550 | struct kvec iov[2]; | ||
551 | |||
552 | iov[0].iov_base = (void *)path; | ||
553 | iov[0].iov_len = strlen(path) + 1; | ||
554 | iov[1].iov_base = (void *)token; | ||
555 | iov[1].iov_len = strlen(token) + 1; | ||
556 | |||
557 | return xs_error(xs_talkv(XBT_NIL, XS_WATCH, iov, | ||
558 | ARRAY_SIZE(iov), NULL)); | ||
559 | } | ||
560 | |||
561 | static int xs_unwatch(const char *path, const char *token) | ||
562 | { | ||
563 | struct kvec iov[2]; | ||
564 | |||
565 | iov[0].iov_base = (char *)path; | ||
566 | iov[0].iov_len = strlen(path) + 1; | ||
567 | iov[1].iov_base = (char *)token; | ||
568 | iov[1].iov_len = strlen(token) + 1; | ||
569 | |||
570 | return xs_error(xs_talkv(XBT_NIL, XS_UNWATCH, iov, | ||
571 | ARRAY_SIZE(iov), NULL)); | ||
572 | } | ||
573 | |||
574 | static struct xenbus_watch *find_watch(const char *token) | ||
575 | { | ||
576 | struct xenbus_watch *i, *cmp; | ||
577 | |||
578 | cmp = (void *)simple_strtoul(token, NULL, 16); | ||
579 | |||
580 | list_for_each_entry(i, &watches, list) | ||
581 | if (i == cmp) | ||
582 | return i; | ||
583 | |||
584 | return NULL; | ||
585 | } | ||
586 | |||
587 | /* Register callback to watch this node. */ | ||
588 | int register_xenbus_watch(struct xenbus_watch *watch) | ||
589 | { | ||
590 | /* Pointer in ascii is the token. */ | ||
591 | char token[sizeof(watch) * 2 + 1]; | ||
592 | int err; | ||
593 | |||
594 | sprintf(token, "%lX", (long)watch); | ||
595 | |||
596 | down_read(&xs_state.watch_mutex); | ||
597 | |||
598 | spin_lock(&watches_lock); | ||
599 | BUG_ON(find_watch(token)); | ||
600 | list_add(&watch->list, &watches); | ||
601 | spin_unlock(&watches_lock); | ||
602 | |||
603 | err = xs_watch(watch->node, token); | ||
604 | |||
605 | /* Ignore errors due to multiple registration. */ | ||
606 | if ((err != 0) && (err != -EEXIST)) { | ||
607 | spin_lock(&watches_lock); | ||
608 | list_del(&watch->list); | ||
609 | spin_unlock(&watches_lock); | ||
610 | } | ||
611 | |||
612 | up_read(&xs_state.watch_mutex); | ||
613 | |||
614 | return err; | ||
615 | } | ||
616 | EXPORT_SYMBOL_GPL(register_xenbus_watch); | ||
617 | |||
618 | void unregister_xenbus_watch(struct xenbus_watch *watch) | ||
619 | { | ||
620 | struct xs_stored_msg *msg, *tmp; | ||
621 | char token[sizeof(watch) * 2 + 1]; | ||
622 | int err; | ||
623 | |||
624 | sprintf(token, "%lX", (long)watch); | ||
625 | |||
626 | down_read(&xs_state.watch_mutex); | ||
627 | |||
628 | spin_lock(&watches_lock); | ||
629 | BUG_ON(!find_watch(token)); | ||
630 | list_del(&watch->list); | ||
631 | spin_unlock(&watches_lock); | ||
632 | |||
633 | err = xs_unwatch(watch->node, token); | ||
634 | if (err) | ||
635 | printk(KERN_WARNING | ||
636 | "XENBUS Failed to release watch %s: %i\n", | ||
637 | watch->node, err); | ||
638 | |||
639 | up_read(&xs_state.watch_mutex); | ||
640 | |||
641 | /* Make sure there are no callbacks running currently (unless | ||
642 | its us) */ | ||
643 | if (current->pid != xenwatch_pid) | ||
644 | mutex_lock(&xenwatch_mutex); | ||
645 | |||
646 | /* Cancel pending watch events. */ | ||
647 | spin_lock(&watch_events_lock); | ||
648 | list_for_each_entry_safe(msg, tmp, &watch_events, list) { | ||
649 | if (msg->u.watch.handle != watch) | ||
650 | continue; | ||
651 | list_del(&msg->list); | ||
652 | kfree(msg->u.watch.vec); | ||
653 | kfree(msg); | ||
654 | } | ||
655 | spin_unlock(&watch_events_lock); | ||
656 | |||
657 | if (current->pid != xenwatch_pid) | ||
658 | mutex_unlock(&xenwatch_mutex); | ||
659 | } | ||
660 | EXPORT_SYMBOL_GPL(unregister_xenbus_watch); | ||
661 | |||
662 | void xs_suspend(void) | ||
663 | { | ||
664 | down_write(&xs_state.transaction_mutex); | ||
665 | down_write(&xs_state.watch_mutex); | ||
666 | mutex_lock(&xs_state.request_mutex); | ||
667 | mutex_lock(&xs_state.response_mutex); | ||
668 | } | ||
669 | |||
670 | void xs_resume(void) | ||
671 | { | ||
672 | struct xenbus_watch *watch; | ||
673 | char token[sizeof(watch) * 2 + 1]; | ||
674 | |||
675 | mutex_unlock(&xs_state.response_mutex); | ||
676 | mutex_unlock(&xs_state.request_mutex); | ||
677 | up_write(&xs_state.transaction_mutex); | ||
678 | |||
679 | /* No need for watches_lock: the watch_mutex is sufficient. */ | ||
680 | list_for_each_entry(watch, &watches, list) { | ||
681 | sprintf(token, "%lX", (long)watch); | ||
682 | xs_watch(watch->node, token); | ||
683 | } | ||
684 | |||
685 | up_write(&xs_state.watch_mutex); | ||
686 | } | ||
687 | |||
688 | void xs_suspend_cancel(void) | ||
689 | { | ||
690 | mutex_unlock(&xs_state.response_mutex); | ||
691 | mutex_unlock(&xs_state.request_mutex); | ||
692 | up_write(&xs_state.watch_mutex); | ||
693 | up_write(&xs_state.transaction_mutex); | ||
694 | } | ||
695 | |||
696 | static int xenwatch_thread(void *unused) | ||
697 | { | ||
698 | struct list_head *ent; | ||
699 | struct xs_stored_msg *msg; | ||
700 | |||
701 | for (;;) { | ||
702 | wait_event_interruptible(watch_events_waitq, | ||
703 | !list_empty(&watch_events)); | ||
704 | |||
705 | if (kthread_should_stop()) | ||
706 | break; | ||
707 | |||
708 | mutex_lock(&xenwatch_mutex); | ||
709 | |||
710 | spin_lock(&watch_events_lock); | ||
711 | ent = watch_events.next; | ||
712 | if (ent != &watch_events) | ||
713 | list_del(ent); | ||
714 | spin_unlock(&watch_events_lock); | ||
715 | |||
716 | if (ent != &watch_events) { | ||
717 | msg = list_entry(ent, struct xs_stored_msg, list); | ||
718 | msg->u.watch.handle->callback( | ||
719 | msg->u.watch.handle, | ||
720 | (const char **)msg->u.watch.vec, | ||
721 | msg->u.watch.vec_size); | ||
722 | kfree(msg->u.watch.vec); | ||
723 | kfree(msg); | ||
724 | } | ||
725 | |||
726 | mutex_unlock(&xenwatch_mutex); | ||
727 | } | ||
728 | |||
729 | return 0; | ||
730 | } | ||
731 | |||
732 | static int process_msg(void) | ||
733 | { | ||
734 | struct xs_stored_msg *msg; | ||
735 | char *body; | ||
736 | int err; | ||
737 | |||
738 | /* | ||
739 | * We must disallow save/restore while reading a xenstore message. | ||
740 | * A partial read across s/r leaves us out of sync with xenstored. | ||
741 | */ | ||
742 | for (;;) { | ||
743 | err = xb_wait_for_data_to_read(); | ||
744 | if (err) | ||
745 | return err; | ||
746 | mutex_lock(&xs_state.response_mutex); | ||
747 | if (xb_data_to_read()) | ||
748 | break; | ||
749 | /* We raced with save/restore: pending data 'disappeared'. */ | ||
750 | mutex_unlock(&xs_state.response_mutex); | ||
751 | } | ||
752 | |||
753 | |||
754 | msg = kmalloc(sizeof(*msg), GFP_KERNEL); | ||
755 | if (msg == NULL) { | ||
756 | err = -ENOMEM; | ||
757 | goto out; | ||
758 | } | ||
759 | |||
760 | err = xb_read(&msg->hdr, sizeof(msg->hdr)); | ||
761 | if (err) { | ||
762 | kfree(msg); | ||
763 | goto out; | ||
764 | } | ||
765 | |||
766 | body = kmalloc(msg->hdr.len + 1, GFP_KERNEL); | ||
767 | if (body == NULL) { | ||
768 | kfree(msg); | ||
769 | err = -ENOMEM; | ||
770 | goto out; | ||
771 | } | ||
772 | |||
773 | err = xb_read(body, msg->hdr.len); | ||
774 | if (err) { | ||
775 | kfree(body); | ||
776 | kfree(msg); | ||
777 | goto out; | ||
778 | } | ||
779 | body[msg->hdr.len] = '\0'; | ||
780 | |||
781 | if (msg->hdr.type == XS_WATCH_EVENT) { | ||
782 | msg->u.watch.vec = split(body, msg->hdr.len, | ||
783 | &msg->u.watch.vec_size); | ||
784 | if (IS_ERR(msg->u.watch.vec)) { | ||
785 | kfree(msg); | ||
786 | err = PTR_ERR(msg->u.watch.vec); | ||
787 | goto out; | ||
788 | } | ||
789 | |||
790 | spin_lock(&watches_lock); | ||
791 | msg->u.watch.handle = find_watch( | ||
792 | msg->u.watch.vec[XS_WATCH_TOKEN]); | ||
793 | if (msg->u.watch.handle != NULL) { | ||
794 | spin_lock(&watch_events_lock); | ||
795 | list_add_tail(&msg->list, &watch_events); | ||
796 | wake_up(&watch_events_waitq); | ||
797 | spin_unlock(&watch_events_lock); | ||
798 | } else { | ||
799 | kfree(msg->u.watch.vec); | ||
800 | kfree(msg); | ||
801 | } | ||
802 | spin_unlock(&watches_lock); | ||
803 | } else { | ||
804 | msg->u.reply.body = body; | ||
805 | spin_lock(&xs_state.reply_lock); | ||
806 | list_add_tail(&msg->list, &xs_state.reply_list); | ||
807 | spin_unlock(&xs_state.reply_lock); | ||
808 | wake_up(&xs_state.reply_waitq); | ||
809 | } | ||
810 | |||
811 | out: | ||
812 | mutex_unlock(&xs_state.response_mutex); | ||
813 | return err; | ||
814 | } | ||
815 | |||
816 | static int xenbus_thread(void *unused) | ||
817 | { | ||
818 | int err; | ||
819 | |||
820 | for (;;) { | ||
821 | err = process_msg(); | ||
822 | if (err) | ||
823 | printk(KERN_WARNING "XENBUS error %d while reading " | ||
824 | "message\n", err); | ||
825 | if (kthread_should_stop()) | ||
826 | break; | ||
827 | } | ||
828 | |||
829 | return 0; | ||
830 | } | ||
831 | |||
832 | int xs_init(void) | ||
833 | { | ||
834 | int err; | ||
835 | struct task_struct *task; | ||
836 | |||
837 | INIT_LIST_HEAD(&xs_state.reply_list); | ||
838 | spin_lock_init(&xs_state.reply_lock); | ||
839 | init_waitqueue_head(&xs_state.reply_waitq); | ||
840 | |||
841 | mutex_init(&xs_state.request_mutex); | ||
842 | mutex_init(&xs_state.response_mutex); | ||
843 | init_rwsem(&xs_state.transaction_mutex); | ||
844 | init_rwsem(&xs_state.watch_mutex); | ||
845 | |||
846 | /* Initialize the shared memory rings to talk to xenstored */ | ||
847 | err = xb_init_comms(); | ||
848 | if (err) | ||
849 | return err; | ||
850 | |||
851 | task = kthread_run(xenwatch_thread, NULL, "xenwatch"); | ||
852 | if (IS_ERR(task)) | ||
853 | return PTR_ERR(task); | ||
854 | xenwatch_pid = task->pid; | ||
855 | |||
856 | task = kthread_run(xenbus_thread, NULL, "xenbus"); | ||
857 | if (IS_ERR(task)) | ||
858 | return PTR_ERR(task); | ||
859 | |||
860 | return 0; | ||
861 | } | ||
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 352eb4a13f98..c4c36171240d 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c | |||
@@ -209,7 +209,7 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb) | |||
209 | envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | 209 | envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; |
210 | envp[2] = NULL; | 210 | envp[2] = NULL; |
211 | 211 | ||
212 | ret = call_usermodehelper(argv[0], argv, envp, 1); | 212 | ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); |
213 | if (ret < 0) | 213 | if (ret < 0) |
214 | mlog_errno(ret); | 214 | mlog_errno(ret); |
215 | } | 215 | } |
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h index 9e15ce0006eb..36f310632c49 100644 --- a/include/asm-i386/irq.h +++ b/include/asm-i386/irq.h | |||
@@ -41,6 +41,7 @@ extern int irqbalance_disable(char *str); | |||
41 | extern void fixup_irqs(cpumask_t map); | 41 | extern void fixup_irqs(cpumask_t map); |
42 | #endif | 42 | #endif |
43 | 43 | ||
44 | unsigned int do_IRQ(struct pt_regs *regs); | ||
44 | void init_IRQ(void); | 45 | void init_IRQ(void); |
45 | void __init native_init_IRQ(void); | 46 | void __init native_init_IRQ(void); |
46 | 47 | ||
diff --git a/include/asm-i386/mach-default/irq_vectors_limits.h b/include/asm-i386/mach-default/irq_vectors_limits.h index 7f161e760be6..a90c7a60109f 100644 --- a/include/asm-i386/mach-default/irq_vectors_limits.h +++ b/include/asm-i386/mach-default/irq_vectors_limits.h | |||
@@ -1,7 +1,7 @@ | |||
1 | #ifndef _ASM_IRQ_VECTORS_LIMITS_H | 1 | #ifndef _ASM_IRQ_VECTORS_LIMITS_H |
2 | #define _ASM_IRQ_VECTORS_LIMITS_H | 2 | #define _ASM_IRQ_VECTORS_LIMITS_H |
3 | 3 | ||
4 | #ifdef CONFIG_X86_IO_APIC | 4 | #if defined(CONFIG_X86_IO_APIC) || defined(CONFIG_PARAVIRT) |
5 | #define NR_IRQS 224 | 5 | #define NR_IRQS 224 |
6 | # if (224 >= 32 * NR_CPUS) | 6 | # if (224 >= 32 * NR_CPUS) |
7 | # define NR_IRQ_VECTORS NR_IRQS | 7 | # define NR_IRQ_VECTORS NR_IRQS |
diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h index 8198d1cca1f3..7eb0b0b1fb3c 100644 --- a/include/asm-i386/mmu_context.h +++ b/include/asm-i386/mmu_context.h | |||
@@ -32,6 +32,8 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) | |||
32 | #endif | 32 | #endif |
33 | } | 33 | } |
34 | 34 | ||
35 | void leave_mm(unsigned long cpu); | ||
36 | |||
35 | static inline void switch_mm(struct mm_struct *prev, | 37 | static inline void switch_mm(struct mm_struct *prev, |
36 | struct mm_struct *next, | 38 | struct mm_struct *next, |
37 | struct task_struct *tsk) | 39 | struct task_struct *tsk) |
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 7f846a7d6bcc..7df88be2dd9e 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h | |||
@@ -52,6 +52,8 @@ struct paravirt_ops | |||
52 | /* Basic arch-specific setup */ | 52 | /* Basic arch-specific setup */ |
53 | void (*arch_setup)(void); | 53 | void (*arch_setup)(void); |
54 | char *(*memory_setup)(void); | 54 | char *(*memory_setup)(void); |
55 | void (*post_allocator_init)(void); | ||
56 | |||
55 | void (*init_IRQ)(void); | 57 | void (*init_IRQ)(void); |
56 | void (*time_init)(void); | 58 | void (*time_init)(void); |
57 | 59 | ||
@@ -116,7 +118,7 @@ struct paravirt_ops | |||
116 | 118 | ||
117 | u64 (*read_tsc)(void); | 119 | u64 (*read_tsc)(void); |
118 | u64 (*read_pmc)(void); | 120 | u64 (*read_pmc)(void); |
119 | u64 (*get_scheduled_cycles)(void); | 121 | unsigned long long (*sched_clock)(void); |
120 | unsigned long (*get_cpu_khz)(void); | 122 | unsigned long (*get_cpu_khz)(void); |
121 | 123 | ||
122 | /* Segment descriptor handling */ | 124 | /* Segment descriptor handling */ |
@@ -173,7 +175,7 @@ struct paravirt_ops | |||
173 | unsigned long va); | 175 | unsigned long va); |
174 | 176 | ||
175 | /* Hooks for allocating/releasing pagetable pages */ | 177 | /* Hooks for allocating/releasing pagetable pages */ |
176 | void (*alloc_pt)(u32 pfn); | 178 | void (*alloc_pt)(struct mm_struct *mm, u32 pfn); |
177 | void (*alloc_pd)(u32 pfn); | 179 | void (*alloc_pd)(u32 pfn); |
178 | void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count); | 180 | void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count); |
179 | void (*release_pt)(u32 pfn); | 181 | void (*release_pt)(u32 pfn); |
@@ -260,6 +262,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *site, unsigned len) | |||
260 | unsigned paravirt_patch_insns(void *site, unsigned len, | 262 | unsigned paravirt_patch_insns(void *site, unsigned len, |
261 | const char *start, const char *end); | 263 | const char *start, const char *end); |
262 | 264 | ||
265 | int paravirt_disable_iospace(void); | ||
263 | 266 | ||
264 | /* | 267 | /* |
265 | * This generates an indirect call based on the operation type number. | 268 | * This generates an indirect call based on the operation type number. |
@@ -563,7 +566,10 @@ static inline u64 paravirt_read_tsc(void) | |||
563 | 566 | ||
564 | #define rdtscll(val) (val = paravirt_read_tsc()) | 567 | #define rdtscll(val) (val = paravirt_read_tsc()) |
565 | 568 | ||
566 | #define get_scheduled_cycles(val) (val = paravirt_ops.get_scheduled_cycles()) | 569 | static inline unsigned long long paravirt_sched_clock(void) |
570 | { | ||
571 | return PVOP_CALL0(unsigned long long, sched_clock); | ||
572 | } | ||
567 | #define calculate_cpu_khz() (paravirt_ops.get_cpu_khz()) | 573 | #define calculate_cpu_khz() (paravirt_ops.get_cpu_khz()) |
568 | 574 | ||
569 | #define write_tsc(val1,val2) wrmsr(0x10, val1, val2) | 575 | #define write_tsc(val1,val2) wrmsr(0x10, val1, val2) |
@@ -669,6 +675,12 @@ static inline void setup_secondary_clock(void) | |||
669 | } | 675 | } |
670 | #endif | 676 | #endif |
671 | 677 | ||
678 | static inline void paravirt_post_allocator_init(void) | ||
679 | { | ||
680 | if (paravirt_ops.post_allocator_init) | ||
681 | (*paravirt_ops.post_allocator_init)(); | ||
682 | } | ||
683 | |||
672 | static inline void paravirt_pagetable_setup_start(pgd_t *base) | 684 | static inline void paravirt_pagetable_setup_start(pgd_t *base) |
673 | { | 685 | { |
674 | if (paravirt_ops.pagetable_setup_start) | 686 | if (paravirt_ops.pagetable_setup_start) |
@@ -725,9 +737,9 @@ static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, | |||
725 | PVOP_VCALL3(flush_tlb_others, &cpumask, mm, va); | 737 | PVOP_VCALL3(flush_tlb_others, &cpumask, mm, va); |
726 | } | 738 | } |
727 | 739 | ||
728 | static inline void paravirt_alloc_pt(unsigned pfn) | 740 | static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn) |
729 | { | 741 | { |
730 | PVOP_VCALL1(alloc_pt, pfn); | 742 | PVOP_VCALL2(alloc_pt, mm, pfn); |
731 | } | 743 | } |
732 | static inline void paravirt_release_pt(unsigned pfn) | 744 | static inline void paravirt_release_pt(unsigned pfn) |
733 | { | 745 | { |
diff --git a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h index d07b7afc2692..f2fc33ceb9f2 100644 --- a/include/asm-i386/pgalloc.h +++ b/include/asm-i386/pgalloc.h | |||
@@ -7,7 +7,7 @@ | |||
7 | #ifdef CONFIG_PARAVIRT | 7 | #ifdef CONFIG_PARAVIRT |
8 | #include <asm/paravirt.h> | 8 | #include <asm/paravirt.h> |
9 | #else | 9 | #else |
10 | #define paravirt_alloc_pt(pfn) do { } while (0) | 10 | #define paravirt_alloc_pt(mm, pfn) do { } while (0) |
11 | #define paravirt_alloc_pd(pfn) do { } while (0) | 11 | #define paravirt_alloc_pd(pfn) do { } while (0) |
12 | #define paravirt_alloc_pd(pfn) do { } while (0) | 12 | #define paravirt_alloc_pd(pfn) do { } while (0) |
13 | #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0) | 13 | #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0) |
@@ -17,13 +17,13 @@ | |||
17 | 17 | ||
18 | #define pmd_populate_kernel(mm, pmd, pte) \ | 18 | #define pmd_populate_kernel(mm, pmd, pte) \ |
19 | do { \ | 19 | do { \ |
20 | paravirt_alloc_pt(__pa(pte) >> PAGE_SHIFT); \ | 20 | paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT); \ |
21 | set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \ | 21 | set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \ |
22 | } while (0) | 22 | } while (0) |
23 | 23 | ||
24 | #define pmd_populate(mm, pmd, pte) \ | 24 | #define pmd_populate(mm, pmd, pte) \ |
25 | do { \ | 25 | do { \ |
26 | paravirt_alloc_pt(page_to_pfn(pte)); \ | 26 | paravirt_alloc_pt(mm, page_to_pfn(pte)); \ |
27 | set_pmd(pmd, __pmd(_PAGE_TABLE + \ | 27 | set_pmd(pmd, __pmd(_PAGE_TABLE + \ |
28 | ((unsigned long long)page_to_pfn(pte) << \ | 28 | ((unsigned long long)page_to_pfn(pte) << \ |
29 | (unsigned long long) PAGE_SHIFT))); \ | 29 | (unsigned long long) PAGE_SHIFT))); \ |
diff --git a/include/asm-i386/setup.h b/include/asm-i386/setup.h index 0d5bff9dc4a5..7862fe858a9e 100644 --- a/include/asm-i386/setup.h +++ b/include/asm-i386/setup.h | |||
@@ -81,6 +81,10 @@ void __init add_memory_region(unsigned long long start, | |||
81 | 81 | ||
82 | extern unsigned long init_pg_tables_end; | 82 | extern unsigned long init_pg_tables_end; |
83 | 83 | ||
84 | #ifndef CONFIG_PARAVIRT | ||
85 | #define paravirt_post_allocator_init() do {} while (0) | ||
86 | #endif | ||
87 | |||
84 | #endif /* __ASSEMBLY__ */ | 88 | #endif /* __ASSEMBLY__ */ |
85 | 89 | ||
86 | #endif /* __KERNEL__ */ | 90 | #endif /* __KERNEL__ */ |
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h index 0c7132787062..1f73bde165b1 100644 --- a/include/asm-i386/smp.h +++ b/include/asm-i386/smp.h | |||
@@ -43,9 +43,12 @@ extern u8 x86_cpu_to_apicid[]; | |||
43 | 43 | ||
44 | #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] | 44 | #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] |
45 | 45 | ||
46 | extern void set_cpu_sibling_map(int cpu); | ||
47 | |||
46 | #ifdef CONFIG_HOTPLUG_CPU | 48 | #ifdef CONFIG_HOTPLUG_CPU |
47 | extern void cpu_exit_clear(void); | 49 | extern void cpu_exit_clear(void); |
48 | extern void cpu_uninit(void); | 50 | extern void cpu_uninit(void); |
51 | extern void remove_siblinginfo(int cpu); | ||
49 | #endif | 52 | #endif |
50 | 53 | ||
51 | struct smp_ops | 54 | struct smp_ops |
@@ -129,6 +132,8 @@ extern int __cpu_disable(void); | |||
129 | extern void __cpu_die(unsigned int cpu); | 132 | extern void __cpu_die(unsigned int cpu); |
130 | extern unsigned int num_processors; | 133 | extern unsigned int num_processors; |
131 | 134 | ||
135 | void __cpuinit smp_store_cpu_info(int id); | ||
136 | |||
132 | #endif /* !__ASSEMBLY__ */ | 137 | #endif /* !__ASSEMBLY__ */ |
133 | 138 | ||
134 | #else /* CONFIG_SMP */ | 139 | #else /* CONFIG_SMP */ |
diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h index 153770e25faa..51a713e33a9e 100644 --- a/include/asm-i386/timer.h +++ b/include/asm-i386/timer.h | |||
@@ -15,8 +15,38 @@ extern int no_sync_cmos_clock; | |||
15 | extern int recalibrate_cpu_khz(void); | 15 | extern int recalibrate_cpu_khz(void); |
16 | 16 | ||
17 | #ifndef CONFIG_PARAVIRT | 17 | #ifndef CONFIG_PARAVIRT |
18 | #define get_scheduled_cycles(val) rdtscll(val) | ||
19 | #define calculate_cpu_khz() native_calculate_cpu_khz() | 18 | #define calculate_cpu_khz() native_calculate_cpu_khz() |
20 | #endif | 19 | #endif |
21 | 20 | ||
21 | /* Accellerators for sched_clock() | ||
22 | * convert from cycles(64bits) => nanoseconds (64bits) | ||
23 | * basic equation: | ||
24 | * ns = cycles / (freq / ns_per_sec) | ||
25 | * ns = cycles * (ns_per_sec / freq) | ||
26 | * ns = cycles * (10^9 / (cpu_khz * 10^3)) | ||
27 | * ns = cycles * (10^6 / cpu_khz) | ||
28 | * | ||
29 | * Then we use scaling math (suggested by george@mvista.com) to get: | ||
30 | * ns = cycles * (10^6 * SC / cpu_khz) / SC | ||
31 | * ns = cycles * cyc2ns_scale / SC | ||
32 | * | ||
33 | * And since SC is a constant power of two, we can convert the div | ||
34 | * into a shift. | ||
35 | * | ||
36 | * We can use khz divisor instead of mhz to keep a better percision, since | ||
37 | * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. | ||
38 | * (mathieu.desnoyers@polymtl.ca) | ||
39 | * | ||
40 | * -johnstul@us.ibm.com "math is hard, lets go shopping!" | ||
41 | */ | ||
42 | extern unsigned long cyc2ns_scale __read_mostly; | ||
43 | |||
44 | #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ | ||
45 | |||
46 | static inline unsigned long long cycles_2_ns(unsigned long long cyc) | ||
47 | { | ||
48 | return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; | ||
49 | } | ||
50 | |||
51 | |||
22 | #endif | 52 | #endif |
diff --git a/include/asm-i386/vmi_time.h b/include/asm-i386/vmi_time.h index 213930b995cb..478188130328 100644 --- a/include/asm-i386/vmi_time.h +++ b/include/asm-i386/vmi_time.h | |||
@@ -49,7 +49,7 @@ extern struct vmi_timer_ops { | |||
49 | extern void __init vmi_time_init(void); | 49 | extern void __init vmi_time_init(void); |
50 | extern unsigned long vmi_get_wallclock(void); | 50 | extern unsigned long vmi_get_wallclock(void); |
51 | extern int vmi_set_wallclock(unsigned long now); | 51 | extern int vmi_set_wallclock(unsigned long now); |
52 | extern unsigned long long vmi_get_sched_cycles(void); | 52 | extern unsigned long long vmi_sched_clock(void); |
53 | extern unsigned long vmi_cpu_khz(void); | 53 | extern unsigned long vmi_cpu_khz(void); |
54 | 54 | ||
55 | #ifdef CONFIG_X86_LOCAL_APIC | 55 | #ifdef CONFIG_X86_LOCAL_APIC |
diff --git a/include/asm-i386/xen/hypercall.h b/include/asm-i386/xen/hypercall.h new file mode 100644 index 000000000000..bc0ee7d961ca --- /dev/null +++ b/include/asm-i386/xen/hypercall.h | |||
@@ -0,0 +1,413 @@ | |||
1 | /****************************************************************************** | ||
2 | * hypercall.h | ||
3 | * | ||
4 | * Linux-specific hypervisor handling. | ||
5 | * | ||
6 | * Copyright (c) 2002-2004, K A Fraser | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License version 2 | ||
10 | * as published by the Free Software Foundation; or, when distributed | ||
11 | * separately from the Linux kernel or incorporated into other | ||
12 | * software packages, subject to the following license: | ||
13 | * | ||
14 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
15 | * of this source file (the "Software"), to deal in the Software without | ||
16 | * restriction, including without limitation the rights to use, copy, modify, | ||
17 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
18 | * and to permit persons to whom the Software is furnished to do so, subject to | ||
19 | * the following conditions: | ||
20 | * | ||
21 | * The above copyright notice and this permission notice shall be included in | ||
22 | * all copies or substantial portions of the Software. | ||
23 | * | ||
24 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
25 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
26 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
27 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
28 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
29 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
30 | * IN THE SOFTWARE. | ||
31 | */ | ||
32 | |||
33 | #ifndef __HYPERCALL_H__ | ||
34 | #define __HYPERCALL_H__ | ||
35 | |||
36 | #include <linux/errno.h> | ||
37 | #include <linux/string.h> | ||
38 | |||
39 | #include <xen/interface/xen.h> | ||
40 | #include <xen/interface/sched.h> | ||
41 | #include <xen/interface/physdev.h> | ||
42 | |||
43 | extern struct { char _entry[32]; } hypercall_page[]; | ||
44 | |||
45 | #define _hypercall0(type, name) \ | ||
46 | ({ \ | ||
47 | long __res; \ | ||
48 | asm volatile ( \ | ||
49 | "call %[call]" \ | ||
50 | : "=a" (__res) \ | ||
51 | : [call] "m" (hypercall_page[__HYPERVISOR_##name]) \ | ||
52 | : "memory" ); \ | ||
53 | (type)__res; \ | ||
54 | }) | ||
55 | |||
56 | #define _hypercall1(type, name, a1) \ | ||
57 | ({ \ | ||
58 | long __res, __ign1; \ | ||
59 | asm volatile ( \ | ||
60 | "call %[call]" \ | ||
61 | : "=a" (__res), "=b" (__ign1) \ | ||
62 | : "1" ((long)(a1)), \ | ||
63 | [call] "m" (hypercall_page[__HYPERVISOR_##name]) \ | ||
64 | : "memory" ); \ | ||
65 | (type)__res; \ | ||
66 | }) | ||
67 | |||
68 | #define _hypercall2(type, name, a1, a2) \ | ||
69 | ({ \ | ||
70 | long __res, __ign1, __ign2; \ | ||
71 | asm volatile ( \ | ||
72 | "call %[call]" \ | ||
73 | : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \ | ||
74 | : "1" ((long)(a1)), "2" ((long)(a2)), \ | ||
75 | [call] "m" (hypercall_page[__HYPERVISOR_##name]) \ | ||
76 | : "memory" ); \ | ||
77 | (type)__res; \ | ||
78 | }) | ||
79 | |||
80 | #define _hypercall3(type, name, a1, a2, a3) \ | ||
81 | ({ \ | ||
82 | long __res, __ign1, __ign2, __ign3; \ | ||
83 | asm volatile ( \ | ||
84 | "call %[call]" \ | ||
85 | : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ | ||
86 | "=d" (__ign3) \ | ||
87 | : "1" ((long)(a1)), "2" ((long)(a2)), \ | ||
88 | "3" ((long)(a3)), \ | ||
89 | [call] "m" (hypercall_page[__HYPERVISOR_##name]) \ | ||
90 | : "memory" ); \ | ||
91 | (type)__res; \ | ||
92 | }) | ||
93 | |||
94 | #define _hypercall4(type, name, a1, a2, a3, a4) \ | ||
95 | ({ \ | ||
96 | long __res, __ign1, __ign2, __ign3, __ign4; \ | ||
97 | asm volatile ( \ | ||
98 | "call %[call]" \ | ||
99 | : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ | ||
100 | "=d" (__ign3), "=S" (__ign4) \ | ||
101 | : "1" ((long)(a1)), "2" ((long)(a2)), \ | ||
102 | "3" ((long)(a3)), "4" ((long)(a4)), \ | ||
103 | [call] "m" (hypercall_page[__HYPERVISOR_##name]) \ | ||
104 | : "memory" ); \ | ||
105 | (type)__res; \ | ||
106 | }) | ||
107 | |||
108 | #define _hypercall5(type, name, a1, a2, a3, a4, a5) \ | ||
109 | ({ \ | ||
110 | long __res, __ign1, __ign2, __ign3, __ign4, __ign5; \ | ||
111 | asm volatile ( \ | ||
112 | "call %[call]" \ | ||
113 | : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ | ||
114 | "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \ | ||
115 | : "1" ((long)(a1)), "2" ((long)(a2)), \ | ||
116 | "3" ((long)(a3)), "4" ((long)(a4)), \ | ||
117 | "5" ((long)(a5)), \ | ||
118 | [call] "m" (hypercall_page[__HYPERVISOR_##name]) \ | ||
119 | : "memory" ); \ | ||
120 | (type)__res; \ | ||
121 | }) | ||
122 | |||
123 | static inline int | ||
124 | HYPERVISOR_set_trap_table(struct trap_info *table) | ||
125 | { | ||
126 | return _hypercall1(int, set_trap_table, table); | ||
127 | } | ||
128 | |||
129 | static inline int | ||
130 | HYPERVISOR_mmu_update(struct mmu_update *req, int count, | ||
131 | int *success_count, domid_t domid) | ||
132 | { | ||
133 | return _hypercall4(int, mmu_update, req, count, success_count, domid); | ||
134 | } | ||
135 | |||
136 | static inline int | ||
137 | HYPERVISOR_mmuext_op(struct mmuext_op *op, int count, | ||
138 | int *success_count, domid_t domid) | ||
139 | { | ||
140 | return _hypercall4(int, mmuext_op, op, count, success_count, domid); | ||
141 | } | ||
142 | |||
143 | static inline int | ||
144 | HYPERVISOR_set_gdt(unsigned long *frame_list, int entries) | ||
145 | { | ||
146 | return _hypercall2(int, set_gdt, frame_list, entries); | ||
147 | } | ||
148 | |||
149 | static inline int | ||
150 | HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp) | ||
151 | { | ||
152 | return _hypercall2(int, stack_switch, ss, esp); | ||
153 | } | ||
154 | |||
155 | static inline int | ||
156 | HYPERVISOR_set_callbacks(unsigned long event_selector, | ||
157 | unsigned long event_address, | ||
158 | unsigned long failsafe_selector, | ||
159 | unsigned long failsafe_address) | ||
160 | { | ||
161 | return _hypercall4(int, set_callbacks, | ||
162 | event_selector, event_address, | ||
163 | failsafe_selector, failsafe_address); | ||
164 | } | ||
165 | |||
166 | static inline int | ||
167 | HYPERVISOR_fpu_taskswitch(int set) | ||
168 | { | ||
169 | return _hypercall1(int, fpu_taskswitch, set); | ||
170 | } | ||
171 | |||
172 | static inline int | ||
173 | HYPERVISOR_sched_op(int cmd, unsigned long arg) | ||
174 | { | ||
175 | return _hypercall2(int, sched_op, cmd, arg); | ||
176 | } | ||
177 | |||
178 | static inline long | ||
179 | HYPERVISOR_set_timer_op(u64 timeout) | ||
180 | { | ||
181 | unsigned long timeout_hi = (unsigned long)(timeout>>32); | ||
182 | unsigned long timeout_lo = (unsigned long)timeout; | ||
183 | return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi); | ||
184 | } | ||
185 | |||
186 | static inline int | ||
187 | HYPERVISOR_set_debugreg(int reg, unsigned long value) | ||
188 | { | ||
189 | return _hypercall2(int, set_debugreg, reg, value); | ||
190 | } | ||
191 | |||
192 | static inline unsigned long | ||
193 | HYPERVISOR_get_debugreg(int reg) | ||
194 | { | ||
195 | return _hypercall1(unsigned long, get_debugreg, reg); | ||
196 | } | ||
197 | |||
198 | static inline int | ||
199 | HYPERVISOR_update_descriptor(u64 ma, u64 desc) | ||
200 | { | ||
201 | return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); | ||
202 | } | ||
203 | |||
204 | static inline int | ||
205 | HYPERVISOR_memory_op(unsigned int cmd, void *arg) | ||
206 | { | ||
207 | return _hypercall2(int, memory_op, cmd, arg); | ||
208 | } | ||
209 | |||
210 | static inline int | ||
211 | HYPERVISOR_multicall(void *call_list, int nr_calls) | ||
212 | { | ||
213 | return _hypercall2(int, multicall, call_list, nr_calls); | ||
214 | } | ||
215 | |||
216 | static inline int | ||
217 | HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val, | ||
218 | unsigned long flags) | ||
219 | { | ||
220 | unsigned long pte_hi = 0; | ||
221 | #ifdef CONFIG_X86_PAE | ||
222 | pte_hi = new_val.pte_high; | ||
223 | #endif | ||
224 | return _hypercall4(int, update_va_mapping, va, | ||
225 | new_val.pte_low, pte_hi, flags); | ||
226 | } | ||
227 | |||
228 | static inline int | ||
229 | HYPERVISOR_event_channel_op(int cmd, void *arg) | ||
230 | { | ||
231 | int rc = _hypercall2(int, event_channel_op, cmd, arg); | ||
232 | if (unlikely(rc == -ENOSYS)) { | ||
233 | struct evtchn_op op; | ||
234 | op.cmd = cmd; | ||
235 | memcpy(&op.u, arg, sizeof(op.u)); | ||
236 | rc = _hypercall1(int, event_channel_op_compat, &op); | ||
237 | memcpy(arg, &op.u, sizeof(op.u)); | ||
238 | } | ||
239 | return rc; | ||
240 | } | ||
241 | |||
242 | static inline int | ||
243 | HYPERVISOR_xen_version(int cmd, void *arg) | ||
244 | { | ||
245 | return _hypercall2(int, xen_version, cmd, arg); | ||
246 | } | ||
247 | |||
248 | static inline int | ||
249 | HYPERVISOR_console_io(int cmd, int count, char *str) | ||
250 | { | ||
251 | return _hypercall3(int, console_io, cmd, count, str); | ||
252 | } | ||
253 | |||
254 | static inline int | ||
255 | HYPERVISOR_physdev_op(int cmd, void *arg) | ||
256 | { | ||
257 | int rc = _hypercall2(int, physdev_op, cmd, arg); | ||
258 | if (unlikely(rc == -ENOSYS)) { | ||
259 | struct physdev_op op; | ||
260 | op.cmd = cmd; | ||
261 | memcpy(&op.u, arg, sizeof(op.u)); | ||
262 | rc = _hypercall1(int, physdev_op_compat, &op); | ||
263 | memcpy(arg, &op.u, sizeof(op.u)); | ||
264 | } | ||
265 | return rc; | ||
266 | } | ||
267 | |||
268 | static inline int | ||
269 | HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count) | ||
270 | { | ||
271 | return _hypercall3(int, grant_table_op, cmd, uop, count); | ||
272 | } | ||
273 | |||
274 | static inline int | ||
275 | HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val, | ||
276 | unsigned long flags, domid_t domid) | ||
277 | { | ||
278 | unsigned long pte_hi = 0; | ||
279 | #ifdef CONFIG_X86_PAE | ||
280 | pte_hi = new_val.pte_high; | ||
281 | #endif | ||
282 | return _hypercall5(int, update_va_mapping_otherdomain, va, | ||
283 | new_val.pte_low, pte_hi, flags, domid); | ||
284 | } | ||
285 | |||
286 | static inline int | ||
287 | HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type) | ||
288 | { | ||
289 | return _hypercall2(int, vm_assist, cmd, type); | ||
290 | } | ||
291 | |||
292 | static inline int | ||
293 | HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args) | ||
294 | { | ||
295 | return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); | ||
296 | } | ||
297 | |||
298 | static inline int | ||
299 | HYPERVISOR_suspend(unsigned long srec) | ||
300 | { | ||
301 | return _hypercall3(int, sched_op, SCHEDOP_shutdown, | ||
302 | SHUTDOWN_suspend, srec); | ||
303 | } | ||
304 | |||
305 | static inline int | ||
306 | HYPERVISOR_nmi_op(unsigned long op, unsigned long arg) | ||
307 | { | ||
308 | return _hypercall2(int, nmi_op, op, arg); | ||
309 | } | ||
310 | |||
311 | static inline void | ||
312 | MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, | ||
313 | pte_t new_val, unsigned long flags) | ||
314 | { | ||
315 | mcl->op = __HYPERVISOR_update_va_mapping; | ||
316 | mcl->args[0] = va; | ||
317 | #ifdef CONFIG_X86_PAE | ||
318 | mcl->args[1] = new_val.pte_low; | ||
319 | mcl->args[2] = new_val.pte_high; | ||
320 | #else | ||
321 | mcl->args[1] = new_val.pte_low; | ||
322 | mcl->args[2] = 0; | ||
323 | #endif | ||
324 | mcl->args[3] = flags; | ||
325 | } | ||
326 | |||
327 | static inline void | ||
328 | MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd, | ||
329 | void *uop, unsigned int count) | ||
330 | { | ||
331 | mcl->op = __HYPERVISOR_grant_table_op; | ||
332 | mcl->args[0] = cmd; | ||
333 | mcl->args[1] = (unsigned long)uop; | ||
334 | mcl->args[2] = count; | ||
335 | } | ||
336 | |||
337 | static inline void | ||
338 | MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long va, | ||
339 | pte_t new_val, unsigned long flags, | ||
340 | domid_t domid) | ||
341 | { | ||
342 | mcl->op = __HYPERVISOR_update_va_mapping_otherdomain; | ||
343 | mcl->args[0] = va; | ||
344 | #ifdef CONFIG_X86_PAE | ||
345 | mcl->args[1] = new_val.pte_low; | ||
346 | mcl->args[2] = new_val.pte_high; | ||
347 | #else | ||
348 | mcl->args[1] = new_val.pte_low; | ||
349 | mcl->args[2] = 0; | ||
350 | #endif | ||
351 | mcl->args[3] = flags; | ||
352 | mcl->args[4] = domid; | ||
353 | } | ||
354 | |||
355 | static inline void | ||
356 | MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, | ||
357 | struct desc_struct desc) | ||
358 | { | ||
359 | mcl->op = __HYPERVISOR_update_descriptor; | ||
360 | mcl->args[0] = maddr; | ||
361 | mcl->args[1] = maddr >> 32; | ||
362 | mcl->args[2] = desc.a; | ||
363 | mcl->args[3] = desc.b; | ||
364 | } | ||
365 | |||
366 | static inline void | ||
367 | MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg) | ||
368 | { | ||
369 | mcl->op = __HYPERVISOR_memory_op; | ||
370 | mcl->args[0] = cmd; | ||
371 | mcl->args[1] = (unsigned long)arg; | ||
372 | } | ||
373 | |||
374 | static inline void | ||
375 | MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req, | ||
376 | int count, int *success_count, domid_t domid) | ||
377 | { | ||
378 | mcl->op = __HYPERVISOR_mmu_update; | ||
379 | mcl->args[0] = (unsigned long)req; | ||
380 | mcl->args[1] = count; | ||
381 | mcl->args[2] = (unsigned long)success_count; | ||
382 | mcl->args[3] = domid; | ||
383 | } | ||
384 | |||
385 | static inline void | ||
386 | MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count, | ||
387 | int *success_count, domid_t domid) | ||
388 | { | ||
389 | mcl->op = __HYPERVISOR_mmuext_op; | ||
390 | mcl->args[0] = (unsigned long)op; | ||
391 | mcl->args[1] = count; | ||
392 | mcl->args[2] = (unsigned long)success_count; | ||
393 | mcl->args[3] = domid; | ||
394 | } | ||
395 | |||
396 | static inline void | ||
397 | MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries) | ||
398 | { | ||
399 | mcl->op = __HYPERVISOR_set_gdt; | ||
400 | mcl->args[0] = (unsigned long)frames; | ||
401 | mcl->args[1] = entries; | ||
402 | } | ||
403 | |||
404 | static inline void | ||
405 | MULTI_stack_switch(struct multicall_entry *mcl, | ||
406 | unsigned long ss, unsigned long esp) | ||
407 | { | ||
408 | mcl->op = __HYPERVISOR_stack_switch; | ||
409 | mcl->args[0] = ss; | ||
410 | mcl->args[1] = esp; | ||
411 | } | ||
412 | |||
413 | #endif /* __HYPERCALL_H__ */ | ||
diff --git a/include/asm-i386/xen/hypervisor.h b/include/asm-i386/xen/hypervisor.h new file mode 100644 index 000000000000..8e15dd28c91f --- /dev/null +++ b/include/asm-i386/xen/hypervisor.h | |||
@@ -0,0 +1,73 @@ | |||
1 | /****************************************************************************** | ||
2 | * hypervisor.h | ||
3 | * | ||
4 | * Linux-specific hypervisor handling. | ||
5 | * | ||
6 | * Copyright (c) 2002-2004, K A Fraser | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License version 2 | ||
10 | * as published by the Free Software Foundation; or, when distributed | ||
11 | * separately from the Linux kernel or incorporated into other | ||
12 | * software packages, subject to the following license: | ||
13 | * | ||
14 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
15 | * of this source file (the "Software"), to deal in the Software without | ||
16 | * restriction, including without limitation the rights to use, copy, modify, | ||
17 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
18 | * and to permit persons to whom the Software is furnished to do so, subject to | ||
19 | * the following conditions: | ||
20 | * | ||
21 | * The above copyright notice and this permission notice shall be included in | ||
22 | * all copies or substantial portions of the Software. | ||
23 | * | ||
24 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
25 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
26 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
27 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
28 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
29 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
30 | * IN THE SOFTWARE. | ||
31 | */ | ||
32 | |||
33 | #ifndef __HYPERVISOR_H__ | ||
34 | #define __HYPERVISOR_H__ | ||
35 | |||
36 | #include <linux/types.h> | ||
37 | #include <linux/kernel.h> | ||
38 | #include <linux/version.h> | ||
39 | |||
40 | #include <xen/interface/xen.h> | ||
41 | #include <xen/interface/version.h> | ||
42 | |||
43 | #include <asm/ptrace.h> | ||
44 | #include <asm/page.h> | ||
45 | #include <asm/desc.h> | ||
46 | #if defined(__i386__) | ||
47 | # ifdef CONFIG_X86_PAE | ||
48 | # include <asm-generic/pgtable-nopud.h> | ||
49 | # else | ||
50 | # include <asm-generic/pgtable-nopmd.h> | ||
51 | # endif | ||
52 | #endif | ||
53 | #include <asm/xen/hypercall.h> | ||
54 | |||
55 | /* arch/i386/kernel/setup.c */ | ||
56 | extern struct shared_info *HYPERVISOR_shared_info; | ||
57 | extern struct start_info *xen_start_info; | ||
58 | #define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN) | ||
59 | |||
60 | /* arch/i386/mach-xen/evtchn.c */ | ||
61 | /* Force a proper event-channel callback from Xen. */ | ||
62 | extern void force_evtchn_callback(void); | ||
63 | |||
64 | /* Turn jiffies into Xen system time. */ | ||
65 | u64 jiffies_to_st(unsigned long jiffies); | ||
66 | |||
67 | |||
68 | #define MULTI_UVMFLAGS_INDEX 3 | ||
69 | #define MULTI_UVMDOMID_INDEX 4 | ||
70 | |||
71 | #define is_running_on_xen() (xen_start_info ? 1 : 0) | ||
72 | |||
73 | #endif /* __HYPERVISOR_H__ */ | ||
diff --git a/include/asm-i386/xen/interface.h b/include/asm-i386/xen/interface.h new file mode 100644 index 000000000000..165c3968e138 --- /dev/null +++ b/include/asm-i386/xen/interface.h | |||
@@ -0,0 +1,188 @@ | |||
1 | /****************************************************************************** | ||
2 | * arch-x86_32.h | ||
3 | * | ||
4 | * Guest OS interface to x86 32-bit Xen. | ||
5 | * | ||
6 | * Copyright (c) 2004, K A Fraser | ||
7 | */ | ||
8 | |||
9 | #ifndef __XEN_PUBLIC_ARCH_X86_32_H__ | ||
10 | #define __XEN_PUBLIC_ARCH_X86_32_H__ | ||
11 | |||
12 | #ifdef __XEN__ | ||
13 | #define __DEFINE_GUEST_HANDLE(name, type) \ | ||
14 | typedef struct { type *p; } __guest_handle_ ## name | ||
15 | #else | ||
16 | #define __DEFINE_GUEST_HANDLE(name, type) \ | ||
17 | typedef type * __guest_handle_ ## name | ||
18 | #endif | ||
19 | |||
20 | #define DEFINE_GUEST_HANDLE_STRUCT(name) \ | ||
21 | __DEFINE_GUEST_HANDLE(name, struct name) | ||
22 | #define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name) | ||
23 | #define GUEST_HANDLE(name) __guest_handle_ ## name | ||
24 | |||
25 | #ifndef __ASSEMBLY__ | ||
26 | /* Guest handles for primitive C types. */ | ||
27 | __DEFINE_GUEST_HANDLE(uchar, unsigned char); | ||
28 | __DEFINE_GUEST_HANDLE(uint, unsigned int); | ||
29 | __DEFINE_GUEST_HANDLE(ulong, unsigned long); | ||
30 | DEFINE_GUEST_HANDLE(char); | ||
31 | DEFINE_GUEST_HANDLE(int); | ||
32 | DEFINE_GUEST_HANDLE(long); | ||
33 | DEFINE_GUEST_HANDLE(void); | ||
34 | #endif | ||
35 | |||
36 | /* | ||
37 | * SEGMENT DESCRIPTOR TABLES | ||
38 | */ | ||
39 | /* | ||
40 | * A number of GDT entries are reserved by Xen. These are not situated at the | ||
41 | * start of the GDT because some stupid OSes export hard-coded selector values | ||
42 | * in their ABI. These hard-coded values are always near the start of the GDT, | ||
43 | * so Xen places itself out of the way, at the far end of the GDT. | ||
44 | */ | ||
45 | #define FIRST_RESERVED_GDT_PAGE 14 | ||
46 | #define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) | ||
47 | #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) | ||
48 | |||
49 | /* | ||
50 | * These flat segments are in the Xen-private section of every GDT. Since these | ||
51 | * are also present in the initial GDT, many OSes will be able to avoid | ||
52 | * installing their own GDT. | ||
53 | */ | ||
54 | #define FLAT_RING1_CS 0xe019 /* GDT index 259 */ | ||
55 | #define FLAT_RING1_DS 0xe021 /* GDT index 260 */ | ||
56 | #define FLAT_RING1_SS 0xe021 /* GDT index 260 */ | ||
57 | #define FLAT_RING3_CS 0xe02b /* GDT index 261 */ | ||
58 | #define FLAT_RING3_DS 0xe033 /* GDT index 262 */ | ||
59 | #define FLAT_RING3_SS 0xe033 /* GDT index 262 */ | ||
60 | |||
61 | #define FLAT_KERNEL_CS FLAT_RING1_CS | ||
62 | #define FLAT_KERNEL_DS FLAT_RING1_DS | ||
63 | #define FLAT_KERNEL_SS FLAT_RING1_SS | ||
64 | #define FLAT_USER_CS FLAT_RING3_CS | ||
65 | #define FLAT_USER_DS FLAT_RING3_DS | ||
66 | #define FLAT_USER_SS FLAT_RING3_SS | ||
67 | |||
68 | /* And the trap vector is... */ | ||
69 | #define TRAP_INSTR "int $0x82" | ||
70 | |||
71 | /* | ||
72 | * Virtual addresses beyond this are not modifiable by guest OSes. The | ||
73 | * machine->physical mapping table starts at this address, read-only. | ||
74 | */ | ||
75 | #ifdef CONFIG_X86_PAE | ||
76 | #define __HYPERVISOR_VIRT_START 0xF5800000 | ||
77 | #else | ||
78 | #define __HYPERVISOR_VIRT_START 0xFC000000 | ||
79 | #endif | ||
80 | |||
81 | #ifndef HYPERVISOR_VIRT_START | ||
82 | #define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) | ||
83 | #endif | ||
84 | |||
85 | #ifndef machine_to_phys_mapping | ||
86 | #define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) | ||
87 | #endif | ||
88 | |||
89 | /* Maximum number of virtual CPUs in multi-processor guests. */ | ||
90 | #define MAX_VIRT_CPUS 32 | ||
91 | |||
92 | #ifndef __ASSEMBLY__ | ||
93 | |||
94 | /* | ||
95 | * Send an array of these to HYPERVISOR_set_trap_table() | ||
96 | */ | ||
97 | #define TI_GET_DPL(_ti) ((_ti)->flags & 3) | ||
98 | #define TI_GET_IF(_ti) ((_ti)->flags & 4) | ||
99 | #define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl)) | ||
100 | #define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2)) | ||
101 | |||
102 | struct trap_info { | ||
103 | uint8_t vector; /* exception vector */ | ||
104 | uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */ | ||
105 | uint16_t cs; /* code selector */ | ||
106 | unsigned long address; /* code offset */ | ||
107 | }; | ||
108 | DEFINE_GUEST_HANDLE_STRUCT(trap_info); | ||
109 | |||
110 | struct cpu_user_regs { | ||
111 | uint32_t ebx; | ||
112 | uint32_t ecx; | ||
113 | uint32_t edx; | ||
114 | uint32_t esi; | ||
115 | uint32_t edi; | ||
116 | uint32_t ebp; | ||
117 | uint32_t eax; | ||
118 | uint16_t error_code; /* private */ | ||
119 | uint16_t entry_vector; /* private */ | ||
120 | uint32_t eip; | ||
121 | uint16_t cs; | ||
122 | uint8_t saved_upcall_mask; | ||
123 | uint8_t _pad0; | ||
124 | uint32_t eflags; /* eflags.IF == !saved_upcall_mask */ | ||
125 | uint32_t esp; | ||
126 | uint16_t ss, _pad1; | ||
127 | uint16_t es, _pad2; | ||
128 | uint16_t ds, _pad3; | ||
129 | uint16_t fs, _pad4; | ||
130 | uint16_t gs, _pad5; | ||
131 | }; | ||
132 | DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs); | ||
133 | |||
134 | typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */ | ||
135 | |||
136 | /* | ||
137 | * The following is all CPU context. Note that the fpu_ctxt block is filled | ||
138 | * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. | ||
139 | */ | ||
140 | struct vcpu_guest_context { | ||
141 | /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ | ||
142 | struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */ | ||
143 | #define VGCF_I387_VALID (1<<0) | ||
144 | #define VGCF_HVM_GUEST (1<<1) | ||
145 | #define VGCF_IN_KERNEL (1<<2) | ||
146 | unsigned long flags; /* VGCF_* flags */ | ||
147 | struct cpu_user_regs user_regs; /* User-level CPU registers */ | ||
148 | struct trap_info trap_ctxt[256]; /* Virtual IDT */ | ||
149 | unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */ | ||
150 | unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */ | ||
151 | unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */ | ||
152 | unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */ | ||
153 | unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */ | ||
154 | unsigned long event_callback_cs; /* CS:EIP of event callback */ | ||
155 | unsigned long event_callback_eip; | ||
156 | unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */ | ||
157 | unsigned long failsafe_callback_eip; | ||
158 | unsigned long vm_assist; /* VMASST_TYPE_* bitmap */ | ||
159 | }; | ||
160 | DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context); | ||
161 | |||
162 | struct arch_shared_info { | ||
163 | unsigned long max_pfn; /* max pfn that appears in table */ | ||
164 | /* Frame containing list of mfns containing list of mfns containing p2m. */ | ||
165 | unsigned long pfn_to_mfn_frame_list_list; | ||
166 | unsigned long nmi_reason; | ||
167 | }; | ||
168 | |||
169 | struct arch_vcpu_info { | ||
170 | unsigned long cr2; | ||
171 | unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */ | ||
172 | }; | ||
173 | |||
174 | #endif /* !__ASSEMBLY__ */ | ||
175 | |||
176 | /* | ||
177 | * Prefix forces emulation of some non-trapping instructions. | ||
178 | * Currently only CPUID. | ||
179 | */ | ||
180 | #ifdef __ASSEMBLY__ | ||
181 | #define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ; | ||
182 | #define XEN_CPUID XEN_EMULATE_PREFIX cpuid | ||
183 | #else | ||
184 | #define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; " | ||
185 | #define XEN_CPUID XEN_EMULATE_PREFIX "cpuid" | ||
186 | #endif | ||
187 | |||
188 | #endif | ||
diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h index 9a1e0674e56c..e831759b2fb5 100644 --- a/include/linux/elfnote.h +++ b/include/linux/elfnote.h | |||
@@ -38,17 +38,25 @@ | |||
38 | * e.g. ELFNOTE(XYZCo, 42, .asciz, "forty-two") | 38 | * e.g. ELFNOTE(XYZCo, 42, .asciz, "forty-two") |
39 | * ELFNOTE(XYZCo, 12, .long, 0xdeadbeef) | 39 | * ELFNOTE(XYZCo, 12, .long, 0xdeadbeef) |
40 | */ | 40 | */ |
41 | #define ELFNOTE(name, type, desctype, descdata) \ | 41 | #define ELFNOTE_START(name, type, flags) \ |
42 | .pushsection .note.name, "",@note ; \ | 42 | .pushsection .note.name, flags,@note ; \ |
43 | .align 4 ; \ | 43 | .balign 4 ; \ |
44 | .long 2f - 1f /* namesz */ ; \ | 44 | .long 2f - 1f /* namesz */ ; \ |
45 | .long 4f - 3f /* descsz */ ; \ | 45 | .long 4484f - 3f /* descsz */ ; \ |
46 | .long type ; \ | 46 | .long type ; \ |
47 | 1:.asciz #name ; \ | 47 | 1:.asciz #name ; \ |
48 | 2:.align 4 ; \ | 48 | 2:.balign 4 ; \ |
49 | 3:desctype descdata ; \ | 49 | 3: |
50 | 4:.align 4 ; \ | 50 | |
51 | #define ELFNOTE_END \ | ||
52 | 4484:.balign 4 ; \ | ||
51 | .popsection ; | 53 | .popsection ; |
54 | |||
55 | #define ELFNOTE(name, type, desc) \ | ||
56 | ELFNOTE_START(name, type, "") \ | ||
57 | desc ; \ | ||
58 | ELFNOTE_END | ||
59 | |||
52 | #else /* !__ASSEMBLER__ */ | 60 | #else /* !__ASSEMBLER__ */ |
53 | #include <linux/elf.h> | 61 | #include <linux/elf.h> |
54 | /* | 62 | /* |
diff --git a/include/linux/kmod.h b/include/linux/kmod.h index 10f505c8431d..5dc13848891b 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h | |||
@@ -36,13 +36,57 @@ static inline int request_module(const char * name, ...) { return -ENOSYS; } | |||
36 | #define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x))) | 36 | #define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x))) |
37 | 37 | ||
38 | struct key; | 38 | struct key; |
39 | extern int call_usermodehelper_keys(char *path, char *argv[], char *envp[], | 39 | struct file; |
40 | struct key *session_keyring, int wait); | 40 | struct subprocess_info; |
41 | |||
42 | /* Allocate a subprocess_info structure */ | ||
43 | struct subprocess_info *call_usermodehelper_setup(char *path, | ||
44 | char **argv, char **envp); | ||
45 | |||
46 | /* Set various pieces of state into the subprocess_info structure */ | ||
47 | void call_usermodehelper_setkeys(struct subprocess_info *info, | ||
48 | struct key *session_keyring); | ||
49 | int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info, | ||
50 | struct file **filp); | ||
51 | void call_usermodehelper_setcleanup(struct subprocess_info *info, | ||
52 | void (*cleanup)(char **argv, char **envp)); | ||
53 | |||
54 | enum umh_wait { | ||
55 | UMH_NO_WAIT = -1, /* don't wait at all */ | ||
56 | UMH_WAIT_EXEC = 0, /* wait for the exec, but not the process */ | ||
57 | UMH_WAIT_PROC = 1, /* wait for the process to complete */ | ||
58 | }; | ||
59 | |||
60 | /* Actually execute the sub-process */ | ||
61 | int call_usermodehelper_exec(struct subprocess_info *info, enum umh_wait wait); | ||
62 | |||
63 | /* Free the subprocess_info. This is only needed if you're not going | ||
64 | to call call_usermodehelper_exec */ | ||
65 | void call_usermodehelper_freeinfo(struct subprocess_info *info); | ||
41 | 66 | ||
42 | static inline int | 67 | static inline int |
43 | call_usermodehelper(char *path, char **argv, char **envp, int wait) | 68 | call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait) |
44 | { | 69 | { |
45 | return call_usermodehelper_keys(path, argv, envp, NULL, wait); | 70 | struct subprocess_info *info; |
71 | |||
72 | info = call_usermodehelper_setup(path, argv, envp); | ||
73 | if (info == NULL) | ||
74 | return -ENOMEM; | ||
75 | return call_usermodehelper_exec(info, wait); | ||
76 | } | ||
77 | |||
78 | static inline int | ||
79 | call_usermodehelper_keys(char *path, char **argv, char **envp, | ||
80 | struct key *session_keyring, enum umh_wait wait) | ||
81 | { | ||
82 | struct subprocess_info *info; | ||
83 | |||
84 | info = call_usermodehelper_setup(path, argv, envp); | ||
85 | if (info == NULL) | ||
86 | return -ENOMEM; | ||
87 | |||
88 | call_usermodehelper_setkeys(info, session_keyring); | ||
89 | return call_usermodehelper_exec(info, wait); | ||
46 | } | 90 | } |
47 | 91 | ||
48 | extern void usermodehelper_init(void); | 92 | extern void usermodehelper_init(void); |
diff --git a/include/linux/major.h b/include/linux/major.h index 7e7c9093919a..0cb98053537a 100644 --- a/include/linux/major.h +++ b/include/linux/major.h | |||
@@ -158,6 +158,8 @@ | |||
158 | #define VXSPEC_MAJOR 200 /* VERITAS volume config driver */ | 158 | #define VXSPEC_MAJOR 200 /* VERITAS volume config driver */ |
159 | #define VXDMP_MAJOR 201 /* VERITAS volume multipath driver */ | 159 | #define VXDMP_MAJOR 201 /* VERITAS volume multipath driver */ |
160 | 160 | ||
161 | #define XENVBD_MAJOR 202 /* Xen virtual block device */ | ||
162 | |||
161 | #define MSR_MAJOR 202 | 163 | #define MSR_MAJOR 202 |
162 | #define CPUID_MAJOR 203 | 164 | #define CPUID_MAJOR 203 |
163 | 165 | ||
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index ae2d79f2107e..731cd2ac3227 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h | |||
@@ -92,6 +92,7 @@ | |||
92 | 92 | ||
93 | /* PG_owner_priv_1 users should have descriptive aliases */ | 93 | /* PG_owner_priv_1 users should have descriptive aliases */ |
94 | #define PG_checked PG_owner_priv_1 /* Used by some filesystems */ | 94 | #define PG_checked PG_owner_priv_1 /* Used by some filesystems */ |
95 | #define PG_pinned PG_owner_priv_1 /* Xen pinned pagetable */ | ||
95 | 96 | ||
96 | #if (BITS_PER_LONG > 32) | 97 | #if (BITS_PER_LONG > 32) |
97 | /* | 98 | /* |
@@ -170,6 +171,10 @@ static inline void SetPageUptodate(struct page *page) | |||
170 | #define SetPageChecked(page) set_bit(PG_checked, &(page)->flags) | 171 | #define SetPageChecked(page) set_bit(PG_checked, &(page)->flags) |
171 | #define ClearPageChecked(page) clear_bit(PG_checked, &(page)->flags) | 172 | #define ClearPageChecked(page) clear_bit(PG_checked, &(page)->flags) |
172 | 173 | ||
174 | #define PagePinned(page) test_bit(PG_pinned, &(page)->flags) | ||
175 | #define SetPagePinned(page) set_bit(PG_pinned, &(page)->flags) | ||
176 | #define ClearPagePinned(page) clear_bit(PG_pinned, &(page)->flags) | ||
177 | |||
173 | #define PageReserved(page) test_bit(PG_reserved, &(page)->flags) | 178 | #define PageReserved(page) test_bit(PG_reserved, &(page)->flags) |
174 | #define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags) | 179 | #define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags) |
175 | #define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags) | 180 | #define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags) |
diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 1dd1c707311f..85ea63f462af 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h | |||
@@ -67,6 +67,11 @@ extern void kernel_power_off(void); | |||
67 | 67 | ||
68 | void ctrl_alt_del(void); | 68 | void ctrl_alt_del(void); |
69 | 69 | ||
70 | #define POWEROFF_CMD_PATH_LEN 256 | ||
71 | extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN]; | ||
72 | |||
73 | extern int orderly_poweroff(bool force); | ||
74 | |||
70 | /* | 75 | /* |
71 | * Emergency restart, callable from an interrupt handler. | 76 | * Emergency restart, callable from an interrupt handler. |
72 | */ | 77 | */ |
diff --git a/include/linux/string.h b/include/linux/string.h index 7f2eb6a477f9..836062b7582a 100644 --- a/include/linux/string.h +++ b/include/linux/string.h | |||
@@ -105,8 +105,12 @@ extern void * memchr(const void *,int,__kernel_size_t); | |||
105 | #endif | 105 | #endif |
106 | 106 | ||
107 | extern char *kstrdup(const char *s, gfp_t gfp); | 107 | extern char *kstrdup(const char *s, gfp_t gfp); |
108 | extern char *kstrndup(const char *s, size_t len, gfp_t gfp); | ||
108 | extern void *kmemdup(const void *src, size_t len, gfp_t gfp); | 109 | extern void *kmemdup(const void *src, size_t len, gfp_t gfp); |
109 | 110 | ||
111 | extern char **argv_split(gfp_t gfp, const char *str, int *argcp); | ||
112 | extern void argv_free(char **argv); | ||
113 | |||
110 | #ifdef __cplusplus | 114 | #ifdef __cplusplus |
111 | } | 115 | } |
112 | #endif | 116 | #endif |
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 132b260aef1e..c2b10cae5da5 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h | |||
@@ -70,6 +70,10 @@ extern int map_vm_area(struct vm_struct *area, pgprot_t prot, | |||
70 | struct page ***pages); | 70 | struct page ***pages); |
71 | extern void unmap_kernel_range(unsigned long addr, unsigned long size); | 71 | extern void unmap_kernel_range(unsigned long addr, unsigned long size); |
72 | 72 | ||
73 | /* Allocate/destroy a 'vmalloc' VM area. */ | ||
74 | extern struct vm_struct *alloc_vm_area(size_t size); | ||
75 | extern void free_vm_area(struct vm_struct *area); | ||
76 | |||
73 | /* | 77 | /* |
74 | * Internals. Dont't use.. | 78 | * Internals. Dont't use.. |
75 | */ | 79 | */ |
diff --git a/include/xen/events.h b/include/xen/events.h new file mode 100644 index 000000000000..2bde54d29be5 --- /dev/null +++ b/include/xen/events.h | |||
@@ -0,0 +1,48 @@ | |||
1 | #ifndef _XEN_EVENTS_H | ||
2 | #define _XEN_EVENTS_H | ||
3 | |||
4 | #include <linux/interrupt.h> | ||
5 | |||
6 | #include <xen/interface/event_channel.h> | ||
7 | #include <asm/xen/hypercall.h> | ||
8 | |||
9 | enum ipi_vector { | ||
10 | XEN_RESCHEDULE_VECTOR, | ||
11 | XEN_CALL_FUNCTION_VECTOR, | ||
12 | |||
13 | XEN_NR_IPIS, | ||
14 | }; | ||
15 | |||
16 | int bind_evtchn_to_irq(unsigned int evtchn); | ||
17 | int bind_evtchn_to_irqhandler(unsigned int evtchn, | ||
18 | irq_handler_t handler, | ||
19 | unsigned long irqflags, const char *devname, | ||
20 | void *dev_id); | ||
21 | int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, | ||
22 | irq_handler_t handler, | ||
23 | unsigned long irqflags, const char *devname, | ||
24 | void *dev_id); | ||
25 | int bind_ipi_to_irqhandler(enum ipi_vector ipi, | ||
26 | unsigned int cpu, | ||
27 | irq_handler_t handler, | ||
28 | unsigned long irqflags, | ||
29 | const char *devname, | ||
30 | void *dev_id); | ||
31 | |||
32 | /* | ||
33 | * Common unbind function for all event sources. Takes IRQ to unbind from. | ||
34 | * Automatically closes the underlying event channel (even for bindings | ||
35 | * made with bind_evtchn_to_irqhandler()). | ||
36 | */ | ||
37 | void unbind_from_irqhandler(unsigned int irq, void *dev_id); | ||
38 | |||
39 | void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector); | ||
40 | |||
41 | static inline void notify_remote_via_evtchn(int port) | ||
42 | { | ||
43 | struct evtchn_send send = { .port = port }; | ||
44 | (void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send); | ||
45 | } | ||
46 | |||
47 | extern void notify_remote_via_irq(int irq); | ||
48 | #endif /* _XEN_EVENTS_H */ | ||
diff --git a/include/xen/features.h b/include/xen/features.h new file mode 100644 index 000000000000..27292d4d2a6a --- /dev/null +++ b/include/xen/features.h | |||
@@ -0,0 +1,23 @@ | |||
1 | /****************************************************************************** | ||
2 | * features.h | ||
3 | * | ||
4 | * Query the features reported by Xen. | ||
5 | * | ||
6 | * Copyright (c) 2006, Ian Campbell | ||
7 | */ | ||
8 | |||
9 | #ifndef __XEN_FEATURES_H__ | ||
10 | #define __XEN_FEATURES_H__ | ||
11 | |||
12 | #include <xen/interface/features.h> | ||
13 | |||
14 | void xen_setup_features(void); | ||
15 | |||
16 | extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32]; | ||
17 | |||
18 | static inline int xen_feature(int flag) | ||
19 | { | ||
20 | return xen_features[flag]; | ||
21 | } | ||
22 | |||
23 | #endif /* __ASM_XEN_FEATURES_H__ */ | ||
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h new file mode 100644 index 000000000000..761c83498e03 --- /dev/null +++ b/include/xen/grant_table.h | |||
@@ -0,0 +1,107 @@ | |||
1 | /****************************************************************************** | ||
2 | * grant_table.h | ||
3 | * | ||
4 | * Two sets of functionality: | ||
5 | * 1. Granting foreign access to our memory reservation. | ||
6 | * 2. Accessing others' memory reservations via grant references. | ||
7 | * (i.e., mechanisms for both sender and recipient of grant references) | ||
8 | * | ||
9 | * Copyright (c) 2004-2005, K A Fraser | ||
10 | * Copyright (c) 2005, Christopher Clark | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License version 2 | ||
14 | * as published by the Free Software Foundation; or, when distributed | ||
15 | * separately from the Linux kernel or incorporated into other | ||
16 | * software packages, subject to the following license: | ||
17 | * | ||
18 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
19 | * of this source file (the "Software"), to deal in the Software without | ||
20 | * restriction, including without limitation the rights to use, copy, modify, | ||
21 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
22 | * and to permit persons to whom the Software is furnished to do so, subject to | ||
23 | * the following conditions: | ||
24 | * | ||
25 | * The above copyright notice and this permission notice shall be included in | ||
26 | * all copies or substantial portions of the Software. | ||
27 | * | ||
28 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
29 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
30 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
31 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
32 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
33 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
34 | * IN THE SOFTWARE. | ||
35 | */ | ||
36 | |||
37 | #ifndef __ASM_GNTTAB_H__ | ||
38 | #define __ASM_GNTTAB_H__ | ||
39 | |||
40 | #include <asm/xen/hypervisor.h> | ||
41 | #include <xen/interface/grant_table.h> | ||
42 | |||
43 | /* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */ | ||
44 | #define NR_GRANT_FRAMES 4 | ||
45 | |||
46 | struct gnttab_free_callback { | ||
47 | struct gnttab_free_callback *next; | ||
48 | void (*fn)(void *); | ||
49 | void *arg; | ||
50 | u16 count; | ||
51 | }; | ||
52 | |||
53 | int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, | ||
54 | int readonly); | ||
55 | |||
56 | /* | ||
57 | * End access through the given grant reference, iff the grant entry is no | ||
58 | * longer in use. Return 1 if the grant entry was freed, 0 if it is still in | ||
59 | * use. | ||
60 | */ | ||
61 | int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly); | ||
62 | |||
63 | /* | ||
64 | * Eventually end access through the given grant reference, and once that | ||
65 | * access has been ended, free the given page too. Access will be ended | ||
66 | * immediately iff the grant entry is not in use, otherwise it will happen | ||
67 | * some time later. page may be 0, in which case no freeing will occur. | ||
68 | */ | ||
69 | void gnttab_end_foreign_access(grant_ref_t ref, int readonly, | ||
70 | unsigned long page); | ||
71 | |||
72 | int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); | ||
73 | |||
74 | unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); | ||
75 | unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); | ||
76 | |||
77 | int gnttab_query_foreign_access(grant_ref_t ref); | ||
78 | |||
79 | /* | ||
80 | * operations on reserved batches of grant references | ||
81 | */ | ||
82 | int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head); | ||
83 | |||
84 | void gnttab_free_grant_reference(grant_ref_t ref); | ||
85 | |||
86 | void gnttab_free_grant_references(grant_ref_t head); | ||
87 | |||
88 | int gnttab_empty_grant_references(const grant_ref_t *pprivate_head); | ||
89 | |||
90 | int gnttab_claim_grant_reference(grant_ref_t *pprivate_head); | ||
91 | |||
92 | void gnttab_release_grant_reference(grant_ref_t *private_head, | ||
93 | grant_ref_t release); | ||
94 | |||
95 | void gnttab_request_free_callback(struct gnttab_free_callback *callback, | ||
96 | void (*fn)(void *), void *arg, u16 count); | ||
97 | void gnttab_cancel_free_callback(struct gnttab_free_callback *callback); | ||
98 | |||
99 | void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, | ||
100 | unsigned long frame, int readonly); | ||
101 | |||
102 | void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, | ||
103 | unsigned long pfn); | ||
104 | |||
105 | #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr)) | ||
106 | |||
107 | #endif /* __ASM_GNTTAB_H__ */ | ||
diff --git a/include/xen/hvc-console.h b/include/xen/hvc-console.h new file mode 100644 index 000000000000..21c0ecfd786d --- /dev/null +++ b/include/xen/hvc-console.h | |||
@@ -0,0 +1,6 @@ | |||
1 | #ifndef XEN_HVC_CONSOLE_H | ||
2 | #define XEN_HVC_CONSOLE_H | ||
3 | |||
4 | extern struct console xenboot_console; | ||
5 | |||
6 | #endif /* XEN_HVC_CONSOLE_H */ | ||
diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h new file mode 100644 index 000000000000..a64d3df5bd95 --- /dev/null +++ b/include/xen/interface/elfnote.h | |||
@@ -0,0 +1,133 @@ | |||
1 | /****************************************************************************** | ||
2 | * elfnote.h | ||
3 | * | ||
4 | * Definitions used for the Xen ELF notes. | ||
5 | * | ||
6 | * Copyright (c) 2006, Ian Campbell, XenSource Ltd. | ||
7 | */ | ||
8 | |||
9 | #ifndef __XEN_PUBLIC_ELFNOTE_H__ | ||
10 | #define __XEN_PUBLIC_ELFNOTE_H__ | ||
11 | |||
12 | /* | ||
13 | * The notes should live in a SHT_NOTE segment and have "Xen" in the | ||
14 | * name field. | ||
15 | * | ||
16 | * Numeric types are either 4 or 8 bytes depending on the content of | ||
17 | * the desc field. | ||
18 | * | ||
19 | * LEGACY indicated the fields in the legacy __xen_guest string which | ||
20 | * this a note type replaces. | ||
21 | */ | ||
22 | |||
23 | /* | ||
24 | * NAME=VALUE pair (string). | ||
25 | * | ||
26 | * LEGACY: FEATURES and PAE | ||
27 | */ | ||
28 | #define XEN_ELFNOTE_INFO 0 | ||
29 | |||
30 | /* | ||
31 | * The virtual address of the entry point (numeric). | ||
32 | * | ||
33 | * LEGACY: VIRT_ENTRY | ||
34 | */ | ||
35 | #define XEN_ELFNOTE_ENTRY 1 | ||
36 | |||
37 | /* The virtual address of the hypercall transfer page (numeric). | ||
38 | * | ||
39 | * LEGACY: HYPERCALL_PAGE. (n.b. legacy value is a physical page | ||
40 | * number not a virtual address) | ||
41 | */ | ||
42 | #define XEN_ELFNOTE_HYPERCALL_PAGE 2 | ||
43 | |||
44 | /* The virtual address where the kernel image should be mapped (numeric). | ||
45 | * | ||
46 | * Defaults to 0. | ||
47 | * | ||
48 | * LEGACY: VIRT_BASE | ||
49 | */ | ||
50 | #define XEN_ELFNOTE_VIRT_BASE 3 | ||
51 | |||
52 | /* | ||
53 | * The offset of the ELF paddr field from the acutal required | ||
54 | * psuedo-physical address (numeric). | ||
55 | * | ||
56 | * This is used to maintain backwards compatibility with older kernels | ||
57 | * which wrote __PAGE_OFFSET into that field. This field defaults to 0 | ||
58 | * if not present. | ||
59 | * | ||
60 | * LEGACY: ELF_PADDR_OFFSET. (n.b. legacy default is VIRT_BASE) | ||
61 | */ | ||
62 | #define XEN_ELFNOTE_PADDR_OFFSET 4 | ||
63 | |||
64 | /* | ||
65 | * The version of Xen that we work with (string). | ||
66 | * | ||
67 | * LEGACY: XEN_VER | ||
68 | */ | ||
69 | #define XEN_ELFNOTE_XEN_VERSION 5 | ||
70 | |||
71 | /* | ||
72 | * The name of the guest operating system (string). | ||
73 | * | ||
74 | * LEGACY: GUEST_OS | ||
75 | */ | ||
76 | #define XEN_ELFNOTE_GUEST_OS 6 | ||
77 | |||
78 | /* | ||
79 | * The version of the guest operating system (string). | ||
80 | * | ||
81 | * LEGACY: GUEST_VER | ||
82 | */ | ||
83 | #define XEN_ELFNOTE_GUEST_VERSION 7 | ||
84 | |||
85 | /* | ||
86 | * The loader type (string). | ||
87 | * | ||
88 | * LEGACY: LOADER | ||
89 | */ | ||
90 | #define XEN_ELFNOTE_LOADER 8 | ||
91 | |||
92 | /* | ||
93 | * The kernel supports PAE (x86/32 only, string = "yes" or "no"). | ||
94 | * | ||
95 | * LEGACY: PAE (n.b. The legacy interface included a provision to | ||
96 | * indicate 'extended-cr3' support allowing L3 page tables to be | ||
97 | * placed above 4G. It is assumed that any kernel new enough to use | ||
98 | * these ELF notes will include this and therefore "yes" here is | ||
99 | * equivalent to "yes[entended-cr3]" in the __xen_guest interface. | ||
100 | */ | ||
101 | #define XEN_ELFNOTE_PAE_MODE 9 | ||
102 | |||
103 | /* | ||
104 | * The features supported/required by this kernel (string). | ||
105 | * | ||
106 | * The string must consist of a list of feature names (as given in | ||
107 | * features.h, without the "XENFEAT_" prefix) separated by '|' | ||
108 | * characters. If a feature is required for the kernel to function | ||
109 | * then the feature name must be preceded by a '!' character. | ||
110 | * | ||
111 | * LEGACY: FEATURES | ||
112 | */ | ||
113 | #define XEN_ELFNOTE_FEATURES 10 | ||
114 | |||
115 | /* | ||
116 | * The kernel requires the symbol table to be loaded (string = "yes" or "no") | ||
117 | * LEGACY: BSD_SYMTAB (n.b. The legacy treated the presence or absence | ||
118 | * of this string as a boolean flag rather than requiring "yes" or | ||
119 | * "no". | ||
120 | */ | ||
121 | #define XEN_ELFNOTE_BSD_SYMTAB 11 | ||
122 | |||
123 | #endif /* __XEN_PUBLIC_ELFNOTE_H__ */ | ||
124 | |||
125 | /* | ||
126 | * Local variables: | ||
127 | * mode: C | ||
128 | * c-set-style: "BSD" | ||
129 | * c-basic-offset: 4 | ||
130 | * tab-width: 4 | ||
131 | * indent-tabs-mode: nil | ||
132 | * End: | ||
133 | */ | ||
diff --git a/include/xen/interface/event_channel.h b/include/xen/interface/event_channel.h new file mode 100644 index 000000000000..919b5bdcb2bd --- /dev/null +++ b/include/xen/interface/event_channel.h | |||
@@ -0,0 +1,195 @@ | |||
1 | /****************************************************************************** | ||
2 | * event_channel.h | ||
3 | * | ||
4 | * Event channels between domains. | ||
5 | * | ||
6 | * Copyright (c) 2003-2004, K A Fraser. | ||
7 | */ | ||
8 | |||
9 | #ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__ | ||
10 | #define __XEN_PUBLIC_EVENT_CHANNEL_H__ | ||
11 | |||
12 | typedef uint32_t evtchn_port_t; | ||
13 | DEFINE_GUEST_HANDLE(evtchn_port_t); | ||
14 | |||
15 | /* | ||
16 | * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as | ||
17 | * accepting interdomain bindings from domain <remote_dom>. A fresh port | ||
18 | * is allocated in <dom> and returned as <port>. | ||
19 | * NOTES: | ||
20 | * 1. If the caller is unprivileged then <dom> must be DOMID_SELF. | ||
21 | * 2. <rdom> may be DOMID_SELF, allowing loopback connections. | ||
22 | */ | ||
23 | #define EVTCHNOP_alloc_unbound 6 | ||
24 | struct evtchn_alloc_unbound { | ||
25 | /* IN parameters */ | ||
26 | domid_t dom, remote_dom; | ||
27 | /* OUT parameters */ | ||
28 | evtchn_port_t port; | ||
29 | }; | ||
30 | |||
31 | /* | ||
32 | * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between | ||
33 | * the calling domain and <remote_dom>. <remote_dom,remote_port> must identify | ||
34 | * a port that is unbound and marked as accepting bindings from the calling | ||
35 | * domain. A fresh port is allocated in the calling domain and returned as | ||
36 | * <local_port>. | ||
37 | * NOTES: | ||
38 | * 2. <remote_dom> may be DOMID_SELF, allowing loopback connections. | ||
39 | */ | ||
40 | #define EVTCHNOP_bind_interdomain 0 | ||
41 | struct evtchn_bind_interdomain { | ||
42 | /* IN parameters. */ | ||
43 | domid_t remote_dom; | ||
44 | evtchn_port_t remote_port; | ||
45 | /* OUT parameters. */ | ||
46 | evtchn_port_t local_port; | ||
47 | }; | ||
48 | |||
49 | /* | ||
50 | * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified | ||
51 | * vcpu. | ||
52 | * NOTES: | ||
53 | * 1. A virtual IRQ may be bound to at most one event channel per vcpu. | ||
54 | * 2. The allocated event channel is bound to the specified vcpu. The binding | ||
55 | * may not be changed. | ||
56 | */ | ||
57 | #define EVTCHNOP_bind_virq 1 | ||
58 | struct evtchn_bind_virq { | ||
59 | /* IN parameters. */ | ||
60 | uint32_t virq; | ||
61 | uint32_t vcpu; | ||
62 | /* OUT parameters. */ | ||
63 | evtchn_port_t port; | ||
64 | }; | ||
65 | |||
66 | /* | ||
67 | * EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>. | ||
68 | * NOTES: | ||
69 | * 1. A physical IRQ may be bound to at most one event channel per domain. | ||
70 | * 2. Only a sufficiently-privileged domain may bind to a physical IRQ. | ||
71 | */ | ||
72 | #define EVTCHNOP_bind_pirq 2 | ||
73 | struct evtchn_bind_pirq { | ||
74 | /* IN parameters. */ | ||
75 | uint32_t pirq; | ||
76 | #define BIND_PIRQ__WILL_SHARE 1 | ||
77 | uint32_t flags; /* BIND_PIRQ__* */ | ||
78 | /* OUT parameters. */ | ||
79 | evtchn_port_t port; | ||
80 | }; | ||
81 | |||
82 | /* | ||
83 | * EVTCHNOP_bind_ipi: Bind a local event channel to receive events. | ||
84 | * NOTES: | ||
85 | * 1. The allocated event channel is bound to the specified vcpu. The binding | ||
86 | * may not be changed. | ||
87 | */ | ||
88 | #define EVTCHNOP_bind_ipi 7 | ||
89 | struct evtchn_bind_ipi { | ||
90 | uint32_t vcpu; | ||
91 | /* OUT parameters. */ | ||
92 | evtchn_port_t port; | ||
93 | }; | ||
94 | |||
95 | /* | ||
96 | * EVTCHNOP_close: Close a local event channel <port>. If the channel is | ||
97 | * interdomain then the remote end is placed in the unbound state | ||
98 | * (EVTCHNSTAT_unbound), awaiting a new connection. | ||
99 | */ | ||
100 | #define EVTCHNOP_close 3 | ||
101 | struct evtchn_close { | ||
102 | /* IN parameters. */ | ||
103 | evtchn_port_t port; | ||
104 | }; | ||
105 | |||
106 | /* | ||
107 | * EVTCHNOP_send: Send an event to the remote end of the channel whose local | ||
108 | * endpoint is <port>. | ||
109 | */ | ||
110 | #define EVTCHNOP_send 4 | ||
111 | struct evtchn_send { | ||
112 | /* IN parameters. */ | ||
113 | evtchn_port_t port; | ||
114 | }; | ||
115 | |||
116 | /* | ||
117 | * EVTCHNOP_status: Get the current status of the communication channel which | ||
118 | * has an endpoint at <dom, port>. | ||
119 | * NOTES: | ||
120 | * 1. <dom> may be specified as DOMID_SELF. | ||
121 | * 2. Only a sufficiently-privileged domain may obtain the status of an event | ||
122 | * channel for which <dom> is not DOMID_SELF. | ||
123 | */ | ||
124 | #define EVTCHNOP_status 5 | ||
125 | struct evtchn_status { | ||
126 | /* IN parameters */ | ||
127 | domid_t dom; | ||
128 | evtchn_port_t port; | ||
129 | /* OUT parameters */ | ||
130 | #define EVTCHNSTAT_closed 0 /* Channel is not in use. */ | ||
131 | #define EVTCHNSTAT_unbound 1 /* Channel is waiting interdom connection.*/ | ||
132 | #define EVTCHNSTAT_interdomain 2 /* Channel is connected to remote domain. */ | ||
133 | #define EVTCHNSTAT_pirq 3 /* Channel is bound to a phys IRQ line. */ | ||
134 | #define EVTCHNSTAT_virq 4 /* Channel is bound to a virtual IRQ line */ | ||
135 | #define EVTCHNSTAT_ipi 5 /* Channel is bound to a virtual IPI line */ | ||
136 | uint32_t status; | ||
137 | uint32_t vcpu; /* VCPU to which this channel is bound. */ | ||
138 | union { | ||
139 | struct { | ||
140 | domid_t dom; | ||
141 | } unbound; /* EVTCHNSTAT_unbound */ | ||
142 | struct { | ||
143 | domid_t dom; | ||
144 | evtchn_port_t port; | ||
145 | } interdomain; /* EVTCHNSTAT_interdomain */ | ||
146 | uint32_t pirq; /* EVTCHNSTAT_pirq */ | ||
147 | uint32_t virq; /* EVTCHNSTAT_virq */ | ||
148 | } u; | ||
149 | }; | ||
150 | |||
151 | /* | ||
152 | * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an | ||
153 | * event is pending. | ||
154 | * NOTES: | ||
155 | * 1. IPI- and VIRQ-bound channels always notify the vcpu that initialised | ||
156 | * the binding. This binding cannot be changed. | ||
157 | * 2. All other channels notify vcpu0 by default. This default is set when | ||
158 | * the channel is allocated (a port that is freed and subsequently reused | ||
159 | * has its binding reset to vcpu0). | ||
160 | */ | ||
161 | #define EVTCHNOP_bind_vcpu 8 | ||
162 | struct evtchn_bind_vcpu { | ||
163 | /* IN parameters. */ | ||
164 | evtchn_port_t port; | ||
165 | uint32_t vcpu; | ||
166 | }; | ||
167 | |||
168 | /* | ||
169 | * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver | ||
170 | * a notification to the appropriate VCPU if an event is pending. | ||
171 | */ | ||
172 | #define EVTCHNOP_unmask 9 | ||
173 | struct evtchn_unmask { | ||
174 | /* IN parameters. */ | ||
175 | evtchn_port_t port; | ||
176 | }; | ||
177 | |||
178 | struct evtchn_op { | ||
179 | uint32_t cmd; /* EVTCHNOP_* */ | ||
180 | union { | ||
181 | struct evtchn_alloc_unbound alloc_unbound; | ||
182 | struct evtchn_bind_interdomain bind_interdomain; | ||
183 | struct evtchn_bind_virq bind_virq; | ||
184 | struct evtchn_bind_pirq bind_pirq; | ||
185 | struct evtchn_bind_ipi bind_ipi; | ||
186 | struct evtchn_close close; | ||
187 | struct evtchn_send send; | ||
188 | struct evtchn_status status; | ||
189 | struct evtchn_bind_vcpu bind_vcpu; | ||
190 | struct evtchn_unmask unmask; | ||
191 | } u; | ||
192 | }; | ||
193 | DEFINE_GUEST_HANDLE_STRUCT(evtchn_op); | ||
194 | |||
195 | #endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */ | ||
diff --git a/include/xen/interface/features.h b/include/xen/interface/features.h new file mode 100644 index 000000000000..d73228d16488 --- /dev/null +++ b/include/xen/interface/features.h | |||
@@ -0,0 +1,43 @@ | |||
1 | /****************************************************************************** | ||
2 | * features.h | ||
3 | * | ||
4 | * Feature flags, reported by XENVER_get_features. | ||
5 | * | ||
6 | * Copyright (c) 2006, Keir Fraser <keir@xensource.com> | ||
7 | */ | ||
8 | |||
9 | #ifndef __XEN_PUBLIC_FEATURES_H__ | ||
10 | #define __XEN_PUBLIC_FEATURES_H__ | ||
11 | |||
12 | /* | ||
13 | * If set, the guest does not need to write-protect its pagetables, and can | ||
14 | * update them via direct writes. | ||
15 | */ | ||
16 | #define XENFEAT_writable_page_tables 0 | ||
17 | |||
18 | /* | ||
19 | * If set, the guest does not need to write-protect its segment descriptor | ||
20 | * tables, and can update them via direct writes. | ||
21 | */ | ||
22 | #define XENFEAT_writable_descriptor_tables 1 | ||
23 | |||
24 | /* | ||
25 | * If set, translation between the guest's 'pseudo-physical' address space | ||
26 | * and the host's machine address space are handled by the hypervisor. In this | ||
27 | * mode the guest does not need to perform phys-to/from-machine translations | ||
28 | * when performing page table operations. | ||
29 | */ | ||
30 | #define XENFEAT_auto_translated_physmap 2 | ||
31 | |||
32 | /* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */ | ||
33 | #define XENFEAT_supervisor_mode_kernel 3 | ||
34 | |||
35 | /* | ||
36 | * If set, the guest does not need to allocate x86 PAE page directories | ||
37 | * below 4GB. This flag is usually implied by auto_translated_physmap. | ||
38 | */ | ||
39 | #define XENFEAT_pae_pgdir_above_4gb 4 | ||
40 | |||
41 | #define XENFEAT_NR_SUBMAPS 1 | ||
42 | |||
43 | #endif /* __XEN_PUBLIC_FEATURES_H__ */ | ||
diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h new file mode 100644 index 000000000000..219049802cf2 --- /dev/null +++ b/include/xen/interface/grant_table.h | |||
@@ -0,0 +1,375 @@ | |||
1 | /****************************************************************************** | ||
2 | * grant_table.h | ||
3 | * | ||
4 | * Interface for granting foreign access to page frames, and receiving | ||
5 | * page-ownership transfers. | ||
6 | * | ||
7 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
8 | * of this software and associated documentation files (the "Software"), to | ||
9 | * deal in the Software without restriction, including without limitation the | ||
10 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or | ||
11 | * sell copies of the Software, and to permit persons to whom the Software is | ||
12 | * furnished to do so, subject to the following conditions: | ||
13 | * | ||
14 | * The above copyright notice and this permission notice shall be included in | ||
15 | * all copies or substantial portions of the Software. | ||
16 | * | ||
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
22 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
23 | * DEALINGS IN THE SOFTWARE. | ||
24 | * | ||
25 | * Copyright (c) 2004, K A Fraser | ||
26 | */ | ||
27 | |||
28 | #ifndef __XEN_PUBLIC_GRANT_TABLE_H__ | ||
29 | #define __XEN_PUBLIC_GRANT_TABLE_H__ | ||
30 | |||
31 | |||
32 | /*********************************** | ||
33 | * GRANT TABLE REPRESENTATION | ||
34 | */ | ||
35 | |||
36 | /* Some rough guidelines on accessing and updating grant-table entries | ||
37 | * in a concurrency-safe manner. For more information, Linux contains a | ||
38 | * reference implementation for guest OSes (arch/xen/kernel/grant_table.c). | ||
39 | * | ||
40 | * NB. WMB is a no-op on current-generation x86 processors. However, a | ||
41 | * compiler barrier will still be required. | ||
42 | * | ||
43 | * Introducing a valid entry into the grant table: | ||
44 | * 1. Write ent->domid. | ||
45 | * 2. Write ent->frame: | ||
46 | * GTF_permit_access: Frame to which access is permitted. | ||
47 | * GTF_accept_transfer: Pseudo-phys frame slot being filled by new | ||
48 | * frame, or zero if none. | ||
49 | * 3. Write memory barrier (WMB). | ||
50 | * 4. Write ent->flags, inc. valid type. | ||
51 | * | ||
52 | * Invalidating an unused GTF_permit_access entry: | ||
53 | * 1. flags = ent->flags. | ||
54 | * 2. Observe that !(flags & (GTF_reading|GTF_writing)). | ||
55 | * 3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0). | ||
56 | * NB. No need for WMB as reuse of entry is control-dependent on success of | ||
57 | * step 3, and all architectures guarantee ordering of ctrl-dep writes. | ||
58 | * | ||
59 | * Invalidating an in-use GTF_permit_access entry: | ||
60 | * This cannot be done directly. Request assistance from the domain controller | ||
61 | * which can set a timeout on the use of a grant entry and take necessary | ||
62 | * action. (NB. This is not yet implemented!). | ||
63 | * | ||
64 | * Invalidating an unused GTF_accept_transfer entry: | ||
65 | * 1. flags = ent->flags. | ||
66 | * 2. Observe that !(flags & GTF_transfer_committed). [*] | ||
67 | * 3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0). | ||
68 | * NB. No need for WMB as reuse of entry is control-dependent on success of | ||
69 | * step 3, and all architectures guarantee ordering of ctrl-dep writes. | ||
70 | * [*] If GTF_transfer_committed is set then the grant entry is 'committed'. | ||
71 | * The guest must /not/ modify the grant entry until the address of the | ||
72 | * transferred frame is written. It is safe for the guest to spin waiting | ||
73 | * for this to occur (detect by observing GTF_transfer_completed in | ||
74 | * ent->flags). | ||
75 | * | ||
76 | * Invalidating a committed GTF_accept_transfer entry: | ||
77 | * 1. Wait for (ent->flags & GTF_transfer_completed). | ||
78 | * | ||
79 | * Changing a GTF_permit_access from writable to read-only: | ||
80 | * Use SMP-safe CMPXCHG to set GTF_readonly, while checking !GTF_writing. | ||
81 | * | ||
82 | * Changing a GTF_permit_access from read-only to writable: | ||
83 | * Use SMP-safe bit-setting instruction. | ||
84 | */ | ||
85 | |||
86 | /* | ||
87 | * A grant table comprises a packed array of grant entries in one or more | ||
88 | * page frames shared between Xen and a guest. | ||
89 | * [XEN]: This field is written by Xen and read by the sharing guest. | ||
90 | * [GST]: This field is written by the guest and read by Xen. | ||
91 | */ | ||
92 | struct grant_entry { | ||
93 | /* GTF_xxx: various type and flag information. [XEN,GST] */ | ||
94 | uint16_t flags; | ||
95 | /* The domain being granted foreign privileges. [GST] */ | ||
96 | domid_t domid; | ||
97 | /* | ||
98 | * GTF_permit_access: Frame that @domid is allowed to map and access. [GST] | ||
99 | * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN] | ||
100 | */ | ||
101 | uint32_t frame; | ||
102 | }; | ||
103 | |||
104 | /* | ||
105 | * Type of grant entry. | ||
106 | * GTF_invalid: This grant entry grants no privileges. | ||
107 | * GTF_permit_access: Allow @domid to map/access @frame. | ||
108 | * GTF_accept_transfer: Allow @domid to transfer ownership of one page frame | ||
109 | * to this guest. Xen writes the page number to @frame. | ||
110 | */ | ||
111 | #define GTF_invalid (0U<<0) | ||
112 | #define GTF_permit_access (1U<<0) | ||
113 | #define GTF_accept_transfer (2U<<0) | ||
114 | #define GTF_type_mask (3U<<0) | ||
115 | |||
116 | /* | ||
117 | * Subflags for GTF_permit_access. | ||
118 | * GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST] | ||
119 | * GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN] | ||
120 | * GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN] | ||
121 | */ | ||
122 | #define _GTF_readonly (2) | ||
123 | #define GTF_readonly (1U<<_GTF_readonly) | ||
124 | #define _GTF_reading (3) | ||
125 | #define GTF_reading (1U<<_GTF_reading) | ||
126 | #define _GTF_writing (4) | ||
127 | #define GTF_writing (1U<<_GTF_writing) | ||
128 | |||
129 | /* | ||
130 | * Subflags for GTF_accept_transfer: | ||
131 | * GTF_transfer_committed: Xen sets this flag to indicate that it is committed | ||
132 | * to transferring ownership of a page frame. When a guest sees this flag | ||
133 | * it must /not/ modify the grant entry until GTF_transfer_completed is | ||
134 | * set by Xen. | ||
135 | * GTF_transfer_completed: It is safe for the guest to spin-wait on this flag | ||
136 | * after reading GTF_transfer_committed. Xen will always write the frame | ||
137 | * address, followed by ORing this flag, in a timely manner. | ||
138 | */ | ||
139 | #define _GTF_transfer_committed (2) | ||
140 | #define GTF_transfer_committed (1U<<_GTF_transfer_committed) | ||
141 | #define _GTF_transfer_completed (3) | ||
142 | #define GTF_transfer_completed (1U<<_GTF_transfer_completed) | ||
143 | |||
144 | |||
145 | /*********************************** | ||
146 | * GRANT TABLE QUERIES AND USES | ||
147 | */ | ||
148 | |||
149 | /* | ||
150 | * Reference to a grant entry in a specified domain's grant table. | ||
151 | */ | ||
152 | typedef uint32_t grant_ref_t; | ||
153 | |||
154 | /* | ||
155 | * Handle to track a mapping created via a grant reference. | ||
156 | */ | ||
157 | typedef uint32_t grant_handle_t; | ||
158 | |||
159 | /* | ||
160 | * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access | ||
161 | * by devices and/or host CPUs. If successful, <handle> is a tracking number | ||
162 | * that must be presented later to destroy the mapping(s). On error, <handle> | ||
163 | * is a negative status code. | ||
164 | * NOTES: | ||
165 | * 1. If GNTMAP_device_map is specified then <dev_bus_addr> is the address | ||
166 | * via which I/O devices may access the granted frame. | ||
167 | * 2. If GNTMAP_host_map is specified then a mapping will be added at | ||
168 | * either a host virtual address in the current address space, or at | ||
169 | * a PTE at the specified machine address. The type of mapping to | ||
170 | * perform is selected through the GNTMAP_contains_pte flag, and the | ||
171 | * address is specified in <host_addr>. | ||
172 | * 3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a | ||
173 | * host mapping is destroyed by other means then it is *NOT* guaranteed | ||
174 | * to be accounted to the correct grant reference! | ||
175 | */ | ||
176 | #define GNTTABOP_map_grant_ref 0 | ||
177 | struct gnttab_map_grant_ref { | ||
178 | /* IN parameters. */ | ||
179 | uint64_t host_addr; | ||
180 | uint32_t flags; /* GNTMAP_* */ | ||
181 | grant_ref_t ref; | ||
182 | domid_t dom; | ||
183 | /* OUT parameters. */ | ||
184 | int16_t status; /* GNTST_* */ | ||
185 | grant_handle_t handle; | ||
186 | uint64_t dev_bus_addr; | ||
187 | }; | ||
188 | |||
189 | /* | ||
190 | * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings | ||
191 | * tracked by <handle>. If <host_addr> or <dev_bus_addr> is zero, that | ||
192 | * field is ignored. If non-zero, they must refer to a device/host mapping | ||
193 | * that is tracked by <handle> | ||
194 | * NOTES: | ||
195 | * 1. The call may fail in an undefined manner if either mapping is not | ||
196 | * tracked by <handle>. | ||
197 | * 3. After executing a batch of unmaps, it is guaranteed that no stale | ||
198 | * mappings will remain in the device or host TLBs. | ||
199 | */ | ||
200 | #define GNTTABOP_unmap_grant_ref 1 | ||
201 | struct gnttab_unmap_grant_ref { | ||
202 | /* IN parameters. */ | ||
203 | uint64_t host_addr; | ||
204 | uint64_t dev_bus_addr; | ||
205 | grant_handle_t handle; | ||
206 | /* OUT parameters. */ | ||
207 | int16_t status; /* GNTST_* */ | ||
208 | }; | ||
209 | |||
210 | /* | ||
211 | * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least | ||
212 | * <nr_frames> pages. The frame addresses are written to the <frame_list>. | ||
213 | * Only <nr_frames> addresses are written, even if the table is larger. | ||
214 | * NOTES: | ||
215 | * 1. <dom> may be specified as DOMID_SELF. | ||
216 | * 2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF. | ||
217 | * 3. Xen may not support more than a single grant-table page per domain. | ||
218 | */ | ||
219 | #define GNTTABOP_setup_table 2 | ||
220 | struct gnttab_setup_table { | ||
221 | /* IN parameters. */ | ||
222 | domid_t dom; | ||
223 | uint32_t nr_frames; | ||
224 | /* OUT parameters. */ | ||
225 | int16_t status; /* GNTST_* */ | ||
226 | ulong *frame_list; | ||
227 | }; | ||
228 | |||
229 | /* | ||
230 | * GNTTABOP_dump_table: Dump the contents of the grant table to the | ||
231 | * xen console. Debugging use only. | ||
232 | */ | ||
233 | #define GNTTABOP_dump_table 3 | ||
234 | struct gnttab_dump_table { | ||
235 | /* IN parameters. */ | ||
236 | domid_t dom; | ||
237 | /* OUT parameters. */ | ||
238 | int16_t status; /* GNTST_* */ | ||
239 | }; | ||
240 | |||
241 | /* | ||
242 | * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The | ||
243 | * foreign domain has previously registered its interest in the transfer via | ||
244 | * <domid, ref>. | ||
245 | * | ||
246 | * Note that, even if the transfer fails, the specified page no longer belongs | ||
247 | * to the calling domain *unless* the error is GNTST_bad_page. | ||
248 | */ | ||
249 | #define GNTTABOP_transfer 4 | ||
250 | struct gnttab_transfer { | ||
251 | /* IN parameters. */ | ||
252 | unsigned long mfn; | ||
253 | domid_t domid; | ||
254 | grant_ref_t ref; | ||
255 | /* OUT parameters. */ | ||
256 | int16_t status; | ||
257 | }; | ||
258 | |||
259 | |||
260 | /* | ||
261 | * GNTTABOP_copy: Hypervisor based copy | ||
262 | * source and destinations can be eithers MFNs or, for foreign domains, | ||
263 | * grant references. the foreign domain has to grant read/write access | ||
264 | * in its grant table. | ||
265 | * | ||
266 | * The flags specify what type source and destinations are (either MFN | ||
267 | * or grant reference). | ||
268 | * | ||
269 | * Note that this can also be used to copy data between two domains | ||
270 | * via a third party if the source and destination domains had previously | ||
271 | * grant appropriate access to their pages to the third party. | ||
272 | * | ||
273 | * source_offset specifies an offset in the source frame, dest_offset | ||
274 | * the offset in the target frame and len specifies the number of | ||
275 | * bytes to be copied. | ||
276 | */ | ||
277 | |||
278 | #define _GNTCOPY_source_gref (0) | ||
279 | #define GNTCOPY_source_gref (1<<_GNTCOPY_source_gref) | ||
280 | #define _GNTCOPY_dest_gref (1) | ||
281 | #define GNTCOPY_dest_gref (1<<_GNTCOPY_dest_gref) | ||
282 | |||
283 | #define GNTTABOP_copy 5 | ||
284 | struct gnttab_copy { | ||
285 | /* IN parameters. */ | ||
286 | struct { | ||
287 | union { | ||
288 | grant_ref_t ref; | ||
289 | unsigned long gmfn; | ||
290 | } u; | ||
291 | domid_t domid; | ||
292 | uint16_t offset; | ||
293 | } source, dest; | ||
294 | uint16_t len; | ||
295 | uint16_t flags; /* GNTCOPY_* */ | ||
296 | /* OUT parameters. */ | ||
297 | int16_t status; | ||
298 | }; | ||
299 | |||
300 | /* | ||
301 | * GNTTABOP_query_size: Query the current and maximum sizes of the shared | ||
302 | * grant table. | ||
303 | * NOTES: | ||
304 | * 1. <dom> may be specified as DOMID_SELF. | ||
305 | * 2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF. | ||
306 | */ | ||
307 | #define GNTTABOP_query_size 6 | ||
308 | struct gnttab_query_size { | ||
309 | /* IN parameters. */ | ||
310 | domid_t dom; | ||
311 | /* OUT parameters. */ | ||
312 | uint32_t nr_frames; | ||
313 | uint32_t max_nr_frames; | ||
314 | int16_t status; /* GNTST_* */ | ||
315 | }; | ||
316 | |||
317 | |||
318 | /* | ||
319 | * Bitfield values for update_pin_status.flags. | ||
320 | */ | ||
321 | /* Map the grant entry for access by I/O devices. */ | ||
322 | #define _GNTMAP_device_map (0) | ||
323 | #define GNTMAP_device_map (1<<_GNTMAP_device_map) | ||
324 | /* Map the grant entry for access by host CPUs. */ | ||
325 | #define _GNTMAP_host_map (1) | ||
326 | #define GNTMAP_host_map (1<<_GNTMAP_host_map) | ||
327 | /* Accesses to the granted frame will be restricted to read-only access. */ | ||
328 | #define _GNTMAP_readonly (2) | ||
329 | #define GNTMAP_readonly (1<<_GNTMAP_readonly) | ||
330 | /* | ||
331 | * GNTMAP_host_map subflag: | ||
332 | * 0 => The host mapping is usable only by the guest OS. | ||
333 | * 1 => The host mapping is usable by guest OS + current application. | ||
334 | */ | ||
335 | #define _GNTMAP_application_map (3) | ||
336 | #define GNTMAP_application_map (1<<_GNTMAP_application_map) | ||
337 | |||
338 | /* | ||
339 | * GNTMAP_contains_pte subflag: | ||
340 | * 0 => This map request contains a host virtual address. | ||
341 | * 1 => This map request contains the machine addess of the PTE to update. | ||
342 | */ | ||
343 | #define _GNTMAP_contains_pte (4) | ||
344 | #define GNTMAP_contains_pte (1<<_GNTMAP_contains_pte) | ||
345 | |||
346 | /* | ||
347 | * Values for error status returns. All errors are -ve. | ||
348 | */ | ||
349 | #define GNTST_okay (0) /* Normal return. */ | ||
350 | #define GNTST_general_error (-1) /* General undefined error. */ | ||
351 | #define GNTST_bad_domain (-2) /* Unrecognsed domain id. */ | ||
352 | #define GNTST_bad_gntref (-3) /* Unrecognised or inappropriate gntref. */ | ||
353 | #define GNTST_bad_handle (-4) /* Unrecognised or inappropriate handle. */ | ||
354 | #define GNTST_bad_virt_addr (-5) /* Inappropriate virtual address to map. */ | ||
355 | #define GNTST_bad_dev_addr (-6) /* Inappropriate device address to unmap.*/ | ||
356 | #define GNTST_no_device_space (-7) /* Out of space in I/O MMU. */ | ||
357 | #define GNTST_permission_denied (-8) /* Not enough privilege for operation. */ | ||
358 | #define GNTST_bad_page (-9) /* Specified page was invalid for op. */ | ||
359 | #define GNTST_bad_copy_arg (-10) /* copy arguments cross page boundary */ | ||
360 | |||
361 | #define GNTTABOP_error_msgs { \ | ||
362 | "okay", \ | ||
363 | "undefined error", \ | ||
364 | "unrecognised domain id", \ | ||
365 | "invalid grant reference", \ | ||
366 | "invalid mapping handle", \ | ||
367 | "invalid virtual address", \ | ||
368 | "invalid device address", \ | ||
369 | "no spare translation slot in the I/O MMU", \ | ||
370 | "permission denied", \ | ||
371 | "bad page", \ | ||
372 | "copy arguments cross page boundary" \ | ||
373 | } | ||
374 | |||
375 | #endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */ | ||
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h new file mode 100644 index 000000000000..c2d1fa4dc1ee --- /dev/null +++ b/include/xen/interface/io/blkif.h | |||
@@ -0,0 +1,94 @@ | |||
1 | /****************************************************************************** | ||
2 | * blkif.h | ||
3 | * | ||
4 | * Unified block-device I/O interface for Xen guest OSes. | ||
5 | * | ||
6 | * Copyright (c) 2003-2004, Keir Fraser | ||
7 | */ | ||
8 | |||
9 | #ifndef __XEN_PUBLIC_IO_BLKIF_H__ | ||
10 | #define __XEN_PUBLIC_IO_BLKIF_H__ | ||
11 | |||
12 | #include "ring.h" | ||
13 | #include "../grant_table.h" | ||
14 | |||
15 | /* | ||
16 | * Front->back notifications: When enqueuing a new request, sending a | ||
17 | * notification can be made conditional on req_event (i.e., the generic | ||
18 | * hold-off mechanism provided by the ring macros). Backends must set | ||
19 | * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()). | ||
20 | * | ||
21 | * Back->front notifications: When enqueuing a new response, sending a | ||
22 | * notification can be made conditional on rsp_event (i.e., the generic | ||
23 | * hold-off mechanism provided by the ring macros). Frontends must set | ||
24 | * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()). | ||
25 | */ | ||
26 | |||
27 | typedef uint16_t blkif_vdev_t; | ||
28 | typedef uint64_t blkif_sector_t; | ||
29 | |||
30 | /* | ||
31 | * REQUEST CODES. | ||
32 | */ | ||
33 | #define BLKIF_OP_READ 0 | ||
34 | #define BLKIF_OP_WRITE 1 | ||
35 | /* | ||
36 | * Recognised only if "feature-barrier" is present in backend xenbus info. | ||
37 | * The "feature_barrier" node contains a boolean indicating whether barrier | ||
38 | * requests are likely to succeed or fail. Either way, a barrier request | ||
39 | * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by | ||
40 | * the underlying block-device hardware. The boolean simply indicates whether | ||
41 | * or not it is worthwhile for the frontend to attempt barrier requests. | ||
42 | * If a backend does not recognise BLKIF_OP_WRITE_BARRIER, it should *not* | ||
43 | * create the "feature-barrier" node! | ||
44 | */ | ||
45 | #define BLKIF_OP_WRITE_BARRIER 2 | ||
46 | |||
47 | /* | ||
48 | * Maximum scatter/gather segments per request. | ||
49 | * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE. | ||
50 | * NB. This could be 12 if the ring indexes weren't stored in the same page. | ||
51 | */ | ||
52 | #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 | ||
53 | |||
54 | struct blkif_request { | ||
55 | uint8_t operation; /* BLKIF_OP_??? */ | ||
56 | uint8_t nr_segments; /* number of segments */ | ||
57 | blkif_vdev_t handle; /* only for read/write requests */ | ||
58 | uint64_t id; /* private guest value, echoed in resp */ | ||
59 | blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ | ||
60 | struct blkif_request_segment { | ||
61 | grant_ref_t gref; /* reference to I/O buffer frame */ | ||
62 | /* @first_sect: first sector in frame to transfer (inclusive). */ | ||
63 | /* @last_sect: last sector in frame to transfer (inclusive). */ | ||
64 | uint8_t first_sect, last_sect; | ||
65 | } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
66 | }; | ||
67 | |||
68 | struct blkif_response { | ||
69 | uint64_t id; /* copied from request */ | ||
70 | uint8_t operation; /* copied from request */ | ||
71 | int16_t status; /* BLKIF_RSP_??? */ | ||
72 | }; | ||
73 | |||
74 | /* | ||
75 | * STATUS RETURN CODES. | ||
76 | */ | ||
77 | /* Operation not supported (only happens on barrier writes). */ | ||
78 | #define BLKIF_RSP_EOPNOTSUPP -2 | ||
79 | /* Operation failed for some unspecified reason (-EIO). */ | ||
80 | #define BLKIF_RSP_ERROR -1 | ||
81 | /* Operation completed successfully. */ | ||
82 | #define BLKIF_RSP_OKAY 0 | ||
83 | |||
84 | /* | ||
85 | * Generate blkif ring structures and types. | ||
86 | */ | ||
87 | |||
88 | DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response); | ||
89 | |||
90 | #define VDISK_CDROM 0x1 | ||
91 | #define VDISK_REMOVABLE 0x2 | ||
92 | #define VDISK_READONLY 0x4 | ||
93 | |||
94 | #endif /* __XEN_PUBLIC_IO_BLKIF_H__ */ | ||
diff --git a/include/xen/interface/io/console.h b/include/xen/interface/io/console.h new file mode 100644 index 000000000000..e563de70f784 --- /dev/null +++ b/include/xen/interface/io/console.h | |||
@@ -0,0 +1,23 @@ | |||
1 | /****************************************************************************** | ||
2 | * console.h | ||
3 | * | ||
4 | * Console I/O interface for Xen guest OSes. | ||
5 | * | ||
6 | * Copyright (c) 2005, Keir Fraser | ||
7 | */ | ||
8 | |||
9 | #ifndef __XEN_PUBLIC_IO_CONSOLE_H__ | ||
10 | #define __XEN_PUBLIC_IO_CONSOLE_H__ | ||
11 | |||
12 | typedef uint32_t XENCONS_RING_IDX; | ||
13 | |||
14 | #define MASK_XENCONS_IDX(idx, ring) ((idx) & (sizeof(ring)-1)) | ||
15 | |||
16 | struct xencons_interface { | ||
17 | char in[1024]; | ||
18 | char out[2048]; | ||
19 | XENCONS_RING_IDX in_cons, in_prod; | ||
20 | XENCONS_RING_IDX out_cons, out_prod; | ||
21 | }; | ||
22 | |||
23 | #endif /* __XEN_PUBLIC_IO_CONSOLE_H__ */ | ||
diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h new file mode 100644 index 000000000000..518481c95f18 --- /dev/null +++ b/include/xen/interface/io/netif.h | |||
@@ -0,0 +1,158 @@ | |||
1 | /****************************************************************************** | ||
2 | * netif.h | ||
3 | * | ||
4 | * Unified network-device I/O interface for Xen guest OSes. | ||
5 | * | ||
6 | * Copyright (c) 2003-2004, Keir Fraser | ||
7 | */ | ||
8 | |||
9 | #ifndef __XEN_PUBLIC_IO_NETIF_H__ | ||
10 | #define __XEN_PUBLIC_IO_NETIF_H__ | ||
11 | |||
12 | #include "ring.h" | ||
13 | #include "../grant_table.h" | ||
14 | |||
15 | /* | ||
16 | * Notifications after enqueuing any type of message should be conditional on | ||
17 | * the appropriate req_event or rsp_event field in the shared ring. | ||
18 | * If the client sends notification for rx requests then it should specify | ||
19 | * feature 'feature-rx-notify' via xenbus. Otherwise the backend will assume | ||
20 | * that it cannot safely queue packets (as it may not be kicked to send them). | ||
21 | */ | ||
22 | |||
23 | /* | ||
24 | * This is the 'wire' format for packets: | ||
25 | * Request 1: netif_tx_request -- NETTXF_* (any flags) | ||
26 | * [Request 2: netif_tx_extra] (only if request 1 has NETTXF_extra_info) | ||
27 | * [Request 3: netif_tx_extra] (only if request 2 has XEN_NETIF_EXTRA_MORE) | ||
28 | * Request 4: netif_tx_request -- NETTXF_more_data | ||
29 | * Request 5: netif_tx_request -- NETTXF_more_data | ||
30 | * ... | ||
31 | * Request N: netif_tx_request -- 0 | ||
32 | */ | ||
33 | |||
34 | /* Protocol checksum field is blank in the packet (hardware offload)? */ | ||
35 | #define _NETTXF_csum_blank (0) | ||
36 | #define NETTXF_csum_blank (1U<<_NETTXF_csum_blank) | ||
37 | |||
38 | /* Packet data has been validated against protocol checksum. */ | ||
39 | #define _NETTXF_data_validated (1) | ||
40 | #define NETTXF_data_validated (1U<<_NETTXF_data_validated) | ||
41 | |||
42 | /* Packet continues in the next request descriptor. */ | ||
43 | #define _NETTXF_more_data (2) | ||
44 | #define NETTXF_more_data (1U<<_NETTXF_more_data) | ||
45 | |||
46 | /* Packet to be followed by extra descriptor(s). */ | ||
47 | #define _NETTXF_extra_info (3) | ||
48 | #define NETTXF_extra_info (1U<<_NETTXF_extra_info) | ||
49 | |||
50 | struct xen_netif_tx_request { | ||
51 | grant_ref_t gref; /* Reference to buffer page */ | ||
52 | uint16_t offset; /* Offset within buffer page */ | ||
53 | uint16_t flags; /* NETTXF_* */ | ||
54 | uint16_t id; /* Echoed in response message. */ | ||
55 | uint16_t size; /* Packet size in bytes. */ | ||
56 | }; | ||
57 | |||
58 | /* Types of netif_extra_info descriptors. */ | ||
59 | #define XEN_NETIF_EXTRA_TYPE_NONE (0) /* Never used - invalid */ | ||
60 | #define XEN_NETIF_EXTRA_TYPE_GSO (1) /* u.gso */ | ||
61 | #define XEN_NETIF_EXTRA_TYPE_MAX (2) | ||
62 | |||
63 | /* netif_extra_info flags. */ | ||
64 | #define _XEN_NETIF_EXTRA_FLAG_MORE (0) | ||
65 | #define XEN_NETIF_EXTRA_FLAG_MORE (1U<<_XEN_NETIF_EXTRA_FLAG_MORE) | ||
66 | |||
67 | /* GSO types - only TCPv4 currently supported. */ | ||
68 | #define XEN_NETIF_GSO_TYPE_TCPV4 (1) | ||
69 | |||
70 | /* | ||
71 | * This structure needs to fit within both netif_tx_request and | ||
72 | * netif_rx_response for compatibility. | ||
73 | */ | ||
74 | struct xen_netif_extra_info { | ||
75 | uint8_t type; /* XEN_NETIF_EXTRA_TYPE_* */ | ||
76 | uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */ | ||
77 | |||
78 | union { | ||
79 | struct { | ||
80 | /* | ||
81 | * Maximum payload size of each segment. For | ||
82 | * example, for TCP this is just the path MSS. | ||
83 | */ | ||
84 | uint16_t size; | ||
85 | |||
86 | /* | ||
87 | * GSO type. This determines the protocol of | ||
88 | * the packet and any extra features required | ||
89 | * to segment the packet properly. | ||
90 | */ | ||
91 | uint8_t type; /* XEN_NETIF_GSO_TYPE_* */ | ||
92 | |||
93 | /* Future expansion. */ | ||
94 | uint8_t pad; | ||
95 | |||
96 | /* | ||
97 | * GSO features. This specifies any extra GSO | ||
98 | * features required to process this packet, | ||
99 | * such as ECN support for TCPv4. | ||
100 | */ | ||
101 | uint16_t features; /* XEN_NETIF_GSO_FEAT_* */ | ||
102 | } gso; | ||
103 | |||
104 | uint16_t pad[3]; | ||
105 | } u; | ||
106 | }; | ||
107 | |||
108 | struct xen_netif_tx_response { | ||
109 | uint16_t id; | ||
110 | int16_t status; /* NETIF_RSP_* */ | ||
111 | }; | ||
112 | |||
113 | struct xen_netif_rx_request { | ||
114 | uint16_t id; /* Echoed in response message. */ | ||
115 | grant_ref_t gref; /* Reference to incoming granted frame */ | ||
116 | }; | ||
117 | |||
118 | /* Packet data has been validated against protocol checksum. */ | ||
119 | #define _NETRXF_data_validated (0) | ||
120 | #define NETRXF_data_validated (1U<<_NETRXF_data_validated) | ||
121 | |||
122 | /* Protocol checksum field is blank in the packet (hardware offload)? */ | ||
123 | #define _NETRXF_csum_blank (1) | ||
124 | #define NETRXF_csum_blank (1U<<_NETRXF_csum_blank) | ||
125 | |||
126 | /* Packet continues in the next request descriptor. */ | ||
127 | #define _NETRXF_more_data (2) | ||
128 | #define NETRXF_more_data (1U<<_NETRXF_more_data) | ||
129 | |||
130 | /* Packet to be followed by extra descriptor(s). */ | ||
131 | #define _NETRXF_extra_info (3) | ||
132 | #define NETRXF_extra_info (1U<<_NETRXF_extra_info) | ||
133 | |||
134 | struct xen_netif_rx_response { | ||
135 | uint16_t id; | ||
136 | uint16_t offset; /* Offset in page of start of received packet */ | ||
137 | uint16_t flags; /* NETRXF_* */ | ||
138 | int16_t status; /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */ | ||
139 | }; | ||
140 | |||
141 | /* | ||
142 | * Generate netif ring structures and types. | ||
143 | */ | ||
144 | |||
145 | DEFINE_RING_TYPES(xen_netif_tx, | ||
146 | struct xen_netif_tx_request, | ||
147 | struct xen_netif_tx_response); | ||
148 | DEFINE_RING_TYPES(xen_netif_rx, | ||
149 | struct xen_netif_rx_request, | ||
150 | struct xen_netif_rx_response); | ||
151 | |||
152 | #define NETIF_RSP_DROPPED -2 | ||
153 | #define NETIF_RSP_ERROR -1 | ||
154 | #define NETIF_RSP_OKAY 0 | ||
155 | /* No response: used for auxiliary requests (e.g., netif_tx_extra). */ | ||
156 | #define NETIF_RSP_NULL 1 | ||
157 | |||
158 | #endif | ||
diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h new file mode 100644 index 000000000000..e8cbf431c8cc --- /dev/null +++ b/include/xen/interface/io/ring.h | |||
@@ -0,0 +1,260 @@ | |||
1 | /****************************************************************************** | ||
2 | * ring.h | ||
3 | * | ||
4 | * Shared producer-consumer ring macros. | ||
5 | * | ||
6 | * Tim Deegan and Andrew Warfield November 2004. | ||
7 | */ | ||
8 | |||
9 | #ifndef __XEN_PUBLIC_IO_RING_H__ | ||
10 | #define __XEN_PUBLIC_IO_RING_H__ | ||
11 | |||
12 | typedef unsigned int RING_IDX; | ||
13 | |||
14 | /* Round a 32-bit unsigned constant down to the nearest power of two. */ | ||
15 | #define __RD2(_x) (((_x) & 0x00000002) ? 0x2 : ((_x) & 0x1)) | ||
16 | #define __RD4(_x) (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2 : __RD2(_x)) | ||
17 | #define __RD8(_x) (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4 : __RD4(_x)) | ||
18 | #define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8 : __RD8(_x)) | ||
19 | #define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x)) | ||
20 | |||
21 | /* | ||
22 | * Calculate size of a shared ring, given the total available space for the | ||
23 | * ring and indexes (_sz), and the name tag of the request/response structure. | ||
24 | * A ring contains as many entries as will fit, rounded down to the nearest | ||
25 | * power of two (so we can mask with (size-1) to loop around). | ||
26 | */ | ||
27 | #define __RING_SIZE(_s, _sz) \ | ||
28 | (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0]))) | ||
29 | |||
30 | /* | ||
31 | * Macros to make the correct C datatypes for a new kind of ring. | ||
32 | * | ||
33 | * To make a new ring datatype, you need to have two message structures, | ||
34 | * let's say struct request, and struct response already defined. | ||
35 | * | ||
36 | * In a header where you want the ring datatype declared, you then do: | ||
37 | * | ||
38 | * DEFINE_RING_TYPES(mytag, struct request, struct response); | ||
39 | * | ||
40 | * These expand out to give you a set of types, as you can see below. | ||
41 | * The most important of these are: | ||
42 | * | ||
43 | * struct mytag_sring - The shared ring. | ||
44 | * struct mytag_front_ring - The 'front' half of the ring. | ||
45 | * struct mytag_back_ring - The 'back' half of the ring. | ||
46 | * | ||
47 | * To initialize a ring in your code you need to know the location and size | ||
48 | * of the shared memory area (PAGE_SIZE, for instance). To initialise | ||
49 | * the front half: | ||
50 | * | ||
51 | * struct mytag_front_ring front_ring; | ||
52 | * SHARED_RING_INIT((struct mytag_sring *)shared_page); | ||
53 | * FRONT_RING_INIT(&front_ring, (struct mytag_sring *)shared_page, | ||
54 | * PAGE_SIZE); | ||
55 | * | ||
56 | * Initializing the back follows similarly (note that only the front | ||
57 | * initializes the shared ring): | ||
58 | * | ||
59 | * struct mytag_back_ring back_ring; | ||
60 | * BACK_RING_INIT(&back_ring, (struct mytag_sring *)shared_page, | ||
61 | * PAGE_SIZE); | ||
62 | */ | ||
63 | |||
64 | #define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) \ | ||
65 | \ | ||
66 | /* Shared ring entry */ \ | ||
67 | union __name##_sring_entry { \ | ||
68 | __req_t req; \ | ||
69 | __rsp_t rsp; \ | ||
70 | }; \ | ||
71 | \ | ||
72 | /* Shared ring page */ \ | ||
73 | struct __name##_sring { \ | ||
74 | RING_IDX req_prod, req_event; \ | ||
75 | RING_IDX rsp_prod, rsp_event; \ | ||
76 | uint8_t pad[48]; \ | ||
77 | union __name##_sring_entry ring[1]; /* variable-length */ \ | ||
78 | }; \ | ||
79 | \ | ||
80 | /* "Front" end's private variables */ \ | ||
81 | struct __name##_front_ring { \ | ||
82 | RING_IDX req_prod_pvt; \ | ||
83 | RING_IDX rsp_cons; \ | ||
84 | unsigned int nr_ents; \ | ||
85 | struct __name##_sring *sring; \ | ||
86 | }; \ | ||
87 | \ | ||
88 | /* "Back" end's private variables */ \ | ||
89 | struct __name##_back_ring { \ | ||
90 | RING_IDX rsp_prod_pvt; \ | ||
91 | RING_IDX req_cons; \ | ||
92 | unsigned int nr_ents; \ | ||
93 | struct __name##_sring *sring; \ | ||
94 | }; | ||
95 | |||
96 | /* | ||
97 | * Macros for manipulating rings. | ||
98 | * | ||
99 | * FRONT_RING_whatever works on the "front end" of a ring: here | ||
100 | * requests are pushed on to the ring and responses taken off it. | ||
101 | * | ||
102 | * BACK_RING_whatever works on the "back end" of a ring: here | ||
103 | * requests are taken off the ring and responses put on. | ||
104 | * | ||
105 | * N.B. these macros do NO INTERLOCKS OR FLOW CONTROL. | ||
106 | * This is OK in 1-for-1 request-response situations where the | ||
107 | * requestor (front end) never has more than RING_SIZE()-1 | ||
108 | * outstanding requests. | ||
109 | */ | ||
110 | |||
111 | /* Initialising empty rings */ | ||
112 | #define SHARED_RING_INIT(_s) do { \ | ||
113 | (_s)->req_prod = (_s)->rsp_prod = 0; \ | ||
114 | (_s)->req_event = (_s)->rsp_event = 1; \ | ||
115 | memset((_s)->pad, 0, sizeof((_s)->pad)); \ | ||
116 | } while(0) | ||
117 | |||
118 | #define FRONT_RING_INIT(_r, _s, __size) do { \ | ||
119 | (_r)->req_prod_pvt = 0; \ | ||
120 | (_r)->rsp_cons = 0; \ | ||
121 | (_r)->nr_ents = __RING_SIZE(_s, __size); \ | ||
122 | (_r)->sring = (_s); \ | ||
123 | } while (0) | ||
124 | |||
125 | #define BACK_RING_INIT(_r, _s, __size) do { \ | ||
126 | (_r)->rsp_prod_pvt = 0; \ | ||
127 | (_r)->req_cons = 0; \ | ||
128 | (_r)->nr_ents = __RING_SIZE(_s, __size); \ | ||
129 | (_r)->sring = (_s); \ | ||
130 | } while (0) | ||
131 | |||
132 | /* Initialize to existing shared indexes -- for recovery */ | ||
133 | #define FRONT_RING_ATTACH(_r, _s, __size) do { \ | ||
134 | (_r)->sring = (_s); \ | ||
135 | (_r)->req_prod_pvt = (_s)->req_prod; \ | ||
136 | (_r)->rsp_cons = (_s)->rsp_prod; \ | ||
137 | (_r)->nr_ents = __RING_SIZE(_s, __size); \ | ||
138 | } while (0) | ||
139 | |||
140 | #define BACK_RING_ATTACH(_r, _s, __size) do { \ | ||
141 | (_r)->sring = (_s); \ | ||
142 | (_r)->rsp_prod_pvt = (_s)->rsp_prod; \ | ||
143 | (_r)->req_cons = (_s)->req_prod; \ | ||
144 | (_r)->nr_ents = __RING_SIZE(_s, __size); \ | ||
145 | } while (0) | ||
146 | |||
147 | /* How big is this ring? */ | ||
148 | #define RING_SIZE(_r) \ | ||
149 | ((_r)->nr_ents) | ||
150 | |||
151 | /* Number of free requests (for use on front side only). */ | ||
152 | #define RING_FREE_REQUESTS(_r) \ | ||
153 | (RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons)) | ||
154 | |||
155 | /* Test if there is an empty slot available on the front ring. | ||
156 | * (This is only meaningful from the front. ) | ||
157 | */ | ||
158 | #define RING_FULL(_r) \ | ||
159 | (RING_FREE_REQUESTS(_r) == 0) | ||
160 | |||
161 | /* Test if there are outstanding messages to be processed on a ring. */ | ||
162 | #define RING_HAS_UNCONSUMED_RESPONSES(_r) \ | ||
163 | ((_r)->sring->rsp_prod - (_r)->rsp_cons) | ||
164 | |||
165 | #define RING_HAS_UNCONSUMED_REQUESTS(_r) \ | ||
166 | ({ \ | ||
167 | unsigned int req = (_r)->sring->req_prod - (_r)->req_cons; \ | ||
168 | unsigned int rsp = RING_SIZE(_r) - \ | ||
169 | ((_r)->req_cons - (_r)->rsp_prod_pvt); \ | ||
170 | req < rsp ? req : rsp; \ | ||
171 | }) | ||
172 | |||
173 | /* Direct access to individual ring elements, by index. */ | ||
174 | #define RING_GET_REQUEST(_r, _idx) \ | ||
175 | (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req)) | ||
176 | |||
177 | #define RING_GET_RESPONSE(_r, _idx) \ | ||
178 | (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp)) | ||
179 | |||
180 | /* Loop termination condition: Would the specified index overflow the ring? */ | ||
181 | #define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ | ||
182 | (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) | ||
183 | |||
184 | #define RING_PUSH_REQUESTS(_r) do { \ | ||
185 | wmb(); /* back sees requests /before/ updated producer index */ \ | ||
186 | (_r)->sring->req_prod = (_r)->req_prod_pvt; \ | ||
187 | } while (0) | ||
188 | |||
189 | #define RING_PUSH_RESPONSES(_r) do { \ | ||
190 | wmb(); /* front sees responses /before/ updated producer index */ \ | ||
191 | (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt; \ | ||
192 | } while (0) | ||
193 | |||
194 | /* | ||
195 | * Notification hold-off (req_event and rsp_event): | ||
196 | * | ||
197 | * When queueing requests or responses on a shared ring, it may not always be | ||
198 | * necessary to notify the remote end. For example, if requests are in flight | ||
199 | * in a backend, the front may be able to queue further requests without | ||
200 | * notifying the back (if the back checks for new requests when it queues | ||
201 | * responses). | ||
202 | * | ||
203 | * When enqueuing requests or responses: | ||
204 | * | ||
205 | * Use RING_PUSH_{REQUESTS,RESPONSES}_AND_CHECK_NOTIFY(). The second argument | ||
206 | * is a boolean return value. True indicates that the receiver requires an | ||
207 | * asynchronous notification. | ||
208 | * | ||
209 | * After dequeuing requests or responses (before sleeping the connection): | ||
210 | * | ||
211 | * Use RING_FINAL_CHECK_FOR_REQUESTS() or RING_FINAL_CHECK_FOR_RESPONSES(). | ||
212 | * The second argument is a boolean return value. True indicates that there | ||
213 | * are pending messages on the ring (i.e., the connection should not be put | ||
214 | * to sleep). | ||
215 | * | ||
216 | * These macros will set the req_event/rsp_event field to trigger a | ||
217 | * notification on the very next message that is enqueued. If you want to | ||
218 | * create batches of work (i.e., only receive a notification after several | ||
219 | * messages have been enqueued) then you will need to create a customised | ||
220 | * version of the FINAL_CHECK macro in your own code, which sets the event | ||
221 | * field appropriately. | ||
222 | */ | ||
223 | |||
224 | #define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do { \ | ||
225 | RING_IDX __old = (_r)->sring->req_prod; \ | ||
226 | RING_IDX __new = (_r)->req_prod_pvt; \ | ||
227 | wmb(); /* back sees requests /before/ updated producer index */ \ | ||
228 | (_r)->sring->req_prod = __new; \ | ||
229 | mb(); /* back sees new requests /before/ we check req_event */ \ | ||
230 | (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) < \ | ||
231 | (RING_IDX)(__new - __old)); \ | ||
232 | } while (0) | ||
233 | |||
234 | #define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do { \ | ||
235 | RING_IDX __old = (_r)->sring->rsp_prod; \ | ||
236 | RING_IDX __new = (_r)->rsp_prod_pvt; \ | ||
237 | wmb(); /* front sees responses /before/ updated producer index */ \ | ||
238 | (_r)->sring->rsp_prod = __new; \ | ||
239 | mb(); /* front sees new responses /before/ we check rsp_event */ \ | ||
240 | (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) < \ | ||
241 | (RING_IDX)(__new - __old)); \ | ||
242 | } while (0) | ||
243 | |||
244 | #define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do { \ | ||
245 | (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \ | ||
246 | if (_work_to_do) break; \ | ||
247 | (_r)->sring->req_event = (_r)->req_cons + 1; \ | ||
248 | mb(); \ | ||
249 | (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \ | ||
250 | } while (0) | ||
251 | |||
252 | #define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do { \ | ||
253 | (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ | ||
254 | if (_work_to_do) break; \ | ||
255 | (_r)->sring->rsp_event = (_r)->rsp_cons + 1; \ | ||
256 | mb(); \ | ||
257 | (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ | ||
258 | } while (0) | ||
259 | |||
260 | #endif /* __XEN_PUBLIC_IO_RING_H__ */ | ||
diff --git a/include/xen/interface/io/xenbus.h b/include/xen/interface/io/xenbus.h new file mode 100644 index 000000000000..46508c7fa399 --- /dev/null +++ b/include/xen/interface/io/xenbus.h | |||
@@ -0,0 +1,44 @@ | |||
1 | /***************************************************************************** | ||
2 | * xenbus.h | ||
3 | * | ||
4 | * Xenbus protocol details. | ||
5 | * | ||
6 | * Copyright (C) 2005 XenSource Ltd. | ||
7 | */ | ||
8 | |||
9 | #ifndef _XEN_PUBLIC_IO_XENBUS_H | ||
10 | #define _XEN_PUBLIC_IO_XENBUS_H | ||
11 | |||
12 | /* The state of either end of the Xenbus, i.e. the current communication | ||
13 | status of initialisation across the bus. States here imply nothing about | ||
14 | the state of the connection between the driver and the kernel's device | ||
15 | layers. */ | ||
16 | enum xenbus_state | ||
17 | { | ||
18 | XenbusStateUnknown = 0, | ||
19 | XenbusStateInitialising = 1, | ||
20 | XenbusStateInitWait = 2, /* Finished early | ||
21 | initialisation, but waiting | ||
22 | for information from the peer | ||
23 | or hotplug scripts. */ | ||
24 | XenbusStateInitialised = 3, /* Initialised and waiting for a | ||
25 | connection from the peer. */ | ||
26 | XenbusStateConnected = 4, | ||
27 | XenbusStateClosing = 5, /* The device is being closed | ||
28 | due to an error or an unplug | ||
29 | event. */ | ||
30 | XenbusStateClosed = 6 | ||
31 | |||
32 | }; | ||
33 | |||
34 | #endif /* _XEN_PUBLIC_IO_XENBUS_H */ | ||
35 | |||
36 | /* | ||
37 | * Local variables: | ||
38 | * c-file-style: "linux" | ||
39 | * indent-tabs-mode: t | ||
40 | * c-indent-level: 8 | ||
41 | * c-basic-offset: 8 | ||
42 | * tab-width: 8 | ||
43 | * End: | ||
44 | */ | ||
diff --git a/include/xen/interface/io/xs_wire.h b/include/xen/interface/io/xs_wire.h new file mode 100644 index 000000000000..99fcffb372d1 --- /dev/null +++ b/include/xen/interface/io/xs_wire.h | |||
@@ -0,0 +1,87 @@ | |||
1 | /* | ||
2 | * Details of the "wire" protocol between Xen Store Daemon and client | ||
3 | * library or guest kernel. | ||
4 | * Copyright (C) 2005 Rusty Russell IBM Corporation | ||
5 | */ | ||
6 | |||
7 | #ifndef _XS_WIRE_H | ||
8 | #define _XS_WIRE_H | ||
9 | |||
10 | enum xsd_sockmsg_type | ||
11 | { | ||
12 | XS_DEBUG, | ||
13 | XS_DIRECTORY, | ||
14 | XS_READ, | ||
15 | XS_GET_PERMS, | ||
16 | XS_WATCH, | ||
17 | XS_UNWATCH, | ||
18 | XS_TRANSACTION_START, | ||
19 | XS_TRANSACTION_END, | ||
20 | XS_INTRODUCE, | ||
21 | XS_RELEASE, | ||
22 | XS_GET_DOMAIN_PATH, | ||
23 | XS_WRITE, | ||
24 | XS_MKDIR, | ||
25 | XS_RM, | ||
26 | XS_SET_PERMS, | ||
27 | XS_WATCH_EVENT, | ||
28 | XS_ERROR, | ||
29 | XS_IS_DOMAIN_INTRODUCED | ||
30 | }; | ||
31 | |||
32 | #define XS_WRITE_NONE "NONE" | ||
33 | #define XS_WRITE_CREATE "CREATE" | ||
34 | #define XS_WRITE_CREATE_EXCL "CREATE|EXCL" | ||
35 | |||
36 | /* We hand errors as strings, for portability. */ | ||
37 | struct xsd_errors | ||
38 | { | ||
39 | int errnum; | ||
40 | const char *errstring; | ||
41 | }; | ||
42 | #define XSD_ERROR(x) { x, #x } | ||
43 | static struct xsd_errors xsd_errors[] __attribute__((unused)) = { | ||
44 | XSD_ERROR(EINVAL), | ||
45 | XSD_ERROR(EACCES), | ||
46 | XSD_ERROR(EEXIST), | ||
47 | XSD_ERROR(EISDIR), | ||
48 | XSD_ERROR(ENOENT), | ||
49 | XSD_ERROR(ENOMEM), | ||
50 | XSD_ERROR(ENOSPC), | ||
51 | XSD_ERROR(EIO), | ||
52 | XSD_ERROR(ENOTEMPTY), | ||
53 | XSD_ERROR(ENOSYS), | ||
54 | XSD_ERROR(EROFS), | ||
55 | XSD_ERROR(EBUSY), | ||
56 | XSD_ERROR(EAGAIN), | ||
57 | XSD_ERROR(EISCONN) | ||
58 | }; | ||
59 | |||
60 | struct xsd_sockmsg | ||
61 | { | ||
62 | uint32_t type; /* XS_??? */ | ||
63 | uint32_t req_id;/* Request identifier, echoed in daemon's response. */ | ||
64 | uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */ | ||
65 | uint32_t len; /* Length of data following this. */ | ||
66 | |||
67 | /* Generally followed by nul-terminated string(s). */ | ||
68 | }; | ||
69 | |||
70 | enum xs_watch_type | ||
71 | { | ||
72 | XS_WATCH_PATH = 0, | ||
73 | XS_WATCH_TOKEN | ||
74 | }; | ||
75 | |||
76 | /* Inter-domain shared memory communications. */ | ||
77 | #define XENSTORE_RING_SIZE 1024 | ||
78 | typedef uint32_t XENSTORE_RING_IDX; | ||
79 | #define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE-1)) | ||
80 | struct xenstore_domain_interface { | ||
81 | char req[XENSTORE_RING_SIZE]; /* Requests to xenstore daemon. */ | ||
82 | char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch events. */ | ||
83 | XENSTORE_RING_IDX req_cons, req_prod; | ||
84 | XENSTORE_RING_IDX rsp_cons, rsp_prod; | ||
85 | }; | ||
86 | |||
87 | #endif /* _XS_WIRE_H */ | ||
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h new file mode 100644 index 000000000000..af36ead16817 --- /dev/null +++ b/include/xen/interface/memory.h | |||
@@ -0,0 +1,145 @@ | |||
1 | /****************************************************************************** | ||
2 | * memory.h | ||
3 | * | ||
4 | * Memory reservation and information. | ||
5 | * | ||
6 | * Copyright (c) 2005, Keir Fraser <keir@xensource.com> | ||
7 | */ | ||
8 | |||
9 | #ifndef __XEN_PUBLIC_MEMORY_H__ | ||
10 | #define __XEN_PUBLIC_MEMORY_H__ | ||
11 | |||
12 | /* | ||
13 | * Increase or decrease the specified domain's memory reservation. Returns a | ||
14 | * -ve errcode on failure, or the # extents successfully allocated or freed. | ||
15 | * arg == addr of struct xen_memory_reservation. | ||
16 | */ | ||
17 | #define XENMEM_increase_reservation 0 | ||
18 | #define XENMEM_decrease_reservation 1 | ||
19 | #define XENMEM_populate_physmap 6 | ||
20 | struct xen_memory_reservation { | ||
21 | |||
22 | /* | ||
23 | * XENMEM_increase_reservation: | ||
24 | * OUT: MFN (*not* GMFN) bases of extents that were allocated | ||
25 | * XENMEM_decrease_reservation: | ||
26 | * IN: GMFN bases of extents to free | ||
27 | * XENMEM_populate_physmap: | ||
28 | * IN: GPFN bases of extents to populate with memory | ||
29 | * OUT: GMFN bases of extents that were allocated | ||
30 | * (NB. This command also updates the mach_to_phys translation table) | ||
31 | */ | ||
32 | GUEST_HANDLE(ulong) extent_start; | ||
33 | |||
34 | /* Number of extents, and size/alignment of each (2^extent_order pages). */ | ||
35 | unsigned long nr_extents; | ||
36 | unsigned int extent_order; | ||
37 | |||
38 | /* | ||
39 | * Maximum # bits addressable by the user of the allocated region (e.g., | ||
40 | * I/O devices often have a 32-bit limitation even in 64-bit systems). If | ||
41 | * zero then the user has no addressing restriction. | ||
42 | * This field is not used by XENMEM_decrease_reservation. | ||
43 | */ | ||
44 | unsigned int address_bits; | ||
45 | |||
46 | /* | ||
47 | * Domain whose reservation is being changed. | ||
48 | * Unprivileged domains can specify only DOMID_SELF. | ||
49 | */ | ||
50 | domid_t domid; | ||
51 | |||
52 | }; | ||
53 | DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation); | ||
54 | |||
55 | /* | ||
56 | * Returns the maximum machine frame number of mapped RAM in this system. | ||
57 | * This command always succeeds (it never returns an error code). | ||
58 | * arg == NULL. | ||
59 | */ | ||
60 | #define XENMEM_maximum_ram_page 2 | ||
61 | |||
62 | /* | ||
63 | * Returns the current or maximum memory reservation, in pages, of the | ||
64 | * specified domain (may be DOMID_SELF). Returns -ve errcode on failure. | ||
65 | * arg == addr of domid_t. | ||
66 | */ | ||
67 | #define XENMEM_current_reservation 3 | ||
68 | #define XENMEM_maximum_reservation 4 | ||
69 | |||
70 | /* | ||
71 | * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys | ||
72 | * mapping table. Architectures which do not have a m2p table do not implement | ||
73 | * this command. | ||
74 | * arg == addr of xen_machphys_mfn_list_t. | ||
75 | */ | ||
76 | #define XENMEM_machphys_mfn_list 5 | ||
77 | struct xen_machphys_mfn_list { | ||
78 | /* | ||
79 | * Size of the 'extent_start' array. Fewer entries will be filled if the | ||
80 | * machphys table is smaller than max_extents * 2MB. | ||
81 | */ | ||
82 | unsigned int max_extents; | ||
83 | |||
84 | /* | ||
85 | * Pointer to buffer to fill with list of extent starts. If there are | ||
86 | * any large discontiguities in the machine address space, 2MB gaps in | ||
87 | * the machphys table will be represented by an MFN base of zero. | ||
88 | */ | ||
89 | GUEST_HANDLE(ulong) extent_start; | ||
90 | |||
91 | /* | ||
92 | * Number of extents written to the above array. This will be smaller | ||
93 | * than 'max_extents' if the machphys table is smaller than max_e * 2MB. | ||
94 | */ | ||
95 | unsigned int nr_extents; | ||
96 | }; | ||
97 | DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list); | ||
98 | |||
99 | /* | ||
100 | * Sets the GPFN at which a particular page appears in the specified guest's | ||
101 | * pseudophysical address space. | ||
102 | * arg == addr of xen_add_to_physmap_t. | ||
103 | */ | ||
104 | #define XENMEM_add_to_physmap 7 | ||
105 | struct xen_add_to_physmap { | ||
106 | /* Which domain to change the mapping for. */ | ||
107 | domid_t domid; | ||
108 | |||
109 | /* Source mapping space. */ | ||
110 | #define XENMAPSPACE_shared_info 0 /* shared info page */ | ||
111 | #define XENMAPSPACE_grant_table 1 /* grant table page */ | ||
112 | unsigned int space; | ||
113 | |||
114 | /* Index into source mapping space. */ | ||
115 | unsigned long idx; | ||
116 | |||
117 | /* GPFN where the source mapping page should appear. */ | ||
118 | unsigned long gpfn; | ||
119 | }; | ||
120 | DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap); | ||
121 | |||
122 | /* | ||
123 | * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error | ||
124 | * code on failure. This call only works for auto-translated guests. | ||
125 | */ | ||
126 | #define XENMEM_translate_gpfn_list 8 | ||
127 | struct xen_translate_gpfn_list { | ||
128 | /* Which domain to translate for? */ | ||
129 | domid_t domid; | ||
130 | |||
131 | /* Length of list. */ | ||
132 | unsigned long nr_gpfns; | ||
133 | |||
134 | /* List of GPFNs to translate. */ | ||
135 | GUEST_HANDLE(ulong) gpfn_list; | ||
136 | |||
137 | /* | ||
138 | * Output list to contain MFN translations. May be the same as the input | ||
139 | * list (in which case each input GPFN is overwritten with the output MFN). | ||
140 | */ | ||
141 | GUEST_HANDLE(ulong) mfn_list; | ||
142 | }; | ||
143 | DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list); | ||
144 | |||
145 | #endif /* __XEN_PUBLIC_MEMORY_H__ */ | ||
diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h new file mode 100644 index 000000000000..cd6939147cb6 --- /dev/null +++ b/include/xen/interface/physdev.h | |||
@@ -0,0 +1,145 @@ | |||
1 | /* | ||
2 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
3 | * of this software and associated documentation files (the "Software"), to | ||
4 | * deal in the Software without restriction, including without limitation the | ||
5 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or | ||
6 | * sell copies of the Software, and to permit persons to whom the Software is | ||
7 | * furnished to do so, subject to the following conditions: | ||
8 | * | ||
9 | * The above copyright notice and this permission notice shall be included in | ||
10 | * all copies or substantial portions of the Software. | ||
11 | * | ||
12 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
13 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
14 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
15 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
16 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
17 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
18 | * DEALINGS IN THE SOFTWARE. | ||
19 | */ | ||
20 | |||
21 | #ifndef __XEN_PUBLIC_PHYSDEV_H__ | ||
22 | #define __XEN_PUBLIC_PHYSDEV_H__ | ||
23 | |||
24 | /* | ||
25 | * Prototype for this hypercall is: | ||
26 | * int physdev_op(int cmd, void *args) | ||
27 | * @cmd == PHYSDEVOP_??? (physdev operation). | ||
28 | * @args == Operation-specific extra arguments (NULL if none). | ||
29 | */ | ||
30 | |||
31 | /* | ||
32 | * Notify end-of-interrupt (EOI) for the specified IRQ. | ||
33 | * @arg == pointer to physdev_eoi structure. | ||
34 | */ | ||
35 | #define PHYSDEVOP_eoi 12 | ||
36 | struct physdev_eoi { | ||
37 | /* IN */ | ||
38 | uint32_t irq; | ||
39 | }; | ||
40 | |||
41 | /* | ||
42 | * Query the status of an IRQ line. | ||
43 | * @arg == pointer to physdev_irq_status_query structure. | ||
44 | */ | ||
45 | #define PHYSDEVOP_irq_status_query 5 | ||
46 | struct physdev_irq_status_query { | ||
47 | /* IN */ | ||
48 | uint32_t irq; | ||
49 | /* OUT */ | ||
50 | uint32_t flags; /* XENIRQSTAT_* */ | ||
51 | }; | ||
52 | |||
53 | /* Need to call PHYSDEVOP_eoi when the IRQ has been serviced? */ | ||
54 | #define _XENIRQSTAT_needs_eoi (0) | ||
55 | #define XENIRQSTAT_needs_eoi (1U<<_XENIRQSTAT_needs_eoi) | ||
56 | |||
57 | /* IRQ shared by multiple guests? */ | ||
58 | #define _XENIRQSTAT_shared (1) | ||
59 | #define XENIRQSTAT_shared (1U<<_XENIRQSTAT_shared) | ||
60 | |||
61 | /* | ||
62 | * Set the current VCPU's I/O privilege level. | ||
63 | * @arg == pointer to physdev_set_iopl structure. | ||
64 | */ | ||
65 | #define PHYSDEVOP_set_iopl 6 | ||
66 | struct physdev_set_iopl { | ||
67 | /* IN */ | ||
68 | uint32_t iopl; | ||
69 | }; | ||
70 | |||
71 | /* | ||
72 | * Set the current VCPU's I/O-port permissions bitmap. | ||
73 | * @arg == pointer to physdev_set_iobitmap structure. | ||
74 | */ | ||
75 | #define PHYSDEVOP_set_iobitmap 7 | ||
76 | struct physdev_set_iobitmap { | ||
77 | /* IN */ | ||
78 | uint8_t * bitmap; | ||
79 | uint32_t nr_ports; | ||
80 | }; | ||
81 | |||
82 | /* | ||
83 | * Read or write an IO-APIC register. | ||
84 | * @arg == pointer to physdev_apic structure. | ||
85 | */ | ||
86 | #define PHYSDEVOP_apic_read 8 | ||
87 | #define PHYSDEVOP_apic_write 9 | ||
88 | struct physdev_apic { | ||
89 | /* IN */ | ||
90 | unsigned long apic_physbase; | ||
91 | uint32_t reg; | ||
92 | /* IN or OUT */ | ||
93 | uint32_t value; | ||
94 | }; | ||
95 | |||
96 | /* | ||
97 | * Allocate or free a physical upcall vector for the specified IRQ line. | ||
98 | * @arg == pointer to physdev_irq structure. | ||
99 | */ | ||
100 | #define PHYSDEVOP_alloc_irq_vector 10 | ||
101 | #define PHYSDEVOP_free_irq_vector 11 | ||
102 | struct physdev_irq { | ||
103 | /* IN */ | ||
104 | uint32_t irq; | ||
105 | /* IN or OUT */ | ||
106 | uint32_t vector; | ||
107 | }; | ||
108 | |||
109 | /* | ||
110 | * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op() | ||
111 | * hypercall since 0x00030202. | ||
112 | */ | ||
113 | struct physdev_op { | ||
114 | uint32_t cmd; | ||
115 | union { | ||
116 | struct physdev_irq_status_query irq_status_query; | ||
117 | struct physdev_set_iopl set_iopl; | ||
118 | struct physdev_set_iobitmap set_iobitmap; | ||
119 | struct physdev_apic apic_op; | ||
120 | struct physdev_irq irq_op; | ||
121 | } u; | ||
122 | }; | ||
123 | |||
124 | /* | ||
125 | * Notify that some PIRQ-bound event channels have been unmasked. | ||
126 | * ** This command is obsolete since interface version 0x00030202 and is ** | ||
127 | * ** unsupported by newer versions of Xen. ** | ||
128 | */ | ||
129 | #define PHYSDEVOP_IRQ_UNMASK_NOTIFY 4 | ||
130 | |||
131 | /* | ||
132 | * These all-capitals physdev operation names are superceded by the new names | ||
133 | * (defined above) since interface version 0x00030202. | ||
134 | */ | ||
135 | #define PHYSDEVOP_IRQ_STATUS_QUERY PHYSDEVOP_irq_status_query | ||
136 | #define PHYSDEVOP_SET_IOPL PHYSDEVOP_set_iopl | ||
137 | #define PHYSDEVOP_SET_IOBITMAP PHYSDEVOP_set_iobitmap | ||
138 | #define PHYSDEVOP_APIC_READ PHYSDEVOP_apic_read | ||
139 | #define PHYSDEVOP_APIC_WRITE PHYSDEVOP_apic_write | ||
140 | #define PHYSDEVOP_ASSIGN_VECTOR PHYSDEVOP_alloc_irq_vector | ||
141 | #define PHYSDEVOP_FREE_VECTOR PHYSDEVOP_free_irq_vector | ||
142 | #define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi | ||
143 | #define PHYSDEVOP_IRQ_SHARED XENIRQSTAT_shared | ||
144 | |||
145 | #endif /* __XEN_PUBLIC_PHYSDEV_H__ */ | ||
diff --git a/include/xen/interface/sched.h b/include/xen/interface/sched.h new file mode 100644 index 000000000000..5fec575a800a --- /dev/null +++ b/include/xen/interface/sched.h | |||
@@ -0,0 +1,77 @@ | |||
1 | /****************************************************************************** | ||
2 | * sched.h | ||
3 | * | ||
4 | * Scheduler state interactions | ||
5 | * | ||
6 | * Copyright (c) 2005, Keir Fraser <keir@xensource.com> | ||
7 | */ | ||
8 | |||
9 | #ifndef __XEN_PUBLIC_SCHED_H__ | ||
10 | #define __XEN_PUBLIC_SCHED_H__ | ||
11 | |||
12 | #include "event_channel.h" | ||
13 | |||
14 | /* | ||
15 | * The prototype for this hypercall is: | ||
16 | * long sched_op_new(int cmd, void *arg) | ||
17 | * @cmd == SCHEDOP_??? (scheduler operation). | ||
18 | * @arg == Operation-specific extra argument(s), as described below. | ||
19 | * | ||
20 | * **NOTE**: | ||
21 | * Versions of Xen prior to 3.0.2 provide only the following legacy version | ||
22 | * of this hypercall, supporting only the commands yield, block and shutdown: | ||
23 | * long sched_op(int cmd, unsigned long arg) | ||
24 | * @cmd == SCHEDOP_??? (scheduler operation). | ||
25 | * @arg == 0 (SCHEDOP_yield and SCHEDOP_block) | ||
26 | * == SHUTDOWN_* code (SCHEDOP_shutdown) | ||
27 | */ | ||
28 | |||
29 | /* | ||
30 | * Voluntarily yield the CPU. | ||
31 | * @arg == NULL. | ||
32 | */ | ||
33 | #define SCHEDOP_yield 0 | ||
34 | |||
35 | /* | ||
36 | * Block execution of this VCPU until an event is received for processing. | ||
37 | * If called with event upcalls masked, this operation will atomically | ||
38 | * reenable event delivery and check for pending events before blocking the | ||
39 | * VCPU. This avoids a "wakeup waiting" race. | ||
40 | * @arg == NULL. | ||
41 | */ | ||
42 | #define SCHEDOP_block 1 | ||
43 | |||
44 | /* | ||
45 | * Halt execution of this domain (all VCPUs) and notify the system controller. | ||
46 | * @arg == pointer to sched_shutdown structure. | ||
47 | */ | ||
48 | #define SCHEDOP_shutdown 2 | ||
49 | struct sched_shutdown { | ||
50 | unsigned int reason; /* SHUTDOWN_* */ | ||
51 | }; | ||
52 | DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown); | ||
53 | |||
54 | /* | ||
55 | * Poll a set of event-channel ports. Return when one or more are pending. An | ||
56 | * optional timeout may be specified. | ||
57 | * @arg == pointer to sched_poll structure. | ||
58 | */ | ||
59 | #define SCHEDOP_poll 3 | ||
60 | struct sched_poll { | ||
61 | GUEST_HANDLE(evtchn_port_t) ports; | ||
62 | unsigned int nr_ports; | ||
63 | uint64_t timeout; | ||
64 | }; | ||
65 | DEFINE_GUEST_HANDLE_STRUCT(sched_poll); | ||
66 | |||
67 | /* | ||
68 | * Reason codes for SCHEDOP_shutdown. These may be interpreted by control | ||
69 | * software to determine the appropriate action. For the most part, Xen does | ||
70 | * not care about the shutdown code. | ||
71 | */ | ||
72 | #define SHUTDOWN_poweroff 0 /* Domain exited normally. Clean up and kill. */ | ||
73 | #define SHUTDOWN_reboot 1 /* Clean up, kill, and then restart. */ | ||
74 | #define SHUTDOWN_suspend 2 /* Clean up, save suspend info, kill. */ | ||
75 | #define SHUTDOWN_crash 3 /* Tell controller we've crashed. */ | ||
76 | |||
77 | #endif /* __XEN_PUBLIC_SCHED_H__ */ | ||
diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h new file mode 100644 index 000000000000..ff61ea365997 --- /dev/null +++ b/include/xen/interface/vcpu.h | |||
@@ -0,0 +1,167 @@ | |||
1 | /****************************************************************************** | ||
2 | * vcpu.h | ||
3 | * | ||
4 | * VCPU initialisation, query, and hotplug. | ||
5 | * | ||
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
7 | * of this software and associated documentation files (the "Software"), to | ||
8 | * deal in the Software without restriction, including without limitation the | ||
9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or | ||
10 | * sell copies of the Software, and to permit persons to whom the Software is | ||
11 | * furnished to do so, subject to the following conditions: | ||
12 | * | ||
13 | * The above copyright notice and this permission notice shall be included in | ||
14 | * all copies or substantial portions of the Software. | ||
15 | * | ||
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
22 | * DEALINGS IN THE SOFTWARE. | ||
23 | * | ||
24 | * Copyright (c) 2005, Keir Fraser <keir@xensource.com> | ||
25 | */ | ||
26 | |||
27 | #ifndef __XEN_PUBLIC_VCPU_H__ | ||
28 | #define __XEN_PUBLIC_VCPU_H__ | ||
29 | |||
30 | /* | ||
31 | * Prototype for this hypercall is: | ||
32 | * int vcpu_op(int cmd, int vcpuid, void *extra_args) | ||
33 | * @cmd == VCPUOP_??? (VCPU operation). | ||
34 | * @vcpuid == VCPU to operate on. | ||
35 | * @extra_args == Operation-specific extra arguments (NULL if none). | ||
36 | */ | ||
37 | |||
38 | /* | ||
39 | * Initialise a VCPU. Each VCPU can be initialised only once. A | ||
40 | * newly-initialised VCPU will not run until it is brought up by VCPUOP_up. | ||
41 | * | ||
42 | * @extra_arg == pointer to vcpu_guest_context structure containing initial | ||
43 | * state for the VCPU. | ||
44 | */ | ||
45 | #define VCPUOP_initialise 0 | ||
46 | |||
47 | /* | ||
48 | * Bring up a VCPU. This makes the VCPU runnable. This operation will fail | ||
49 | * if the VCPU has not been initialised (VCPUOP_initialise). | ||
50 | */ | ||
51 | #define VCPUOP_up 1 | ||
52 | |||
53 | /* | ||
54 | * Bring down a VCPU (i.e., make it non-runnable). | ||
55 | * There are a few caveats that callers should observe: | ||
56 | * 1. This operation may return, and VCPU_is_up may return false, before the | ||
57 | * VCPU stops running (i.e., the command is asynchronous). It is a good | ||
58 | * idea to ensure that the VCPU has entered a non-critical loop before | ||
59 | * bringing it down. Alternatively, this operation is guaranteed | ||
60 | * synchronous if invoked by the VCPU itself. | ||
61 | * 2. After a VCPU is initialised, there is currently no way to drop all its | ||
62 | * references to domain memory. Even a VCPU that is down still holds | ||
63 | * memory references via its pagetable base pointer and GDT. It is good | ||
64 | * practise to move a VCPU onto an 'idle' or default page table, LDT and | ||
65 | * GDT before bringing it down. | ||
66 | */ | ||
67 | #define VCPUOP_down 2 | ||
68 | |||
69 | /* Returns 1 if the given VCPU is up. */ | ||
70 | #define VCPUOP_is_up 3 | ||
71 | |||
72 | /* | ||
73 | * Return information about the state and running time of a VCPU. | ||
74 | * @extra_arg == pointer to vcpu_runstate_info structure. | ||
75 | */ | ||
76 | #define VCPUOP_get_runstate_info 4 | ||
77 | struct vcpu_runstate_info { | ||
78 | /* VCPU's current state (RUNSTATE_*). */ | ||
79 | int state; | ||
80 | /* When was current state entered (system time, ns)? */ | ||
81 | uint64_t state_entry_time; | ||
82 | /* | ||
83 | * Time spent in each RUNSTATE_* (ns). The sum of these times is | ||
84 | * guaranteed not to drift from system time. | ||
85 | */ | ||
86 | uint64_t time[4]; | ||
87 | }; | ||
88 | |||
89 | /* VCPU is currently running on a physical CPU. */ | ||
90 | #define RUNSTATE_running 0 | ||
91 | |||
92 | /* VCPU is runnable, but not currently scheduled on any physical CPU. */ | ||
93 | #define RUNSTATE_runnable 1 | ||
94 | |||
95 | /* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */ | ||
96 | #define RUNSTATE_blocked 2 | ||
97 | |||
98 | /* | ||
99 | * VCPU is not runnable, but it is not blocked. | ||
100 | * This is a 'catch all' state for things like hotplug and pauses by the | ||
101 | * system administrator (or for critical sections in the hypervisor). | ||
102 | * RUNSTATE_blocked dominates this state (it is the preferred state). | ||
103 | */ | ||
104 | #define RUNSTATE_offline 3 | ||
105 | |||
106 | /* | ||
107 | * Register a shared memory area from which the guest may obtain its own | ||
108 | * runstate information without needing to execute a hypercall. | ||
109 | * Notes: | ||
110 | * 1. The registered address may be virtual or physical, depending on the | ||
111 | * platform. The virtual address should be registered on x86 systems. | ||
112 | * 2. Only one shared area may be registered per VCPU. The shared area is | ||
113 | * updated by the hypervisor each time the VCPU is scheduled. Thus | ||
114 | * runstate.state will always be RUNSTATE_running and | ||
115 | * runstate.state_entry_time will indicate the system time at which the | ||
116 | * VCPU was last scheduled to run. | ||
117 | * @extra_arg == pointer to vcpu_register_runstate_memory_area structure. | ||
118 | */ | ||
119 | #define VCPUOP_register_runstate_memory_area 5 | ||
120 | struct vcpu_register_runstate_memory_area { | ||
121 | union { | ||
122 | struct vcpu_runstate_info *v; | ||
123 | uint64_t p; | ||
124 | } addr; | ||
125 | }; | ||
126 | |||
127 | /* | ||
128 | * Set or stop a VCPU's periodic timer. Every VCPU has one periodic timer | ||
129 | * which can be set via these commands. Periods smaller than one millisecond | ||
130 | * may not be supported. | ||
131 | */ | ||
132 | #define VCPUOP_set_periodic_timer 6 /* arg == vcpu_set_periodic_timer_t */ | ||
133 | #define VCPUOP_stop_periodic_timer 7 /* arg == NULL */ | ||
134 | struct vcpu_set_periodic_timer { | ||
135 | uint64_t period_ns; | ||
136 | }; | ||
137 | |||
138 | /* | ||
139 | * Set or stop a VCPU's single-shot timer. Every VCPU has one single-shot | ||
140 | * timer which can be set via these commands. | ||
141 | */ | ||
142 | #define VCPUOP_set_singleshot_timer 8 /* arg == vcpu_set_singleshot_timer_t */ | ||
143 | #define VCPUOP_stop_singleshot_timer 9 /* arg == NULL */ | ||
144 | struct vcpu_set_singleshot_timer { | ||
145 | uint64_t timeout_abs_ns; | ||
146 | uint32_t flags; /* VCPU_SSHOTTMR_??? */ | ||
147 | }; | ||
148 | |||
149 | /* Flags to VCPUOP_set_singleshot_timer. */ | ||
150 | /* Require the timeout to be in the future (return -ETIME if it's passed). */ | ||
151 | #define _VCPU_SSHOTTMR_future (0) | ||
152 | #define VCPU_SSHOTTMR_future (1U << _VCPU_SSHOTTMR_future) | ||
153 | |||
154 | /* | ||
155 | * Register a memory location in the guest address space for the | ||
156 | * vcpu_info structure. This allows the guest to place the vcpu_info | ||
157 | * structure in a convenient place, such as in a per-cpu data area. | ||
158 | * The pointer need not be page aligned, but the structure must not | ||
159 | * cross a page boundary. | ||
160 | */ | ||
161 | #define VCPUOP_register_vcpu_info 10 /* arg == struct vcpu_info */ | ||
162 | struct vcpu_register_vcpu_info { | ||
163 | uint32_t mfn; /* mfn of page to place vcpu_info */ | ||
164 | uint32_t offset; /* offset within page */ | ||
165 | }; | ||
166 | |||
167 | #endif /* __XEN_PUBLIC_VCPU_H__ */ | ||
diff --git a/include/xen/interface/version.h b/include/xen/interface/version.h new file mode 100644 index 000000000000..453235e923f0 --- /dev/null +++ b/include/xen/interface/version.h | |||
@@ -0,0 +1,60 @@ | |||
1 | /****************************************************************************** | ||
2 | * version.h | ||
3 | * | ||
4 | * Xen version, type, and compile information. | ||
5 | * | ||
6 | * Copyright (c) 2005, Nguyen Anh Quynh <aquynh@gmail.com> | ||
7 | * Copyright (c) 2005, Keir Fraser <keir@xensource.com> | ||
8 | */ | ||
9 | |||
10 | #ifndef __XEN_PUBLIC_VERSION_H__ | ||
11 | #define __XEN_PUBLIC_VERSION_H__ | ||
12 | |||
13 | /* NB. All ops return zero on success, except XENVER_version. */ | ||
14 | |||
15 | /* arg == NULL; returns major:minor (16:16). */ | ||
16 | #define XENVER_version 0 | ||
17 | |||
18 | /* arg == xen_extraversion_t. */ | ||
19 | #define XENVER_extraversion 1 | ||
20 | struct xen_extraversion { | ||
21 | char extraversion[16]; | ||
22 | }; | ||
23 | #define XEN_EXTRAVERSION_LEN (sizeof(struct xen_extraversion)) | ||
24 | |||
25 | /* arg == xen_compile_info_t. */ | ||
26 | #define XENVER_compile_info 2 | ||
27 | struct xen_compile_info { | ||
28 | char compiler[64]; | ||
29 | char compile_by[16]; | ||
30 | char compile_domain[32]; | ||
31 | char compile_date[32]; | ||
32 | }; | ||
33 | |||
34 | #define XENVER_capabilities 3 | ||
35 | struct xen_capabilities_info { | ||
36 | char info[1024]; | ||
37 | }; | ||
38 | #define XEN_CAPABILITIES_INFO_LEN (sizeof(struct xen_capabilities_info)) | ||
39 | |||
40 | #define XENVER_changeset 4 | ||
41 | struct xen_changeset_info { | ||
42 | char info[64]; | ||
43 | }; | ||
44 | #define XEN_CHANGESET_INFO_LEN (sizeof(struct xen_changeset_info)) | ||
45 | |||
46 | #define XENVER_platform_parameters 5 | ||
47 | struct xen_platform_parameters { | ||
48 | unsigned long virt_start; | ||
49 | }; | ||
50 | |||
51 | #define XENVER_get_features 6 | ||
52 | struct xen_feature_info { | ||
53 | unsigned int submap_idx; /* IN: which 32-bit submap to return */ | ||
54 | uint32_t submap; /* OUT: 32-bit submap */ | ||
55 | }; | ||
56 | |||
57 | /* Declares the features reported by XENVER_get_features. */ | ||
58 | #include "features.h" | ||
59 | |||
60 | #endif /* __XEN_PUBLIC_VERSION_H__ */ | ||
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h new file mode 100644 index 000000000000..518a5bf79ed3 --- /dev/null +++ b/include/xen/interface/xen.h | |||
@@ -0,0 +1,447 @@ | |||
1 | /****************************************************************************** | ||
2 | * xen.h | ||
3 | * | ||
4 | * Guest OS interface to Xen. | ||
5 | * | ||
6 | * Copyright (c) 2004, K A Fraser | ||
7 | */ | ||
8 | |||
9 | #ifndef __XEN_PUBLIC_XEN_H__ | ||
10 | #define __XEN_PUBLIC_XEN_H__ | ||
11 | |||
12 | #include <asm/xen/interface.h> | ||
13 | |||
14 | /* | ||
15 | * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS). | ||
16 | */ | ||
17 | |||
18 | /* | ||
19 | * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5. | ||
20 | * EAX = return value | ||
21 | * (argument registers may be clobbered on return) | ||
22 | * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6. | ||
23 | * RAX = return value | ||
24 | * (argument registers not clobbered on return; RCX, R11 are) | ||
25 | */ | ||
26 | #define __HYPERVISOR_set_trap_table 0 | ||
27 | #define __HYPERVISOR_mmu_update 1 | ||
28 | #define __HYPERVISOR_set_gdt 2 | ||
29 | #define __HYPERVISOR_stack_switch 3 | ||
30 | #define __HYPERVISOR_set_callbacks 4 | ||
31 | #define __HYPERVISOR_fpu_taskswitch 5 | ||
32 | #define __HYPERVISOR_sched_op 6 | ||
33 | #define __HYPERVISOR_dom0_op 7 | ||
34 | #define __HYPERVISOR_set_debugreg 8 | ||
35 | #define __HYPERVISOR_get_debugreg 9 | ||
36 | #define __HYPERVISOR_update_descriptor 10 | ||
37 | #define __HYPERVISOR_memory_op 12 | ||
38 | #define __HYPERVISOR_multicall 13 | ||
39 | #define __HYPERVISOR_update_va_mapping 14 | ||
40 | #define __HYPERVISOR_set_timer_op 15 | ||
41 | #define __HYPERVISOR_event_channel_op_compat 16 | ||
42 | #define __HYPERVISOR_xen_version 17 | ||
43 | #define __HYPERVISOR_console_io 18 | ||
44 | #define __HYPERVISOR_physdev_op_compat 19 | ||
45 | #define __HYPERVISOR_grant_table_op 20 | ||
46 | #define __HYPERVISOR_vm_assist 21 | ||
47 | #define __HYPERVISOR_update_va_mapping_otherdomain 22 | ||
48 | #define __HYPERVISOR_iret 23 /* x86 only */ | ||
49 | #define __HYPERVISOR_vcpu_op 24 | ||
50 | #define __HYPERVISOR_set_segment_base 25 /* x86/64 only */ | ||
51 | #define __HYPERVISOR_mmuext_op 26 | ||
52 | #define __HYPERVISOR_acm_op 27 | ||
53 | #define __HYPERVISOR_nmi_op 28 | ||
54 | #define __HYPERVISOR_sched_op_new 29 | ||
55 | #define __HYPERVISOR_callback_op 30 | ||
56 | #define __HYPERVISOR_xenoprof_op 31 | ||
57 | #define __HYPERVISOR_event_channel_op 32 | ||
58 | #define __HYPERVISOR_physdev_op 33 | ||
59 | #define __HYPERVISOR_hvm_op 34 | ||
60 | |||
61 | /* | ||
62 | * VIRTUAL INTERRUPTS | ||
63 | * | ||
64 | * Virtual interrupts that a guest OS may receive from Xen. | ||
65 | */ | ||
66 | #define VIRQ_TIMER 0 /* Timebase update, and/or requested timeout. */ | ||
67 | #define VIRQ_DEBUG 1 /* Request guest to dump debug info. */ | ||
68 | #define VIRQ_CONSOLE 2 /* (DOM0) Bytes received on emergency console. */ | ||
69 | #define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */ | ||
70 | #define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */ | ||
71 | #define NR_VIRQS 8 | ||
72 | |||
73 | /* | ||
74 | * MMU-UPDATE REQUESTS | ||
75 | * | ||
76 | * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs. | ||
77 | * A foreigndom (FD) can be specified (or DOMID_SELF for none). | ||
78 | * Where the FD has some effect, it is described below. | ||
79 | * ptr[1:0] specifies the appropriate MMU_* command. | ||
80 | * | ||
81 | * ptr[1:0] == MMU_NORMAL_PT_UPDATE: | ||
82 | * Updates an entry in a page table. If updating an L1 table, and the new | ||
83 | * table entry is valid/present, the mapped frame must belong to the FD, if | ||
84 | * an FD has been specified. If attempting to map an I/O page then the | ||
85 | * caller assumes the privilege of the FD. | ||
86 | * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller. | ||
87 | * FD == DOMID_XEN: Map restricted areas of Xen's heap space. | ||
88 | * ptr[:2] -- Machine address of the page-table entry to modify. | ||
89 | * val -- Value to write. | ||
90 | * | ||
91 | * ptr[1:0] == MMU_MACHPHYS_UPDATE: | ||
92 | * Updates an entry in the machine->pseudo-physical mapping table. | ||
93 | * ptr[:2] -- Machine address within the frame whose mapping to modify. | ||
94 | * The frame must belong to the FD, if one is specified. | ||
95 | * val -- Value to write into the mapping entry. | ||
96 | */ | ||
97 | #define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ | ||
98 | #define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */ | ||
99 | |||
100 | /* | ||
101 | * MMU EXTENDED OPERATIONS | ||
102 | * | ||
103 | * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures. | ||
104 | * A foreigndom (FD) can be specified (or DOMID_SELF for none). | ||
105 | * Where the FD has some effect, it is described below. | ||
106 | * | ||
107 | * cmd: MMUEXT_(UN)PIN_*_TABLE | ||
108 | * mfn: Machine frame number to be (un)pinned as a p.t. page. | ||
109 | * The frame must belong to the FD, if one is specified. | ||
110 | * | ||
111 | * cmd: MMUEXT_NEW_BASEPTR | ||
112 | * mfn: Machine frame number of new page-table base to install in MMU. | ||
113 | * | ||
114 | * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only] | ||
115 | * mfn: Machine frame number of new page-table base to install in MMU | ||
116 | * when in user space. | ||
117 | * | ||
118 | * cmd: MMUEXT_TLB_FLUSH_LOCAL | ||
119 | * No additional arguments. Flushes local TLB. | ||
120 | * | ||
121 | * cmd: MMUEXT_INVLPG_LOCAL | ||
122 | * linear_addr: Linear address to be flushed from the local TLB. | ||
123 | * | ||
124 | * cmd: MMUEXT_TLB_FLUSH_MULTI | ||
125 | * vcpumask: Pointer to bitmap of VCPUs to be flushed. | ||
126 | * | ||
127 | * cmd: MMUEXT_INVLPG_MULTI | ||
128 | * linear_addr: Linear address to be flushed. | ||
129 | * vcpumask: Pointer to bitmap of VCPUs to be flushed. | ||
130 | * | ||
131 | * cmd: MMUEXT_TLB_FLUSH_ALL | ||
132 | * No additional arguments. Flushes all VCPUs' TLBs. | ||
133 | * | ||
134 | * cmd: MMUEXT_INVLPG_ALL | ||
135 | * linear_addr: Linear address to be flushed from all VCPUs' TLBs. | ||
136 | * | ||
137 | * cmd: MMUEXT_FLUSH_CACHE | ||
138 | * No additional arguments. Writes back and flushes cache contents. | ||
139 | * | ||
140 | * cmd: MMUEXT_SET_LDT | ||
141 | * linear_addr: Linear address of LDT base (NB. must be page-aligned). | ||
142 | * nr_ents: Number of entries in LDT. | ||
143 | */ | ||
144 | #define MMUEXT_PIN_L1_TABLE 0 | ||
145 | #define MMUEXT_PIN_L2_TABLE 1 | ||
146 | #define MMUEXT_PIN_L3_TABLE 2 | ||
147 | #define MMUEXT_PIN_L4_TABLE 3 | ||
148 | #define MMUEXT_UNPIN_TABLE 4 | ||
149 | #define MMUEXT_NEW_BASEPTR 5 | ||
150 | #define MMUEXT_TLB_FLUSH_LOCAL 6 | ||
151 | #define MMUEXT_INVLPG_LOCAL 7 | ||
152 | #define MMUEXT_TLB_FLUSH_MULTI 8 | ||
153 | #define MMUEXT_INVLPG_MULTI 9 | ||
154 | #define MMUEXT_TLB_FLUSH_ALL 10 | ||
155 | #define MMUEXT_INVLPG_ALL 11 | ||
156 | #define MMUEXT_FLUSH_CACHE 12 | ||
157 | #define MMUEXT_SET_LDT 13 | ||
158 | #define MMUEXT_NEW_USER_BASEPTR 15 | ||
159 | |||
160 | #ifndef __ASSEMBLY__ | ||
161 | struct mmuext_op { | ||
162 | unsigned int cmd; | ||
163 | union { | ||
164 | /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */ | ||
165 | unsigned long mfn; | ||
166 | /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */ | ||
167 | unsigned long linear_addr; | ||
168 | } arg1; | ||
169 | union { | ||
170 | /* SET_LDT */ | ||
171 | unsigned int nr_ents; | ||
172 | /* TLB_FLUSH_MULTI, INVLPG_MULTI */ | ||
173 | void *vcpumask; | ||
174 | } arg2; | ||
175 | }; | ||
176 | DEFINE_GUEST_HANDLE_STRUCT(mmuext_op); | ||
177 | #endif | ||
178 | |||
179 | /* These are passed as 'flags' to update_va_mapping. They can be ORed. */ | ||
180 | /* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap. */ | ||
181 | /* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer. */ | ||
182 | #define UVMF_NONE (0UL<<0) /* No flushing at all. */ | ||
183 | #define UVMF_TLB_FLUSH (1UL<<0) /* Flush entire TLB(s). */ | ||
184 | #define UVMF_INVLPG (2UL<<0) /* Flush only one entry. */ | ||
185 | #define UVMF_FLUSHTYPE_MASK (3UL<<0) | ||
186 | #define UVMF_MULTI (0UL<<2) /* Flush subset of TLBs. */ | ||
187 | #define UVMF_LOCAL (0UL<<2) /* Flush local TLB. */ | ||
188 | #define UVMF_ALL (1UL<<2) /* Flush all TLBs. */ | ||
189 | |||
190 | /* | ||
191 | * Commands to HYPERVISOR_console_io(). | ||
192 | */ | ||
193 | #define CONSOLEIO_write 0 | ||
194 | #define CONSOLEIO_read 1 | ||
195 | |||
196 | /* | ||
197 | * Commands to HYPERVISOR_vm_assist(). | ||
198 | */ | ||
199 | #define VMASST_CMD_enable 0 | ||
200 | #define VMASST_CMD_disable 1 | ||
201 | #define VMASST_TYPE_4gb_segments 0 | ||
202 | #define VMASST_TYPE_4gb_segments_notify 1 | ||
203 | #define VMASST_TYPE_writable_pagetables 2 | ||
204 | #define VMASST_TYPE_pae_extended_cr3 3 | ||
205 | #define MAX_VMASST_TYPE 3 | ||
206 | |||
207 | #ifndef __ASSEMBLY__ | ||
208 | |||
209 | typedef uint16_t domid_t; | ||
210 | |||
211 | /* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */ | ||
212 | #define DOMID_FIRST_RESERVED (0x7FF0U) | ||
213 | |||
214 | /* DOMID_SELF is used in certain contexts to refer to oneself. */ | ||
215 | #define DOMID_SELF (0x7FF0U) | ||
216 | |||
217 | /* | ||
218 | * DOMID_IO is used to restrict page-table updates to mapping I/O memory. | ||
219 | * Although no Foreign Domain need be specified to map I/O pages, DOMID_IO | ||
220 | * is useful to ensure that no mappings to the OS's own heap are accidentally | ||
221 | * installed. (e.g., in Linux this could cause havoc as reference counts | ||
222 | * aren't adjusted on the I/O-mapping code path). | ||
223 | * This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can | ||
224 | * be specified by any calling domain. | ||
225 | */ | ||
226 | #define DOMID_IO (0x7FF1U) | ||
227 | |||
228 | /* | ||
229 | * DOMID_XEN is used to allow privileged domains to map restricted parts of | ||
230 | * Xen's heap space (e.g., the machine_to_phys table). | ||
231 | * This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if | ||
232 | * the caller is privileged. | ||
233 | */ | ||
234 | #define DOMID_XEN (0x7FF2U) | ||
235 | |||
236 | /* | ||
237 | * Send an array of these to HYPERVISOR_mmu_update(). | ||
238 | * NB. The fields are natural pointer/address size for this architecture. | ||
239 | */ | ||
240 | struct mmu_update { | ||
241 | uint64_t ptr; /* Machine address of PTE. */ | ||
242 | uint64_t val; /* New contents of PTE. */ | ||
243 | }; | ||
244 | DEFINE_GUEST_HANDLE_STRUCT(mmu_update); | ||
245 | |||
246 | /* | ||
247 | * Send an array of these to HYPERVISOR_multicall(). | ||
248 | * NB. The fields are natural register size for this architecture. | ||
249 | */ | ||
250 | struct multicall_entry { | ||
251 | unsigned long op; | ||
252 | long result; | ||
253 | unsigned long args[6]; | ||
254 | }; | ||
255 | DEFINE_GUEST_HANDLE_STRUCT(multicall_entry); | ||
256 | |||
257 | /* | ||
258 | * Event channel endpoints per domain: | ||
259 | * 1024 if a long is 32 bits; 4096 if a long is 64 bits. | ||
260 | */ | ||
261 | #define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64) | ||
262 | |||
263 | struct vcpu_time_info { | ||
264 | /* | ||
265 | * Updates to the following values are preceded and followed | ||
266 | * by an increment of 'version'. The guest can therefore | ||
267 | * detect updates by looking for changes to 'version'. If the | ||
268 | * least-significant bit of the version number is set then an | ||
269 | * update is in progress and the guest must wait to read a | ||
270 | * consistent set of values. The correct way to interact with | ||
271 | * the version number is similar to Linux's seqlock: see the | ||
272 | * implementations of read_seqbegin/read_seqretry. | ||
273 | */ | ||
274 | uint32_t version; | ||
275 | uint32_t pad0; | ||
276 | uint64_t tsc_timestamp; /* TSC at last update of time vals. */ | ||
277 | uint64_t system_time; /* Time, in nanosecs, since boot. */ | ||
278 | /* | ||
279 | * Current system time: | ||
280 | * system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul | ||
281 | * CPU frequency (Hz): | ||
282 | * ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift | ||
283 | */ | ||
284 | uint32_t tsc_to_system_mul; | ||
285 | int8_t tsc_shift; | ||
286 | int8_t pad1[3]; | ||
287 | }; /* 32 bytes */ | ||
288 | |||
289 | struct vcpu_info { | ||
290 | /* | ||
291 | * 'evtchn_upcall_pending' is written non-zero by Xen to indicate | ||
292 | * a pending notification for a particular VCPU. It is then cleared | ||
293 | * by the guest OS /before/ checking for pending work, thus avoiding | ||
294 | * a set-and-check race. Note that the mask is only accessed by Xen | ||
295 | * on the CPU that is currently hosting the VCPU. This means that the | ||
296 | * pending and mask flags can be updated by the guest without special | ||
297 | * synchronisation (i.e., no need for the x86 LOCK prefix). | ||
298 | * This may seem suboptimal because if the pending flag is set by | ||
299 | * a different CPU then an IPI may be scheduled even when the mask | ||
300 | * is set. However, note: | ||
301 | * 1. The task of 'interrupt holdoff' is covered by the per-event- | ||
302 | * channel mask bits. A 'noisy' event that is continually being | ||
303 | * triggered can be masked at source at this very precise | ||
304 | * granularity. | ||
305 | * 2. The main purpose of the per-VCPU mask is therefore to restrict | ||
306 | * reentrant execution: whether for concurrency control, or to | ||
307 | * prevent unbounded stack usage. Whatever the purpose, we expect | ||
308 | * that the mask will be asserted only for short periods at a time, | ||
309 | * and so the likelihood of a 'spurious' IPI is suitably small. | ||
310 | * The mask is read before making an event upcall to the guest: a | ||
311 | * non-zero mask therefore guarantees that the VCPU will not receive | ||
312 | * an upcall activation. The mask is cleared when the VCPU requests | ||
313 | * to block: this avoids wakeup-waiting races. | ||
314 | */ | ||
315 | uint8_t evtchn_upcall_pending; | ||
316 | uint8_t evtchn_upcall_mask; | ||
317 | unsigned long evtchn_pending_sel; | ||
318 | struct arch_vcpu_info arch; | ||
319 | struct vcpu_time_info time; | ||
320 | }; /* 64 bytes (x86) */ | ||
321 | |||
322 | /* | ||
323 | * Xen/kernel shared data -- pointer provided in start_info. | ||
324 | * NB. We expect that this struct is smaller than a page. | ||
325 | */ | ||
326 | struct shared_info { | ||
327 | struct vcpu_info vcpu_info[MAX_VIRT_CPUS]; | ||
328 | |||
329 | /* | ||
330 | * A domain can create "event channels" on which it can send and receive | ||
331 | * asynchronous event notifications. There are three classes of event that | ||
332 | * are delivered by this mechanism: | ||
333 | * 1. Bi-directional inter- and intra-domain connections. Domains must | ||
334 | * arrange out-of-band to set up a connection (usually by allocating | ||
335 | * an unbound 'listener' port and avertising that via a storage service | ||
336 | * such as xenstore). | ||
337 | * 2. Physical interrupts. A domain with suitable hardware-access | ||
338 | * privileges can bind an event-channel port to a physical interrupt | ||
339 | * source. | ||
340 | * 3. Virtual interrupts ('events'). A domain can bind an event-channel | ||
341 | * port to a virtual interrupt source, such as the virtual-timer | ||
342 | * device or the emergency console. | ||
343 | * | ||
344 | * Event channels are addressed by a "port index". Each channel is | ||
345 | * associated with two bits of information: | ||
346 | * 1. PENDING -- notifies the domain that there is a pending notification | ||
347 | * to be processed. This bit is cleared by the guest. | ||
348 | * 2. MASK -- if this bit is clear then a 0->1 transition of PENDING | ||
349 | * will cause an asynchronous upcall to be scheduled. This bit is only | ||
350 | * updated by the guest. It is read-only within Xen. If a channel | ||
351 | * becomes pending while the channel is masked then the 'edge' is lost | ||
352 | * (i.e., when the channel is unmasked, the guest must manually handle | ||
353 | * pending notifications as no upcall will be scheduled by Xen). | ||
354 | * | ||
355 | * To expedite scanning of pending notifications, any 0->1 pending | ||
356 | * transition on an unmasked channel causes a corresponding bit in a | ||
357 | * per-vcpu selector word to be set. Each bit in the selector covers a | ||
358 | * 'C long' in the PENDING bitfield array. | ||
359 | */ | ||
360 | unsigned long evtchn_pending[sizeof(unsigned long) * 8]; | ||
361 | unsigned long evtchn_mask[sizeof(unsigned long) * 8]; | ||
362 | |||
363 | /* | ||
364 | * Wallclock time: updated only by control software. Guests should base | ||
365 | * their gettimeofday() syscall on this wallclock-base value. | ||
366 | */ | ||
367 | uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */ | ||
368 | uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */ | ||
369 | uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */ | ||
370 | |||
371 | struct arch_shared_info arch; | ||
372 | |||
373 | }; | ||
374 | |||
375 | /* | ||
376 | * Start-of-day memory layout for the initial domain (DOM0): | ||
377 | * 1. The domain is started within contiguous virtual-memory region. | ||
378 | * 2. The contiguous region begins and ends on an aligned 4MB boundary. | ||
379 | * 3. The region start corresponds to the load address of the OS image. | ||
380 | * If the load address is not 4MB aligned then the address is rounded down. | ||
381 | * 4. This the order of bootstrap elements in the initial virtual region: | ||
382 | * a. relocated kernel image | ||
383 | * b. initial ram disk [mod_start, mod_len] | ||
384 | * c. list of allocated page frames [mfn_list, nr_pages] | ||
385 | * d. start_info_t structure [register ESI (x86)] | ||
386 | * e. bootstrap page tables [pt_base, CR3 (x86)] | ||
387 | * f. bootstrap stack [register ESP (x86)] | ||
388 | * 5. Bootstrap elements are packed together, but each is 4kB-aligned. | ||
389 | * 6. The initial ram disk may be omitted. | ||
390 | * 7. The list of page frames forms a contiguous 'pseudo-physical' memory | ||
391 | * layout for the domain. In particular, the bootstrap virtual-memory | ||
392 | * region is a 1:1 mapping to the first section of the pseudo-physical map. | ||
393 | * 8. All bootstrap elements are mapped read-writable for the guest OS. The | ||
394 | * only exception is the bootstrap page table, which is mapped read-only. | ||
395 | * 9. There is guaranteed to be at least 512kB padding after the final | ||
396 | * bootstrap element. If necessary, the bootstrap virtual region is | ||
397 | * extended by an extra 4MB to ensure this. | ||
398 | */ | ||
399 | |||
400 | #define MAX_GUEST_CMDLINE 1024 | ||
401 | struct start_info { | ||
402 | /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */ | ||
403 | char magic[32]; /* "xen-<version>-<platform>". */ | ||
404 | unsigned long nr_pages; /* Total pages allocated to this domain. */ | ||
405 | unsigned long shared_info; /* MACHINE address of shared info struct. */ | ||
406 | uint32_t flags; /* SIF_xxx flags. */ | ||
407 | unsigned long store_mfn; /* MACHINE page number of shared page. */ | ||
408 | uint32_t store_evtchn; /* Event channel for store communication. */ | ||
409 | union { | ||
410 | struct { | ||
411 | unsigned long mfn; /* MACHINE page number of console page. */ | ||
412 | uint32_t evtchn; /* Event channel for console page. */ | ||
413 | } domU; | ||
414 | struct { | ||
415 | uint32_t info_off; /* Offset of console_info struct. */ | ||
416 | uint32_t info_size; /* Size of console_info struct from start.*/ | ||
417 | } dom0; | ||
418 | } console; | ||
419 | /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME). */ | ||
420 | unsigned long pt_base; /* VIRTUAL address of page directory. */ | ||
421 | unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames. */ | ||
422 | unsigned long mfn_list; /* VIRTUAL address of page-frame list. */ | ||
423 | unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ | ||
424 | unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ | ||
425 | int8_t cmd_line[MAX_GUEST_CMDLINE]; | ||
426 | }; | ||
427 | |||
428 | /* These flags are passed in the 'flags' field of start_info_t. */ | ||
429 | #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ | ||
430 | #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ | ||
431 | |||
432 | typedef uint64_t cpumap_t; | ||
433 | |||
434 | typedef uint8_t xen_domain_handle_t[16]; | ||
435 | |||
436 | /* Turn a plain number into a C unsigned long constant. */ | ||
437 | #define __mk_unsigned_long(x) x ## UL | ||
438 | #define mk_unsigned_long(x) __mk_unsigned_long(x) | ||
439 | |||
440 | #else /* __ASSEMBLY__ */ | ||
441 | |||
442 | /* In assembly code we cannot use C numeric constant suffixes. */ | ||
443 | #define mk_unsigned_long(x) x | ||
444 | |||
445 | #endif /* !__ASSEMBLY__ */ | ||
446 | |||
447 | #endif /* __XEN_PUBLIC_XEN_H__ */ | ||
diff --git a/include/xen/page.h b/include/xen/page.h new file mode 100644 index 000000000000..1df6c1930578 --- /dev/null +++ b/include/xen/page.h | |||
@@ -0,0 +1,179 @@ | |||
1 | #ifndef __XEN_PAGE_H | ||
2 | #define __XEN_PAGE_H | ||
3 | |||
4 | #include <linux/pfn.h> | ||
5 | |||
6 | #include <asm/uaccess.h> | ||
7 | |||
8 | #include <xen/features.h> | ||
9 | |||
10 | #ifdef CONFIG_X86_PAE | ||
11 | /* Xen machine address */ | ||
12 | typedef struct xmaddr { | ||
13 | unsigned long long maddr; | ||
14 | } xmaddr_t; | ||
15 | |||
16 | /* Xen pseudo-physical address */ | ||
17 | typedef struct xpaddr { | ||
18 | unsigned long long paddr; | ||
19 | } xpaddr_t; | ||
20 | #else | ||
21 | /* Xen machine address */ | ||
22 | typedef struct xmaddr { | ||
23 | unsigned long maddr; | ||
24 | } xmaddr_t; | ||
25 | |||
26 | /* Xen pseudo-physical address */ | ||
27 | typedef struct xpaddr { | ||
28 | unsigned long paddr; | ||
29 | } xpaddr_t; | ||
30 | #endif | ||
31 | |||
32 | #define XMADDR(x) ((xmaddr_t) { .maddr = (x) }) | ||
33 | #define XPADDR(x) ((xpaddr_t) { .paddr = (x) }) | ||
34 | |||
35 | /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ | ||
36 | #define INVALID_P2M_ENTRY (~0UL) | ||
37 | #define FOREIGN_FRAME_BIT (1UL<<31) | ||
38 | #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) | ||
39 | |||
40 | extern unsigned long *phys_to_machine_mapping; | ||
41 | |||
42 | static inline unsigned long pfn_to_mfn(unsigned long pfn) | ||
43 | { | ||
44 | if (xen_feature(XENFEAT_auto_translated_physmap)) | ||
45 | return pfn; | ||
46 | |||
47 | return phys_to_machine_mapping[(unsigned int)(pfn)] & | ||
48 | ~FOREIGN_FRAME_BIT; | ||
49 | } | ||
50 | |||
51 | static inline int phys_to_machine_mapping_valid(unsigned long pfn) | ||
52 | { | ||
53 | if (xen_feature(XENFEAT_auto_translated_physmap)) | ||
54 | return 1; | ||
55 | |||
56 | return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); | ||
57 | } | ||
58 | |||
59 | static inline unsigned long mfn_to_pfn(unsigned long mfn) | ||
60 | { | ||
61 | unsigned long pfn; | ||
62 | |||
63 | if (xen_feature(XENFEAT_auto_translated_physmap)) | ||
64 | return mfn; | ||
65 | |||
66 | #if 0 | ||
67 | if (unlikely((mfn >> machine_to_phys_order) != 0)) | ||
68 | return max_mapnr; | ||
69 | #endif | ||
70 | |||
71 | pfn = 0; | ||
72 | /* | ||
73 | * The array access can fail (e.g., device space beyond end of RAM). | ||
74 | * In such cases it doesn't matter what we return (we return garbage), | ||
75 | * but we must handle the fault without crashing! | ||
76 | */ | ||
77 | __get_user(pfn, &machine_to_phys_mapping[mfn]); | ||
78 | |||
79 | return pfn; | ||
80 | } | ||
81 | |||
82 | static inline xmaddr_t phys_to_machine(xpaddr_t phys) | ||
83 | { | ||
84 | unsigned offset = phys.paddr & ~PAGE_MASK; | ||
85 | return XMADDR(PFN_PHYS((u64)pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset); | ||
86 | } | ||
87 | |||
88 | static inline xpaddr_t machine_to_phys(xmaddr_t machine) | ||
89 | { | ||
90 | unsigned offset = machine.maddr & ~PAGE_MASK; | ||
91 | return XPADDR(PFN_PHYS((u64)mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * We detect special mappings in one of two ways: | ||
96 | * 1. If the MFN is an I/O page then Xen will set the m2p entry | ||
97 | * to be outside our maximum possible pseudophys range. | ||
98 | * 2. If the MFN belongs to a different domain then we will certainly | ||
99 | * not have MFN in our p2m table. Conversely, if the page is ours, | ||
100 | * then we'll have p2m(m2p(MFN))==MFN. | ||
101 | * If we detect a special mapping then it doesn't have a 'struct page'. | ||
102 | * We force !pfn_valid() by returning an out-of-range pointer. | ||
103 | * | ||
104 | * NB. These checks require that, for any MFN that is not in our reservation, | ||
105 | * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if | ||
106 | * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. | ||
107 | * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. | ||
108 | * | ||
109 | * NB2. When deliberately mapping foreign pages into the p2m table, you *must* | ||
110 | * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we | ||
111 | * require. In all the cases we care about, the FOREIGN_FRAME bit is | ||
112 | * masked (e.g., pfn_to_mfn()) so behaviour there is correct. | ||
113 | */ | ||
114 | static inline unsigned long mfn_to_local_pfn(unsigned long mfn) | ||
115 | { | ||
116 | extern unsigned long max_mapnr; | ||
117 | unsigned long pfn = mfn_to_pfn(mfn); | ||
118 | if ((pfn < max_mapnr) | ||
119 | && !xen_feature(XENFEAT_auto_translated_physmap) | ||
120 | && (phys_to_machine_mapping[pfn] != mfn)) | ||
121 | return max_mapnr; /* force !pfn_valid() */ | ||
122 | return pfn; | ||
123 | } | ||
124 | |||
125 | static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) | ||
126 | { | ||
127 | if (xen_feature(XENFEAT_auto_translated_physmap)) { | ||
128 | BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); | ||
129 | return; | ||
130 | } | ||
131 | phys_to_machine_mapping[pfn] = mfn; | ||
132 | } | ||
133 | |||
134 | /* VIRT <-> MACHINE conversion */ | ||
135 | #define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) | ||
136 | #define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v)))) | ||
137 | #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) | ||
138 | |||
139 | #ifdef CONFIG_X86_PAE | ||
140 | #define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \ | ||
141 | (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT))) | ||
142 | |||
143 | static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot) | ||
144 | { | ||
145 | pte_t pte; | ||
146 | |||
147 | pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | | ||
148 | (pgprot_val(pgprot) >> 32); | ||
149 | pte.pte_high &= (__supported_pte_mask >> 32); | ||
150 | pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); | ||
151 | pte.pte_low &= __supported_pte_mask; | ||
152 | |||
153 | return pte; | ||
154 | } | ||
155 | |||
156 | static inline unsigned long long pte_val_ma(pte_t x) | ||
157 | { | ||
158 | return ((unsigned long long)x.pte_high << 32) | x.pte_low; | ||
159 | } | ||
160 | #define pmd_val_ma(v) ((v).pmd) | ||
161 | #define pud_val_ma(v) ((v).pgd.pgd) | ||
162 | #define __pte_ma(x) ((pte_t) { .pte_low = (x), .pte_high = (x)>>32 } ) | ||
163 | #define __pmd_ma(x) ((pmd_t) { (x) } ) | ||
164 | #else /* !X86_PAE */ | ||
165 | #define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT) | ||
166 | #define mfn_pte(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) | ||
167 | #define pte_val_ma(x) ((x).pte_low) | ||
168 | #define pmd_val_ma(v) ((v).pud.pgd.pgd) | ||
169 | #define __pte_ma(x) ((pte_t) { (x) } ) | ||
170 | #endif /* CONFIG_X86_PAE */ | ||
171 | |||
172 | #define pgd_val_ma(x) ((x).pgd) | ||
173 | |||
174 | |||
175 | xmaddr_t arbitrary_virt_to_machine(unsigned long address); | ||
176 | void make_lowmem_page_readonly(void *vaddr); | ||
177 | void make_lowmem_page_readwrite(void *vaddr); | ||
178 | |||
179 | #endif /* __XEN_PAGE_H */ | ||
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h new file mode 100644 index 000000000000..6f7c290651ae --- /dev/null +++ b/include/xen/xenbus.h | |||
@@ -0,0 +1,234 @@ | |||
1 | /****************************************************************************** | ||
2 | * xenbus.h | ||
3 | * | ||
4 | * Talks to Xen Store to figure out what devices we have. | ||
5 | * | ||
6 | * Copyright (C) 2005 Rusty Russell, IBM Corporation | ||
7 | * Copyright (C) 2005 XenSource Ltd. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License version 2 | ||
11 | * as published by the Free Software Foundation; or, when distributed | ||
12 | * separately from the Linux kernel or incorporated into other | ||
13 | * software packages, subject to the following license: | ||
14 | * | ||
15 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
16 | * of this source file (the "Software"), to deal in the Software without | ||
17 | * restriction, including without limitation the rights to use, copy, modify, | ||
18 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
19 | * and to permit persons to whom the Software is furnished to do so, subject to | ||
20 | * the following conditions: | ||
21 | * | ||
22 | * The above copyright notice and this permission notice shall be included in | ||
23 | * all copies or substantial portions of the Software. | ||
24 | * | ||
25 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
26 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
27 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
28 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
29 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
30 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
31 | * IN THE SOFTWARE. | ||
32 | */ | ||
33 | |||
34 | #ifndef _XEN_XENBUS_H | ||
35 | #define _XEN_XENBUS_H | ||
36 | |||
37 | #include <linux/device.h> | ||
38 | #include <linux/notifier.h> | ||
39 | #include <linux/mutex.h> | ||
40 | #include <linux/completion.h> | ||
41 | #include <linux/init.h> | ||
42 | #include <xen/interface/xen.h> | ||
43 | #include <xen/interface/grant_table.h> | ||
44 | #include <xen/interface/io/xenbus.h> | ||
45 | #include <xen/interface/io/xs_wire.h> | ||
46 | |||
47 | /* Register callback to watch this node. */ | ||
48 | struct xenbus_watch | ||
49 | { | ||
50 | struct list_head list; | ||
51 | |||
52 | /* Path being watched. */ | ||
53 | const char *node; | ||
54 | |||
55 | /* Callback (executed in a process context with no locks held). */ | ||
56 | void (*callback)(struct xenbus_watch *, | ||
57 | const char **vec, unsigned int len); | ||
58 | }; | ||
59 | |||
60 | |||
61 | /* A xenbus device. */ | ||
62 | struct xenbus_device { | ||
63 | const char *devicetype; | ||
64 | const char *nodename; | ||
65 | const char *otherend; | ||
66 | int otherend_id; | ||
67 | struct xenbus_watch otherend_watch; | ||
68 | struct device dev; | ||
69 | enum xenbus_state state; | ||
70 | struct completion down; | ||
71 | }; | ||
72 | |||
73 | static inline struct xenbus_device *to_xenbus_device(struct device *dev) | ||
74 | { | ||
75 | return container_of(dev, struct xenbus_device, dev); | ||
76 | } | ||
77 | |||
78 | struct xenbus_device_id | ||
79 | { | ||
80 | /* .../device/<device_type>/<identifier> */ | ||
81 | char devicetype[32]; /* General class of device. */ | ||
82 | }; | ||
83 | |||
84 | /* A xenbus driver. */ | ||
85 | struct xenbus_driver { | ||
86 | char *name; | ||
87 | struct module *owner; | ||
88 | const struct xenbus_device_id *ids; | ||
89 | int (*probe)(struct xenbus_device *dev, | ||
90 | const struct xenbus_device_id *id); | ||
91 | void (*otherend_changed)(struct xenbus_device *dev, | ||
92 | enum xenbus_state backend_state); | ||
93 | int (*remove)(struct xenbus_device *dev); | ||
94 | int (*suspend)(struct xenbus_device *dev); | ||
95 | int (*suspend_cancel)(struct xenbus_device *dev); | ||
96 | int (*resume)(struct xenbus_device *dev); | ||
97 | int (*uevent)(struct xenbus_device *, char **, int, char *, int); | ||
98 | struct device_driver driver; | ||
99 | int (*read_otherend_details)(struct xenbus_device *dev); | ||
100 | }; | ||
101 | |||
102 | static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv) | ||
103 | { | ||
104 | return container_of(drv, struct xenbus_driver, driver); | ||
105 | } | ||
106 | |||
107 | int __must_check __xenbus_register_frontend(struct xenbus_driver *drv, | ||
108 | struct module *owner, | ||
109 | const char *mod_name); | ||
110 | |||
111 | static inline int __must_check | ||
112 | xenbus_register_frontend(struct xenbus_driver *drv) | ||
113 | { | ||
114 | WARN_ON(drv->owner != THIS_MODULE); | ||
115 | return __xenbus_register_frontend(drv, THIS_MODULE, KBUILD_MODNAME); | ||
116 | } | ||
117 | |||
118 | int __must_check __xenbus_register_backend(struct xenbus_driver *drv, | ||
119 | struct module *owner, | ||
120 | const char *mod_name); | ||
121 | static inline int __must_check | ||
122 | xenbus_register_backend(struct xenbus_driver *drv) | ||
123 | { | ||
124 | WARN_ON(drv->owner != THIS_MODULE); | ||
125 | return __xenbus_register_backend(drv, THIS_MODULE, KBUILD_MODNAME); | ||
126 | } | ||
127 | |||
128 | void xenbus_unregister_driver(struct xenbus_driver *drv); | ||
129 | |||
130 | struct xenbus_transaction | ||
131 | { | ||
132 | u32 id; | ||
133 | }; | ||
134 | |||
135 | /* Nil transaction ID. */ | ||
136 | #define XBT_NIL ((struct xenbus_transaction) { 0 }) | ||
137 | |||
138 | int __init xenbus_dev_init(void); | ||
139 | |||
140 | char **xenbus_directory(struct xenbus_transaction t, | ||
141 | const char *dir, const char *node, unsigned int *num); | ||
142 | void *xenbus_read(struct xenbus_transaction t, | ||
143 | const char *dir, const char *node, unsigned int *len); | ||
144 | int xenbus_write(struct xenbus_transaction t, | ||
145 | const char *dir, const char *node, const char *string); | ||
146 | int xenbus_mkdir(struct xenbus_transaction t, | ||
147 | const char *dir, const char *node); | ||
148 | int xenbus_exists(struct xenbus_transaction t, | ||
149 | const char *dir, const char *node); | ||
150 | int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node); | ||
151 | int xenbus_transaction_start(struct xenbus_transaction *t); | ||
152 | int xenbus_transaction_end(struct xenbus_transaction t, int abort); | ||
153 | |||
154 | /* Single read and scanf: returns -errno or num scanned if > 0. */ | ||
155 | int xenbus_scanf(struct xenbus_transaction t, | ||
156 | const char *dir, const char *node, const char *fmt, ...) | ||
157 | __attribute__((format(scanf, 4, 5))); | ||
158 | |||
159 | /* Single printf and write: returns -errno or 0. */ | ||
160 | int xenbus_printf(struct xenbus_transaction t, | ||
161 | const char *dir, const char *node, const char *fmt, ...) | ||
162 | __attribute__((format(printf, 4, 5))); | ||
163 | |||
164 | /* Generic read function: NULL-terminated triples of name, | ||
165 | * sprintf-style type string, and pointer. Returns 0 or errno.*/ | ||
166 | int xenbus_gather(struct xenbus_transaction t, const char *dir, ...); | ||
167 | |||
168 | /* notifer routines for when the xenstore comes up */ | ||
169 | extern int xenstored_ready; | ||
170 | int register_xenstore_notifier(struct notifier_block *nb); | ||
171 | void unregister_xenstore_notifier(struct notifier_block *nb); | ||
172 | |||
173 | int register_xenbus_watch(struct xenbus_watch *watch); | ||
174 | void unregister_xenbus_watch(struct xenbus_watch *watch); | ||
175 | void xs_suspend(void); | ||
176 | void xs_resume(void); | ||
177 | void xs_suspend_cancel(void); | ||
178 | |||
179 | /* Used by xenbus_dev to borrow kernel's store connection. */ | ||
180 | void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg); | ||
181 | |||
182 | struct work_struct; | ||
183 | |||
184 | /* Prepare for domain suspend: then resume or cancel the suspend. */ | ||
185 | void xenbus_suspend(void); | ||
186 | void xenbus_resume(void); | ||
187 | void xenbus_probe(struct work_struct *); | ||
188 | void xenbus_suspend_cancel(void); | ||
189 | |||
190 | #define XENBUS_IS_ERR_READ(str) ({ \ | ||
191 | if (!IS_ERR(str) && strlen(str) == 0) { \ | ||
192 | kfree(str); \ | ||
193 | str = ERR_PTR(-ERANGE); \ | ||
194 | } \ | ||
195 | IS_ERR(str); \ | ||
196 | }) | ||
197 | |||
198 | #define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE) | ||
199 | |||
200 | int xenbus_watch_path(struct xenbus_device *dev, const char *path, | ||
201 | struct xenbus_watch *watch, | ||
202 | void (*callback)(struct xenbus_watch *, | ||
203 | const char **, unsigned int)); | ||
204 | int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch, | ||
205 | void (*callback)(struct xenbus_watch *, | ||
206 | const char **, unsigned int), | ||
207 | const char *pathfmt, ...) | ||
208 | __attribute__ ((format (printf, 4, 5))); | ||
209 | |||
210 | int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state); | ||
211 | int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn); | ||
212 | int xenbus_map_ring_valloc(struct xenbus_device *dev, | ||
213 | int gnt_ref, void **vaddr); | ||
214 | int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref, | ||
215 | grant_handle_t *handle, void *vaddr); | ||
216 | |||
217 | int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr); | ||
218 | int xenbus_unmap_ring(struct xenbus_device *dev, | ||
219 | grant_handle_t handle, void *vaddr); | ||
220 | |||
221 | int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port); | ||
222 | int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port); | ||
223 | int xenbus_free_evtchn(struct xenbus_device *dev, int port); | ||
224 | |||
225 | enum xenbus_state xenbus_read_driver_state(const char *path); | ||
226 | |||
227 | void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...); | ||
228 | void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...); | ||
229 | |||
230 | const char *xenbus_strstate(enum xenbus_state state); | ||
231 | int xenbus_dev_is_online(struct xenbus_device *dev); | ||
232 | int xenbus_frontend_closed(struct xenbus_device *dev); | ||
233 | |||
234 | #endif /* _XEN_XENBUS_H */ | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b4796d850140..57e6448b171e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -516,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf) | |||
516 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | 516 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; |
517 | envp[i] = NULL; | 517 | envp[i] = NULL; |
518 | 518 | ||
519 | call_usermodehelper(argv[0], argv, envp, 0); | 519 | call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); |
520 | kfree(pathbuf); | 520 | kfree(pathbuf); |
521 | } | 521 | } |
522 | 522 | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c index 4d32eb077179..78d365c524ed 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -119,9 +119,10 @@ struct subprocess_info { | |||
119 | char **argv; | 119 | char **argv; |
120 | char **envp; | 120 | char **envp; |
121 | struct key *ring; | 121 | struct key *ring; |
122 | int wait; | 122 | enum umh_wait wait; |
123 | int retval; | 123 | int retval; |
124 | struct file *stdin; | 124 | struct file *stdin; |
125 | void (*cleanup)(char **argv, char **envp); | ||
125 | }; | 126 | }; |
126 | 127 | ||
127 | /* | 128 | /* |
@@ -180,6 +181,14 @@ static int ____call_usermodehelper(void *data) | |||
180 | do_exit(0); | 181 | do_exit(0); |
181 | } | 182 | } |
182 | 183 | ||
184 | void call_usermodehelper_freeinfo(struct subprocess_info *info) | ||
185 | { | ||
186 | if (info->cleanup) | ||
187 | (*info->cleanup)(info->argv, info->envp); | ||
188 | kfree(info); | ||
189 | } | ||
190 | EXPORT_SYMBOL(call_usermodehelper_freeinfo); | ||
191 | |||
183 | /* Keventd can't block, but this (a child) can. */ | 192 | /* Keventd can't block, but this (a child) can. */ |
184 | static int wait_for_helper(void *data) | 193 | static int wait_for_helper(void *data) |
185 | { | 194 | { |
@@ -216,8 +225,8 @@ static int wait_for_helper(void *data) | |||
216 | sub_info->retval = ret; | 225 | sub_info->retval = ret; |
217 | } | 226 | } |
218 | 227 | ||
219 | if (sub_info->wait < 0) | 228 | if (sub_info->wait == UMH_NO_WAIT) |
220 | kfree(sub_info); | 229 | call_usermodehelper_freeinfo(sub_info); |
221 | else | 230 | else |
222 | complete(sub_info->complete); | 231 | complete(sub_info->complete); |
223 | return 0; | 232 | return 0; |
@@ -229,34 +238,122 @@ static void __call_usermodehelper(struct work_struct *work) | |||
229 | struct subprocess_info *sub_info = | 238 | struct subprocess_info *sub_info = |
230 | container_of(work, struct subprocess_info, work); | 239 | container_of(work, struct subprocess_info, work); |
231 | pid_t pid; | 240 | pid_t pid; |
232 | int wait = sub_info->wait; | 241 | enum umh_wait wait = sub_info->wait; |
233 | 242 | ||
234 | /* CLONE_VFORK: wait until the usermode helper has execve'd | 243 | /* CLONE_VFORK: wait until the usermode helper has execve'd |
235 | * successfully We need the data structures to stay around | 244 | * successfully We need the data structures to stay around |
236 | * until that is done. */ | 245 | * until that is done. */ |
237 | if (wait) | 246 | if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT) |
238 | pid = kernel_thread(wait_for_helper, sub_info, | 247 | pid = kernel_thread(wait_for_helper, sub_info, |
239 | CLONE_FS | CLONE_FILES | SIGCHLD); | 248 | CLONE_FS | CLONE_FILES | SIGCHLD); |
240 | else | 249 | else |
241 | pid = kernel_thread(____call_usermodehelper, sub_info, | 250 | pid = kernel_thread(____call_usermodehelper, sub_info, |
242 | CLONE_VFORK | SIGCHLD); | 251 | CLONE_VFORK | SIGCHLD); |
243 | 252 | ||
244 | if (wait < 0) | 253 | switch (wait) { |
245 | return; | 254 | case UMH_NO_WAIT: |
255 | break; | ||
246 | 256 | ||
247 | if (pid < 0) { | 257 | case UMH_WAIT_PROC: |
258 | if (pid > 0) | ||
259 | break; | ||
248 | sub_info->retval = pid; | 260 | sub_info->retval = pid; |
261 | /* FALLTHROUGH */ | ||
262 | |||
263 | case UMH_WAIT_EXEC: | ||
249 | complete(sub_info->complete); | 264 | complete(sub_info->complete); |
250 | } else if (!wait) | 265 | } |
251 | complete(sub_info->complete); | 266 | } |
267 | |||
268 | /** | ||
269 | * call_usermodehelper_setup - prepare to call a usermode helper | ||
270 | * @path - path to usermode executable | ||
271 | * @argv - arg vector for process | ||
272 | * @envp - environment for process | ||
273 | * | ||
274 | * Returns either NULL on allocation failure, or a subprocess_info | ||
275 | * structure. This should be passed to call_usermodehelper_exec to | ||
276 | * exec the process and free the structure. | ||
277 | */ | ||
278 | struct subprocess_info *call_usermodehelper_setup(char *path, | ||
279 | char **argv, char **envp) | ||
280 | { | ||
281 | struct subprocess_info *sub_info; | ||
282 | sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); | ||
283 | if (!sub_info) | ||
284 | goto out; | ||
285 | |||
286 | INIT_WORK(&sub_info->work, __call_usermodehelper); | ||
287 | sub_info->path = path; | ||
288 | sub_info->argv = argv; | ||
289 | sub_info->envp = envp; | ||
290 | |||
291 | out: | ||
292 | return sub_info; | ||
252 | } | 293 | } |
294 | EXPORT_SYMBOL(call_usermodehelper_setup); | ||
253 | 295 | ||
254 | /** | 296 | /** |
255 | * call_usermodehelper_keys - start a usermode application | 297 | * call_usermodehelper_setkeys - set the session keys for usermode helper |
256 | * @path: pathname for the application | 298 | * @info: a subprocess_info returned by call_usermodehelper_setup |
257 | * @argv: null-terminated argument list | 299 | * @session_keyring: the session keyring for the process |
258 | * @envp: null-terminated environment list | 300 | */ |
259 | * @session_keyring: session keyring for process (NULL for an empty keyring) | 301 | void call_usermodehelper_setkeys(struct subprocess_info *info, |
302 | struct key *session_keyring) | ||
303 | { | ||
304 | info->ring = session_keyring; | ||
305 | } | ||
306 | EXPORT_SYMBOL(call_usermodehelper_setkeys); | ||
307 | |||
308 | /** | ||
309 | * call_usermodehelper_setcleanup - set a cleanup function | ||
310 | * @info: a subprocess_info returned by call_usermodehelper_setup | ||
311 | * @cleanup: a cleanup function | ||
312 | * | ||
313 | * The cleanup function is just befor ethe subprocess_info is about to | ||
314 | * be freed. This can be used for freeing the argv and envp. The | ||
315 | * Function must be runnable in either a process context or the | ||
316 | * context in which call_usermodehelper_exec is called. | ||
317 | */ | ||
318 | void call_usermodehelper_setcleanup(struct subprocess_info *info, | ||
319 | void (*cleanup)(char **argv, char **envp)) | ||
320 | { | ||
321 | info->cleanup = cleanup; | ||
322 | } | ||
323 | EXPORT_SYMBOL(call_usermodehelper_setcleanup); | ||
324 | |||
325 | /** | ||
326 | * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin | ||
327 | * @sub_info: a subprocess_info returned by call_usermodehelper_setup | ||
328 | * @filp: set to the write-end of a pipe | ||
329 | * | ||
330 | * This constructs a pipe, and sets the read end to be the stdin of the | ||
331 | * subprocess, and returns the write-end in *@filp. | ||
332 | */ | ||
333 | int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info, | ||
334 | struct file **filp) | ||
335 | { | ||
336 | struct file *f; | ||
337 | |||
338 | f = create_write_pipe(); | ||
339 | if (IS_ERR(f)) | ||
340 | return PTR_ERR(f); | ||
341 | *filp = f; | ||
342 | |||
343 | f = create_read_pipe(f); | ||
344 | if (IS_ERR(f)) { | ||
345 | free_write_pipe(*filp); | ||
346 | return PTR_ERR(f); | ||
347 | } | ||
348 | sub_info->stdin = f; | ||
349 | |||
350 | return 0; | ||
351 | } | ||
352 | EXPORT_SYMBOL(call_usermodehelper_stdinpipe); | ||
353 | |||
354 | /** | ||
355 | * call_usermodehelper_exec - start a usermode application | ||
356 | * @sub_info: information about the subprocessa | ||
260 | * @wait: wait for the application to finish and return status. | 357 | * @wait: wait for the application to finish and return status. |
261 | * when -1 don't wait at all, but you get no useful error back when | 358 | * when -1 don't wait at all, but you get no useful error back when |
262 | * the program couldn't be exec'ed. This makes it safe to call | 359 | * the program couldn't be exec'ed. This makes it safe to call |
@@ -265,81 +362,68 @@ static void __call_usermodehelper(struct work_struct *work) | |||
265 | * Runs a user-space application. The application is started | 362 | * Runs a user-space application. The application is started |
266 | * asynchronously if wait is not set, and runs as a child of keventd. | 363 | * asynchronously if wait is not set, and runs as a child of keventd. |
267 | * (ie. it runs with full root capabilities). | 364 | * (ie. it runs with full root capabilities). |
268 | * | ||
269 | * Must be called from process context. Returns a negative error code | ||
270 | * if program was not execed successfully, or 0. | ||
271 | */ | 365 | */ |
272 | int call_usermodehelper_keys(char *path, char **argv, char **envp, | 366 | int call_usermodehelper_exec(struct subprocess_info *sub_info, |
273 | struct key *session_keyring, int wait) | 367 | enum umh_wait wait) |
274 | { | 368 | { |
275 | DECLARE_COMPLETION_ONSTACK(done); | 369 | DECLARE_COMPLETION_ONSTACK(done); |
276 | struct subprocess_info *sub_info; | ||
277 | int retval; | 370 | int retval; |
278 | 371 | ||
279 | if (!khelper_wq) | 372 | if (sub_info->path[0] == '\0') { |
280 | return -EBUSY; | 373 | retval = 0; |
281 | 374 | goto out; | |
282 | if (path[0] == '\0') | 375 | } |
283 | return 0; | ||
284 | 376 | ||
285 | sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); | 377 | if (!khelper_wq) { |
286 | if (!sub_info) | 378 | retval = -EBUSY; |
287 | return -ENOMEM; | 379 | goto out; |
380 | } | ||
288 | 381 | ||
289 | INIT_WORK(&sub_info->work, __call_usermodehelper); | ||
290 | sub_info->complete = &done; | 382 | sub_info->complete = &done; |
291 | sub_info->path = path; | ||
292 | sub_info->argv = argv; | ||
293 | sub_info->envp = envp; | ||
294 | sub_info->ring = session_keyring; | ||
295 | sub_info->wait = wait; | 383 | sub_info->wait = wait; |
296 | 384 | ||
297 | queue_work(khelper_wq, &sub_info->work); | 385 | queue_work(khelper_wq, &sub_info->work); |
298 | if (wait < 0) /* task has freed sub_info */ | 386 | if (wait == UMH_NO_WAIT) /* task has freed sub_info */ |
299 | return 0; | 387 | return 0; |
300 | wait_for_completion(&done); | 388 | wait_for_completion(&done); |
301 | retval = sub_info->retval; | 389 | retval = sub_info->retval; |
302 | kfree(sub_info); | 390 | |
391 | out: | ||
392 | call_usermodehelper_freeinfo(sub_info); | ||
303 | return retval; | 393 | return retval; |
304 | } | 394 | } |
305 | EXPORT_SYMBOL(call_usermodehelper_keys); | 395 | EXPORT_SYMBOL(call_usermodehelper_exec); |
306 | 396 | ||
397 | /** | ||
398 | * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin | ||
399 | * @path: path to usermode executable | ||
400 | * @argv: arg vector for process | ||
401 | * @envp: environment for process | ||
402 | * @filp: set to the write-end of a pipe | ||
403 | * | ||
404 | * This is a simple wrapper which executes a usermode-helper function | ||
405 | * with a pipe as stdin. It is implemented entirely in terms of | ||
406 | * lower-level call_usermodehelper_* functions. | ||
407 | */ | ||
307 | int call_usermodehelper_pipe(char *path, char **argv, char **envp, | 408 | int call_usermodehelper_pipe(char *path, char **argv, char **envp, |
308 | struct file **filp) | 409 | struct file **filp) |
309 | { | 410 | { |
310 | DECLARE_COMPLETION(done); | 411 | struct subprocess_info *sub_info; |
311 | struct subprocess_info sub_info = { | 412 | int ret; |
312 | .work = __WORK_INITIALIZER(sub_info.work, | ||
313 | __call_usermodehelper), | ||
314 | .complete = &done, | ||
315 | .path = path, | ||
316 | .argv = argv, | ||
317 | .envp = envp, | ||
318 | .retval = 0, | ||
319 | }; | ||
320 | struct file *f; | ||
321 | 413 | ||
322 | if (!khelper_wq) | 414 | sub_info = call_usermodehelper_setup(path, argv, envp); |
323 | return -EBUSY; | 415 | if (sub_info == NULL) |
416 | return -ENOMEM; | ||
324 | 417 | ||
325 | if (path[0] == '\0') | 418 | ret = call_usermodehelper_stdinpipe(sub_info, filp); |
326 | return 0; | 419 | if (ret < 0) |
420 | goto out; | ||
327 | 421 | ||
328 | f = create_write_pipe(); | 422 | return call_usermodehelper_exec(sub_info, 1); |
329 | if (IS_ERR(f)) | ||
330 | return PTR_ERR(f); | ||
331 | *filp = f; | ||
332 | |||
333 | f = create_read_pipe(f); | ||
334 | if (IS_ERR(f)) { | ||
335 | free_write_pipe(*filp); | ||
336 | return PTR_ERR(f); | ||
337 | } | ||
338 | sub_info.stdin = f; | ||
339 | 423 | ||
340 | queue_work(khelper_wq, &sub_info.work); | 424 | out: |
341 | wait_for_completion(&done); | 425 | call_usermodehelper_freeinfo(sub_info); |
342 | return sub_info.retval; | 426 | return ret; |
343 | } | 427 | } |
344 | EXPORT_SYMBOL(call_usermodehelper_pipe); | 428 | EXPORT_SYMBOL(call_usermodehelper_pipe); |
345 | 429 | ||
diff --git a/kernel/sys.c b/kernel/sys.c index 4d141ae3e802..18987c7f6add 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -2286,3 +2286,61 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, | |||
2286 | } | 2286 | } |
2287 | return err ? -EFAULT : 0; | 2287 | return err ? -EFAULT : 0; |
2288 | } | 2288 | } |
2289 | |||
2290 | char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; | ||
2291 | |||
2292 | static void argv_cleanup(char **argv, char **envp) | ||
2293 | { | ||
2294 | argv_free(argv); | ||
2295 | } | ||
2296 | |||
2297 | /** | ||
2298 | * orderly_poweroff - Trigger an orderly system poweroff | ||
2299 | * @force: force poweroff if command execution fails | ||
2300 | * | ||
2301 | * This may be called from any context to trigger a system shutdown. | ||
2302 | * If the orderly shutdown fails, it will force an immediate shutdown. | ||
2303 | */ | ||
2304 | int orderly_poweroff(bool force) | ||
2305 | { | ||
2306 | int argc; | ||
2307 | char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); | ||
2308 | static char *envp[] = { | ||
2309 | "HOME=/", | ||
2310 | "PATH=/sbin:/bin:/usr/sbin:/usr/bin", | ||
2311 | NULL | ||
2312 | }; | ||
2313 | int ret = -ENOMEM; | ||
2314 | struct subprocess_info *info; | ||
2315 | |||
2316 | if (argv == NULL) { | ||
2317 | printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", | ||
2318 | __func__, poweroff_cmd); | ||
2319 | goto out; | ||
2320 | } | ||
2321 | |||
2322 | info = call_usermodehelper_setup(argv[0], argv, envp); | ||
2323 | if (info == NULL) { | ||
2324 | argv_free(argv); | ||
2325 | goto out; | ||
2326 | } | ||
2327 | |||
2328 | call_usermodehelper_setcleanup(info, argv_cleanup); | ||
2329 | |||
2330 | ret = call_usermodehelper_exec(info, UMH_NO_WAIT); | ||
2331 | |||
2332 | out: | ||
2333 | if (ret && force) { | ||
2334 | printk(KERN_WARNING "Failed to start orderly shutdown: " | ||
2335 | "forcing the issue\n"); | ||
2336 | |||
2337 | /* I guess this should try to kick off some daemon to | ||
2338 | sync and poweroff asap. Or not even bother syncing | ||
2339 | if we're doing an emergency shutdown? */ | ||
2340 | emergency_sync(); | ||
2341 | kernel_power_off(); | ||
2342 | } | ||
2343 | |||
2344 | return ret; | ||
2345 | } | ||
2346 | EXPORT_SYMBOL_GPL(orderly_poweroff); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7063ebc6db05..44a1d699aad7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include <linux/syscalls.h> | 46 | #include <linux/syscalls.h> |
47 | #include <linux/nfs_fs.h> | 47 | #include <linux/nfs_fs.h> |
48 | #include <linux/acpi.h> | 48 | #include <linux/acpi.h> |
49 | #include <linux/reboot.h> | ||
49 | 50 | ||
50 | #include <asm/uaccess.h> | 51 | #include <asm/uaccess.h> |
51 | #include <asm/processor.h> | 52 | #include <asm/processor.h> |
@@ -705,6 +706,15 @@ static ctl_table kern_table[] = { | |||
705 | .proc_handler = &proc_dointvec, | 706 | .proc_handler = &proc_dointvec, |
706 | }, | 707 | }, |
707 | #endif | 708 | #endif |
709 | { | ||
710 | .ctl_name = CTL_UNNUMBERED, | ||
711 | .procname = "poweroff_cmd", | ||
712 | .data = &poweroff_cmd, | ||
713 | .maxlen = POWEROFF_CMD_PATH_LEN, | ||
714 | .mode = 0644, | ||
715 | .proc_handler = &proc_dostring, | ||
716 | .strategy = &sysctl_string, | ||
717 | }, | ||
708 | 718 | ||
709 | { .ctl_name = 0 } | 719 | { .ctl_name = 0 } |
710 | }; | 720 | }; |
diff --git a/lib/Makefile b/lib/Makefile index da68b2ca0606..614966387402 100644 --- a/lib/Makefile +++ b/lib/Makefile | |||
@@ -5,7 +5,7 @@ | |||
5 | lib-y := ctype.o string.o vsprintf.o cmdline.o \ | 5 | lib-y := ctype.o string.o vsprintf.o cmdline.o \ |
6 | rbtree.o radix-tree.o dump_stack.o \ | 6 | rbtree.o radix-tree.o dump_stack.o \ |
7 | idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ | 7 | idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ |
8 | sha1.o irq_regs.o reciprocal_div.o | 8 | sha1.o irq_regs.o reciprocal_div.o argv_split.o |
9 | 9 | ||
10 | lib-$(CONFIG_MMU) += ioremap.o | 10 | lib-$(CONFIG_MMU) += ioremap.o |
11 | lib-$(CONFIG_SMP) += cpumask.o | 11 | lib-$(CONFIG_SMP) += cpumask.o |
diff --git a/lib/argv_split.c b/lib/argv_split.c new file mode 100644 index 000000000000..4096ed42f490 --- /dev/null +++ b/lib/argv_split.c | |||
@@ -0,0 +1,105 @@ | |||
1 | /* | ||
2 | * Helper function for splitting a string into an argv-like array. | ||
3 | */ | ||
4 | |||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/ctype.h> | ||
7 | #include <linux/bug.h> | ||
8 | |||
9 | static const char *skip_sep(const char *cp) | ||
10 | { | ||
11 | while (*cp && isspace(*cp)) | ||
12 | cp++; | ||
13 | |||
14 | return cp; | ||
15 | } | ||
16 | |||
17 | static const char *skip_arg(const char *cp) | ||
18 | { | ||
19 | while (*cp && !isspace(*cp)) | ||
20 | cp++; | ||
21 | |||
22 | return cp; | ||
23 | } | ||
24 | |||
25 | static int count_argc(const char *str) | ||
26 | { | ||
27 | int count = 0; | ||
28 | |||
29 | while (*str) { | ||
30 | str = skip_sep(str); | ||
31 | if (*str) { | ||
32 | count++; | ||
33 | str = skip_arg(str); | ||
34 | } | ||
35 | } | ||
36 | |||
37 | return count; | ||
38 | } | ||
39 | |||
40 | /** | ||
41 | * argv_free - free an argv | ||
42 | * @argv - the argument vector to be freed | ||
43 | * | ||
44 | * Frees an argv and the strings it points to. | ||
45 | */ | ||
46 | void argv_free(char **argv) | ||
47 | { | ||
48 | char **p; | ||
49 | for (p = argv; *p; p++) | ||
50 | kfree(*p); | ||
51 | |||
52 | kfree(argv); | ||
53 | } | ||
54 | EXPORT_SYMBOL(argv_free); | ||
55 | |||
56 | /** | ||
57 | * argv_split - split a string at whitespace, returning an argv | ||
58 | * @gfp: the GFP mask used to allocate memory | ||
59 | * @str: the string to be split | ||
60 | * @argcp: returned argument count | ||
61 | * | ||
62 | * Returns an array of pointers to strings which are split out from | ||
63 | * @str. This is performed by strictly splitting on white-space; no | ||
64 | * quote processing is performed. Multiple whitespace characters are | ||
65 | * considered to be a single argument separator. The returned array | ||
66 | * is always NULL-terminated. Returns NULL on memory allocation | ||
67 | * failure. | ||
68 | */ | ||
69 | char **argv_split(gfp_t gfp, const char *str, int *argcp) | ||
70 | { | ||
71 | int argc = count_argc(str); | ||
72 | char **argv = kzalloc(sizeof(*argv) * (argc+1), gfp); | ||
73 | char **argvp; | ||
74 | |||
75 | if (argv == NULL) | ||
76 | goto out; | ||
77 | |||
78 | *argcp = argc; | ||
79 | argvp = argv; | ||
80 | |||
81 | while (*str) { | ||
82 | str = skip_sep(str); | ||
83 | |||
84 | if (*str) { | ||
85 | const char *p = str; | ||
86 | char *t; | ||
87 | |||
88 | str = skip_arg(str); | ||
89 | |||
90 | t = kstrndup(p, str-p, gfp); | ||
91 | if (t == NULL) | ||
92 | goto fail; | ||
93 | *argvp++ = t; | ||
94 | } | ||
95 | } | ||
96 | *argvp = NULL; | ||
97 | |||
98 | out: | ||
99 | return argv; | ||
100 | |||
101 | fail: | ||
102 | argv_free(argv); | ||
103 | return NULL; | ||
104 | } | ||
105 | EXPORT_SYMBOL(argv_split); | ||
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 12e311dc664c..bd5ecbbafab1 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c | |||
@@ -208,7 +208,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, | |||
208 | argv [0] = uevent_helper; | 208 | argv [0] = uevent_helper; |
209 | argv [1] = (char *)subsystem; | 209 | argv [1] = (char *)subsystem; |
210 | argv [2] = NULL; | 210 | argv [2] = NULL; |
211 | call_usermodehelper (argv[0], argv, envp, 0); | 211 | call_usermodehelper (argv[0], argv, envp, UMH_WAIT_EXEC); |
212 | } | 212 | } |
213 | 213 | ||
214 | exit: | 214 | exit: |
@@ -6,7 +6,6 @@ | |||
6 | 6 | ||
7 | /** | 7 | /** |
8 | * kstrdup - allocate space for and copy an existing string | 8 | * kstrdup - allocate space for and copy an existing string |
9 | * | ||
10 | * @s: the string to duplicate | 9 | * @s: the string to duplicate |
11 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory | 10 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory |
12 | */ | 11 | */ |
@@ -27,6 +26,30 @@ char *kstrdup(const char *s, gfp_t gfp) | |||
27 | EXPORT_SYMBOL(kstrdup); | 26 | EXPORT_SYMBOL(kstrdup); |
28 | 27 | ||
29 | /** | 28 | /** |
29 | * kstrndup - allocate space for and copy an existing string | ||
30 | * @s: the string to duplicate | ||
31 | * @max: read at most @max chars from @s | ||
32 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory | ||
33 | */ | ||
34 | char *kstrndup(const char *s, size_t max, gfp_t gfp) | ||
35 | { | ||
36 | size_t len; | ||
37 | char *buf; | ||
38 | |||
39 | if (!s) | ||
40 | return NULL; | ||
41 | |||
42 | len = strnlen(s, max); | ||
43 | buf = kmalloc_track_caller(len+1, gfp); | ||
44 | if (buf) { | ||
45 | memcpy(buf, s, len); | ||
46 | buf[len] = '\0'; | ||
47 | } | ||
48 | return buf; | ||
49 | } | ||
50 | EXPORT_SYMBOL(kstrndup); | ||
51 | |||
52 | /** | ||
30 | * kmemdup - duplicate region of memory | 53 | * kmemdup - duplicate region of memory |
31 | * | 54 | * |
32 | * @src: memory region to duplicate | 55 | * @src: memory region to duplicate |
@@ -80,7 +103,6 @@ EXPORT_SYMBOL(krealloc); | |||
80 | 103 | ||
81 | /* | 104 | /* |
82 | * strndup_user - duplicate an existing string from user space | 105 | * strndup_user - duplicate an existing string from user space |
83 | * | ||
84 | * @s: The string to duplicate | 106 | * @s: The string to duplicate |
85 | * @n: Maximum number of bytes to copy, including the trailing NUL. | 107 | * @n: Maximum number of bytes to copy, including the trailing NUL. |
86 | */ | 108 | */ |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8e05a11155c9..3130c343088f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -767,3 +767,56 @@ EXPORT_SYMBOL(remap_vmalloc_range); | |||
767 | void __attribute__((weak)) vmalloc_sync_all(void) | 767 | void __attribute__((weak)) vmalloc_sync_all(void) |
768 | { | 768 | { |
769 | } | 769 | } |
770 | |||
771 | |||
772 | static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) | ||
773 | { | ||
774 | /* apply_to_page_range() does all the hard work. */ | ||
775 | return 0; | ||
776 | } | ||
777 | |||
778 | /** | ||
779 | * alloc_vm_area - allocate a range of kernel address space | ||
780 | * @size: size of the area | ||
781 | * @returns: NULL on failure, vm_struct on success | ||
782 | * | ||
783 | * This function reserves a range of kernel address space, and | ||
784 | * allocates pagetables to map that range. No actual mappings | ||
785 | * are created. If the kernel address space is not shared | ||
786 | * between processes, it syncs the pagetable across all | ||
787 | * processes. | ||
788 | */ | ||
789 | struct vm_struct *alloc_vm_area(size_t size) | ||
790 | { | ||
791 | struct vm_struct *area; | ||
792 | |||
793 | area = get_vm_area(size, VM_IOREMAP); | ||
794 | if (area == NULL) | ||
795 | return NULL; | ||
796 | |||
797 | /* | ||
798 | * This ensures that page tables are constructed for this region | ||
799 | * of kernel virtual address space and mapped into init_mm. | ||
800 | */ | ||
801 | if (apply_to_page_range(&init_mm, (unsigned long)area->addr, | ||
802 | area->size, f, NULL)) { | ||
803 | free_vm_area(area); | ||
804 | return NULL; | ||
805 | } | ||
806 | |||
807 | /* Make sure the pagetables are constructed in process kernel | ||
808 | mappings */ | ||
809 | vmalloc_sync_all(); | ||
810 | |||
811 | return area; | ||
812 | } | ||
813 | EXPORT_SYMBOL_GPL(alloc_vm_area); | ||
814 | |||
815 | void free_vm_area(struct vm_struct *area) | ||
816 | { | ||
817 | struct vm_struct *ret; | ||
818 | ret = remove_vm_area(area->addr); | ||
819 | BUG_ON(ret != area); | ||
820 | kfree(area); | ||
821 | } | ||
822 | EXPORT_SYMBOL_GPL(free_vm_area); | ||
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index a786e7863200..1ea2f86f7683 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c | |||
@@ -125,7 +125,7 @@ static void br_stp_start(struct net_bridge *br) | |||
125 | char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL }; | 125 | char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL }; |
126 | char *envp[] = { NULL }; | 126 | char *envp[] = { NULL }; |
127 | 127 | ||
128 | r = call_usermodehelper(BR_STP_PROG, argv, envp, 1); | 128 | r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); |
129 | if (r == 0) { | 129 | if (r == 0) { |
130 | br->stp_enabled = BR_USER_STP; | 130 | br->stp_enabled = BR_USER_STP; |
131 | printk(KERN_INFO "%s: userspace STP started\n", br->dev->name); | 131 | printk(KERN_INFO "%s: userspace STP started\n", br->dev->name); |
diff --git a/net/irda/irias_object.c b/net/irda/irias_object.c index 4adaae242b9e..cf302457097b 100644 --- a/net/irda/irias_object.c +++ b/net/irda/irias_object.c | |||
@@ -36,39 +36,6 @@ hashbin_t *irias_objects; | |||
36 | */ | 36 | */ |
37 | struct ias_value irias_missing = { IAS_MISSING, 0, 0, 0, {0}}; | 37 | struct ias_value irias_missing = { IAS_MISSING, 0, 0, 0, {0}}; |
38 | 38 | ||
39 | /* | ||
40 | * Function strndup (str, max) | ||
41 | * | ||
42 | * My own kernel version of strndup! | ||
43 | * | ||
44 | * Faster, check boundary... Jean II | ||
45 | */ | ||
46 | static char *strndup(char *str, size_t max) | ||
47 | { | ||
48 | char *new_str; | ||
49 | int len; | ||
50 | |||
51 | /* Check string */ | ||
52 | if (str == NULL) | ||
53 | return NULL; | ||
54 | /* Check length, truncate */ | ||
55 | len = strlen(str); | ||
56 | if(len > max) | ||
57 | len = max; | ||
58 | |||
59 | /* Allocate new string */ | ||
60 | new_str = kmalloc(len + 1, GFP_ATOMIC); | ||
61 | if (new_str == NULL) { | ||
62 | IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__); | ||
63 | return NULL; | ||
64 | } | ||
65 | |||
66 | /* Copy and truncate */ | ||
67 | memcpy(new_str, str, len); | ||
68 | new_str[len] = '\0'; | ||
69 | |||
70 | return new_str; | ||
71 | } | ||
72 | 39 | ||
73 | /* | 40 | /* |
74 | * Function ias_new_object (name, id) | 41 | * Function ias_new_object (name, id) |
@@ -90,7 +57,7 @@ struct ias_object *irias_new_object( char *name, int id) | |||
90 | } | 57 | } |
91 | 58 | ||
92 | obj->magic = IAS_OBJECT_MAGIC; | 59 | obj->magic = IAS_OBJECT_MAGIC; |
93 | obj->name = strndup(name, IAS_MAX_CLASSNAME); | 60 | obj->name = kstrndup(name, IAS_MAX_CLASSNAME, GFP_ATOMIC); |
94 | if (!obj->name) { | 61 | if (!obj->name) { |
95 | IRDA_WARNING("%s(), Unable to allocate name!\n", | 62 | IRDA_WARNING("%s(), Unable to allocate name!\n", |
96 | __FUNCTION__); | 63 | __FUNCTION__); |
@@ -360,7 +327,7 @@ void irias_add_integer_attrib(struct ias_object *obj, char *name, int value, | |||
360 | } | 327 | } |
361 | 328 | ||
362 | attrib->magic = IAS_ATTRIB_MAGIC; | 329 | attrib->magic = IAS_ATTRIB_MAGIC; |
363 | attrib->name = strndup(name, IAS_MAX_ATTRIBNAME); | 330 | attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC); |
364 | 331 | ||
365 | /* Insert value */ | 332 | /* Insert value */ |
366 | attrib->value = irias_new_integer_value(value); | 333 | attrib->value = irias_new_integer_value(value); |
@@ -404,7 +371,7 @@ void irias_add_octseq_attrib(struct ias_object *obj, char *name, __u8 *octets, | |||
404 | } | 371 | } |
405 | 372 | ||
406 | attrib->magic = IAS_ATTRIB_MAGIC; | 373 | attrib->magic = IAS_ATTRIB_MAGIC; |
407 | attrib->name = strndup(name, IAS_MAX_ATTRIBNAME); | 374 | attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC); |
408 | 375 | ||
409 | attrib->value = irias_new_octseq_value( octets, len); | 376 | attrib->value = irias_new_octseq_value( octets, len); |
410 | if (!attrib->name || !attrib->value) { | 377 | if (!attrib->name || !attrib->value) { |
@@ -446,7 +413,7 @@ void irias_add_string_attrib(struct ias_object *obj, char *name, char *value, | |||
446 | } | 413 | } |
447 | 414 | ||
448 | attrib->magic = IAS_ATTRIB_MAGIC; | 415 | attrib->magic = IAS_ATTRIB_MAGIC; |
449 | attrib->name = strndup(name, IAS_MAX_ATTRIBNAME); | 416 | attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC); |
450 | 417 | ||
451 | attrib->value = irias_new_string_value(value); | 418 | attrib->value = irias_new_string_value(value); |
452 | if (!attrib->name || !attrib->value) { | 419 | if (!attrib->name || !attrib->value) { |
@@ -506,7 +473,7 @@ struct ias_value *irias_new_string_value(char *string) | |||
506 | 473 | ||
507 | value->type = IAS_STRING; | 474 | value->type = IAS_STRING; |
508 | value->charset = CS_ASCII; | 475 | value->charset = CS_ASCII; |
509 | value->t.string = strndup(string, IAS_MAX_STRING); | 476 | value->t.string = kstrndup(string, IAS_MAX_STRING, GFP_ATOMIC); |
510 | if (!value->t.string) { | 477 | if (!value->t.string) { |
511 | IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__); | 478 | IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__); |
512 | kfree(value); | 479 | kfree(value); |
diff --git a/security/keys/request_key.c b/security/keys/request_key.c index f573ac189a0a..557500110a13 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c | |||
@@ -108,7 +108,8 @@ static int call_sbin_request_key(struct key *key, | |||
108 | argv[i] = NULL; | 108 | argv[i] = NULL; |
109 | 109 | ||
110 | /* do it */ | 110 | /* do it */ |
111 | ret = call_usermodehelper_keys(argv[0], argv, envp, keyring, 1); | 111 | ret = call_usermodehelper_keys(argv[0], argv, envp, keyring, |
112 | UMH_WAIT_PROC); | ||
112 | 113 | ||
113 | error_link: | 114 | error_link: |
114 | key_put(keyring); | 115 | key_put(keyring); |