aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig28
-rw-r--r--arch/x86/xen/Makefile14
-rw-r--r--arch/x86/xen/debugfs.c123
-rw-r--r--arch/x86/xen/debugfs.h10
-rw-r--r--arch/x86/xen/enlighten.c992
-rw-r--r--arch/x86/xen/irq.c143
-rw-r--r--arch/x86/xen/manage.c143
-rw-r--r--arch/x86/xen/mmu.c840
-rw-r--r--arch/x86/xen/mmu.h42
-rw-r--r--arch/x86/xen/multicalls.c156
-rw-r--r--arch/x86/xen/multicalls.h12
-rw-r--r--arch/x86/xen/setup.c109
-rw-r--r--arch/x86/xen/smp.c342
-rw-r--r--arch/x86/xen/spinlock.c428
-rw-r--r--arch/x86/xen/suspend.c48
-rw-r--r--arch/x86/xen/time.c29
-rw-r--r--arch/x86/xen/xen-asm_32.S (renamed from arch/x86/xen/xen-asm.S)2
-rw-r--r--arch/x86/xen/xen-asm_64.S285
-rw-r--r--arch/x86/xen/xen-head.S31
-rw-r--r--arch/x86/xen/xen-ops.h43
20 files changed, 3066 insertions, 754 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 6c388e593bc8..87b9ab166423 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,9 +6,33 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 depends on X86_32 9 depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC
11 help 11 help
12 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
14 Xen hypervisor. 14 Xen hypervisor.
15
16config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes"
18 default 8 if X86_32
19 default 32 if X86_64
20 depends on XEN
21 help
22 The pseudo-physical to machine address array is sized
23 according to the maximum possible memory size of a Xen
24 domain. This array uses 1 page per gigabyte, so there's no
25 need to be too stingy here.
26
27config XEN_SAVE_RESTORE
28 bool
29 depends on XEN && PM
30 default y
31
32config XEN_DEBUG_FS
33 bool "Enable Xen debug and tuning parameters in debugfs"
34 depends on XEN && DEBUG_FS
35 default n
36 help
37 Enable statistics output and various tuning options in debugfs.
38 Enabling this option may incur a significant performance overhead.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3d8df981d5fd..313947940a1a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,12 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1ifdef CONFIG_FTRACE
2 time.o manage.o xen-asm.o grant-table.o 2# Do not profile debug and lowlevel utilities
3CFLAGS_REMOVE_spinlock.o = -pg
4CFLAGS_REMOVE_time.o = -pg
5CFLAGS_REMOVE_irq.o = -pg
6endif
3 7
4obj-$(CONFIG_SMP) += smp.o 8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
9 time.o xen-asm_$(BITS).o grant-table.o suspend.o
10
11obj-$(CONFIG_SMP) += smp.o spinlock.o
12obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
new file mode 100644
index 000000000000..b53225d2cac3
--- /dev/null
+++ b/arch/x86/xen/debugfs.c
@@ -0,0 +1,123 @@
1#include <linux/init.h>
2#include <linux/debugfs.h>
3#include <linux/module.h>
4
5#include "debugfs.h"
6
7static struct dentry *d_xen_debug;
8
9struct dentry * __init xen_init_debugfs(void)
10{
11 if (!d_xen_debug) {
12 d_xen_debug = debugfs_create_dir("xen", NULL);
13
14 if (!d_xen_debug)
15 pr_warning("Could not create 'xen' debugfs directory\n");
16 }
17
18 return d_xen_debug;
19}
20
21struct array_data
22{
23 void *array;
24 unsigned elements;
25};
26
27static int u32_array_open(struct inode *inode, struct file *file)
28{
29 file->private_data = NULL;
30 return nonseekable_open(inode, file);
31}
32
33static size_t format_array(char *buf, size_t bufsize, const char *fmt,
34 u32 *array, unsigned array_size)
35{
36 size_t ret = 0;
37 unsigned i;
38
39 for(i = 0; i < array_size; i++) {
40 size_t len;
41
42 len = snprintf(buf, bufsize, fmt, array[i]);
43 len++; /* ' ' or '\n' */
44 ret += len;
45
46 if (buf) {
47 buf += len;
48 bufsize -= len;
49 buf[-1] = (i == array_size-1) ? '\n' : ' ';
50 }
51 }
52
53 ret++; /* \0 */
54 if (buf)
55 *buf = '\0';
56
57 return ret;
58}
59
60static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
61{
62 size_t len = format_array(NULL, 0, fmt, array, array_size);
63 char *ret;
64
65 ret = kmalloc(len, GFP_KERNEL);
66 if (ret == NULL)
67 return NULL;
68
69 format_array(ret, len, fmt, array, array_size);
70 return ret;
71}
72
73static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
74 loff_t *ppos)
75{
76 struct inode *inode = file->f_path.dentry->d_inode;
77 struct array_data *data = inode->i_private;
78 size_t size;
79
80 if (*ppos == 0) {
81 if (file->private_data) {
82 kfree(file->private_data);
83 file->private_data = NULL;
84 }
85
86 file->private_data = format_array_alloc("%u", data->array, data->elements);
87 }
88
89 size = 0;
90 if (file->private_data)
91 size = strlen(file->private_data);
92
93 return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
94}
95
96static int xen_array_release(struct inode *inode, struct file *file)
97{
98 kfree(file->private_data);
99
100 return 0;
101}
102
103static struct file_operations u32_array_fops = {
104 .owner = THIS_MODULE,
105 .open = u32_array_open,
106 .release= xen_array_release,
107 .read = u32_array_read,
108};
109
110struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
111 struct dentry *parent,
112 u32 *array, unsigned elements)
113{
114 struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
115
116 if (data == NULL)
117 return NULL;
118
119 data->array = array;
120 data->elements = elements;
121
122 return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
123}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
new file mode 100644
index 000000000000..e28132084832
--- /dev/null
+++ b/arch/x86/xen/debugfs.h
@@ -0,0 +1,10 @@
1#ifndef _XEN_DEBUGFS_H
2#define _XEN_DEBUGFS_H
3
4struct dentry * __init xen_init_debugfs(void);
5
6struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
7 struct dentry *parent,
8 u32 *array, unsigned elements);
9
10#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f09c1c69c37a..0013a729b41d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,16 +30,18 @@
30#include <xen/interface/xen.h> 30#include <xen/interface/xen.h>
31#include <xen/interface/physdev.h> 31#include <xen/interface/physdev.h>
32#include <xen/interface/vcpu.h> 32#include <xen/interface/vcpu.h>
33#include <xen/interface/sched.h>
34#include <xen/features.h> 33#include <xen/features.h>
35#include <xen/page.h> 34#include <xen/page.h>
35#include <xen/hvc-console.h>
36 36
37#include <asm/paravirt.h> 37#include <asm/paravirt.h>
38#include <asm/apic.h>
38#include <asm/page.h> 39#include <asm/page.h>
39#include <asm/xen/hypercall.h> 40#include <asm/xen/hypercall.h>
40#include <asm/xen/hypervisor.h> 41#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h> 42#include <asm/fixmap.h>
42#include <asm/processor.h> 43#include <asm/processor.h>
44#include <asm/msr-index.h>
43#include <asm/setup.h> 45#include <asm/setup.h>
44#include <asm/desc.h> 46#include <asm/desc.h>
45#include <asm/pgtable.h> 47#include <asm/pgtable.h>
@@ -55,6 +57,21 @@ EXPORT_SYMBOL_GPL(hypercall_page);
55DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
56DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
57 59
60enum xen_domain_type xen_domain_type = XEN_NATIVE;
61EXPORT_SYMBOL_GPL(xen_domain_type);
62
63/*
64 * Identity map, in addition to plain kernel map. This needs to be
65 * large enough to allocate page table pages to allocate the rest.
66 * Each page can map 2MB.
67 */
68static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
69
70#ifdef CONFIG_X86_64
71/* l3 pud for userspace vsyscall mapping */
72static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
73#endif /* CONFIG_X86_64 */
74
58/* 75/*
59 * Note about cr3 (pagetable base) values: 76 * Note about cr3 (pagetable base) values:
60 * 77 *
@@ -75,13 +92,13 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
75struct start_info *xen_start_info; 92struct start_info *xen_start_info;
76EXPORT_SYMBOL_GPL(xen_start_info); 93EXPORT_SYMBOL_GPL(xen_start_info);
77 94
78static /* __initdata */ struct shared_info dummy_shared_info; 95struct shared_info xen_dummy_shared_info;
79 96
80/* 97/*
81 * Point at some empty memory to start with. We map the real shared_info 98 * Point at some empty memory to start with. We map the real shared_info
82 * page as soon as fixmap is up and running. 99 * page as soon as fixmap is up and running.
83 */ 100 */
84struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; 101struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
85 102
86/* 103/*
87 * Flag to determine whether vcpu info placement is available on all 104 * Flag to determine whether vcpu info placement is available on all
@@ -96,15 +113,22 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
96 * 113 *
97 * 0: not available, 1: available 114 * 0: not available, 1: available
98 */ 115 */
99static int have_vcpu_info_placement = 1; 116static int have_vcpu_info_placement =
117#ifdef CONFIG_X86_32
118 1
119#else
120 0
121#endif
122 ;
100 123
101static void __init xen_vcpu_setup(int cpu) 124
125static void xen_vcpu_setup(int cpu)
102{ 126{
103 struct vcpu_register_vcpu_info info; 127 struct vcpu_register_vcpu_info info;
104 int err; 128 int err;
105 struct vcpu_info *vcpup; 129 struct vcpu_info *vcpup;
106 130
107 BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info); 131 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
108 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 132 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
109 133
110 if (!have_vcpu_info_placement) 134 if (!have_vcpu_info_placement)
@@ -136,11 +160,45 @@ static void __init xen_vcpu_setup(int cpu)
136 } 160 }
137} 161}
138 162
163/*
164 * On restore, set the vcpu placement up again.
165 * If it fails, then we're in a bad state, since
166 * we can't back out from using it...
167 */
168void xen_vcpu_restore(void)
169{
170 if (have_vcpu_info_placement) {
171 int cpu;
172
173 for_each_online_cpu(cpu) {
174 bool other_cpu = (cpu != smp_processor_id());
175
176 if (other_cpu &&
177 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
178 BUG();
179
180 xen_vcpu_setup(cpu);
181
182 if (other_cpu &&
183 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
184 BUG();
185 }
186
187 BUG_ON(!have_vcpu_info_placement);
188 }
189}
190
139static void __init xen_banner(void) 191static void __init xen_banner(void)
140{ 192{
193 unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
194 struct xen_extraversion extra;
195 HYPERVISOR_xen_version(XENVER_extraversion, &extra);
196
141 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 197 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
142 pv_info.name); 198 pv_info.name);
143 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); 199 printk(KERN_INFO "Xen version: %d.%d%s%s\n",
200 version >> 16, version & 0xffff, extra.extraversion,
201 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
144} 202}
145 203
146static void xen_cpuid(unsigned int *ax, unsigned int *bx, 204static void xen_cpuid(unsigned int *ax, unsigned int *bx,
@@ -178,103 +236,68 @@ static unsigned long xen_get_debugreg(int reg)
178 return HYPERVISOR_get_debugreg(reg); 236 return HYPERVISOR_get_debugreg(reg);
179} 237}
180 238
181static unsigned long xen_save_fl(void) 239static void xen_leave_lazy(void)
182{
183 struct vcpu_info *vcpu;
184 unsigned long flags;
185
186 vcpu = x86_read_percpu(xen_vcpu);
187
188 /* flag has opposite sense of mask */
189 flags = !vcpu->evtchn_upcall_mask;
190
191 /* convert to IF type flag
192 -0 -> 0x00000000
193 -1 -> 0xffffffff
194 */
195 return (-flags) & X86_EFLAGS_IF;
196}
197
198static void xen_restore_fl(unsigned long flags)
199{ 240{
200 struct vcpu_info *vcpu; 241 paravirt_leave_lazy(paravirt_get_lazy_mode());
201 242 xen_mc_flush();
202 /* convert from IF type flag */
203 flags = !(flags & X86_EFLAGS_IF);
204
205 /* There's a one instruction preempt window here. We need to
206 make sure we're don't switch CPUs between getting the vcpu
207 pointer and updating the mask. */
208 preempt_disable();
209 vcpu = x86_read_percpu(xen_vcpu);
210 vcpu->evtchn_upcall_mask = flags;
211 preempt_enable_no_resched();
212
213 /* Doesn't matter if we get preempted here, because any
214 pending event will get dealt with anyway. */
215
216 if (flags == 0) {
217 preempt_check_resched();
218 barrier(); /* unmask then check (avoid races) */
219 if (unlikely(vcpu->evtchn_upcall_pending))
220 force_evtchn_callback();
221 }
222} 243}
223 244
224static void xen_irq_disable(void) 245static unsigned long xen_store_tr(void)
225{ 246{
226 /* There's a one instruction preempt window here. We need to 247 return 0;
227 make sure we're don't switch CPUs between getting the vcpu
228 pointer and updating the mask. */
229 preempt_disable();
230 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
231 preempt_enable_no_resched();
232} 248}
233 249
234static void xen_irq_enable(void) 250/*
251 * Set the page permissions for a particular virtual address. If the
252 * address is a vmalloc mapping (or other non-linear mapping), then
253 * find the linear mapping of the page and also set its protections to
254 * match.
255 */
256static void set_aliased_prot(void *v, pgprot_t prot)
235{ 257{
236 struct vcpu_info *vcpu; 258 int level;
259 pte_t *ptep;
260 pte_t pte;
261 unsigned long pfn;
262 struct page *page;
237 263
238 /* There's a one instruction preempt window here. We need to 264 ptep = lookup_address((unsigned long)v, &level);
239 make sure we're don't switch CPUs between getting the vcpu 265 BUG_ON(ptep == NULL);
240 pointer and updating the mask. */
241 preempt_disable();
242 vcpu = x86_read_percpu(xen_vcpu);
243 vcpu->evtchn_upcall_mask = 0;
244 preempt_enable_no_resched();
245 266
246 /* Doesn't matter if we get preempted here, because any 267 pfn = pte_pfn(*ptep);
247 pending event will get dealt with anyway. */ 268 page = pfn_to_page(pfn);
248 269
249 barrier(); /* unmask then check (avoid races) */ 270 pte = pfn_pte(pfn, prot);
250 if (unlikely(vcpu->evtchn_upcall_pending))
251 force_evtchn_callback();
252}
253 271
254static void xen_safe_halt(void) 272 if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
255{
256 /* Blocking includes an implicit local_irq_enable(). */
257 if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
258 BUG(); 273 BUG();
259}
260 274
261static void xen_halt(void) 275 if (!PageHighMem(page)) {
262{ 276 void *av = __va(PFN_PHYS(pfn));
263 if (irqs_disabled()) 277
264 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); 278 if (av != v)
265 else 279 if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
266 xen_safe_halt(); 280 BUG();
281 } else
282 kmap_flush_unused();
267} 283}
268 284
269static void xen_leave_lazy(void) 285static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
270{ 286{
271 paravirt_leave_lazy(paravirt_get_lazy_mode()); 287 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
272 xen_mc_flush(); 288 int i;
289
290 for(i = 0; i < entries; i += entries_per_page)
291 set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
273} 292}
274 293
275static unsigned long xen_store_tr(void) 294static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
276{ 295{
277 return 0; 296 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
297 int i;
298
299 for(i = 0; i < entries; i += entries_per_page)
300 set_aliased_prot(ldt + i, PAGE_KERNEL);
278} 301}
279 302
280static void xen_set_ldt(const void *addr, unsigned entries) 303static void xen_set_ldt(const void *addr, unsigned entries)
@@ -332,14 +355,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
332 355
333static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 356static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
334{ 357{
335 xen_mc_batch();
336
337 load_TLS_descriptor(t, cpu, 0);
338 load_TLS_descriptor(t, cpu, 1);
339 load_TLS_descriptor(t, cpu, 2);
340
341 xen_mc_issue(PARAVIRT_LAZY_CPU);
342
343 /* 358 /*
344 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 359 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
345 * it means we're in a context switch, and %gs has just been 360 * it means we're in a context switch, and %gs has just been
@@ -348,16 +363,44 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
348 * Either way, it has been saved, and the new value will get 363 * Either way, it has been saved, and the new value will get
349 * loaded properly. This will go away as soon as Xen has been 364 * loaded properly. This will go away as soon as Xen has been
350 * modified to not save/restore %gs for normal hypercalls. 365 * modified to not save/restore %gs for normal hypercalls.
366 *
367 * On x86_64, this hack is not used for %gs, because gs points
368 * to KERNEL_GS_BASE (and uses it for PDA references), so we
369 * must not zero %gs on x86_64
370 *
371 * For x86_64, we need to zero %fs, otherwise we may get an
372 * exception between the new %fs descriptor being loaded and
373 * %fs being effectively cleared at __switch_to().
351 */ 374 */
352 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) 375 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
376#ifdef CONFIG_X86_32
353 loadsegment(gs, 0); 377 loadsegment(gs, 0);
378#else
379 loadsegment(fs, 0);
380#endif
381 }
382
383 xen_mc_batch();
384
385 load_TLS_descriptor(t, cpu, 0);
386 load_TLS_descriptor(t, cpu, 1);
387 load_TLS_descriptor(t, cpu, 2);
388
389 xen_mc_issue(PARAVIRT_LAZY_CPU);
390}
391
392#ifdef CONFIG_X86_64
393static void xen_load_gs_index(unsigned int idx)
394{
395 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
396 BUG();
354} 397}
398#endif
355 399
356static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 400static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
357 const void *ptr) 401 const void *ptr)
358{ 402{
359 unsigned long lp = (unsigned long)&dt[entrynum]; 403 xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
360 xmaddr_t mach_lp = virt_to_machine(lp);
361 u64 entry = *(u64 *)ptr; 404 u64 entry = *(u64 *)ptr;
362 405
363 preempt_disable(); 406 preempt_disable();
@@ -369,23 +412,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
369 preempt_enable(); 412 preempt_enable();
370} 413}
371 414
372static int cvt_gate_to_trap(int vector, u32 low, u32 high, 415static int cvt_gate_to_trap(int vector, const gate_desc *val,
373 struct trap_info *info) 416 struct trap_info *info)
374{ 417{
375 u8 type, dpl; 418 if (val->type != 0xf && val->type != 0xe)
376
377 type = (high >> 8) & 0x1f;
378 dpl = (high >> 13) & 3;
379
380 if (type != 0xf && type != 0xe)
381 return 0; 419 return 0;
382 420
383 info->vector = vector; 421 info->vector = vector;
384 info->address = (high & 0xffff0000) | (low & 0x0000ffff); 422 info->address = gate_offset(*val);
385 info->cs = low >> 16; 423 info->cs = gate_segment(*val);
386 info->flags = dpl; 424 info->flags = val->dpl;
387 /* interrupt gates clear IF */ 425 /* interrupt gates clear IF */
388 if (type == 0xe) 426 if (val->type == 0xe)
389 info->flags |= 4; 427 info->flags |= 4;
390 428
391 return 1; 429 return 1;
@@ -412,11 +450,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
412 450
413 if (p >= start && (p + 8) <= end) { 451 if (p >= start && (p + 8) <= end) {
414 struct trap_info info[2]; 452 struct trap_info info[2];
415 u32 *desc = (u32 *)g;
416 453
417 info[1].address = 0; 454 info[1].address = 0;
418 455
419 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) 456 if (cvt_gate_to_trap(entrynum, g, &info[0]))
420 if (HYPERVISOR_set_trap_table(info)) 457 if (HYPERVISOR_set_trap_table(info))
421 BUG(); 458 BUG();
422 } 459 }
@@ -429,13 +466,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
429{ 466{
430 unsigned in, out, count; 467 unsigned in, out, count;
431 468
432 count = (desc->size+1) / 8; 469 count = (desc->size+1) / sizeof(gate_desc);
433 BUG_ON(count > 256); 470 BUG_ON(count > 256);
434 471
435 for (in = out = 0; in < count; in++) { 472 for (in = out = 0; in < count; in++) {
436 const u32 *entry = (u32 *)(desc->address + in * 8); 473 gate_desc *entry = (gate_desc*)(desc->address) + in;
437 474
438 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) 475 if (cvt_gate_to_trap(in, entry, &traps[out]))
439 out++; 476 out++;
440 } 477 }
441 traps[out].address = 0; 478 traps[out].address = 0;
@@ -496,7 +533,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
496} 533}
497 534
498static void xen_load_sp0(struct tss_struct *tss, 535static void xen_load_sp0(struct tss_struct *tss,
499 struct thread_struct *thread) 536 struct thread_struct *thread)
500{ 537{
501 struct multicall_space mcs = xen_mc_entry(0); 538 struct multicall_space mcs = xen_mc_entry(0);
502 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 539 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
@@ -517,16 +554,47 @@ static void xen_io_delay(void)
517} 554}
518 555
519#ifdef CONFIG_X86_LOCAL_APIC 556#ifdef CONFIG_X86_LOCAL_APIC
520static u32 xen_apic_read(unsigned long reg) 557static u32 xen_apic_read(u32 reg)
521{ 558{
522 return 0; 559 return 0;
523} 560}
524 561
525static void xen_apic_write(unsigned long reg, u32 val) 562static void xen_apic_write(u32 reg, u32 val)
526{ 563{
527 /* Warn to see if there's any stray references */ 564 /* Warn to see if there's any stray references */
528 WARN_ON(1); 565 WARN_ON(1);
529} 566}
567
568static u64 xen_apic_icr_read(void)
569{
570 return 0;
571}
572
573static void xen_apic_icr_write(u32 low, u32 id)
574{
575 /* Warn to see if there's any stray references */
576 WARN_ON(1);
577}
578
579static void xen_apic_wait_icr_idle(void)
580{
581 return;
582}
583
584static u32 xen_safe_apic_wait_icr_idle(void)
585{
586 return 0;
587}
588
589static struct apic_ops xen_basic_apic_ops = {
590 .read = xen_apic_read,
591 .write = xen_apic_write,
592 .icr_read = xen_apic_icr_read,
593 .icr_write = xen_apic_icr_write,
594 .wait_icr_idle = xen_apic_wait_icr_idle,
595 .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
596};
597
530#endif 598#endif
531 599
532static void xen_flush_tlb(void) 600static void xen_flush_tlb(void)
@@ -607,6 +675,30 @@ static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
607 xen_mc_issue(PARAVIRT_LAZY_MMU); 675 xen_mc_issue(PARAVIRT_LAZY_MMU);
608} 676}
609 677
678static void xen_clts(void)
679{
680 struct multicall_space mcs;
681
682 mcs = xen_mc_entry(0);
683
684 MULTI_fpu_taskswitch(mcs.mc, 0);
685
686 xen_mc_issue(PARAVIRT_LAZY_CPU);
687}
688
689static void xen_write_cr0(unsigned long cr0)
690{
691 struct multicall_space mcs;
692
693 /* Only pay attention to cr0.TS; everything else is
694 ignored. */
695 mcs = xen_mc_entry(0);
696
697 MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
698
699 xen_mc_issue(PARAVIRT_LAZY_CPU);
700}
701
610static void xen_write_cr2(unsigned long cr2) 702static void xen_write_cr2(unsigned long cr2)
611{ 703{
612 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; 704 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
@@ -624,8 +716,10 @@ static unsigned long xen_read_cr2_direct(void)
624 716
625static void xen_write_cr4(unsigned long cr4) 717static void xen_write_cr4(unsigned long cr4)
626{ 718{
627 /* Just ignore cr4 changes; Xen doesn't allow us to do 719 cr4 &= ~X86_CR4_PGE;
628 anything anyway. */ 720 cr4 &= ~X86_CR4_PSE;
721
722 native_write_cr4(cr4);
629} 723}
630 724
631static unsigned long xen_read_cr3(void) 725static unsigned long xen_read_cr3(void)
@@ -638,36 +732,105 @@ static void set_current_cr3(void *v)
638 x86_write_percpu(xen_current_cr3, (unsigned long)v); 732 x86_write_percpu(xen_current_cr3, (unsigned long)v);
639} 733}
640 734
641static void xen_write_cr3(unsigned long cr3) 735static void __xen_write_cr3(bool kernel, unsigned long cr3)
642{ 736{
643 struct mmuext_op *op; 737 struct mmuext_op *op;
644 struct multicall_space mcs; 738 struct multicall_space mcs;
645 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 739 unsigned long mfn;
646 740
647 BUG_ON(preemptible()); 741 if (cr3)
742 mfn = pfn_to_mfn(PFN_DOWN(cr3));
743 else
744 mfn = 0;
648 745
649 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 746 WARN_ON(mfn == 0 && kernel);
650 747
651 /* Update while interrupts are disabled, so its atomic with 748 mcs = __xen_mc_entry(sizeof(*op));
652 respect to ipis */
653 x86_write_percpu(xen_cr3, cr3);
654 749
655 op = mcs.args; 750 op = mcs.args;
656 op->cmd = MMUEXT_NEW_BASEPTR; 751 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
657 op->arg1.mfn = mfn; 752 op->arg1.mfn = mfn;
658 753
659 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 754 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
660 755
661 /* Update xen_update_cr3 once the batch has actually 756 if (kernel) {
662 been submitted. */ 757 x86_write_percpu(xen_cr3, cr3);
663 xen_mc_callback(set_current_cr3, (void *)cr3); 758
759 /* Update xen_current_cr3 once the batch has actually
760 been submitted. */
761 xen_mc_callback(set_current_cr3, (void *)cr3);
762 }
763}
764
765static void xen_write_cr3(unsigned long cr3)
766{
767 BUG_ON(preemptible());
768
769 xen_mc_batch(); /* disables interrupts */
770
771 /* Update while interrupts are disabled, so its atomic with
772 respect to ipis */
773 x86_write_percpu(xen_cr3, cr3);
774
775 __xen_write_cr3(true, cr3);
776
777#ifdef CONFIG_X86_64
778 {
779 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
780 if (user_pgd)
781 __xen_write_cr3(false, __pa(user_pgd));
782 else
783 __xen_write_cr3(false, 0);
784 }
785#endif
664 786
665 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 787 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
666} 788}
667 789
790static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
791{
792 int ret;
793
794 ret = 0;
795
796 switch(msr) {
797#ifdef CONFIG_X86_64
798 unsigned which;
799 u64 base;
800
801 case MSR_FS_BASE: which = SEGBASE_FS; goto set;
802 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
803 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
804
805 set:
806 base = ((u64)high << 32) | low;
807 if (HYPERVISOR_set_segment_base(which, base) != 0)
808 ret = -EFAULT;
809 break;
810#endif
811
812 case MSR_STAR:
813 case MSR_CSTAR:
814 case MSR_LSTAR:
815 case MSR_SYSCALL_MASK:
816 case MSR_IA32_SYSENTER_CS:
817 case MSR_IA32_SYSENTER_ESP:
818 case MSR_IA32_SYSENTER_EIP:
819 /* Fast syscall setup is all done in hypercalls, so
820 these are all ignored. Stub them out here to stop
821 Xen console noise. */
822 break;
823
824 default:
825 ret = native_write_msr_safe(msr, low, high);
826 }
827
828 return ret;
829}
830
668/* Early in boot, while setting up the initial pagetable, assume 831/* Early in boot, while setting up the initial pagetable, assume
669 everything is pinned. */ 832 everything is pinned. */
670static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 833static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
671{ 834{
672#ifdef CONFIG_FLATMEM 835#ifdef CONFIG_FLATMEM
673 BUG_ON(mem_map); /* should only be used early */ 836 BUG_ON(mem_map); /* should only be used early */
@@ -677,7 +840,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
677 840
678/* Early release_pte assumes that all pts are pinned, since there's 841/* Early release_pte assumes that all pts are pinned, since there's
679 only init_mm and anything attached to that is pinned. */ 842 only init_mm and anything attached to that is pinned. */
680static void xen_release_pte_init(u32 pfn) 843static void xen_release_pte_init(unsigned long pfn)
681{ 844{
682 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 845 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
683} 846}
@@ -693,7 +856,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
693 856
694/* This needs to make sure the new pte page is pinned iff its being 857/* This needs to make sure the new pte page is pinned iff its being
695 attached to a pinned pagetable. */ 858 attached to a pinned pagetable. */
696static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) 859static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
697{ 860{
698 struct page *page = pfn_to_page(pfn); 861 struct page *page = pfn_to_page(pfn);
699 862
@@ -701,8 +864,8 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
701 SetPagePinned(page); 864 SetPagePinned(page);
702 865
703 if (!PageHighMem(page)) { 866 if (!PageHighMem(page)) {
704 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 867 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
705 if (level == PT_PTE) 868 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
706 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 869 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
707 } else 870 } else
708 /* make sure there are no stray mappings of 871 /* make sure there are no stray mappings of
@@ -711,24 +874,66 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
711 } 874 }
712} 875}
713 876
714static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) 877static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
715{ 878{
716 xen_alloc_ptpage(mm, pfn, PT_PTE); 879 xen_alloc_ptpage(mm, pfn, PT_PTE);
717} 880}
718 881
719static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) 882static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
720{ 883{
721 xen_alloc_ptpage(mm, pfn, PT_PMD); 884 xen_alloc_ptpage(mm, pfn, PT_PMD);
722} 885}
723 886
887static int xen_pgd_alloc(struct mm_struct *mm)
888{
889 pgd_t *pgd = mm->pgd;
890 int ret = 0;
891
892 BUG_ON(PagePinned(virt_to_page(pgd)));
893
894#ifdef CONFIG_X86_64
895 {
896 struct page *page = virt_to_page(pgd);
897 pgd_t *user_pgd;
898
899 BUG_ON(page->private != 0);
900
901 ret = -ENOMEM;
902
903 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
904 page->private = (unsigned long)user_pgd;
905
906 if (user_pgd != NULL) {
907 user_pgd[pgd_index(VSYSCALL_START)] =
908 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
909 ret = 0;
910 }
911
912 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
913 }
914#endif
915
916 return ret;
917}
918
919static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
920{
921#ifdef CONFIG_X86_64
922 pgd_t *user_pgd = xen_get_user_pgd(pgd);
923
924 if (user_pgd)
925 free_page((unsigned long)user_pgd);
926#endif
927}
928
724/* This should never happen until we're OK to use struct page */ 929/* This should never happen until we're OK to use struct page */
725static void xen_release_ptpage(u32 pfn, unsigned level) 930static void xen_release_ptpage(unsigned long pfn, unsigned level)
726{ 931{
727 struct page *page = pfn_to_page(pfn); 932 struct page *page = pfn_to_page(pfn);
728 933
729 if (PagePinned(page)) { 934 if (PagePinned(page)) {
730 if (!PageHighMem(page)) { 935 if (!PageHighMem(page)) {
731 if (level == PT_PTE) 936 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
732 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 937 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
733 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 938 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
734 } 939 }
@@ -736,16 +941,28 @@ static void xen_release_ptpage(u32 pfn, unsigned level)
736 } 941 }
737} 942}
738 943
739static void xen_release_pte(u32 pfn) 944static void xen_release_pte(unsigned long pfn)
740{ 945{
741 xen_release_ptpage(pfn, PT_PTE); 946 xen_release_ptpage(pfn, PT_PTE);
742} 947}
743 948
744static void xen_release_pmd(u32 pfn) 949static void xen_release_pmd(unsigned long pfn)
745{ 950{
746 xen_release_ptpage(pfn, PT_PMD); 951 xen_release_ptpage(pfn, PT_PMD);
747} 952}
748 953
954#if PAGETABLE_LEVELS == 4
955static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
956{
957 xen_alloc_ptpage(mm, pfn, PT_PUD);
958}
959
960static void xen_release_pud(unsigned long pfn)
961{
962 xen_release_ptpage(pfn, PT_PUD);
963}
964#endif
965
749#ifdef CONFIG_HIGHPTE 966#ifdef CONFIG_HIGHPTE
750static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 967static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
751{ 968{
@@ -763,6 +980,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
763} 980}
764#endif 981#endif
765 982
983#ifdef CONFIG_X86_32
766static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 984static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
767{ 985{
768 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 986 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
@@ -781,71 +999,20 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
781 999
782 xen_set_pte(ptep, pte); 1000 xen_set_pte(ptep, pte);
783} 1001}
1002#endif
784 1003
785static __init void xen_pagetable_setup_start(pgd_t *base) 1004static __init void xen_pagetable_setup_start(pgd_t *base)
786{ 1005{
787 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
788 int i;
789
790 /* special set_pte for pagetable initialization */
791 pv_mmu_ops.set_pte = xen_set_pte_init;
792
793 init_mm.pgd = base;
794 /*
795 * copy top-level of Xen-supplied pagetable into place. This
796 * is a stand-in while we copy the pmd pages.
797 */
798 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
799
800 /*
801 * For PAE, need to allocate new pmds, rather than
802 * share Xen's, since Xen doesn't like pmd's being
803 * shared between address spaces.
804 */
805 for (i = 0; i < PTRS_PER_PGD; i++) {
806 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
807 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
808
809 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
810 PAGE_SIZE);
811
812 make_lowmem_page_readonly(pmd);
813
814 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
815 } else
816 pgd_clear(&base[i]);
817 }
818
819 /* make sure zero_page is mapped RO so we can use it in pagetables */
820 make_lowmem_page_readonly(empty_zero_page);
821 make_lowmem_page_readonly(base);
822 /*
823 * Switch to new pagetable. This is done before
824 * pagetable_init has done anything so that the new pages
825 * added to the table can be prepared properly for Xen.
826 */
827 xen_write_cr3(__pa(base));
828
829 /* Unpin initial Xen pagetable */
830 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
831 PFN_DOWN(__pa(xen_start_info->pt_base)));
832} 1006}
833 1007
834static __init void setup_shared_info(void) 1008void xen_setup_shared_info(void)
835{ 1009{
836 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1010 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
837 unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); 1011 set_fixmap(FIX_PARAVIRT_BOOTMAP,
838 1012 xen_start_info->shared_info);
839 /* 1013
840 * Create a mapping for the shared info page. 1014 HYPERVISOR_shared_info =
841 * Should be set_fixmap(), but shared_info is a machine 1015 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
842 * address with no corresponding pseudo-phys address.
843 */
844 set_pte_mfn(addr,
845 PFN_DOWN(xen_start_info->shared_info),
846 PAGE_KERNEL);
847
848 HYPERVISOR_shared_info = (struct shared_info *)addr;
849 } else 1016 } else
850 HYPERVISOR_shared_info = 1017 HYPERVISOR_shared_info =
851 (struct shared_info *)__va(xen_start_info->shared_info); 1018 (struct shared_info *)__va(xen_start_info->shared_info);
@@ -854,27 +1021,43 @@ static __init void setup_shared_info(void)
854 /* In UP this is as good a place as any to set up shared info */ 1021 /* In UP this is as good a place as any to set up shared info */
855 xen_setup_vcpu_info_placement(); 1022 xen_setup_vcpu_info_placement();
856#endif 1023#endif
1024
1025 xen_setup_mfn_list_list();
857} 1026}
858 1027
859static __init void xen_pagetable_setup_done(pgd_t *base) 1028static __init void xen_pagetable_setup_done(pgd_t *base)
860{ 1029{
1030 xen_setup_shared_info();
1031}
1032
1033static __init void xen_post_allocator_init(void)
1034{
1035 pv_mmu_ops.set_pte = xen_set_pte;
1036 pv_mmu_ops.set_pmd = xen_set_pmd;
1037 pv_mmu_ops.set_pud = xen_set_pud;
1038#if PAGETABLE_LEVELS == 4
1039 pv_mmu_ops.set_pgd = xen_set_pgd;
1040#endif
1041
861 /* This will work as long as patching hasn't happened yet 1042 /* This will work as long as patching hasn't happened yet
862 (which it hasn't) */ 1043 (which it hasn't) */
863 pv_mmu_ops.alloc_pte = xen_alloc_pte; 1044 pv_mmu_ops.alloc_pte = xen_alloc_pte;
864 pv_mmu_ops.alloc_pmd = xen_alloc_pmd; 1045 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
865 pv_mmu_ops.release_pte = xen_release_pte; 1046 pv_mmu_ops.release_pte = xen_release_pte;
866 pv_mmu_ops.release_pmd = xen_release_pmd; 1047 pv_mmu_ops.release_pmd = xen_release_pmd;
867 pv_mmu_ops.set_pte = xen_set_pte; 1048#if PAGETABLE_LEVELS == 4
868 1049 pv_mmu_ops.alloc_pud = xen_alloc_pud;
869 setup_shared_info(); 1050 pv_mmu_ops.release_pud = xen_release_pud;
1051#endif
870 1052
871 /* Actually pin the pagetable down, but we can't set PG_pinned 1053#ifdef CONFIG_X86_64
872 yet because the page structures don't exist yet. */ 1054 SetPagePinned(virt_to_page(level3_user_vsyscall));
873 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); 1055#endif
1056 xen_mark_init_mm_pinned();
874} 1057}
875 1058
876/* This is called once we have the cpu_possible_map */ 1059/* This is called once we have the cpu_possible_map */
877void __init xen_setup_vcpu_info_placement(void) 1060void xen_setup_vcpu_info_placement(void)
878{ 1061{
879 int cpu; 1062 int cpu;
880 1063
@@ -947,6 +1130,49 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
947 return ret; 1130 return ret;
948} 1131}
949 1132
1133static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1134{
1135 pte_t pte;
1136
1137 phys >>= PAGE_SHIFT;
1138
1139 switch (idx) {
1140 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1141#ifdef CONFIG_X86_F00F_BUG
1142 case FIX_F00F_IDT:
1143#endif
1144#ifdef CONFIG_X86_32
1145 case FIX_WP_TEST:
1146 case FIX_VDSO:
1147# ifdef CONFIG_HIGHMEM
1148 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1149# endif
1150#else
1151 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1152#endif
1153#ifdef CONFIG_X86_LOCAL_APIC
1154 case FIX_APIC_BASE: /* maps dummy local APIC */
1155#endif
1156 pte = pfn_pte(phys, prot);
1157 break;
1158
1159 default:
1160 pte = mfn_pte(phys, prot);
1161 break;
1162 }
1163
1164 __native_set_fixmap(idx, pte);
1165
1166#ifdef CONFIG_X86_64
1167 /* Replicate changes to map the vsyscall page into the user
1168 pagetable vsyscall mapping. */
1169 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1170 unsigned long vaddr = __fix_to_virt(idx);
1171 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1172 }
1173#endif
1174}
1175
950static const struct pv_info xen_info __initdata = { 1176static const struct pv_info xen_info __initdata = {
951 .paravirt_enabled = 1, 1177 .paravirt_enabled = 1,
952 .shared_kernel_pmd = 0, 1178 .shared_kernel_pmd = 0,
@@ -960,7 +1186,7 @@ static const struct pv_init_ops xen_init_ops __initdata = {
960 .banner = xen_banner, 1186 .banner = xen_banner,
961 .memory_setup = xen_memory_setup, 1187 .memory_setup = xen_memory_setup,
962 .arch_setup = xen_arch_setup, 1188 .arch_setup = xen_arch_setup,
963 .post_allocator_init = xen_mark_init_mm_pinned, 1189 .post_allocator_init = xen_post_allocator_init,
964}; 1190};
965 1191
966static const struct pv_time_ops xen_time_ops __initdata = { 1192static const struct pv_time_ops xen_time_ops __initdata = {
@@ -968,7 +1194,7 @@ static const struct pv_time_ops xen_time_ops __initdata = {
968 1194
969 .set_wallclock = xen_set_wallclock, 1195 .set_wallclock = xen_set_wallclock,
970 .get_wallclock = xen_get_wallclock, 1196 .get_wallclock = xen_get_wallclock,
971 .get_cpu_khz = xen_cpu_khz, 1197 .get_tsc_khz = xen_tsc_khz,
972 .sched_clock = xen_sched_clock, 1198 .sched_clock = xen_sched_clock,
973}; 1199};
974 1200
@@ -978,10 +1204,10 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
978 .set_debugreg = xen_set_debugreg, 1204 .set_debugreg = xen_set_debugreg,
979 .get_debugreg = xen_get_debugreg, 1205 .get_debugreg = xen_get_debugreg,
980 1206
981 .clts = native_clts, 1207 .clts = xen_clts,
982 1208
983 .read_cr0 = native_read_cr0, 1209 .read_cr0 = native_read_cr0,
984 .write_cr0 = native_write_cr0, 1210 .write_cr0 = xen_write_cr0,
985 1211
986 .read_cr4 = native_read_cr4, 1212 .read_cr4 = native_read_cr4,
987 .read_cr4_safe = native_read_cr4_safe, 1213 .read_cr4_safe = native_read_cr4_safe,
@@ -990,18 +1216,28 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
990 .wbinvd = native_wbinvd, 1216 .wbinvd = native_wbinvd,
991 1217
992 .read_msr = native_read_msr_safe, 1218 .read_msr = native_read_msr_safe,
993 .write_msr = native_write_msr_safe, 1219 .write_msr = xen_write_msr_safe,
994 .read_tsc = native_read_tsc, 1220 .read_tsc = native_read_tsc,
995 .read_pmc = native_read_pmc, 1221 .read_pmc = native_read_pmc,
996 1222
997 .iret = xen_iret, 1223 .iret = xen_iret,
998 .irq_enable_syscall_ret = xen_sysexit, 1224 .irq_enable_sysexit = xen_sysexit,
1225#ifdef CONFIG_X86_64
1226 .usergs_sysret32 = xen_sysret32,
1227 .usergs_sysret64 = xen_sysret64,
1228#endif
999 1229
1000 .load_tr_desc = paravirt_nop, 1230 .load_tr_desc = paravirt_nop,
1001 .set_ldt = xen_set_ldt, 1231 .set_ldt = xen_set_ldt,
1002 .load_gdt = xen_load_gdt, 1232 .load_gdt = xen_load_gdt,
1003 .load_idt = xen_load_idt, 1233 .load_idt = xen_load_idt,
1004 .load_tls = xen_load_tls, 1234 .load_tls = xen_load_tls,
1235#ifdef CONFIG_X86_64
1236 .load_gs_index = xen_load_gs_index,
1237#endif
1238
1239 .alloc_ldt = xen_alloc_ldt,
1240 .free_ldt = xen_free_ldt,
1005 1241
1006 .store_gdt = native_store_gdt, 1242 .store_gdt = native_store_gdt,
1007 .store_idt = native_store_idt, 1243 .store_idt = native_store_idt,
@@ -1015,27 +1251,17 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1015 .set_iopl_mask = xen_set_iopl_mask, 1251 .set_iopl_mask = xen_set_iopl_mask,
1016 .io_delay = xen_io_delay, 1252 .io_delay = xen_io_delay,
1017 1253
1254 /* Xen takes care of %gs when switching to usermode for us */
1255 .swapgs = paravirt_nop,
1256
1018 .lazy_mode = { 1257 .lazy_mode = {
1019 .enter = paravirt_enter_lazy_cpu, 1258 .enter = paravirt_enter_lazy_cpu,
1020 .leave = xen_leave_lazy, 1259 .leave = xen_leave_lazy,
1021 }, 1260 },
1022}; 1261};
1023 1262
1024static const struct pv_irq_ops xen_irq_ops __initdata = {
1025 .init_IRQ = xen_init_IRQ,
1026 .save_fl = xen_save_fl,
1027 .restore_fl = xen_restore_fl,
1028 .irq_disable = xen_irq_disable,
1029 .irq_enable = xen_irq_enable,
1030 .safe_halt = xen_safe_halt,
1031 .halt = xen_halt,
1032};
1033
1034static const struct pv_apic_ops xen_apic_ops __initdata = { 1263static const struct pv_apic_ops xen_apic_ops __initdata = {
1035#ifdef CONFIG_X86_LOCAL_APIC 1264#ifdef CONFIG_X86_LOCAL_APIC
1036 .apic_write = xen_apic_write,
1037 .apic_write_atomic = xen_apic_write,
1038 .apic_read = xen_apic_read,
1039 .setup_boot_clock = paravirt_nop, 1265 .setup_boot_clock = paravirt_nop,
1040 .setup_secondary_clock = paravirt_nop, 1266 .setup_secondary_clock = paravirt_nop,
1041 .startup_ipi_hook = paravirt_nop, 1267 .startup_ipi_hook = paravirt_nop,
@@ -1060,6 +1286,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1060 .pte_update = paravirt_nop, 1286 .pte_update = paravirt_nop,
1061 .pte_update_defer = paravirt_nop, 1287 .pte_update_defer = paravirt_nop,
1062 1288
1289 .pgd_alloc = xen_pgd_alloc,
1290 .pgd_free = xen_pgd_free,
1291
1063 .alloc_pte = xen_alloc_pte_init, 1292 .alloc_pte = xen_alloc_pte_init,
1064 .release_pte = xen_release_pte_init, 1293 .release_pte = xen_release_pte_init,
1065 .alloc_pmd = xen_alloc_pte_init, 1294 .alloc_pmd = xen_alloc_pte_init,
@@ -1070,25 +1299,44 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1070 .kmap_atomic_pte = xen_kmap_atomic_pte, 1299 .kmap_atomic_pte = xen_kmap_atomic_pte,
1071#endif 1300#endif
1072 1301
1073 .set_pte = NULL, /* see xen_pagetable_setup_* */ 1302#ifdef CONFIG_X86_64
1303 .set_pte = xen_set_pte,
1304#else
1305 .set_pte = xen_set_pte_init,
1306#endif
1074 .set_pte_at = xen_set_pte_at, 1307 .set_pte_at = xen_set_pte_at,
1075 .set_pmd = xen_set_pmd, 1308 .set_pmd = xen_set_pmd_hyper,
1309
1310 .ptep_modify_prot_start = __ptep_modify_prot_start,
1311 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1076 1312
1077 .pte_val = xen_pte_val, 1313 .pte_val = xen_pte_val,
1314 .pte_flags = native_pte_flags,
1078 .pgd_val = xen_pgd_val, 1315 .pgd_val = xen_pgd_val,
1079 1316
1080 .make_pte = xen_make_pte, 1317 .make_pte = xen_make_pte,
1081 .make_pgd = xen_make_pgd, 1318 .make_pgd = xen_make_pgd,
1082 1319
1320#ifdef CONFIG_X86_PAE
1083 .set_pte_atomic = xen_set_pte_atomic, 1321 .set_pte_atomic = xen_set_pte_atomic,
1084 .set_pte_present = xen_set_pte_at, 1322 .set_pte_present = xen_set_pte_at,
1085 .set_pud = xen_set_pud,
1086 .pte_clear = xen_pte_clear, 1323 .pte_clear = xen_pte_clear,
1087 .pmd_clear = xen_pmd_clear, 1324 .pmd_clear = xen_pmd_clear,
1325#endif /* CONFIG_X86_PAE */
1326 .set_pud = xen_set_pud_hyper,
1088 1327
1089 .make_pmd = xen_make_pmd, 1328 .make_pmd = xen_make_pmd,
1090 .pmd_val = xen_pmd_val, 1329 .pmd_val = xen_pmd_val,
1091 1330
1331#if PAGETABLE_LEVELS == 4
1332 .pud_val = xen_pud_val,
1333 .make_pud = xen_make_pud,
1334 .set_pgd = xen_set_pgd_hyper,
1335
1336 .alloc_pud = xen_alloc_pte_init,
1337 .release_pud = xen_release_pte_init,
1338#endif /* PAGETABLE_LEVELS == 4 */
1339
1092 .activate_mm = xen_activate_mm, 1340 .activate_mm = xen_activate_mm,
1093 .dup_mmap = xen_dup_mmap, 1341 .dup_mmap = xen_dup_mmap,
1094 .exit_mmap = xen_exit_mmap, 1342 .exit_mmap = xen_exit_mmap,
@@ -1097,28 +1345,19 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1097 .enter = paravirt_enter_lazy_mmu, 1345 .enter = paravirt_enter_lazy_mmu,
1098 .leave = xen_leave_lazy, 1346 .leave = xen_leave_lazy,
1099 }, 1347 },
1100};
1101 1348
1102#ifdef CONFIG_SMP 1349 .set_fixmap = xen_set_fixmap,
1103static const struct smp_ops xen_smp_ops __initdata = {
1104 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1105 .smp_prepare_cpus = xen_smp_prepare_cpus,
1106 .cpu_up = xen_cpu_up,
1107 .smp_cpus_done = xen_smp_cpus_done,
1108
1109 .smp_send_stop = xen_smp_send_stop,
1110 .smp_send_reschedule = xen_smp_send_reschedule,
1111 .smp_call_function_mask = xen_smp_call_function_mask,
1112}; 1350};
1113#endif /* CONFIG_SMP */
1114 1351
1115static void xen_reboot(int reason) 1352static void xen_reboot(int reason)
1116{ 1353{
1354 struct sched_shutdown r = { .reason = reason };
1355
1117#ifdef CONFIG_SMP 1356#ifdef CONFIG_SMP
1118 smp_send_stop(); 1357 smp_send_stop();
1119#endif 1358#endif
1120 1359
1121 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) 1360 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
1122 BUG(); 1361 BUG();
1123} 1362}
1124 1363
@@ -1154,15 +1393,219 @@ static const struct machine_ops __initdata xen_machine_ops = {
1154 1393
1155static void __init xen_reserve_top(void) 1394static void __init xen_reserve_top(void)
1156{ 1395{
1396#ifdef CONFIG_X86_32
1157 unsigned long top = HYPERVISOR_VIRT_START; 1397 unsigned long top = HYPERVISOR_VIRT_START;
1158 struct xen_platform_parameters pp; 1398 struct xen_platform_parameters pp;
1159 1399
1160 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) 1400 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1161 top = pp.virt_start; 1401 top = pp.virt_start;
1162 1402
1163 reserve_top_address(-top + 2 * PAGE_SIZE); 1403 reserve_top_address(-top);
1404#endif /* CONFIG_X86_32 */
1405}
1406
1407/*
1408 * Like __va(), but returns address in the kernel mapping (which is
1409 * all we have until the physical memory mapping has been set up.
1410 */
1411static void *__ka(phys_addr_t paddr)
1412{
1413#ifdef CONFIG_X86_64
1414 return (void *)(paddr + __START_KERNEL_map);
1415#else
1416 return __va(paddr);
1417#endif
1418}
1419
1420/* Convert a machine address to physical address */
1421static unsigned long m2p(phys_addr_t maddr)
1422{
1423 phys_addr_t paddr;
1424
1425 maddr &= PTE_PFN_MASK;
1426 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1427
1428 return paddr;
1164} 1429}
1165 1430
1431/* Convert a machine address to kernel virtual */
1432static void *m2v(phys_addr_t maddr)
1433{
1434 return __ka(m2p(maddr));
1435}
1436
1437static void set_page_prot(void *addr, pgprot_t prot)
1438{
1439 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1440 pte_t pte = pfn_pte(pfn, prot);
1441
1442 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1443 BUG();
1444}
1445
1446static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1447{
1448 unsigned pmdidx, pteidx;
1449 unsigned ident_pte;
1450 unsigned long pfn;
1451
1452 ident_pte = 0;
1453 pfn = 0;
1454 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1455 pte_t *pte_page;
1456
1457 /* Reuse or allocate a page of ptes */
1458 if (pmd_present(pmd[pmdidx]))
1459 pte_page = m2v(pmd[pmdidx].pmd);
1460 else {
1461 /* Check for free pte pages */
1462 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1463 break;
1464
1465 pte_page = &level1_ident_pgt[ident_pte];
1466 ident_pte += PTRS_PER_PTE;
1467
1468 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1469 }
1470
1471 /* Install mappings */
1472 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1473 pte_t pte;
1474
1475 if (pfn > max_pfn_mapped)
1476 max_pfn_mapped = pfn;
1477
1478 if (!pte_none(pte_page[pteidx]))
1479 continue;
1480
1481 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1482 pte_page[pteidx] = pte;
1483 }
1484 }
1485
1486 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1487 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1488
1489 set_page_prot(pmd, PAGE_KERNEL_RO);
1490}
1491
1492#ifdef CONFIG_X86_64
1493static void convert_pfn_mfn(void *v)
1494{
1495 pte_t *pte = v;
1496 int i;
1497
1498 /* All levels are converted the same way, so just treat them
1499 as ptes. */
1500 for(i = 0; i < PTRS_PER_PTE; i++)
1501 pte[i] = xen_make_pte(pte[i].pte);
1502}
1503
1504/*
1505 * Set up the inital kernel pagetable.
1506 *
1507 * We can construct this by grafting the Xen provided pagetable into
1508 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1509 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1510 * means that only the kernel has a physical mapping to start with -
1511 * but that's enough to get __va working. We need to fill in the rest
1512 * of the physical mapping once some sort of allocator has been set
1513 * up.
1514 */
1515static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1516{
1517 pud_t *l3;
1518 pmd_t *l2;
1519
1520 /* Zap identity mapping */
1521 init_level4_pgt[0] = __pgd(0);
1522
1523 /* Pre-constructed entries are in pfn, so convert to mfn */
1524 convert_pfn_mfn(init_level4_pgt);
1525 convert_pfn_mfn(level3_ident_pgt);
1526 convert_pfn_mfn(level3_kernel_pgt);
1527
1528 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1529 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1530
1531 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1532 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1533
1534 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1535 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1536 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1537
1538 /* Set up identity map */
1539 xen_map_identity_early(level2_ident_pgt, max_pfn);
1540
1541 /* Make pagetable pieces RO */
1542 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1543 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1544 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1545 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1546 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1547 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1548
1549 /* Pin down new L4 */
1550 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1551 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1552
1553 /* Unpin Xen-provided one */
1554 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1555
1556 /* Switch over */
1557 pgd = init_level4_pgt;
1558
1559 /*
1560 * At this stage there can be no user pgd, and no page
1561 * structure to attach it to, so make sure we just set kernel
1562 * pgd.
1563 */
1564 xen_mc_batch();
1565 __xen_write_cr3(true, __pa(pgd));
1566 xen_mc_issue(PARAVIRT_LAZY_CPU);
1567
1568 reserve_early(__pa(xen_start_info->pt_base),
1569 __pa(xen_start_info->pt_base +
1570 xen_start_info->nr_pt_frames * PAGE_SIZE),
1571 "XEN PAGETABLES");
1572
1573 return pgd;
1574}
1575#else /* !CONFIG_X86_64 */
1576static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1577
1578static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1579{
1580 pmd_t *kernel_pmd;
1581
1582 init_pg_tables_start = __pa(pgd);
1583 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1584 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1585
1586 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1587 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1588
1589 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1590
1591 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1592 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1593 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1594
1595 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1596 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1597 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1598
1599 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1600
1601 xen_write_cr3(__pa(swapper_pg_dir));
1602
1603 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1604
1605 return swapper_pg_dir;
1606}
1607#endif /* CONFIG_X86_64 */
1608
1166/* First C function to be called on Xen boot */ 1609/* First C function to be called on Xen boot */
1167asmlinkage void __init xen_start_kernel(void) 1610asmlinkage void __init xen_start_kernel(void)
1168{ 1611{
@@ -1171,70 +1614,99 @@ asmlinkage void __init xen_start_kernel(void)
1171 if (!xen_start_info) 1614 if (!xen_start_info)
1172 return; 1615 return;
1173 1616
1617 xen_domain_type = XEN_PV_DOMAIN;
1618
1174 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); 1619 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
1175 1620
1621 xen_setup_features();
1622
1176 /* Install Xen paravirt ops */ 1623 /* Install Xen paravirt ops */
1177 pv_info = xen_info; 1624 pv_info = xen_info;
1178 pv_init_ops = xen_init_ops; 1625 pv_init_ops = xen_init_ops;
1179 pv_time_ops = xen_time_ops; 1626 pv_time_ops = xen_time_ops;
1180 pv_cpu_ops = xen_cpu_ops; 1627 pv_cpu_ops = xen_cpu_ops;
1181 pv_irq_ops = xen_irq_ops;
1182 pv_apic_ops = xen_apic_ops; 1628 pv_apic_ops = xen_apic_ops;
1183 pv_mmu_ops = xen_mmu_ops; 1629 pv_mmu_ops = xen_mmu_ops;
1184 1630
1631 xen_init_irq_ops();
1632
1633#ifdef CONFIG_X86_LOCAL_APIC
1634 /*
1635 * set up the basic apic ops.
1636 */
1637 apic_ops = &xen_basic_apic_ops;
1638#endif
1639
1640 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1641 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
1642 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
1643 }
1644
1185 machine_ops = xen_machine_ops; 1645 machine_ops = xen_machine_ops;
1186 1646
1187#ifdef CONFIG_SMP 1647#ifdef CONFIG_X86_64
1188 smp_ops = xen_smp_ops; 1648 /* Disable until direct per-cpu data access. */
1649 have_vcpu_info_placement = 0;
1650 x86_64_init_pda();
1189#endif 1651#endif
1190 1652
1191 xen_setup_features(); 1653 xen_smp_init();
1192 1654
1193 /* Get mfn list */ 1655 /* Get mfn list */
1194 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1656 if (!xen_feature(XENFEAT_auto_translated_physmap))
1195 phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; 1657 xen_build_dynamic_phys_to_machine();
1196 1658
1197 pgd = (pgd_t *)xen_start_info->pt_base; 1659 pgd = (pgd_t *)xen_start_info->pt_base;
1198 1660
1199 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1661 /* Prevent unwanted bits from being set in PTEs. */
1200 1662 __supported_pte_mask &= ~_PAGE_GLOBAL;
1201 init_mm.pgd = pgd; /* use the Xen pagetables to start */ 1663 if (!xen_initial_domain())
1202 1664 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1203 /* keep using Xen gdt for now; no urgent need to change it */
1204
1205 x86_write_percpu(xen_cr3, __pa(pgd));
1206 x86_write_percpu(xen_current_cr3, __pa(pgd));
1207 1665
1208 /* Don't do the full vcpu_info placement stuff until we have a 1666 /* Don't do the full vcpu_info placement stuff until we have a
1209 possible map and a non-dummy shared_info. */ 1667 possible map and a non-dummy shared_info. */
1210 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1668 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1211 1669
1670 xen_raw_console_write("mapping kernel into physical memory\n");
1671 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1672
1673 init_mm.pgd = pgd;
1674
1675 /* keep using Xen gdt for now; no urgent need to change it */
1676
1212 pv_info.kernel_rpl = 1; 1677 pv_info.kernel_rpl = 1;
1213 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1678 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1214 pv_info.kernel_rpl = 0; 1679 pv_info.kernel_rpl = 0;
1215 1680
1216 /* Prevent unwanted bits from being set in PTEs. */
1217 __supported_pte_mask &= ~_PAGE_GLOBAL;
1218 if (!is_initial_xendomain())
1219 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1220
1221 /* set the limit of our address space */ 1681 /* set the limit of our address space */
1222 xen_reserve_top(); 1682 xen_reserve_top();
1223 1683
1684#ifdef CONFIG_X86_32
1224 /* set up basic CPUID stuff */ 1685 /* set up basic CPUID stuff */
1225 cpu_detect(&new_cpu_data); 1686 cpu_detect(&new_cpu_data);
1226 new_cpu_data.hard_math = 1; 1687 new_cpu_data.hard_math = 1;
1227 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1688 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1689#endif
1228 1690
1229 /* Poke various useful things into boot_params */ 1691 /* Poke various useful things into boot_params */
1230 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1692 boot_params.hdr.type_of_loader = (9 << 4) | 0;
1231 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1693 boot_params.hdr.ramdisk_image = xen_start_info->mod_start
1232 ? __pa(xen_start_info->mod_start) : 0; 1694 ? __pa(xen_start_info->mod_start) : 0;
1233 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1695 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1696 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1234 1697
1235 if (!is_initial_xendomain()) 1698 if (!xen_initial_domain()) {
1699 add_preferred_console("xenboot", 0, NULL);
1700 add_preferred_console("tty", 0, NULL);
1236 add_preferred_console("hvc", 0, NULL); 1701 add_preferred_console("hvc", 0, NULL);
1702 }
1703
1704 xen_raw_console_write("about to get started...\n");
1237 1705
1238 /* Start the world */ 1706 /* Start the world */
1239 start_kernel(); 1707#ifdef CONFIG_X86_32
1708 i386_start_kernel();
1709#else
1710 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1711#endif
1240} 1712}
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
new file mode 100644
index 000000000000..28b85ab8422e
--- /dev/null
+++ b/arch/x86/xen/irq.c
@@ -0,0 +1,143 @@
1#include <linux/hardirq.h>
2
3#include <xen/interface/xen.h>
4#include <xen/interface/sched.h>
5#include <xen/interface/vcpu.h>
6
7#include <asm/xen/hypercall.h>
8#include <asm/xen/hypervisor.h>
9
10#include "xen-ops.h"
11
12/*
13 * Force a proper event-channel callback from Xen after clearing the
14 * callback mask. We do this in a very simple manner, by making a call
15 * down into Xen. The pending flag will be checked by Xen on return.
16 */
17void xen_force_evtchn_callback(void)
18{
19 (void)HYPERVISOR_xen_version(0, NULL);
20}
21
22static void __init __xen_init_IRQ(void)
23{
24#ifdef CONFIG_X86_64
25 int i;
26
27 /* Create identity vector->irq map */
28 for(i = 0; i < NR_VECTORS; i++) {
29 int cpu;
30
31 for_each_possible_cpu(cpu)
32 per_cpu(vector_irq, cpu)[i] = i;
33 }
34#endif /* CONFIG_X86_64 */
35
36 xen_init_IRQ();
37}
38
39static unsigned long xen_save_fl(void)
40{
41 struct vcpu_info *vcpu;
42 unsigned long flags;
43
44 vcpu = x86_read_percpu(xen_vcpu);
45
46 /* flag has opposite sense of mask */
47 flags = !vcpu->evtchn_upcall_mask;
48
49 /* convert to IF type flag
50 -0 -> 0x00000000
51 -1 -> 0xffffffff
52 */
53 return (-flags) & X86_EFLAGS_IF;
54}
55
56static void xen_restore_fl(unsigned long flags)
57{
58 struct vcpu_info *vcpu;
59
60 /* convert from IF type flag */
61 flags = !(flags & X86_EFLAGS_IF);
62
63 /* There's a one instruction preempt window here. We need to
64 make sure we're don't switch CPUs between getting the vcpu
65 pointer and updating the mask. */
66 preempt_disable();
67 vcpu = x86_read_percpu(xen_vcpu);
68 vcpu->evtchn_upcall_mask = flags;
69 preempt_enable_no_resched();
70
71 /* Doesn't matter if we get preempted here, because any
72 pending event will get dealt with anyway. */
73
74 if (flags == 0) {
75 preempt_check_resched();
76 barrier(); /* unmask then check (avoid races) */
77 if (unlikely(vcpu->evtchn_upcall_pending))
78 xen_force_evtchn_callback();
79 }
80}
81
82static void xen_irq_disable(void)
83{
84 /* There's a one instruction preempt window here. We need to
85 make sure we're don't switch CPUs between getting the vcpu
86 pointer and updating the mask. */
87 preempt_disable();
88 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
89 preempt_enable_no_resched();
90}
91
92static void xen_irq_enable(void)
93{
94 struct vcpu_info *vcpu;
95
96 /* We don't need to worry about being preempted here, since
97 either a) interrupts are disabled, so no preemption, or b)
98 the caller is confused and is trying to re-enable interrupts
99 on an indeterminate processor. */
100
101 vcpu = x86_read_percpu(xen_vcpu);
102 vcpu->evtchn_upcall_mask = 0;
103
104 /* Doesn't matter if we get preempted here, because any
105 pending event will get dealt with anyway. */
106
107 barrier(); /* unmask then check (avoid races) */
108 if (unlikely(vcpu->evtchn_upcall_pending))
109 xen_force_evtchn_callback();
110}
111
112static void xen_safe_halt(void)
113{
114 /* Blocking includes an implicit local_irq_enable(). */
115 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
116 BUG();
117}
118
119static void xen_halt(void)
120{
121 if (irqs_disabled())
122 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
123 else
124 xen_safe_halt();
125}
126
127static const struct pv_irq_ops xen_irq_ops __initdata = {
128 .init_IRQ = __xen_init_IRQ,
129 .save_fl = xen_save_fl,
130 .restore_fl = xen_restore_fl,
131 .irq_disable = xen_irq_disable,
132 .irq_enable = xen_irq_enable,
133 .safe_halt = xen_safe_halt,
134 .halt = xen_halt,
135#ifdef CONFIG_X86_64
136 .adjust_exception_frame = xen_adjust_exception_frame,
137#endif
138};
139
140void __init xen_init_irq_ops()
141{
142 pv_irq_ops = xen_irq_ops;
143}
diff --git a/arch/x86/xen/manage.c b/arch/x86/xen/manage.c
deleted file mode 100644
index aa7af9e6abc0..000000000000
--- a/arch/x86/xen/manage.c
+++ /dev/null
@@ -1,143 +0,0 @@
1/*
2 * Handle extern requests for shutdown, reboot and sysrq
3 */
4#include <linux/kernel.h>
5#include <linux/err.h>
6#include <linux/reboot.h>
7#include <linux/sysrq.h>
8
9#include <xen/xenbus.h>
10
11#define SHUTDOWN_INVALID -1
12#define SHUTDOWN_POWEROFF 0
13#define SHUTDOWN_SUSPEND 2
14/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
15 * report a crash, not be instructed to crash!
16 * HALT is the same as POWEROFF, as far as we're concerned. The tools use
17 * the distinction when we return the reason code to them.
18 */
19#define SHUTDOWN_HALT 4
20
21/* Ignore multiple shutdown requests. */
22static int shutting_down = SHUTDOWN_INVALID;
23
24static void shutdown_handler(struct xenbus_watch *watch,
25 const char **vec, unsigned int len)
26{
27 char *str;
28 struct xenbus_transaction xbt;
29 int err;
30
31 if (shutting_down != SHUTDOWN_INVALID)
32 return;
33
34 again:
35 err = xenbus_transaction_start(&xbt);
36 if (err)
37 return;
38
39 str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
40 /* Ignore read errors and empty reads. */
41 if (XENBUS_IS_ERR_READ(str)) {
42 xenbus_transaction_end(xbt, 1);
43 return;
44 }
45
46 xenbus_write(xbt, "control", "shutdown", "");
47
48 err = xenbus_transaction_end(xbt, 0);
49 if (err == -EAGAIN) {
50 kfree(str);
51 goto again;
52 }
53
54 if (strcmp(str, "poweroff") == 0 ||
55 strcmp(str, "halt") == 0)
56 orderly_poweroff(false);
57 else if (strcmp(str, "reboot") == 0)
58 ctrl_alt_del();
59 else {
60 printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
61 shutting_down = SHUTDOWN_INVALID;
62 }
63
64 kfree(str);
65}
66
67static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
68 unsigned int len)
69{
70 char sysrq_key = '\0';
71 struct xenbus_transaction xbt;
72 int err;
73
74 again:
75 err = xenbus_transaction_start(&xbt);
76 if (err)
77 return;
78 if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
79 printk(KERN_ERR "Unable to read sysrq code in "
80 "control/sysrq\n");
81 xenbus_transaction_end(xbt, 1);
82 return;
83 }
84
85 if (sysrq_key != '\0')
86 xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
87
88 err = xenbus_transaction_end(xbt, 0);
89 if (err == -EAGAIN)
90 goto again;
91
92 if (sysrq_key != '\0')
93 handle_sysrq(sysrq_key, NULL);
94}
95
96static struct xenbus_watch shutdown_watch = {
97 .node = "control/shutdown",
98 .callback = shutdown_handler
99};
100
101static struct xenbus_watch sysrq_watch = {
102 .node = "control/sysrq",
103 .callback = sysrq_handler
104};
105
106static int setup_shutdown_watcher(void)
107{
108 int err;
109
110 err = register_xenbus_watch(&shutdown_watch);
111 if (err) {
112 printk(KERN_ERR "Failed to set shutdown watcher\n");
113 return err;
114 }
115
116 err = register_xenbus_watch(&sysrq_watch);
117 if (err) {
118 printk(KERN_ERR "Failed to set sysrq watcher\n");
119 return err;
120 }
121
122 return 0;
123}
124
125static int shutdown_event(struct notifier_block *notifier,
126 unsigned long event,
127 void *data)
128{
129 setup_shutdown_watcher();
130 return NOTIFY_DONE;
131}
132
133static int __init setup_shutdown_event(void)
134{
135 static struct notifier_block xenstore_notifier = {
136 .notifier_call = shutdown_event
137 };
138 register_xenstore_notifier(&xenstore_notifier);
139
140 return 0;
141}
142
143subsys_initcall(setup_shutdown_event);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index df40bf74ea75..ae173f6edd8b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -40,12 +40,15 @@
40 */ 40 */
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/debugfs.h>
43#include <linux/bug.h> 44#include <linux/bug.h>
44 45
45#include <asm/pgtable.h> 46#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/fixmap.h>
47#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
48#include <asm/paravirt.h> 50#include <asm/paravirt.h>
51#include <asm/linkage.h>
49 52
50#include <asm/xen/hypercall.h> 53#include <asm/xen/hypercall.h>
51#include <asm/xen/hypervisor.h> 54#include <asm/xen/hypervisor.h>
@@ -55,16 +58,200 @@
55 58
56#include "multicalls.h" 59#include "multicalls.h"
57#include "mmu.h" 60#include "mmu.h"
61#include "debugfs.h"
58 62
59xmaddr_t arbitrary_virt_to_machine(unsigned long address) 63#define MMU_UPDATE_HISTO 30
64
65#ifdef CONFIG_XEN_DEBUG_FS
66
67static struct {
68 u32 pgd_update;
69 u32 pgd_update_pinned;
70 u32 pgd_update_batched;
71
72 u32 pud_update;
73 u32 pud_update_pinned;
74 u32 pud_update_batched;
75
76 u32 pmd_update;
77 u32 pmd_update_pinned;
78 u32 pmd_update_batched;
79
80 u32 pte_update;
81 u32 pte_update_pinned;
82 u32 pte_update_batched;
83
84 u32 mmu_update;
85 u32 mmu_update_extended;
86 u32 mmu_update_histo[MMU_UPDATE_HISTO];
87
88 u32 prot_commit;
89 u32 prot_commit_batched;
90
91 u32 set_pte_at;
92 u32 set_pte_at_batched;
93 u32 set_pte_at_pinned;
94 u32 set_pte_at_current;
95 u32 set_pte_at_kernel;
96} mmu_stats;
97
98static u8 zero_stats;
99
100static inline void check_zero(void)
101{
102 if (unlikely(zero_stats)) {
103 memset(&mmu_stats, 0, sizeof(mmu_stats));
104 zero_stats = 0;
105 }
106}
107
108#define ADD_STATS(elem, val) \
109 do { check_zero(); mmu_stats.elem += (val); } while(0)
110
111#else /* !CONFIG_XEN_DEBUG_FS */
112
113#define ADD_STATS(elem, val) do { (void)(val); } while(0)
114
115#endif /* CONFIG_XEN_DEBUG_FS */
116
117/*
118 * Just beyond the highest usermode address. STACK_TOP_MAX has a
119 * redzone above it, so round it up to a PGD boundary.
120 */
121#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
122
123
124#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
125#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
126
127/* Placeholder for holes in the address space */
128static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
129 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
130
131 /* Array of pointers to pages containing p2m entries */
132static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
133 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
134
135/* Arrays of p2m arrays expressed in mfns used for save/restore */
136static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
137
138static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
139 __page_aligned_bss;
140
141static inline unsigned p2m_top_index(unsigned long pfn)
142{
143 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
144 return pfn / P2M_ENTRIES_PER_PAGE;
145}
146
147static inline unsigned p2m_index(unsigned long pfn)
148{
149 return pfn % P2M_ENTRIES_PER_PAGE;
150}
151
152/* Build the parallel p2m_top_mfn structures */
153void xen_setup_mfn_list_list(void)
60{ 154{
155 unsigned pfn, idx;
156
157 for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
158 unsigned topidx = p2m_top_index(pfn);
159
160 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
161 }
162
163 for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
164 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
165 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
166 }
167
168 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
169
170 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
171 virt_to_mfn(p2m_top_mfn_list);
172 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
173}
174
175/* Set up p2m_top to point to the domain-builder provided p2m pages */
176void __init xen_build_dynamic_phys_to_machine(void)
177{
178 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
179 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
180 unsigned pfn;
181
182 for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
183 unsigned topidx = p2m_top_index(pfn);
184
185 p2m_top[topidx] = &mfn_list[pfn];
186 }
187}
188
189unsigned long get_phys_to_machine(unsigned long pfn)
190{
191 unsigned topidx, idx;
192
193 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
194 return INVALID_P2M_ENTRY;
195
196 topidx = p2m_top_index(pfn);
197 idx = p2m_index(pfn);
198 return p2m_top[topidx][idx];
199}
200EXPORT_SYMBOL_GPL(get_phys_to_machine);
201
202static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
203{
204 unsigned long *p;
205 unsigned i;
206
207 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
208 BUG_ON(p == NULL);
209
210 for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
211 p[i] = INVALID_P2M_ENTRY;
212
213 if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
214 free_page((unsigned long)p);
215 else
216 *mfnp = virt_to_mfn(p);
217}
218
219void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
220{
221 unsigned topidx, idx;
222
223 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
224 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
225 return;
226 }
227
228 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
229 BUG_ON(mfn != INVALID_P2M_ENTRY);
230 return;
231 }
232
233 topidx = p2m_top_index(pfn);
234 if (p2m_top[topidx] == p2m_missing) {
235 /* no need to allocate a page to store an invalid entry */
236 if (mfn == INVALID_P2M_ENTRY)
237 return;
238 alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
239 }
240
241 idx = p2m_index(pfn);
242 p2m_top[topidx][idx] = mfn;
243}
244
245xmaddr_t arbitrary_virt_to_machine(void *vaddr)
246{
247 unsigned long address = (unsigned long)vaddr;
61 unsigned int level; 248 unsigned int level;
62 pte_t *pte = lookup_address(address, &level); 249 pte_t *pte = lookup_address(address, &level);
63 unsigned offset = address & ~PAGE_MASK; 250 unsigned offset = address & ~PAGE_MASK;
64 251
65 BUG_ON(pte == NULL); 252 BUG_ON(pte == NULL);
66 253
67 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); 254 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
68} 255}
69 256
70void make_lowmem_page_readonly(void *vaddr) 257void make_lowmem_page_readonly(void *vaddr)
@@ -98,59 +285,84 @@ void make_lowmem_page_readwrite(void *vaddr)
98} 285}
99 286
100 287
101void xen_set_pmd(pmd_t *ptr, pmd_t val) 288static bool xen_page_pinned(void *ptr)
289{
290 struct page *page = virt_to_page(ptr);
291
292 return PagePinned(page);
293}
294
295static void xen_extend_mmu_update(const struct mmu_update *update)
102{ 296{
103 struct multicall_space mcs; 297 struct multicall_space mcs;
104 struct mmu_update *u; 298 struct mmu_update *u;
105 299
106 preempt_disable(); 300 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
301
302 if (mcs.mc != NULL) {
303 ADD_STATS(mmu_update_extended, 1);
304 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
305
306 mcs.mc->args[1]++;
307
308 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
309 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
310 else
311 ADD_STATS(mmu_update_histo[0], 1);
312 } else {
313 ADD_STATS(mmu_update, 1);
314 mcs = __xen_mc_entry(sizeof(*u));
315 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
316 ADD_STATS(mmu_update_histo[1], 1);
317 }
107 318
108 mcs = xen_mc_entry(sizeof(*u));
109 u = mcs.args; 319 u = mcs.args;
110 u->ptr = virt_to_machine(ptr).maddr; 320 *u = *update;
111 u->val = pmd_val_ma(val); 321}
112 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); 322
323void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
324{
325 struct mmu_update u;
326
327 preempt_disable();
328
329 xen_mc_batch();
330
331 /* ptr may be ioremapped for 64-bit pagetable setup */
332 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
333 u.val = pmd_val_ma(val);
334 xen_extend_mmu_update(&u);
335
336 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
113 337
114 xen_mc_issue(PARAVIRT_LAZY_MMU); 338 xen_mc_issue(PARAVIRT_LAZY_MMU);
115 339
116 preempt_enable(); 340 preempt_enable();
117} 341}
118 342
343void xen_set_pmd(pmd_t *ptr, pmd_t val)
344{
345 ADD_STATS(pmd_update, 1);
346
347 /* If page is not pinned, we can just update the entry
348 directly */
349 if (!xen_page_pinned(ptr)) {
350 *ptr = val;
351 return;
352 }
353
354 ADD_STATS(pmd_update_pinned, 1);
355
356 xen_set_pmd_hyper(ptr, val);
357}
358
119/* 359/*
120 * Associate a virtual page frame with a given physical page frame 360 * Associate a virtual page frame with a given physical page frame
121 * and protection flags for that frame. 361 * and protection flags for that frame.
122 */ 362 */
123void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 363void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
124{ 364{
125 pgd_t *pgd; 365 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
126 pud_t *pud;
127 pmd_t *pmd;
128 pte_t *pte;
129
130 pgd = swapper_pg_dir + pgd_index(vaddr);
131 if (pgd_none(*pgd)) {
132 BUG();
133 return;
134 }
135 pud = pud_offset(pgd, vaddr);
136 if (pud_none(*pud)) {
137 BUG();
138 return;
139 }
140 pmd = pmd_offset(pud, vaddr);
141 if (pmd_none(*pmd)) {
142 BUG();
143 return;
144 }
145 pte = pte_offset_kernel(pmd, vaddr);
146 /* <mfn,flags> stored as-is, to permit clearing entries */
147 xen_set_pte(pte, mfn_pte(mfn, flags));
148
149 /*
150 * It's enough to flush this one mapping.
151 * (PGE mappings get flushed as well)
152 */
153 __flush_tlb_one(vaddr);
154} 366}
155 367
156void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 368void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -160,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
160 if (mm == &init_mm) 372 if (mm == &init_mm)
161 preempt_disable(); 373 preempt_disable();
162 374
375 ADD_STATS(set_pte_at, 1);
376// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
377 ADD_STATS(set_pte_at_current, mm == current->mm);
378 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
379
163 if (mm == current->mm || mm == &init_mm) { 380 if (mm == current->mm || mm == &init_mm) {
164 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 381 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
165 struct multicall_space mcs; 382 struct multicall_space mcs;
166 mcs = xen_mc_entry(0); 383 mcs = xen_mc_entry(0);
167 384
168 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 385 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
386 ADD_STATS(set_pte_at_batched, 1);
169 xen_mc_issue(PARAVIRT_LAZY_MMU); 387 xen_mc_issue(PARAVIRT_LAZY_MMU);
170 goto out; 388 goto out;
171 } else 389 } else
@@ -179,13 +397,36 @@ out:
179 preempt_enable(); 397 preempt_enable();
180} 398}
181 399
400pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
401{
402 /* Just return the pte as-is. We preserve the bits on commit */
403 return *ptep;
404}
405
406void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
407 pte_t *ptep, pte_t pte)
408{
409 struct mmu_update u;
410
411 xen_mc_batch();
412
413 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
414 u.val = pte_val_ma(pte);
415 xen_extend_mmu_update(&u);
416
417 ADD_STATS(prot_commit, 1);
418 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
419
420 xen_mc_issue(PARAVIRT_LAZY_MMU);
421}
422
182/* Assume pteval_t is equivalent to all the other *val_t types. */ 423/* Assume pteval_t is equivalent to all the other *val_t types. */
183static pteval_t pte_mfn_to_pfn(pteval_t val) 424static pteval_t pte_mfn_to_pfn(pteval_t val)
184{ 425{
185 if (val & _PAGE_PRESENT) { 426 if (val & _PAGE_PRESENT) {
186 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; 427 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
187 pteval_t flags = val & ~PTE_MASK; 428 pteval_t flags = val & PTE_FLAGS_MASK;
188 val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; 429 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
189 } 430 }
190 431
191 return val; 432 return val;
@@ -194,9 +435,9 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
194static pteval_t pte_pfn_to_mfn(pteval_t val) 435static pteval_t pte_pfn_to_mfn(pteval_t val)
195{ 436{
196 if (val & _PAGE_PRESENT) { 437 if (val & _PAGE_PRESENT) {
197 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; 438 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
198 pteval_t flags = val & ~PTE_MASK; 439 pteval_t flags = val & PTE_FLAGS_MASK;
199 val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; 440 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
200 } 441 }
201 442
202 return val; 443 return val;
@@ -229,34 +470,61 @@ pmdval_t xen_pmd_val(pmd_t pmd)
229 return pte_mfn_to_pfn(pmd.pmd); 470 return pte_mfn_to_pfn(pmd.pmd);
230} 471}
231 472
232void xen_set_pud(pud_t *ptr, pud_t val) 473void xen_set_pud_hyper(pud_t *ptr, pud_t val)
233{ 474{
234 struct multicall_space mcs; 475 struct mmu_update u;
235 struct mmu_update *u;
236 476
237 preempt_disable(); 477 preempt_disable();
238 478
239 mcs = xen_mc_entry(sizeof(*u)); 479 xen_mc_batch();
240 u = mcs.args; 480
241 u->ptr = virt_to_machine(ptr).maddr; 481 /* ptr may be ioremapped for 64-bit pagetable setup */
242 u->val = pud_val_ma(val); 482 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
243 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); 483 u.val = pud_val_ma(val);
484 xen_extend_mmu_update(&u);
485
486 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
244 487
245 xen_mc_issue(PARAVIRT_LAZY_MMU); 488 xen_mc_issue(PARAVIRT_LAZY_MMU);
246 489
247 preempt_enable(); 490 preempt_enable();
248} 491}
249 492
493void xen_set_pud(pud_t *ptr, pud_t val)
494{
495 ADD_STATS(pud_update, 1);
496
497 /* If page is not pinned, we can just update the entry
498 directly */
499 if (!xen_page_pinned(ptr)) {
500 *ptr = val;
501 return;
502 }
503
504 ADD_STATS(pud_update_pinned, 1);
505
506 xen_set_pud_hyper(ptr, val);
507}
508
250void xen_set_pte(pte_t *ptep, pte_t pte) 509void xen_set_pte(pte_t *ptep, pte_t pte)
251{ 510{
511 ADD_STATS(pte_update, 1);
512// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
513 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
514
515#ifdef CONFIG_X86_PAE
252 ptep->pte_high = pte.pte_high; 516 ptep->pte_high = pte.pte_high;
253 smp_wmb(); 517 smp_wmb();
254 ptep->pte_low = pte.pte_low; 518 ptep->pte_low = pte.pte_low;
519#else
520 *ptep = pte;
521#endif
255} 522}
256 523
524#ifdef CONFIG_X86_PAE
257void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 525void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
258{ 526{
259 set_64bit((u64 *)ptep, pte_val_ma(pte)); 527 set_64bit((u64 *)ptep, native_pte_val(pte));
260} 528}
261 529
262void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 530void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -268,8 +536,9 @@ void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
268 536
269void xen_pmd_clear(pmd_t *pmdp) 537void xen_pmd_clear(pmd_t *pmdp)
270{ 538{
271 xen_set_pmd(pmdp, __pmd(0)); 539 set_pmd(pmdp, __pmd(0));
272} 540}
541#endif /* CONFIG_X86_PAE */
273 542
274pmd_t xen_make_pmd(pmdval_t pmd) 543pmd_t xen_make_pmd(pmdval_t pmd)
275{ 544{
@@ -277,95 +546,218 @@ pmd_t xen_make_pmd(pmdval_t pmd)
277 return native_make_pmd(pmd); 546 return native_make_pmd(pmd);
278} 547}
279 548
549#if PAGETABLE_LEVELS == 4
550pudval_t xen_pud_val(pud_t pud)
551{
552 return pte_mfn_to_pfn(pud.pud);
553}
554
555pud_t xen_make_pud(pudval_t pud)
556{
557 pud = pte_pfn_to_mfn(pud);
558
559 return native_make_pud(pud);
560}
561
562pgd_t *xen_get_user_pgd(pgd_t *pgd)
563{
564 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
565 unsigned offset = pgd - pgd_page;
566 pgd_t *user_ptr = NULL;
567
568 if (offset < pgd_index(USER_LIMIT)) {
569 struct page *page = virt_to_page(pgd_page);
570 user_ptr = (pgd_t *)page->private;
571 if (user_ptr)
572 user_ptr += offset;
573 }
574
575 return user_ptr;
576}
577
578static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
579{
580 struct mmu_update u;
581
582 u.ptr = virt_to_machine(ptr).maddr;
583 u.val = pgd_val_ma(val);
584 xen_extend_mmu_update(&u);
585}
586
587/*
588 * Raw hypercall-based set_pgd, intended for in early boot before
589 * there's a page structure. This implies:
590 * 1. The only existing pagetable is the kernel's
591 * 2. It is always pinned
592 * 3. It has no user pagetable attached to it
593 */
594void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
595{
596 preempt_disable();
597
598 xen_mc_batch();
599
600 __xen_set_pgd_hyper(ptr, val);
601
602 xen_mc_issue(PARAVIRT_LAZY_MMU);
603
604 preempt_enable();
605}
606
607void xen_set_pgd(pgd_t *ptr, pgd_t val)
608{
609 pgd_t *user_ptr = xen_get_user_pgd(ptr);
610
611 ADD_STATS(pgd_update, 1);
612
613 /* If page is not pinned, we can just update the entry
614 directly */
615 if (!xen_page_pinned(ptr)) {
616 *ptr = val;
617 if (user_ptr) {
618 WARN_ON(xen_page_pinned(user_ptr));
619 *user_ptr = val;
620 }
621 return;
622 }
623
624 ADD_STATS(pgd_update_pinned, 1);
625 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
626
627 /* If it's pinned, then we can at least batch the kernel and
628 user updates together. */
629 xen_mc_batch();
630
631 __xen_set_pgd_hyper(ptr, val);
632 if (user_ptr)
633 __xen_set_pgd_hyper(user_ptr, val);
634
635 xen_mc_issue(PARAVIRT_LAZY_MMU);
636}
637#endif /* PAGETABLE_LEVELS == 4 */
638
280/* 639/*
281 (Yet another) pagetable walker. This one is intended for pinning a 640 * (Yet another) pagetable walker. This one is intended for pinning a
282 pagetable. This means that it walks a pagetable and calls the 641 * pagetable. This means that it walks a pagetable and calls the
283 callback function on each page it finds making up the page table, 642 * callback function on each page it finds making up the page table,
284 at every level. It walks the entire pagetable, but it only bothers 643 * at every level. It walks the entire pagetable, but it only bothers
285 pinning pte pages which are below pte_limit. In the normal case 644 * pinning pte pages which are below limit. In the normal case this
286 this will be TASK_SIZE, but at boot we need to pin up to 645 * will be STACK_TOP_MAX, but at boot we need to pin up to
287 FIXADDR_TOP. But the important bit is that we don't pin beyond 646 * FIXADDR_TOP.
288 there, because then we start getting into Xen's ptes. 647 *
289*/ 648 * For 32-bit the important bit is that we don't pin beyond there,
290static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), 649 * because then we start getting into Xen's ptes.
291 unsigned long limit) 650 *
292{ 651 * For 64-bit, we must skip the Xen hole in the middle of the address
293 pgd_t *pgd = pgd_base; 652 * space, just after the big x86-64 virtual hole.
653 */
654static int xen_pgd_walk(struct mm_struct *mm,
655 int (*func)(struct mm_struct *mm, struct page *,
656 enum pt_level),
657 unsigned long limit)
658{
659 pgd_t *pgd = mm->pgd;
294 int flush = 0; 660 int flush = 0;
295 unsigned long addr = 0; 661 unsigned hole_low, hole_high;
296 unsigned long pgd_next; 662 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
663 unsigned pgdidx, pudidx, pmdidx;
297 664
298 BUG_ON(limit > FIXADDR_TOP); 665 /* The limit is the last byte to be touched */
666 limit--;
667 BUG_ON(limit >= FIXADDR_TOP);
299 668
300 if (xen_feature(XENFEAT_auto_translated_physmap)) 669 if (xen_feature(XENFEAT_auto_translated_physmap))
301 return 0; 670 return 0;
302 671
303 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { 672 /*
673 * 64-bit has a great big hole in the middle of the address
674 * space, which contains the Xen mappings. On 32-bit these
675 * will end up making a zero-sized hole and so is a no-op.
676 */
677 hole_low = pgd_index(USER_LIMIT);
678 hole_high = pgd_index(PAGE_OFFSET);
679
680 pgdidx_limit = pgd_index(limit);
681#if PTRS_PER_PUD > 1
682 pudidx_limit = pud_index(limit);
683#else
684 pudidx_limit = 0;
685#endif
686#if PTRS_PER_PMD > 1
687 pmdidx_limit = pmd_index(limit);
688#else
689 pmdidx_limit = 0;
690#endif
691
692 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
304 pud_t *pud; 693 pud_t *pud;
305 unsigned long pud_limit, pud_next;
306 694
307 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); 695 if (pgdidx >= hole_low && pgdidx < hole_high)
696 continue;
308 697
309 if (!pgd_val(*pgd)) 698 if (!pgd_val(pgd[pgdidx]))
310 continue; 699 continue;
311 700
312 pud = pud_offset(pgd, 0); 701 pud = pud_offset(&pgd[pgdidx], 0);
313 702
314 if (PTRS_PER_PUD > 1) /* not folded */ 703 if (PTRS_PER_PUD > 1) /* not folded */
315 flush |= (*func)(virt_to_page(pud), PT_PUD); 704 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
316 705
317 for (; addr != pud_limit; pud++, addr = pud_next) { 706 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
318 pmd_t *pmd; 707 pmd_t *pmd;
319 unsigned long pmd_limit;
320
321 pud_next = pud_addr_end(addr, pud_limit);
322 708
323 if (pud_next < limit) 709 if (pgdidx == pgdidx_limit &&
324 pmd_limit = pud_next; 710 pudidx > pudidx_limit)
325 else 711 goto out;
326 pmd_limit = limit;
327 712
328 if (pud_none(*pud)) 713 if (pud_none(pud[pudidx]))
329 continue; 714 continue;
330 715
331 pmd = pmd_offset(pud, 0); 716 pmd = pmd_offset(&pud[pudidx], 0);
332 717
333 if (PTRS_PER_PMD > 1) /* not folded */ 718 if (PTRS_PER_PMD > 1) /* not folded */
334 flush |= (*func)(virt_to_page(pmd), PT_PMD); 719 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
720
721 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
722 struct page *pte;
335 723
336 for (; addr != pmd_limit; pmd++) { 724 if (pgdidx == pgdidx_limit &&
337 addr += (PAGE_SIZE * PTRS_PER_PTE); 725 pudidx == pudidx_limit &&
338 if ((pmd_limit-1) < (addr-1)) { 726 pmdidx > pmdidx_limit)
339 addr = pmd_limit; 727 goto out;
340 break;
341 }
342 728
343 if (pmd_none(*pmd)) 729 if (pmd_none(pmd[pmdidx]))
344 continue; 730 continue;
345 731
346 flush |= (*func)(pmd_page(*pmd), PT_PTE); 732 pte = pmd_page(pmd[pmdidx]);
733 flush |= (*func)(mm, pte, PT_PTE);
347 } 734 }
348 } 735 }
349 } 736 }
350 737
351 flush |= (*func)(virt_to_page(pgd_base), PT_PGD); 738out:
739 /* Do the top level last, so that the callbacks can use it as
740 a cue to do final things like tlb flushes. */
741 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
352 742
353 return flush; 743 return flush;
354} 744}
355 745
356static spinlock_t *lock_pte(struct page *page) 746/* If we're using split pte locks, then take the page's lock and
747 return a pointer to it. Otherwise return NULL. */
748static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
357{ 749{
358 spinlock_t *ptl = NULL; 750 spinlock_t *ptl = NULL;
359 751
360#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 752#if USE_SPLIT_PTLOCKS
361 ptl = __pte_lockptr(page); 753 ptl = __pte_lockptr(page);
362 spin_lock(ptl); 754 spin_lock_nest_lock(ptl, &mm->page_table_lock);
363#endif 755#endif
364 756
365 return ptl; 757 return ptl;
366} 758}
367 759
368static void do_unlock(void *v) 760static void xen_pte_unlock(void *v)
369{ 761{
370 spinlock_t *ptl = v; 762 spinlock_t *ptl = v;
371 spin_unlock(ptl); 763 spin_unlock(ptl);
@@ -383,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
383 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 775 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
384} 776}
385 777
386static int pin_page(struct page *page, enum pt_level level) 778static int xen_pin_page(struct mm_struct *mm, struct page *page,
779 enum pt_level level)
387{ 780{
388 unsigned pgfl = TestSetPagePinned(page); 781 unsigned pgfl = TestSetPagePinned(page);
389 int flush; 782 int flush;
@@ -402,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level)
402 795
403 flush = 0; 796 flush = 0;
404 797
798 /*
799 * We need to hold the pagetable lock between the time
800 * we make the pagetable RO and when we actually pin
801 * it. If we don't, then other users may come in and
802 * attempt to update the pagetable by writing it,
803 * which will fail because the memory is RO but not
804 * pinned, so Xen won't do the trap'n'emulate.
805 *
806 * If we're using split pte locks, we can't hold the
807 * entire pagetable's worth of locks during the
808 * traverse, because we may wrap the preempt count (8
809 * bits). The solution is to mark RO and pin each PTE
810 * page while holding the lock. This means the number
811 * of locks we end up holding is never more than a
812 * batch size (~32 entries, at present).
813 *
814 * If we're not using split pte locks, we needn't pin
815 * the PTE pages independently, because we're
816 * protected by the overall pagetable lock.
817 */
405 ptl = NULL; 818 ptl = NULL;
406 if (level == PT_PTE) 819 if (level == PT_PTE)
407 ptl = lock_pte(page); 820 ptl = xen_pte_lock(page, mm);
408 821
409 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 822 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
410 pfn_pte(pfn, PAGE_KERNEL_RO), 823 pfn_pte(pfn, PAGE_KERNEL_RO),
411 level == PT_PGD ? UVMF_TLB_FLUSH : 0); 824 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
412 825
413 if (level == PT_PTE) 826 if (ptl) {
414 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); 827 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
415 828
416 if (ptl) {
417 /* Queue a deferred unlock for when this batch 829 /* Queue a deferred unlock for when this batch
418 is completed. */ 830 is completed. */
419 xen_mc_callback(do_unlock, ptl); 831 xen_mc_callback(xen_pte_unlock, ptl);
420 } 832 }
421 } 833 }
422 834
@@ -426,25 +838,78 @@ static int pin_page(struct page *page, enum pt_level level)
426/* This is called just after a mm has been created, but it has not 838/* This is called just after a mm has been created, but it has not
427 been used yet. We need to make sure that its pagetable is all 839 been used yet. We need to make sure that its pagetable is all
428 read-only, and can be pinned. */ 840 read-only, and can be pinned. */
429void xen_pgd_pin(pgd_t *pgd) 841static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
430{ 842{
431 xen_mc_batch(); 843 xen_mc_batch();
432 844
433 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 845 if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) {
434 /* re-enable interrupts for kmap_flush_unused */ 846 /* re-enable interrupts for kmap_flush_unused */
435 xen_mc_issue(0); 847 xen_mc_issue(0);
436 kmap_flush_unused(); 848 kmap_flush_unused();
437 xen_mc_batch(); 849 xen_mc_batch();
438 } 850 }
439 851
852#ifdef CONFIG_X86_64
853 {
854 pgd_t *user_pgd = xen_get_user_pgd(pgd);
855
856 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
857
858 if (user_pgd) {
859 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
860 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
861 }
862 }
863#else /* CONFIG_X86_32 */
864#ifdef CONFIG_X86_PAE
865 /* Need to make sure unshared kernel PMD is pinnable */
866 xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
867 PT_PMD);
868#endif
440 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 869 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
870#endif /* CONFIG_X86_64 */
441 xen_mc_issue(0); 871 xen_mc_issue(0);
442} 872}
443 873
444/* The init_mm pagetable is really pinned as soon as its created, but 874static void xen_pgd_pin(struct mm_struct *mm)
445 that's before we have page structures to store the bits. So do all 875{
446 the book-keeping now. */ 876 __xen_pgd_pin(mm, mm->pgd);
447static __init int mark_pinned(struct page *page, enum pt_level level) 877}
878
879/*
880 * On save, we need to pin all pagetables to make sure they get their
881 * mfns turned into pfns. Search the list for any unpinned pgds and pin
882 * them (unpinned pgds are not currently in use, probably because the
883 * process is under construction or destruction).
884 *
885 * Expected to be called in stop_machine() ("equivalent to taking
886 * every spinlock in the system"), so the locking doesn't really
887 * matter all that much.
888 */
889void xen_mm_pin_all(void)
890{
891 unsigned long flags;
892 struct page *page;
893
894 spin_lock_irqsave(&pgd_lock, flags);
895
896 list_for_each_entry(page, &pgd_list, lru) {
897 if (!PagePinned(page)) {
898 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
899 SetPageSavePinned(page);
900 }
901 }
902
903 spin_unlock_irqrestore(&pgd_lock, flags);
904}
905
906/*
907 * The init_mm pagetable is really pinned as soon as its created, but
908 * that's before we have page structures to store the bits. So do all
909 * the book-keeping now.
910 */
911static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
912 enum pt_level level)
448{ 913{
449 SetPagePinned(page); 914 SetPagePinned(page);
450 return 0; 915 return 0;
@@ -452,10 +917,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level)
452 917
453void __init xen_mark_init_mm_pinned(void) 918void __init xen_mark_init_mm_pinned(void)
454{ 919{
455 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 920 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
456} 921}
457 922
458static int unpin_page(struct page *page, enum pt_level level) 923static int xen_unpin_page(struct mm_struct *mm, struct page *page,
924 enum pt_level level)
459{ 925{
460 unsigned pgfl = TestClearPagePinned(page); 926 unsigned pgfl = TestClearPagePinned(page);
461 927
@@ -465,10 +931,18 @@ static int unpin_page(struct page *page, enum pt_level level)
465 spinlock_t *ptl = NULL; 931 spinlock_t *ptl = NULL;
466 struct multicall_space mcs; 932 struct multicall_space mcs;
467 933
934 /*
935 * Do the converse to pin_page. If we're using split
936 * pte locks, we must be holding the lock for while
937 * the pte page is unpinned but still RO to prevent
938 * concurrent updates from seeing it in this
939 * partially-pinned state.
940 */
468 if (level == PT_PTE) { 941 if (level == PT_PTE) {
469 ptl = lock_pte(page); 942 ptl = xen_pte_lock(page, mm);
470 943
471 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); 944 if (ptl)
945 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
472 } 946 }
473 947
474 mcs = __xen_mc_entry(0); 948 mcs = __xen_mc_entry(0);
@@ -479,7 +953,7 @@ static int unpin_page(struct page *page, enum pt_level level)
479 953
480 if (ptl) { 954 if (ptl) {
481 /* unlock when batch completed */ 955 /* unlock when batch completed */
482 xen_mc_callback(do_unlock, ptl); 956 xen_mc_callback(xen_pte_unlock, ptl);
483 } 957 }
484 } 958 }
485 959
@@ -487,28 +961,72 @@ static int unpin_page(struct page *page, enum pt_level level)
487} 961}
488 962
489/* Release a pagetables pages back as normal RW */ 963/* Release a pagetables pages back as normal RW */
490static void xen_pgd_unpin(pgd_t *pgd) 964static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
491{ 965{
492 xen_mc_batch(); 966 xen_mc_batch();
493 967
494 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 968 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
495 969
496 pgd_walk(pgd, unpin_page, TASK_SIZE); 970#ifdef CONFIG_X86_64
971 {
972 pgd_t *user_pgd = xen_get_user_pgd(pgd);
973
974 if (user_pgd) {
975 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
976 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
977 }
978 }
979#endif
980
981#ifdef CONFIG_X86_PAE
982 /* Need to make sure unshared kernel PMD is unpinned */
983 xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
984 PT_PMD);
985#endif
986
987 xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT);
497 988
498 xen_mc_issue(0); 989 xen_mc_issue(0);
499} 990}
500 991
992static void xen_pgd_unpin(struct mm_struct *mm)
993{
994 __xen_pgd_unpin(mm, mm->pgd);
995}
996
997/*
998 * On resume, undo any pinning done at save, so that the rest of the
999 * kernel doesn't see any unexpected pinned pagetables.
1000 */
1001void xen_mm_unpin_all(void)
1002{
1003 unsigned long flags;
1004 struct page *page;
1005
1006 spin_lock_irqsave(&pgd_lock, flags);
1007
1008 list_for_each_entry(page, &pgd_list, lru) {
1009 if (PageSavePinned(page)) {
1010 BUG_ON(!PagePinned(page));
1011 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1012 ClearPageSavePinned(page);
1013 }
1014 }
1015
1016 spin_unlock_irqrestore(&pgd_lock, flags);
1017}
1018
501void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1019void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
502{ 1020{
503 spin_lock(&next->page_table_lock); 1021 spin_lock(&next->page_table_lock);
504 xen_pgd_pin(next->pgd); 1022 xen_pgd_pin(next);
505 spin_unlock(&next->page_table_lock); 1023 spin_unlock(&next->page_table_lock);
506} 1024}
507 1025
508void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 1026void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
509{ 1027{
510 spin_lock(&mm->page_table_lock); 1028 spin_lock(&mm->page_table_lock);
511 xen_pgd_pin(mm->pgd); 1029 xen_pgd_pin(mm);
512 spin_unlock(&mm->page_table_lock); 1030 spin_unlock(&mm->page_table_lock);
513} 1031}
514 1032
@@ -519,8 +1037,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
519static void drop_other_mm_ref(void *info) 1037static void drop_other_mm_ref(void *info)
520{ 1038{
521 struct mm_struct *mm = info; 1039 struct mm_struct *mm = info;
1040 struct mm_struct *active_mm;
1041
1042#ifdef CONFIG_X86_64
1043 active_mm = read_pda(active_mm);
1044#else
1045 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
1046#endif
522 1047
523 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 1048 if (active_mm == mm)
524 leave_mm(smp_processor_id()); 1049 leave_mm(smp_processor_id());
525 1050
526 /* If this cpu still has a stale cr3 reference, then make sure 1051 /* If this cpu still has a stale cr3 reference, then make sure
@@ -531,7 +1056,7 @@ static void drop_other_mm_ref(void *info)
531 } 1056 }
532} 1057}
533 1058
534static void drop_mm_ref(struct mm_struct *mm) 1059static void xen_drop_mm_ref(struct mm_struct *mm)
535{ 1060{
536 cpumask_t mask; 1061 cpumask_t mask;
537 unsigned cpu; 1062 unsigned cpu;
@@ -558,10 +1083,10 @@ static void drop_mm_ref(struct mm_struct *mm)
558 } 1083 }
559 1084
560 if (!cpus_empty(mask)) 1085 if (!cpus_empty(mask))
561 xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 1086 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
562} 1087}
563#else 1088#else
564static void drop_mm_ref(struct mm_struct *mm) 1089static void xen_drop_mm_ref(struct mm_struct *mm)
565{ 1090{
566 if (current->active_mm == mm) 1091 if (current->active_mm == mm)
567 load_cr3(swapper_pg_dir); 1092 load_cr3(swapper_pg_dir);
@@ -585,14 +1110,77 @@ static void drop_mm_ref(struct mm_struct *mm)
585void xen_exit_mmap(struct mm_struct *mm) 1110void xen_exit_mmap(struct mm_struct *mm)
586{ 1111{
587 get_cpu(); /* make sure we don't move around */ 1112 get_cpu(); /* make sure we don't move around */
588 drop_mm_ref(mm); 1113 xen_drop_mm_ref(mm);
589 put_cpu(); 1114 put_cpu();
590 1115
591 spin_lock(&mm->page_table_lock); 1116 spin_lock(&mm->page_table_lock);
592 1117
593 /* pgd may not be pinned in the error exit path of execve */ 1118 /* pgd may not be pinned in the error exit path of execve */
594 if (PagePinned(virt_to_page(mm->pgd))) 1119 if (xen_page_pinned(mm->pgd))
595 xen_pgd_unpin(mm->pgd); 1120 xen_pgd_unpin(mm);
596 1121
597 spin_unlock(&mm->page_table_lock); 1122 spin_unlock(&mm->page_table_lock);
598} 1123}
1124
1125#ifdef CONFIG_XEN_DEBUG_FS
1126
1127static struct dentry *d_mmu_debug;
1128
1129static int __init xen_mmu_debugfs(void)
1130{
1131 struct dentry *d_xen = xen_init_debugfs();
1132
1133 if (d_xen == NULL)
1134 return -ENOMEM;
1135
1136 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1137
1138 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1139
1140 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1141 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1142 &mmu_stats.pgd_update_pinned);
1143 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1144 &mmu_stats.pgd_update_pinned);
1145
1146 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1147 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1148 &mmu_stats.pud_update_pinned);
1149 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1150 &mmu_stats.pud_update_pinned);
1151
1152 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1153 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1154 &mmu_stats.pmd_update_pinned);
1155 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1156 &mmu_stats.pmd_update_pinned);
1157
1158 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1159// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1160// &mmu_stats.pte_update_pinned);
1161 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1162 &mmu_stats.pte_update_pinned);
1163
1164 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1165 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1166 &mmu_stats.mmu_update_extended);
1167 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1168 mmu_stats.mmu_update_histo, 20);
1169
1170 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1171 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1172 &mmu_stats.set_pte_at_batched);
1173 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1174 &mmu_stats.set_pte_at_current);
1175 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1176 &mmu_stats.set_pte_at_kernel);
1177
1178 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1179 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1180 &mmu_stats.prot_commit_batched);
1181
1182 return 0;
1183}
1184fs_initcall(xen_mmu_debugfs);
1185
1186#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 5fe961caffd4..98d71659da5a 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,33 +10,14 @@ enum pt_level {
10 PT_PTE 10 PT_PTE
11}; 11};
12 12
13/*
14 * Page-directory addresses above 4GB do not fit into architectural %cr3.
15 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
16 * must use the following accessor macros to pack/unpack valid MFNs.
17 *
18 * Note that Xen is using the fact that the pagetable base is always
19 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
20 * of cr3.
21 */
22#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
23#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
24
25 13
26void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 14void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
27 15
28void xen_set_pte(pte_t *ptep, pte_t pteval);
29void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
30 pte_t *ptep, pte_t pteval);
31void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
32 16
33void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); 17void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
34void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); 18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
35void xen_exit_mmap(struct mm_struct *mm); 19void xen_exit_mmap(struct mm_struct *mm);
36 20
37void xen_pgd_pin(pgd_t *pgd);
38//void xen_pgd_unpin(pgd_t *pgd);
39
40pteval_t xen_pte_val(pte_t); 21pteval_t xen_pte_val(pte_t);
41pmdval_t xen_pmd_val(pmd_t); 22pmdval_t xen_pmd_val(pmd_t);
42pgdval_t xen_pgd_val(pgd_t); 23pgdval_t xen_pgd_val(pgd_t);
@@ -45,11 +26,32 @@ pte_t xen_make_pte(pteval_t);
45pmd_t xen_make_pmd(pmdval_t); 26pmd_t xen_make_pmd(pmdval_t);
46pgd_t xen_make_pgd(pgdval_t); 27pgd_t xen_make_pgd(pgdval_t);
47 28
29void xen_set_pte(pte_t *ptep, pte_t pteval);
48void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 30void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
49 pte_t *ptep, pte_t pteval); 31 pte_t *ptep, pte_t pteval);
32
33#ifdef CONFIG_X86_PAE
50void xen_set_pte_atomic(pte_t *ptep, pte_t pte); 34void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
51void xen_set_pud(pud_t *ptr, pud_t val);
52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 35void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
53void xen_pmd_clear(pmd_t *pmdp); 36void xen_pmd_clear(pmd_t *pmdp);
37#endif /* CONFIG_X86_PAE */
38
39void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
40void xen_set_pud(pud_t *ptr, pud_t val);
41void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
42void xen_set_pud_hyper(pud_t *ptr, pud_t val);
43
44#if PAGETABLE_LEVELS == 4
45pudval_t xen_pud_val(pud_t pud);
46pud_t xen_make_pud(pudval_t pudval);
47void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
48void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
49#endif
50
51pgd_t *xen_get_user_pgd(pgd_t *pgd);
52
53pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
54void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
55 pte_t *ptep, pte_t pte);
54 56
55#endif /* _XEN_MMU_H */ 57#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 5791eb2e3750..8ea8a0d0b0de 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -21,22 +21,26 @@
21 */ 21 */
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/hardirq.h> 23#include <linux/hardirq.h>
24#include <linux/debugfs.h>
24 25
25#include <asm/xen/hypercall.h> 26#include <asm/xen/hypercall.h>
26 27
27#include "multicalls.h" 28#include "multicalls.h"
29#include "debugfs.h"
30
31#define MC_BATCH 32
28 32
29#define MC_DEBUG 1 33#define MC_DEBUG 1
30 34
31#define MC_BATCH 32 35#define MC_ARGS (MC_BATCH * 16)
32#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) 36
33 37
34struct mc_buffer { 38struct mc_buffer {
35 struct multicall_entry entries[MC_BATCH]; 39 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG 40#if MC_DEBUG
37 struct multicall_entry debug[MC_BATCH]; 41 struct multicall_entry debug[MC_BATCH];
38#endif 42#endif
39 u64 args[MC_ARGS]; 43 unsigned char args[MC_ARGS];
40 struct callback { 44 struct callback {
41 void (*fn)(void *); 45 void (*fn)(void *);
42 void *data; 46 void *data;
@@ -47,6 +51,76 @@ struct mc_buffer {
47static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); 51static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
48DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); 52DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
49 53
54/* flush reasons 0- slots, 1- args, 2- callbacks */
55enum flush_reasons
56{
57 FL_SLOTS,
58 FL_ARGS,
59 FL_CALLBACKS,
60
61 FL_N_REASONS
62};
63
64#ifdef CONFIG_XEN_DEBUG_FS
65#define NHYPERCALLS 40 /* not really */
66
67static struct {
68 unsigned histo[MC_BATCH+1];
69
70 unsigned issued;
71 unsigned arg_total;
72 unsigned hypercalls;
73 unsigned histo_hypercalls[NHYPERCALLS];
74
75 unsigned flush[FL_N_REASONS];
76} mc_stats;
77
78static u8 zero_stats;
79
80static inline void check_zero(void)
81{
82 if (unlikely(zero_stats)) {
83 memset(&mc_stats, 0, sizeof(mc_stats));
84 zero_stats = 0;
85 }
86}
87
88static void mc_add_stats(const struct mc_buffer *mc)
89{
90 int i;
91
92 check_zero();
93
94 mc_stats.issued++;
95 mc_stats.hypercalls += mc->mcidx;
96 mc_stats.arg_total += mc->argidx;
97
98 mc_stats.histo[mc->mcidx]++;
99 for(i = 0; i < mc->mcidx; i++) {
100 unsigned op = mc->entries[i].op;
101 if (op < NHYPERCALLS)
102 mc_stats.histo_hypercalls[op]++;
103 }
104}
105
106static void mc_stats_flush(enum flush_reasons idx)
107{
108 check_zero();
109
110 mc_stats.flush[idx]++;
111}
112
113#else /* !CONFIG_XEN_DEBUG_FS */
114
115static inline void mc_add_stats(const struct mc_buffer *mc)
116{
117}
118
119static inline void mc_stats_flush(enum flush_reasons idx)
120{
121}
122#endif /* CONFIG_XEN_DEBUG_FS */
123
50void xen_mc_flush(void) 124void xen_mc_flush(void)
51{ 125{
52 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 126 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
@@ -60,6 +134,8 @@ void xen_mc_flush(void)
60 something in the middle */ 134 something in the middle */
61 local_irq_save(flags); 135 local_irq_save(flags);
62 136
137 mc_add_stats(b);
138
63 if (b->mcidx) { 139 if (b->mcidx) {
64#if MC_DEBUG 140#if MC_DEBUG
65 memcpy(b->debug, b->entries, 141 memcpy(b->debug, b->entries,
@@ -76,6 +152,7 @@ void xen_mc_flush(void)
76 if (ret) { 152 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", 153 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id()); 154 ret, smp_processor_id());
155 dump_stack();
79 for (i = 0; i < b->mcidx; i++) { 156 for (i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 157 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx, 158 i+1, b->mcidx,
@@ -107,20 +184,49 @@ struct multicall_space __xen_mc_entry(size_t args)
107{ 184{
108 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 185 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
109 struct multicall_space ret; 186 struct multicall_space ret;
110 unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); 187 unsigned argidx = roundup(b->argidx, sizeof(u64));
111 188
112 BUG_ON(preemptible()); 189 BUG_ON(preemptible());
113 BUG_ON(argspace > MC_ARGS); 190 BUG_ON(b->argidx > MC_ARGS);
114 191
115 if (b->mcidx == MC_BATCH || 192 if (b->mcidx == MC_BATCH ||
116 (b->argidx + argspace) > MC_ARGS) 193 (argidx + args) > MC_ARGS) {
194 mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
117 xen_mc_flush(); 195 xen_mc_flush();
196 argidx = roundup(b->argidx, sizeof(u64));
197 }
118 198
119 ret.mc = &b->entries[b->mcidx]; 199 ret.mc = &b->entries[b->mcidx];
120 b->mcidx++; 200 b->mcidx++;
201 ret.args = &b->args[argidx];
202 b->argidx = argidx + args;
203
204 BUG_ON(b->argidx > MC_ARGS);
205 return ret;
206}
207
208struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
209{
210 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
211 struct multicall_space ret = { NULL, NULL };
212
213 BUG_ON(preemptible());
214 BUG_ON(b->argidx > MC_ARGS);
215
216 if (b->mcidx == 0)
217 return ret;
218
219 if (b->entries[b->mcidx - 1].op != op)
220 return ret;
221
222 if ((b->argidx + size) > MC_ARGS)
223 return ret;
224
225 ret.mc = &b->entries[b->mcidx - 1];
121 ret.args = &b->args[b->argidx]; 226 ret.args = &b->args[b->argidx];
122 b->argidx += argspace; 227 b->argidx += size;
123 228
229 BUG_ON(b->argidx > MC_ARGS);
124 return ret; 230 return ret;
125} 231}
126 232
@@ -129,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data)
129 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 235 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
130 struct callback *cb; 236 struct callback *cb;
131 237
132 if (b->cbidx == MC_BATCH) 238 if (b->cbidx == MC_BATCH) {
239 mc_stats_flush(FL_CALLBACKS);
133 xen_mc_flush(); 240 xen_mc_flush();
241 }
134 242
135 cb = &b->callbacks[b->cbidx++]; 243 cb = &b->callbacks[b->cbidx++];
136 cb->fn = fn; 244 cb->fn = fn;
137 cb->data = data; 245 cb->data = data;
138} 246}
247
248#ifdef CONFIG_XEN_DEBUG_FS
249
250static struct dentry *d_mc_debug;
251
252static int __init xen_mc_debugfs(void)
253{
254 struct dentry *d_xen = xen_init_debugfs();
255
256 if (d_xen == NULL)
257 return -ENOMEM;
258
259 d_mc_debug = debugfs_create_dir("multicalls", d_xen);
260
261 debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
262
263 debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
264 debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
265 debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
266
267 xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
268 mc_stats.histo, MC_BATCH);
269 xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
270 mc_stats.histo_hypercalls, NHYPERCALLS);
271 xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
272 mc_stats.flush, FL_N_REASONS);
273
274 return 0;
275}
276fs_initcall(xen_mc_debugfs);
277
278#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 8bae996d99a3..858938241616 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -45,4 +45,16 @@ static inline void xen_mc_issue(unsigned mode)
45/* Set up a callback to be called when the current batch is flushed */ 45/* Set up a callback to be called when the current batch is flushed */
46void xen_mc_callback(void (*fn)(void *), void *data); 46void xen_mc_callback(void (*fn)(void *), void *data);
47 47
48/*
49 * Try to extend the arguments of the previous multicall command. The
50 * previous command's op must match. If it does, then it attempts to
51 * extend the argument space allocated to the multicall entry by
52 * arg_size bytes.
53 *
54 * The returned multicall_space will return with mc pointing to the
55 * command on success, or NULL on failure, and args pointing to the
56 * newly allocated space.
57 */
58struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
59
48#endif /* _XEN_MULTICALLS_H */ 60#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 82517e4a752a..d67901083888 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -13,9 +13,11 @@
13#include <asm/vdso.h> 13#include <asm/vdso.h>
14#include <asm/e820.h> 14#include <asm/e820.h>
15#include <asm/setup.h> 15#include <asm/setup.h>
16#include <asm/acpi.h>
16#include <asm/xen/hypervisor.h> 17#include <asm/xen/hypervisor.h>
17#include <asm/xen/hypercall.h> 18#include <asm/xen/hypercall.h>
18 19
20#include <xen/page.h>
19#include <xen/interface/callback.h> 21#include <xen/interface/callback.h>
20#include <xen/interface/physdev.h> 22#include <xen/interface/physdev.h>
21#include <xen/features.h> 23#include <xen/features.h>
@@ -27,8 +29,6 @@
27extern const char xen_hypervisor_callback[]; 29extern const char xen_hypervisor_callback[];
28extern const char xen_failsafe_callback[]; 30extern const char xen_failsafe_callback[];
29 31
30unsigned long *phys_to_machine_mapping;
31EXPORT_SYMBOL(phys_to_machine_mapping);
32 32
33/** 33/**
34 * machine_specific_memory_setup - Hook for machine specific memory setup. 34 * machine_specific_memory_setup - Hook for machine specific memory setup.
@@ -38,9 +38,31 @@ char * __init xen_memory_setup(void)
38{ 38{
39 unsigned long max_pfn = xen_start_info->nr_pages; 39 unsigned long max_pfn = xen_start_info->nr_pages;
40 40
41 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
42
41 e820.nr_map = 0; 43 e820.nr_map = 0;
42 add_memory_region(0, LOWMEMSIZE(), E820_RAM); 44
43 add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); 45 e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
46
47 /*
48 * Even though this is normal, usable memory under Xen, reserve
49 * ISA memory anyway because too many things think they can poke
50 * about in there.
51 */
52 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
53 E820_RESERVED);
54
55 /*
56 * Reserve Xen bits:
57 * - mfn_list
58 * - xen_start_info
59 * See comment above "struct start_info" in <xen/interface/xen.h>
60 */
61 e820_add_region(__pa(xen_start_info->mfn_list),
62 xen_start_info->pt_base - xen_start_info->mfn_list,
63 E820_RESERVED);
64
65 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
44 66
45 return "Xen"; 67 return "Xen";
46} 68}
@@ -61,30 +83,72 @@ static void xen_idle(void)
61 83
62/* 84/*
63 * Set the bit indicating "nosegneg" library variants should be used. 85 * Set the bit indicating "nosegneg" library variants should be used.
86 * We only need to bother in pure 32-bit mode; compat 32-bit processes
87 * can have un-truncated segments, so wrapping around is allowed.
64 */ 88 */
65static void __init fiddle_vdso(void) 89static void __init fiddle_vdso(void)
66{ 90{
67 extern const char vdso32_default_start; 91#ifdef CONFIG_X86_32
68 u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); 92 u32 *mask;
93 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
94 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
95 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
69 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 96 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
97#endif
70} 98}
71 99
72void xen_enable_sysenter(void) 100static __cpuinit int register_callback(unsigned type, const void *func)
73{ 101{
74 int cpu = smp_processor_id(); 102 struct callback_register callback = {
75 extern void xen_sysenter_target(void); 103 .type = type,
76 /* Mask events on entry, even though they get enabled immediately */ 104 .address = XEN_CALLBACK(__KERNEL_CS, func),
77 static struct callback_register sysenter = {
78 .type = CALLBACKTYPE_sysenter,
79 .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
80 .flags = CALLBACKF_mask_events, 105 .flags = CALLBACKF_mask_events,
81 }; 106 };
82 107
83 if (!boot_cpu_has(X86_FEATURE_SEP) || 108 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
84 HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { 109}
85 clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); 110
86 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); 111void __cpuinit xen_enable_sysenter(void)
112{
113 extern void xen_sysenter_target(void);
114 int ret;
115 unsigned sysenter_feature;
116
117#ifdef CONFIG_X86_32
118 sysenter_feature = X86_FEATURE_SEP;
119#else
120 sysenter_feature = X86_FEATURE_SYSENTER32;
121#endif
122
123 if (!boot_cpu_has(sysenter_feature))
124 return;
125
126 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
127 if(ret != 0)
128 setup_clear_cpu_cap(sysenter_feature);
129}
130
131void __cpuinit xen_enable_syscall(void)
132{
133#ifdef CONFIG_X86_64
134 int ret;
135 extern void xen_syscall_target(void);
136 extern void xen_syscall32_target(void);
137
138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
139 if (ret != 0) {
140 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
141 /* Pretty fatal; 64-bit userspace has no other
142 mechanism for syscalls. */
143 }
144
145 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
146 ret = register_callback(CALLBACKTYPE_syscall32,
147 xen_syscall32_target);
148 if (ret != 0)
149 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
87 } 150 }
151#endif /* CONFIG_X86_64 */
88} 152}
89 153
90void __init xen_arch_setup(void) 154void __init xen_arch_setup(void)
@@ -98,10 +162,12 @@ void __init xen_arch_setup(void)
98 if (!xen_feature(XENFEAT_auto_translated_physmap)) 162 if (!xen_feature(XENFEAT_auto_translated_physmap))
99 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); 163 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
100 164
101 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, 165 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
102 __KERNEL_CS, (unsigned long)xen_failsafe_callback); 166 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
167 BUG();
103 168
104 xen_enable_sysenter(); 169 xen_enable_sysenter();
170 xen_enable_syscall();
105 171
106 set_iopl.iopl = 1; 172 set_iopl.iopl = 1;
107 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 173 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -121,11 +187,6 @@ void __init xen_arch_setup(void)
121 187
122 pm_idle = xen_idle; 188 pm_idle = xen_idle;
123 189
124#ifdef CONFIG_SMP
125 /* fill cpus_possible with all available cpus */
126 xen_fill_possible_map();
127#endif
128
129 paravirt_disable_iospace(); 190 paravirt_disable_iospace();
130 191
131 fiddle_vdso(); 192 fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 94e69000f982..d77da613b1d2 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -11,8 +11,6 @@
11 * useful topology information for the kernel to make use of. As a 11 * useful topology information for the kernel to make use of. As a
12 * result, all CPUs are treated as if they're single-core and 12 * result, all CPUs are treated as if they're single-core and
13 * single-threaded. 13 * single-threaded.
14 *
15 * This does not handle HOTPLUG_CPU yet.
16 */ 14 */
17#include <linux/sched.h> 15#include <linux/sched.h>
18#include <linux/err.h> 16#include <linux/err.h>
@@ -35,28 +33,15 @@
35#include "xen-ops.h" 33#include "xen-ops.h"
36#include "mmu.h" 34#include "mmu.h"
37 35
38static cpumask_t xen_cpu_initialized_map; 36cpumask_t xen_cpu_initialized_map;
39static DEFINE_PER_CPU(int, resched_irq) = -1;
40static DEFINE_PER_CPU(int, callfunc_irq) = -1;
41static DEFINE_PER_CPU(int, debug_irq) = -1;
42 37
43/* 38static DEFINE_PER_CPU(int, resched_irq);
44 * Structure and data for smp_call_function(). This is designed to minimise 39static DEFINE_PER_CPU(int, callfunc_irq);
45 * static memory requirements. It also looks cleaner. 40static DEFINE_PER_CPU(int, callfuncsingle_irq);
46 */ 41static DEFINE_PER_CPU(int, debug_irq) = -1;
47static DEFINE_SPINLOCK(call_lock);
48
49struct call_data_struct {
50 void (*func) (void *info);
51 void *info;
52 atomic_t started;
53 atomic_t finished;
54 int wait;
55};
56 42
57static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); 43static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
58 44static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
59static struct call_data_struct *call_data;
60 45
61/* 46/*
62 * Reschedule call back. Nothing to do, 47 * Reschedule call back. Nothing to do,
@@ -65,25 +50,46 @@ static struct call_data_struct *call_data;
65 */ 50 */
66static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
67{ 52{
53#ifdef CONFIG_X86_32
54 __get_cpu_var(irq_stat).irq_resched_count++;
55#else
56 add_pda(irq_resched_count, 1);
57#endif
58
68 return IRQ_HANDLED; 59 return IRQ_HANDLED;
69} 60}
70 61
71static __cpuinit void cpu_bringup_and_idle(void) 62static __cpuinit void cpu_bringup(void)
72{ 63{
73 int cpu = smp_processor_id(); 64 int cpu = smp_processor_id();
74 65
75 cpu_init(); 66 cpu_init();
67 touch_softlockup_watchdog();
68 preempt_disable();
69
76 xen_enable_sysenter(); 70 xen_enable_sysenter();
71 xen_enable_syscall();
77 72
78 preempt_disable(); 73 cpu = smp_processor_id();
79 per_cpu(cpu_state, cpu) = CPU_ONLINE; 74 smp_store_cpu_info(cpu);
75 cpu_data(cpu).x86_max_cores = 1;
76 set_cpu_sibling_map(cpu);
80 77
81 xen_setup_cpu_clockevents(); 78 xen_setup_cpu_clockevents();
82 79
80 cpu_set(cpu, cpu_online_map);
81 x86_write_percpu(cpu_state, CPU_ONLINE);
82 wmb();
83
83 /* We can take interrupts now: we're officially "up". */ 84 /* We can take interrupts now: we're officially "up". */
84 local_irq_enable(); 85 local_irq_enable();
85 86
86 wmb(); /* make sure everything is out */ 87 wmb(); /* make sure everything is out */
88}
89
90static __cpuinit void cpu_bringup_and_idle(void)
91{
92 cpu_bringup();
87 cpu_idle(); 93 cpu_idle();
88} 94}
89 95
@@ -122,6 +128,17 @@ static int xen_smp_intr_init(unsigned int cpu)
122 goto fail; 128 goto fail;
123 per_cpu(debug_irq, cpu) = rc; 129 per_cpu(debug_irq, cpu) = rc;
124 130
131 callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
132 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
133 cpu,
134 xen_call_function_single_interrupt,
135 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
136 callfunc_name,
137 NULL);
138 if (rc < 0)
139 goto fail;
140 per_cpu(callfuncsingle_irq, cpu) = rc;
141
125 return 0; 142 return 0;
126 143
127 fail: 144 fail:
@@ -131,59 +148,45 @@ static int xen_smp_intr_init(unsigned int cpu)
131 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); 148 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
132 if (per_cpu(debug_irq, cpu) >= 0) 149 if (per_cpu(debug_irq, cpu) >= 0)
133 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); 150 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
151 if (per_cpu(callfuncsingle_irq, cpu) >= 0)
152 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
153
134 return rc; 154 return rc;
135} 155}
136 156
137void __init xen_fill_possible_map(void) 157static void __init xen_fill_possible_map(void)
138{ 158{
139 int i, rc; 159 int i, rc;
140 160
141 for (i = 0; i < NR_CPUS; i++) { 161 for (i = 0; i < NR_CPUS; i++) {
142 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 162 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
143 if (rc >= 0) 163 if (rc >= 0) {
164 num_processors++;
144 cpu_set(i, cpu_possible_map); 165 cpu_set(i, cpu_possible_map);
166 }
145 } 167 }
146} 168}
147 169
148void __init xen_smp_prepare_boot_cpu(void) 170static void __init xen_smp_prepare_boot_cpu(void)
149{ 171{
150 int cpu;
151
152 BUG_ON(smp_processor_id() != 0); 172 BUG_ON(smp_processor_id() != 0);
153 native_smp_prepare_boot_cpu(); 173 native_smp_prepare_boot_cpu();
154 174
155 /* We've switched to the "real" per-cpu gdt, so make sure the 175 /* We've switched to the "real" per-cpu gdt, so make sure the
156 old memory can be recycled */ 176 old memory can be recycled */
157 make_lowmem_page_readwrite(&per_cpu__gdt_page); 177 make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
158
159 for_each_possible_cpu(cpu) {
160 cpus_clear(per_cpu(cpu_sibling_map, cpu));
161 /*
162 * cpu_core_map lives in a per cpu area that is cleared
163 * when the per cpu array is allocated.
164 *
165 * cpus_clear(per_cpu(cpu_core_map, cpu));
166 */
167 }
168 178
169 xen_setup_vcpu_info_placement(); 179 xen_setup_vcpu_info_placement();
170} 180}
171 181
172void __init xen_smp_prepare_cpus(unsigned int max_cpus) 182static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
173{ 183{
174 unsigned cpu; 184 unsigned cpu;
175 185
176 for_each_possible_cpu(cpu) { 186 xen_init_lock_cpu(0);
177 cpus_clear(per_cpu(cpu_sibling_map, cpu));
178 /*
179 * cpu_core_ map will be zeroed when the per
180 * cpu area is allocated.
181 *
182 * cpus_clear(per_cpu(cpu_core_map, cpu));
183 */
184 }
185 187
186 smp_store_cpu_info(0); 188 smp_store_cpu_info(0);
189 cpu_data(0).x86_max_cores = 1;
187 set_cpu_sibling_map(0); 190 set_cpu_sibling_map(0);
188 191
189 if (xen_smp_intr_init(0)) 192 if (xen_smp_intr_init(0))
@@ -210,15 +213,13 @@ void __init xen_smp_prepare_cpus(unsigned int max_cpus)
210 213
211 cpu_set(cpu, cpu_present_map); 214 cpu_set(cpu, cpu_present_map);
212 } 215 }
213
214 //init_xenbus_allowed_cpumask();
215} 216}
216 217
217static __cpuinit int 218static __cpuinit int
218cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 219cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
219{ 220{
220 struct vcpu_guest_context *ctxt; 221 struct vcpu_guest_context *ctxt;
221 struct gdt_page *gdt = &per_cpu(gdt_page, cpu); 222 struct desc_struct *gdt;
222 223
223 if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) 224 if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
224 return 0; 225 return 0;
@@ -227,12 +228,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
227 if (ctxt == NULL) 228 if (ctxt == NULL)
228 return -ENOMEM; 229 return -ENOMEM;
229 230
231 gdt = get_cpu_gdt_table(cpu);
232
230 ctxt->flags = VGCF_IN_KERNEL; 233 ctxt->flags = VGCF_IN_KERNEL;
231 ctxt->user_regs.ds = __USER_DS; 234 ctxt->user_regs.ds = __USER_DS;
232 ctxt->user_regs.es = __USER_DS; 235 ctxt->user_regs.es = __USER_DS;
233 ctxt->user_regs.fs = __KERNEL_PERCPU;
234 ctxt->user_regs.gs = 0;
235 ctxt->user_regs.ss = __KERNEL_DS; 236 ctxt->user_regs.ss = __KERNEL_DS;
237#ifdef CONFIG_X86_32
238 ctxt->user_regs.fs = __KERNEL_PERCPU;
239#endif
236 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 240 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
237 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 241 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
238 242
@@ -242,11 +246,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
242 246
243 ctxt->ldt_ents = 0; 247 ctxt->ldt_ents = 0;
244 248
245 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); 249 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
246 make_lowmem_page_readonly(gdt->gdt); 250 make_lowmem_page_readonly(gdt);
247 251
248 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); 252 ctxt->gdt_frames[0] = virt_to_mfn(gdt);
249 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); 253 ctxt->gdt_ents = GDT_ENTRIES;
250 254
251 ctxt->user_regs.cs = __KERNEL_CS; 255 ctxt->user_regs.cs = __KERNEL_CS;
252 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 256 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -254,9 +258,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
254 ctxt->kernel_ss = __KERNEL_DS; 258 ctxt->kernel_ss = __KERNEL_DS;
255 ctxt->kernel_sp = idle->thread.sp0; 259 ctxt->kernel_sp = idle->thread.sp0;
256 260
261#ifdef CONFIG_X86_32
257 ctxt->event_callback_cs = __KERNEL_CS; 262 ctxt->event_callback_cs = __KERNEL_CS;
258 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
259 ctxt->failsafe_callback_cs = __KERNEL_CS; 263 ctxt->failsafe_callback_cs = __KERNEL_CS;
264#endif
265 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
260 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; 266 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
261 267
262 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 268 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -269,21 +275,33 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
269 return 0; 275 return 0;
270} 276}
271 277
272int __cpuinit xen_cpu_up(unsigned int cpu) 278static int __cpuinit xen_cpu_up(unsigned int cpu)
273{ 279{
274 struct task_struct *idle = idle_task(cpu); 280 struct task_struct *idle = idle_task(cpu);
275 int rc; 281 int rc;
276 282
277#if 0 283#ifdef CONFIG_X86_64
278 rc = cpu_up_check(cpu); 284 /* Allocate node local memory for AP pdas */
279 if (rc) 285 WARN_ON(cpu == 0);
280 return rc; 286 if (cpu > 0) {
287 rc = get_local_pda(cpu);
288 if (rc)
289 return rc;
290 }
281#endif 291#endif
282 292
293#ifdef CONFIG_X86_32
283 init_gdt(cpu); 294 init_gdt(cpu);
284 per_cpu(current_task, cpu) = idle; 295 per_cpu(current_task, cpu) = idle;
285 irq_ctx_init(cpu); 296 irq_ctx_init(cpu);
297#else
298 cpu_pda(cpu)->pcurrent = idle;
299 clear_tsk_thread_flag(idle, TIF_FORK);
300#endif
286 xen_setup_timer(cpu); 301 xen_setup_timer(cpu);
302 xen_init_lock_cpu(cpu);
303
304 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
287 305
288 /* make sure interrupts start blocked */ 306 /* make sure interrupts start blocked */
289 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 307 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -299,23 +317,75 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
299 if (rc) 317 if (rc)
300 return rc; 318 return rc;
301 319
302 smp_store_cpu_info(cpu);
303 set_cpu_sibling_map(cpu);
304 /* This must be done before setting cpu_online_map */
305 wmb();
306
307 cpu_set(cpu, cpu_online_map);
308
309 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 320 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
310 BUG_ON(rc); 321 BUG_ON(rc);
311 322
323 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
324 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
325 barrier();
326 }
327
328 return 0;
329}
330
331static void xen_smp_cpus_done(unsigned int max_cpus)
332{
333}
334
335#ifdef CONFIG_HOTPLUG_CPU
336static int xen_cpu_disable(void)
337{
338 unsigned int cpu = smp_processor_id();
339 if (cpu == 0)
340 return -EBUSY;
341
342 cpu_disable_common();
343
344 load_cr3(swapper_pg_dir);
312 return 0; 345 return 0;
313} 346}
314 347
315void xen_smp_cpus_done(unsigned int max_cpus) 348static void xen_cpu_die(unsigned int cpu)
349{
350 while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
351 current->state = TASK_UNINTERRUPTIBLE;
352 schedule_timeout(HZ/10);
353 }
354 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
355 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
356 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
357 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
358 xen_uninit_lock_cpu(cpu);
359 xen_teardown_timer(cpu);
360
361 if (num_online_cpus() == 1)
362 alternatives_smp_switch(0);
363}
364
365static void xen_play_dead(void)
366{
367 play_dead_common();
368 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
369 cpu_bringup();
370}
371
372#else /* !CONFIG_HOTPLUG_CPU */
373static int xen_cpu_disable(void)
316{ 374{
375 return -ENOSYS;
317} 376}
318 377
378static void xen_cpu_die(unsigned int cpu)
379{
380 BUG();
381}
382
383static void xen_play_dead(void)
384{
385 BUG();
386}
387
388#endif
319static void stop_self(void *v) 389static void stop_self(void *v)
320{ 390{
321 int cpu = smp_processor_id(); 391 int cpu = smp_processor_id();
@@ -328,104 +398,94 @@ static void stop_self(void *v)
328 BUG(); 398 BUG();
329} 399}
330 400
331void xen_smp_send_stop(void) 401static void xen_smp_send_stop(void)
332{ 402{
333 smp_call_function(stop_self, NULL, 0, 0); 403 smp_call_function(stop_self, NULL, 0);
334} 404}
335 405
336void xen_smp_send_reschedule(int cpu) 406static void xen_smp_send_reschedule(int cpu)
337{ 407{
338 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 408 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
339} 409}
340 410
341
342static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) 411static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
343{ 412{
344 unsigned cpu; 413 unsigned cpu;
345 414
346 cpus_and(mask, mask, cpu_online_map); 415 cpus_and(mask, mask, cpu_online_map);
347 416
348 for_each_cpu_mask(cpu, mask) 417 for_each_cpu_mask_nr(cpu, mask)
349 xen_send_IPI_one(cpu, vector); 418 xen_send_IPI_one(cpu, vector);
350} 419}
351 420
421static void xen_smp_send_call_function_ipi(cpumask_t mask)
422{
423 int cpu;
424
425 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
426
427 /* Make sure other vcpus get a chance to run if they need to. */
428 for_each_cpu_mask_nr(cpu, mask) {
429 if (xen_vcpu_stolen(cpu)) {
430 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
431 break;
432 }
433 }
434}
435
436static void xen_smp_send_call_function_single_ipi(int cpu)
437{
438 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
439}
440
352static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) 441static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
353{ 442{
354 void (*func) (void *info) = call_data->func;
355 void *info = call_data->info;
356 int wait = call_data->wait;
357
358 /*
359 * Notify initiating CPU that I've grabbed the data and am
360 * about to execute the function
361 */
362 mb();
363 atomic_inc(&call_data->started);
364 /*
365 * At this point the info structure may be out of scope unless wait==1
366 */
367 irq_enter(); 443 irq_enter();
368 (*func)(info); 444 generic_smp_call_function_interrupt();
445#ifdef CONFIG_X86_32
369 __get_cpu_var(irq_stat).irq_call_count++; 446 __get_cpu_var(irq_stat).irq_call_count++;
447#else
448 add_pda(irq_call_count, 1);
449#endif
370 irq_exit(); 450 irq_exit();
371 451
372 if (wait) {
373 mb(); /* commit everything before setting finished */
374 atomic_inc(&call_data->finished);
375 }
376
377 return IRQ_HANDLED; 452 return IRQ_HANDLED;
378} 453}
379 454
380int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), 455static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
381 void *info, int wait)
382{ 456{
383 struct call_data_struct data; 457 irq_enter();
384 int cpus, cpu; 458 generic_smp_call_function_single_interrupt();
385 bool yield; 459#ifdef CONFIG_X86_32
386 460 __get_cpu_var(irq_stat).irq_call_count++;
387 /* Holding any lock stops cpus from going down. */ 461#else
388 spin_lock(&call_lock); 462 add_pda(irq_call_count, 1);
389 463#endif
390 cpu_clear(smp_processor_id(), mask); 464 irq_exit();
391
392 cpus = cpus_weight(mask);
393 if (!cpus) {
394 spin_unlock(&call_lock);
395 return 0;
396 }
397
398 /* Can deadlock when called with interrupts disabled */
399 WARN_ON(irqs_disabled());
400
401 data.func = func;
402 data.info = info;
403 atomic_set(&data.started, 0);
404 data.wait = wait;
405 if (wait)
406 atomic_set(&data.finished, 0);
407
408 call_data = &data;
409 mb(); /* write everything before IPI */
410 465
411 /* Send a message to other CPUs and wait for them to respond */ 466 return IRQ_HANDLED;
412 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 467}
413 468
414 /* Make sure other vcpus get a chance to run if they need to. */ 469static const struct smp_ops xen_smp_ops __initdata = {
415 yield = false; 470 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
416 for_each_cpu_mask(cpu, mask) 471 .smp_prepare_cpus = xen_smp_prepare_cpus,
417 if (xen_vcpu_stolen(cpu)) 472 .smp_cpus_done = xen_smp_cpus_done,
418 yield = true;
419 473
420 if (yield) 474 .cpu_up = xen_cpu_up,
421 HYPERVISOR_sched_op(SCHEDOP_yield, 0); 475 .cpu_die = xen_cpu_die,
476 .cpu_disable = xen_cpu_disable,
477 .play_dead = xen_play_dead,
422 478
423 /* Wait for response */ 479 .smp_send_stop = xen_smp_send_stop,
424 while (atomic_read(&data.started) != cpus || 480 .smp_send_reschedule = xen_smp_send_reschedule,
425 (wait && atomic_read(&data.finished) != cpus))
426 cpu_relax();
427 481
428 spin_unlock(&call_lock); 482 .send_call_func_ipi = xen_smp_send_call_function_ipi,
483 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
484};
429 485
430 return 0; 486void __init xen_smp_init(void)
487{
488 smp_ops = xen_smp_ops;
489 xen_fill_possible_map();
490 xen_init_spinlocks();
431} 491}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
new file mode 100644
index 000000000000..dd71e3a021cd
--- /dev/null
+++ b/arch/x86/xen/spinlock.c
@@ -0,0 +1,428 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/kernel_stat.h>
6#include <linux/spinlock.h>
7#include <linux/debugfs.h>
8#include <linux/log2.h>
9
10#include <asm/paravirt.h>
11
12#include <xen/interface/xen.h>
13#include <xen/events.h>
14
15#include "xen-ops.h"
16#include "debugfs.h"
17
18#ifdef CONFIG_XEN_DEBUG_FS
19static struct xen_spinlock_stats
20{
21 u64 taken;
22 u32 taken_slow;
23 u32 taken_slow_nested;
24 u32 taken_slow_pickup;
25 u32 taken_slow_spurious;
26 u32 taken_slow_irqenable;
27
28 u64 released;
29 u32 released_slow;
30 u32 released_slow_kicked;
31
32#define HISTO_BUCKETS 30
33 u32 histo_spin_total[HISTO_BUCKETS+1];
34 u32 histo_spin_spinning[HISTO_BUCKETS+1];
35 u32 histo_spin_blocked[HISTO_BUCKETS+1];
36
37 u64 time_total;
38 u64 time_spinning;
39 u64 time_blocked;
40} spinlock_stats;
41
42static u8 zero_stats;
43
44static unsigned lock_timeout = 1 << 10;
45#define TIMEOUT lock_timeout
46
47static inline void check_zero(void)
48{
49 if (unlikely(zero_stats)) {
50 memset(&spinlock_stats, 0, sizeof(spinlock_stats));
51 zero_stats = 0;
52 }
53}
54
55#define ADD_STATS(elem, val) \
56 do { check_zero(); spinlock_stats.elem += (val); } while(0)
57
58static inline u64 spin_time_start(void)
59{
60 return xen_clocksource_read();
61}
62
63static void __spin_time_accum(u64 delta, u32 *array)
64{
65 unsigned index = ilog2(delta);
66
67 check_zero();
68
69 if (index < HISTO_BUCKETS)
70 array[index]++;
71 else
72 array[HISTO_BUCKETS]++;
73}
74
75static inline void spin_time_accum_spinning(u64 start)
76{
77 u32 delta = xen_clocksource_read() - start;
78
79 __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
80 spinlock_stats.time_spinning += delta;
81}
82
83static inline void spin_time_accum_total(u64 start)
84{
85 u32 delta = xen_clocksource_read() - start;
86
87 __spin_time_accum(delta, spinlock_stats.histo_spin_total);
88 spinlock_stats.time_total += delta;
89}
90
91static inline void spin_time_accum_blocked(u64 start)
92{
93 u32 delta = xen_clocksource_read() - start;
94
95 __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
96 spinlock_stats.time_blocked += delta;
97}
98#else /* !CONFIG_XEN_DEBUG_FS */
99#define TIMEOUT (1 << 10)
100#define ADD_STATS(elem, val) do { (void)(val); } while(0)
101
102static inline u64 spin_time_start(void)
103{
104 return 0;
105}
106
107static inline void spin_time_accum_total(u64 start)
108{
109}
110static inline void spin_time_accum_spinning(u64 start)
111{
112}
113static inline void spin_time_accum_blocked(u64 start)
114{
115}
116#endif /* CONFIG_XEN_DEBUG_FS */
117
118struct xen_spinlock {
119 unsigned char lock; /* 0 -> free; 1 -> locked */
120 unsigned short spinners; /* count of waiting cpus */
121};
122
123static int xen_spin_is_locked(struct raw_spinlock *lock)
124{
125 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
126
127 return xl->lock != 0;
128}
129
130static int xen_spin_is_contended(struct raw_spinlock *lock)
131{
132 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
133
134 /* Not strictly true; this is only the count of contended
135 lock-takers entering the slow path. */
136 return xl->spinners != 0;
137}
138
139static int xen_spin_trylock(struct raw_spinlock *lock)
140{
141 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
142 u8 old = 1;
143
144 asm("xchgb %b0,%1"
145 : "+q" (old), "+m" (xl->lock) : : "memory");
146
147 return old == 0;
148}
149
150static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
151static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
152
153/*
154 * Mark a cpu as interested in a lock. Returns the CPU's previous
155 * lock of interest, in case we got preempted by an interrupt.
156 */
157static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
158{
159 struct xen_spinlock *prev;
160
161 prev = __get_cpu_var(lock_spinners);
162 __get_cpu_var(lock_spinners) = xl;
163
164 wmb(); /* set lock of interest before count */
165
166 asm(LOCK_PREFIX " incw %0"
167 : "+m" (xl->spinners) : : "memory");
168
169 return prev;
170}
171
172/*
173 * Mark a cpu as no longer interested in a lock. Restores previous
174 * lock of interest (NULL for none).
175 */
176static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
177{
178 asm(LOCK_PREFIX " decw %0"
179 : "+m" (xl->spinners) : : "memory");
180 wmb(); /* decrement count before restoring lock */
181 __get_cpu_var(lock_spinners) = prev;
182}
183
184static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
185{
186 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
187 struct xen_spinlock *prev;
188 int irq = __get_cpu_var(lock_kicker_irq);
189 int ret;
190 unsigned long flags;
191 u64 start;
192
193 /* If kicker interrupts not initialized yet, just spin */
194 if (irq == -1)
195 return 0;
196
197 start = spin_time_start();
198
199 /* announce we're spinning */
200 prev = spinning_lock(xl);
201
202 flags = __raw_local_save_flags();
203 if (irq_enable) {
204 ADD_STATS(taken_slow_irqenable, 1);
205 raw_local_irq_enable();
206 }
207
208 ADD_STATS(taken_slow, 1);
209 ADD_STATS(taken_slow_nested, prev != NULL);
210
211 do {
212 /* clear pending */
213 xen_clear_irq_pending(irq);
214
215 /* check again make sure it didn't become free while
216 we weren't looking */
217 ret = xen_spin_trylock(lock);
218 if (ret) {
219 ADD_STATS(taken_slow_pickup, 1);
220
221 /*
222 * If we interrupted another spinlock while it
223 * was blocking, make sure it doesn't block
224 * without rechecking the lock.
225 */
226 if (prev != NULL)
227 xen_set_irq_pending(irq);
228 goto out;
229 }
230
231 /*
232 * Block until irq becomes pending. If we're
233 * interrupted at this point (after the trylock but
234 * before entering the block), then the nested lock
235 * handler guarantees that the irq will be left
236 * pending if there's any chance the lock became free;
237 * xen_poll_irq() returns immediately if the irq is
238 * pending.
239 */
240 xen_poll_irq(irq);
241 ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
242 } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
243
244 kstat_this_cpu.irqs[irq]++;
245
246out:
247 raw_local_irq_restore(flags);
248 unspinning_lock(xl, prev);
249 spin_time_accum_blocked(start);
250
251 return ret;
252}
253
254static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
255{
256 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
257 unsigned timeout;
258 u8 oldval;
259 u64 start_spin;
260
261 ADD_STATS(taken, 1);
262
263 start_spin = spin_time_start();
264
265 do {
266 u64 start_spin_fast = spin_time_start();
267
268 timeout = TIMEOUT;
269
270 asm("1: xchgb %1,%0\n"
271 " testb %1,%1\n"
272 " jz 3f\n"
273 "2: rep;nop\n"
274 " cmpb $0,%0\n"
275 " je 1b\n"
276 " dec %2\n"
277 " jnz 2b\n"
278 "3:\n"
279 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
280 : "1" (1)
281 : "memory");
282
283 spin_time_accum_spinning(start_spin_fast);
284
285 } while (unlikely(oldval != 0 &&
286 (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
287
288 spin_time_accum_total(start_spin);
289}
290
291static void xen_spin_lock(struct raw_spinlock *lock)
292{
293 __xen_spin_lock(lock, false);
294}
295
296static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
297{
298 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
299}
300
301static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
302{
303 int cpu;
304
305 ADD_STATS(released_slow, 1);
306
307 for_each_online_cpu(cpu) {
308 /* XXX should mix up next cpu selection */
309 if (per_cpu(lock_spinners, cpu) == xl) {
310 ADD_STATS(released_slow_kicked, 1);
311 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
312 break;
313 }
314 }
315}
316
317static void xen_spin_unlock(struct raw_spinlock *lock)
318{
319 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
320
321 ADD_STATS(released, 1);
322
323 smp_wmb(); /* make sure no writes get moved after unlock */
324 xl->lock = 0; /* release lock */
325
326 /* make sure unlock happens before kick */
327 barrier();
328
329 if (unlikely(xl->spinners))
330 xen_spin_unlock_slow(xl);
331}
332
333static irqreturn_t dummy_handler(int irq, void *dev_id)
334{
335 BUG();
336 return IRQ_HANDLED;
337}
338
339void __cpuinit xen_init_lock_cpu(int cpu)
340{
341 int irq;
342 const char *name;
343
344 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
345 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
346 cpu,
347 dummy_handler,
348 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
349 name,
350 NULL);
351
352 if (irq >= 0) {
353 disable_irq(irq); /* make sure it's never delivered */
354 per_cpu(lock_kicker_irq, cpu) = irq;
355 }
356
357 printk("cpu %d spinlock event irq %d\n", cpu, irq);
358}
359
360void xen_uninit_lock_cpu(int cpu)
361{
362 unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
363}
364
365void __init xen_init_spinlocks(void)
366{
367 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
368 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
369 pv_lock_ops.spin_lock = xen_spin_lock;
370 pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
371 pv_lock_ops.spin_trylock = xen_spin_trylock;
372 pv_lock_ops.spin_unlock = xen_spin_unlock;
373}
374
375#ifdef CONFIG_XEN_DEBUG_FS
376
377static struct dentry *d_spin_debug;
378
379static int __init xen_spinlock_debugfs(void)
380{
381 struct dentry *d_xen = xen_init_debugfs();
382
383 if (d_xen == NULL)
384 return -ENOMEM;
385
386 d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
387
388 debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
389
390 debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
391
392 debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
393 debugfs_create_u32("taken_slow", 0444, d_spin_debug,
394 &spinlock_stats.taken_slow);
395 debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
396 &spinlock_stats.taken_slow_nested);
397 debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
398 &spinlock_stats.taken_slow_pickup);
399 debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
400 &spinlock_stats.taken_slow_spurious);
401 debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
402 &spinlock_stats.taken_slow_irqenable);
403
404 debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
405 debugfs_create_u32("released_slow", 0444, d_spin_debug,
406 &spinlock_stats.released_slow);
407 debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
408 &spinlock_stats.released_slow_kicked);
409
410 debugfs_create_u64("time_spinning", 0444, d_spin_debug,
411 &spinlock_stats.time_spinning);
412 debugfs_create_u64("time_blocked", 0444, d_spin_debug,
413 &spinlock_stats.time_blocked);
414 debugfs_create_u64("time_total", 0444, d_spin_debug,
415 &spinlock_stats.time_total);
416
417 xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
418 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
419 xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
420 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
421 xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
422 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
423
424 return 0;
425}
426fs_initcall(xen_spinlock_debugfs);
427
428#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
new file mode 100644
index 000000000000..2a234db5949b
--- /dev/null
+++ b/arch/x86/xen/suspend.c
@@ -0,0 +1,48 @@
1#include <linux/types.h>
2
3#include <xen/interface/xen.h>
4#include <xen/grant_table.h>
5#include <xen/events.h>
6
7#include <asm/xen/hypercall.h>
8#include <asm/xen/page.h>
9
10#include "xen-ops.h"
11#include "mmu.h"
12
13void xen_pre_suspend(void)
14{
15 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
16 xen_start_info->console.domU.mfn =
17 mfn_to_pfn(xen_start_info->console.domU.mfn);
18
19 BUG_ON(!irqs_disabled());
20
21 HYPERVISOR_shared_info = &xen_dummy_shared_info;
22 if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
23 __pte_ma(0), 0))
24 BUG();
25}
26
27void xen_post_suspend(int suspend_cancelled)
28{
29 xen_setup_shared_info();
30
31 if (suspend_cancelled) {
32 xen_start_info->store_mfn =
33 pfn_to_mfn(xen_start_info->store_mfn);
34 xen_start_info->console.domU.mfn =
35 pfn_to_mfn(xen_start_info->console.domU.mfn);
36 } else {
37#ifdef CONFIG_SMP
38 xen_cpu_initialized_map = cpu_online_map;
39#endif
40 xen_vcpu_restore();
41 }
42
43}
44
45void xen_arch_resume(void)
46{
47 /* nothing */
48}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 41e217503c96..004ba86326ae 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -30,8 +30,6 @@
30#define TIMER_SLOP 100000 30#define TIMER_SLOP 100000
31#define NS_PER_TICK (1000000000LL / HZ) 31#define NS_PER_TICK (1000000000LL / HZ)
32 32
33static cycle_t xen_clocksource_read(void);
34
35/* runstate info updated by Xen */ 33/* runstate info updated by Xen */
36static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 34static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
37 35
@@ -197,8 +195,8 @@ unsigned long long xen_sched_clock(void)
197} 195}
198 196
199 197
200/* Get the CPU speed from Xen */ 198/* Get the TSC speed from Xen */
201unsigned long xen_cpu_khz(void) 199unsigned long xen_tsc_khz(void)
202{ 200{
203 u64 xen_khz = 1000000ULL << 32; 201 u64 xen_khz = 1000000ULL << 32;
204 const struct pvclock_vcpu_time_info *info = 202 const struct pvclock_vcpu_time_info *info =
@@ -213,7 +211,7 @@ unsigned long xen_cpu_khz(void)
213 return xen_khz; 211 return xen_khz;
214} 212}
215 213
216static cycle_t xen_clocksource_read(void) 214cycle_t xen_clocksource_read(void)
217{ 215{
218 struct pvclock_vcpu_time_info *src; 216 struct pvclock_vcpu_time_info *src;
219 cycle_t ret; 217 cycle_t ret;
@@ -452,6 +450,14 @@ void xen_setup_timer(int cpu)
452 setup_runstate_info(cpu); 450 setup_runstate_info(cpu);
453} 451}
454 452
453void xen_teardown_timer(int cpu)
454{
455 struct clock_event_device *evt;
456 BUG_ON(cpu == 0);
457 evt = &per_cpu(xen_clock_events, cpu);
458 unbind_from_irqhandler(evt->irq, NULL);
459}
460
455void xen_setup_cpu_clockevents(void) 461void xen_setup_cpu_clockevents(void)
456{ 462{
457 BUG_ON(preemptible()); 463 BUG_ON(preemptible());
@@ -459,6 +465,19 @@ void xen_setup_cpu_clockevents(void)
459 clockevents_register_device(&__get_cpu_var(xen_clock_events)); 465 clockevents_register_device(&__get_cpu_var(xen_clock_events));
460} 466}
461 467
468void xen_timer_resume(void)
469{
470 int cpu;
471
472 if (xen_clockevent != &xen_vcpuop_clockevent)
473 return;
474
475 for_each_online_cpu(cpu) {
476 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
477 BUG();
478 }
479}
480
462__init void xen_time_init(void) 481__init void xen_time_init(void)
463{ 482{
464 int cpu = smp_processor_id(); 483 int cpu = smp_processor_id();
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..42786f59d9c0 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -298,7 +298,7 @@ check_events:
298 push %eax 298 push %eax
299 push %ecx 299 push %ecx
300 push %edx 300 push %edx
301 call force_evtchn_callback 301 call xen_force_evtchn_callback
302 pop %edx 302 pop %edx
303 pop %ecx 303 pop %ecx
304 pop %eax 304 pop %eax
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 000000000000..05794c566e87
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,285 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h>
19#include <asm/segment.h>
20
21#include <xen/interface/xen.h>
22
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 1
30/*
31 x86-64 does not yet support direct access to percpu variables
32 via a segment override, so we just need to make sure this code
33 never gets used
34 */
35#define BUG ud2a
36#define PER_CPU_VAR(var, off) 0xdeadbeef
37#endif
38
39/*
40 Enable events. This clears the event mask and tests the pending
41 event status with one and operation. If there are pending
42 events, then enter the hypervisor to get them handled.
43 */
44ENTRY(xen_irq_enable_direct)
45 BUG
46
47 /* Unmask events */
48 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
49
50 /* Preempt here doesn't matter because that will deal with
51 any pending interrupts. The pending check may end up being
52 run on the wrong CPU, but that doesn't hurt. */
53
54 /* Test for pending */
55 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
56 jz 1f
57
582: call check_events
591:
60ENDPATCH(xen_irq_enable_direct)
61 ret
62 ENDPROC(xen_irq_enable_direct)
63 RELOC(xen_irq_enable_direct, 2b+1)
64
65/*
66 Disabling events is simply a matter of making the event mask
67 non-zero.
68 */
69ENTRY(xen_irq_disable_direct)
70 BUG
71
72 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
73ENDPATCH(xen_irq_disable_direct)
74 ret
75 ENDPROC(xen_irq_disable_direct)
76 RELOC(xen_irq_disable_direct, 0)
77
78/*
79 (xen_)save_fl is used to get the current interrupt enable status.
80 Callers expect the status to be in X86_EFLAGS_IF, and other bits
81 may be set in the return value. We take advantage of this by
82 making sure that X86_EFLAGS_IF has the right value (and other bits
83 in that byte are 0), but other bits in the return value are
84 undefined. We need to toggle the state of the bit, because
85 Xen and x86 use opposite senses (mask vs enable).
86 */
87ENTRY(xen_save_fl_direct)
88 BUG
89
90 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
91 setz %ah
92 addb %ah,%ah
93ENDPATCH(xen_save_fl_direct)
94 ret
95 ENDPROC(xen_save_fl_direct)
96 RELOC(xen_save_fl_direct, 0)
97
98/*
99 In principle the caller should be passing us a value return
100 from xen_save_fl_direct, but for robustness sake we test only
101 the X86_EFLAGS_IF flag rather than the whole byte. After
102 setting the interrupt mask state, it checks for unmasked
103 pending events and enters the hypervisor to get them delivered
104 if so.
105 */
106ENTRY(xen_restore_fl_direct)
107 BUG
108
109 testb $X86_EFLAGS_IF>>8, %ah
110 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
111 /* Preempt here doesn't matter because that will deal with
112 any pending interrupts. The pending check may end up being
113 run on the wrong CPU, but that doesn't hurt. */
114
115 /* check for unmasked and pending */
116 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
117 jz 1f
1182: call check_events
1191:
120ENDPATCH(xen_restore_fl_direct)
121 ret
122 ENDPROC(xen_restore_fl_direct)
123 RELOC(xen_restore_fl_direct, 2b+1)
124
125
126/*
127 Force an event check by making a hypercall,
128 but preserve regs before making the call.
129 */
130check_events:
131 push %rax
132 push %rcx
133 push %rdx
134 push %rsi
135 push %rdi
136 push %r8
137 push %r9
138 push %r10
139 push %r11
140 call xen_force_evtchn_callback
141 pop %r11
142 pop %r10
143 pop %r9
144 pop %r8
145 pop %rdi
146 pop %rsi
147 pop %rdx
148 pop %rcx
149 pop %rax
150 ret
151
152ENTRY(xen_adjust_exception_frame)
153 mov 8+0(%rsp),%rcx
154 mov 8+8(%rsp),%r11
155 ret $16
156
157hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
158/*
159 Xen64 iret frame:
160
161 ss
162 rsp
163 rflags
164 cs
165 rip <-- standard iret frame
166
167 flags
168
169 rcx }
170 r11 }<-- pushed by hypercall page
171rsp -> rax }
172 */
173ENTRY(xen_iret)
174 pushq $0
1751: jmp hypercall_iret
176ENDPATCH(xen_iret)
177RELOC(xen_iret, 1b+1)
178
179/*
180 sysexit is not used for 64-bit processes, so it's
181 only ever used to return to 32-bit compat userspace.
182 */
183ENTRY(xen_sysexit)
184 pushq $__USER32_DS
185 pushq %rcx
186 pushq $X86_EFLAGS_IF
187 pushq $__USER32_CS
188 pushq %rdx
189
190 pushq $0
1911: jmp hypercall_iret
192ENDPATCH(xen_sysexit)
193RELOC(xen_sysexit, 1b+1)
194
195ENTRY(xen_sysret64)
196 /* We're already on the usermode stack at this point, but still
197 with the kernel gs, so we can easily switch back */
198 movq %rsp, %gs:pda_oldrsp
199 movq %gs:pda_kernelstack,%rsp
200
201 pushq $__USER_DS
202 pushq %gs:pda_oldrsp
203 pushq %r11
204 pushq $__USER_CS
205 pushq %rcx
206
207 pushq $VGCF_in_syscall
2081: jmp hypercall_iret
209ENDPATCH(xen_sysret64)
210RELOC(xen_sysret64, 1b+1)
211
212ENTRY(xen_sysret32)
213 /* We're already on the usermode stack at this point, but still
214 with the kernel gs, so we can easily switch back */
215 movq %rsp, %gs:pda_oldrsp
216 movq %gs:pda_kernelstack, %rsp
217
218 pushq $__USER32_DS
219 pushq %gs:pda_oldrsp
220 pushq %r11
221 pushq $__USER32_CS
222 pushq %rcx
223
224 pushq $VGCF_in_syscall
2251: jmp hypercall_iret
226ENDPATCH(xen_sysret32)
227RELOC(xen_sysret32, 1b+1)
228
229/*
230 Xen handles syscall callbacks much like ordinary exceptions,
231 which means we have:
232 - kernel gs
233 - kernel rsp
234 - an iret-like stack frame on the stack (including rcx and r11):
235 ss
236 rsp
237 rflags
238 cs
239 rip
240 r11
241 rsp-> rcx
242
243 In all the entrypoints, we undo all that to make it look
244 like a CPU-generated syscall/sysenter and jump to the normal
245 entrypoint.
246 */
247
248.macro undo_xen_syscall
249 mov 0*8(%rsp),%rcx
250 mov 1*8(%rsp),%r11
251 mov 5*8(%rsp),%rsp
252.endm
253
254/* Normal 64-bit system call target */
255ENTRY(xen_syscall_target)
256 undo_xen_syscall
257 jmp system_call_after_swapgs
258ENDPROC(xen_syscall_target)
259
260#ifdef CONFIG_IA32_EMULATION
261
262/* 32-bit compat syscall target */
263ENTRY(xen_syscall32_target)
264 undo_xen_syscall
265 jmp ia32_cstar_target
266ENDPROC(xen_syscall32_target)
267
268/* 32-bit compat sysenter target */
269ENTRY(xen_sysenter_target)
270 undo_xen_syscall
271 jmp ia32_sysenter_target
272ENDPROC(xen_sysenter_target)
273
274#else /* !CONFIG_IA32_EMULATION */
275
276ENTRY(xen_syscall32_target)
277ENTRY(xen_sysenter_target)
278 lea 16(%rsp), %rsp /* strip %rcx,%r11 */
279 mov $-ENOSYS, %rax
280 pushq $VGCF_in_syscall
281 jmp hypercall_iret
282ENDPROC(xen_syscall32_target)
283ENDPROC(xen_sysenter_target)
284
285#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 6ec3b4f7719b..63d49a523ed3 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,14 +5,24 @@
5 5
6#include <linux/elfnote.h> 6#include <linux/elfnote.h>
7#include <linux/init.h> 7#include <linux/init.h>
8
8#include <asm/boot.h> 9#include <asm/boot.h>
10#include <asm/asm.h>
11#include <asm/page.h>
12
9#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
14#include <asm/xen/interface.h>
10 15
11 __INIT 16 __INIT
12ENTRY(startup_xen) 17ENTRY(startup_xen)
13 movl %esi,xen_start_info
14 cld 18 cld
15 movl $(init_thread_union+THREAD_SIZE),%esp 19#ifdef CONFIG_X86_32
20 mov %esi,xen_start_info
21 mov $init_thread_union+THREAD_SIZE,%esp
22#else
23 mov %rsi,xen_start_info
24 mov $init_thread_union+THREAD_SIZE,%rsp
25#endif
16 jmp xen_start_kernel 26 jmp xen_start_kernel
17 27
18 __FINIT 28 __FINIT
@@ -20,17 +30,26 @@ ENTRY(startup_xen)
20.pushsection .text 30.pushsection .text
21 .align PAGE_SIZE_asm 31 .align PAGE_SIZE_asm
22ENTRY(hypercall_page) 32ENTRY(hypercall_page)
23 .skip 0x1000 33 .skip PAGE_SIZE_asm
24.popsection 34.popsection
25 35
26 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
27 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") 37 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
28 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") 38 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
29 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) 39#ifdef CONFIG_X86_32
30 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 40 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
31 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 41#else
42 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
43#endif
44 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
45 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
32 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 46 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
33 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 47 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
34 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 48 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
49 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
50 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
51 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
52 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
53 ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
35 54
36#endif /*CONFIG_XEN */ 55#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f1063ae08037..d7422dc2a55c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,6 +2,7 @@
2#define XEN_OPS_H 2#define XEN_OPS_H
3 3
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/clocksource.h>
5#include <linux/irqreturn.h> 6#include <linux/irqreturn.h>
6#include <xen/xen-ops.h> 7#include <xen/xen-ops.h>
7 8
@@ -9,22 +10,34 @@
9extern const char xen_hypervisor_callback[]; 10extern const char xen_hypervisor_callback[];
10extern const char xen_failsafe_callback[]; 11extern const char xen_failsafe_callback[];
11 12
13struct trap_info;
12void xen_copy_trap_info(struct trap_info *traps); 14void xen_copy_trap_info(struct trap_info *traps);
13 15
14DECLARE_PER_CPU(unsigned long, xen_cr3); 16DECLARE_PER_CPU(unsigned long, xen_cr3);
15DECLARE_PER_CPU(unsigned long, xen_current_cr3); 17DECLARE_PER_CPU(unsigned long, xen_current_cr3);
16 18
17extern struct start_info *xen_start_info; 19extern struct start_info *xen_start_info;
20extern struct shared_info xen_dummy_shared_info;
18extern struct shared_info *HYPERVISOR_shared_info; 21extern struct shared_info *HYPERVISOR_shared_info;
19 22
23void xen_setup_mfn_list_list(void);
24void xen_setup_shared_info(void);
25
20char * __init xen_memory_setup(void); 26char * __init xen_memory_setup(void);
21void __init xen_arch_setup(void); 27void __init xen_arch_setup(void);
22void __init xen_init_IRQ(void); 28void __init xen_init_IRQ(void);
23void xen_enable_sysenter(void); 29void xen_enable_sysenter(void);
30void xen_enable_syscall(void);
31void xen_vcpu_restore(void);
32
33void __init xen_build_dynamic_phys_to_machine(void);
24 34
35void xen_init_irq_ops(void);
25void xen_setup_timer(int cpu); 36void xen_setup_timer(int cpu);
37void xen_teardown_timer(int cpu);
38cycle_t xen_clocksource_read(void);
26void xen_setup_cpu_clockevents(void); 39void xen_setup_cpu_clockevents(void);
27unsigned long xen_cpu_khz(void); 40unsigned long xen_tsc_khz(void);
28void __init xen_time_init(void); 41void __init xen_time_init(void);
29unsigned long xen_get_wallclock(void); 42unsigned long xen_get_wallclock(void);
30int xen_set_wallclock(unsigned long time); 43int xen_set_wallclock(unsigned long time);
@@ -36,23 +49,19 @@ bool xen_vcpu_stolen(int vcpu);
36 49
37void xen_mark_init_mm_pinned(void); 50void xen_mark_init_mm_pinned(void);
38 51
39void __init xen_fill_possible_map(void);
40
41void __init xen_setup_vcpu_info_placement(void); 52void __init xen_setup_vcpu_info_placement(void);
42void xen_smp_prepare_boot_cpu(void);
43void xen_smp_prepare_cpus(unsigned int max_cpus);
44int xen_cpu_up(unsigned int cpu);
45void xen_smp_cpus_done(unsigned int max_cpus);
46 53
47void xen_smp_send_stop(void); 54#ifdef CONFIG_SMP
48void xen_smp_send_reschedule(int cpu); 55void xen_smp_init(void);
49int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic, 56
50 int wait); 57void __init xen_init_spinlocks(void);
51int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, 58__cpuinit void xen_init_lock_cpu(int cpu);
52 int nonatomic, int wait); 59void xen_uninit_lock_cpu(int cpu);
53 60
54int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), 61extern cpumask_t xen_cpu_initialized_map;
55 void *info, int wait); 62#else
63static inline void xen_smp_init(void) {}
64#endif
56 65
57 66
58/* Declare an asm function, along with symbols needed to make it 67/* Declare an asm function, along with symbols needed to make it
@@ -67,7 +76,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
67DECL_ASM(unsigned long, xen_save_fl_direct, void); 76DECL_ASM(unsigned long, xen_save_fl_direct, void);
68DECL_ASM(void, xen_restore_fl_direct, unsigned long); 77DECL_ASM(void, xen_restore_fl_direct, unsigned long);
69 78
79/* These are not functions, and cannot be called normally */
70void xen_iret(void); 80void xen_iret(void);
71void xen_sysexit(void); 81void xen_sysexit(void);
82void xen_sysret32(void);
83void xen_sysret64(void);
84void xen_adjust_exception_frame(void);
72 85
73#endif /* XEN_OPS_H */ 86#endif /* XEN_OPS_H */