aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/page.h3
-rw-r--r--arch/x86/include/asm/paravirt.h453
-rw-r--r--arch/x86/include/asm/pgtable.h38
-rw-r--r--arch/x86/kernel/entry_64.S2
-rw-r--r--arch/x86/kernel/paravirt.c55
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c12
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c15
-rw-r--r--arch/x86/kernel/vmi_32.c9
-rw-r--r--arch/x86/kernel/vsmp_64.c12
-rw-r--r--arch/x86/lguest/boot.c13
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c767
-rw-r--r--arch/x86/xen/irq.c14
-rw-r--r--arch/x86/xen/mmu.c745
-rw-r--r--arch/x86/xen/mmu.h3
-rw-r--r--arch/x86/xen/smp.c6
-rw-r--r--arch/x86/xen/xen-asm.S140
-rw-r--r--arch/x86/xen/xen-asm.h12
-rw-r--r--arch/x86/xen/xen-asm_32.S111
-rw-r--r--arch/x86/xen/xen-asm_64.S134
-rw-r--r--arch/x86/xen/xen-ops.h10
21 files changed, 1379 insertions, 1178 deletions
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index e9873a2e8695..6b9810859daf 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -147,7 +147,7 @@ static inline pteval_t native_pte_val(pte_t pte)
147 return pte.pte; 147 return pte.pte;
148} 148}
149 149
150static inline pteval_t native_pte_flags(pte_t pte) 150static inline pteval_t pte_flags(pte_t pte)
151{ 151{
152 return native_pte_val(pte) & PTE_FLAGS_MASK; 152 return native_pte_val(pte) & PTE_FLAGS_MASK;
153} 153}
@@ -173,7 +173,6 @@ static inline pteval_t native_pte_flags(pte_t pte)
173#endif 173#endif
174 174
175#define pte_val(x) native_pte_val(x) 175#define pte_val(x) native_pte_val(x)
176#define pte_flags(x) native_pte_flags(x)
177#define __pte(x) native_make_pte(x) 176#define __pte(x) native_make_pte(x)
178 177
179#endif /* CONFIG_PARAVIRT */ 178#endif /* CONFIG_PARAVIRT */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index c26c6bf4da00..c85e7475e171 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -12,21 +12,38 @@
12#define CLBR_EAX (1 << 0) 12#define CLBR_EAX (1 << 0)
13#define CLBR_ECX (1 << 1) 13#define CLBR_ECX (1 << 1)
14#define CLBR_EDX (1 << 2) 14#define CLBR_EDX (1 << 2)
15#define CLBR_EDI (1 << 3)
15 16
16#ifdef CONFIG_X86_64 17#ifdef CONFIG_X86_32
17#define CLBR_RSI (1 << 3) 18/* CLBR_ANY should match all regs platform has. For i386, that's just it */
18#define CLBR_RDI (1 << 4) 19#define CLBR_ANY ((1 << 4) - 1)
20
21#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX)
22#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX)
23#define CLBR_SCRATCH (0)
24#else
25#define CLBR_RAX CLBR_EAX
26#define CLBR_RCX CLBR_ECX
27#define CLBR_RDX CLBR_EDX
28#define CLBR_RDI CLBR_EDI
29#define CLBR_RSI (1 << 4)
19#define CLBR_R8 (1 << 5) 30#define CLBR_R8 (1 << 5)
20#define CLBR_R9 (1 << 6) 31#define CLBR_R9 (1 << 6)
21#define CLBR_R10 (1 << 7) 32#define CLBR_R10 (1 << 7)
22#define CLBR_R11 (1 << 8) 33#define CLBR_R11 (1 << 8)
34
23#define CLBR_ANY ((1 << 9) - 1) 35#define CLBR_ANY ((1 << 9) - 1)
36
37#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \
38 CLBR_RCX | CLBR_R8 | CLBR_R9)
39#define CLBR_RET_REG (CLBR_RAX)
40#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11)
41
24#include <asm/desc_defs.h> 42#include <asm/desc_defs.h>
25#else
26/* CLBR_ANY should match all regs platform has. For i386, that's just it */
27#define CLBR_ANY ((1 << 3) - 1)
28#endif /* X86_64 */ 43#endif /* X86_64 */
29 44
45#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG)
46
30#ifndef __ASSEMBLY__ 47#ifndef __ASSEMBLY__
31#include <linux/types.h> 48#include <linux/types.h>
32#include <linux/cpumask.h> 49#include <linux/cpumask.h>
@@ -40,6 +57,14 @@ struct tss_struct;
40struct mm_struct; 57struct mm_struct;
41struct desc_struct; 58struct desc_struct;
42 59
60/*
61 * Wrapper type for pointers to code which uses the non-standard
62 * calling convention. See PV_CALL_SAVE_REGS_THUNK below.
63 */
64struct paravirt_callee_save {
65 void *func;
66};
67
43/* general info */ 68/* general info */
44struct pv_info { 69struct pv_info {
45 unsigned int kernel_rpl; 70 unsigned int kernel_rpl;
@@ -189,11 +214,15 @@ struct pv_irq_ops {
189 * expected to use X86_EFLAGS_IF; all other bits 214 * expected to use X86_EFLAGS_IF; all other bits
190 * returned from save_fl are undefined, and may be ignored by 215 * returned from save_fl are undefined, and may be ignored by
191 * restore_fl. 216 * restore_fl.
217 *
218 * NOTE: These functions callers expect the callee to preserve
219 * more registers than the standard C calling convention.
192 */ 220 */
193 unsigned long (*save_fl)(void); 221 struct paravirt_callee_save save_fl;
194 void (*restore_fl)(unsigned long); 222 struct paravirt_callee_save restore_fl;
195 void (*irq_disable)(void); 223 struct paravirt_callee_save irq_disable;
196 void (*irq_enable)(void); 224 struct paravirt_callee_save irq_enable;
225
197 void (*safe_halt)(void); 226 void (*safe_halt)(void);
198 void (*halt)(void); 227 void (*halt)(void);
199 228
@@ -279,12 +308,11 @@ struct pv_mmu_ops {
279 void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, 308 void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
280 pte_t *ptep, pte_t pte); 309 pte_t *ptep, pte_t pte);
281 310
282 pteval_t (*pte_val)(pte_t); 311 struct paravirt_callee_save pte_val;
283 pteval_t (*pte_flags)(pte_t); 312 struct paravirt_callee_save make_pte;
284 pte_t (*make_pte)(pteval_t pte);
285 313
286 pgdval_t (*pgd_val)(pgd_t); 314 struct paravirt_callee_save pgd_val;
287 pgd_t (*make_pgd)(pgdval_t pgd); 315 struct paravirt_callee_save make_pgd;
288 316
289#if PAGETABLE_LEVELS >= 3 317#if PAGETABLE_LEVELS >= 3
290#ifdef CONFIG_X86_PAE 318#ifdef CONFIG_X86_PAE
@@ -299,12 +327,12 @@ struct pv_mmu_ops {
299 327
300 void (*set_pud)(pud_t *pudp, pud_t pudval); 328 void (*set_pud)(pud_t *pudp, pud_t pudval);
301 329
302 pmdval_t (*pmd_val)(pmd_t); 330 struct paravirt_callee_save pmd_val;
303 pmd_t (*make_pmd)(pmdval_t pmd); 331 struct paravirt_callee_save make_pmd;
304 332
305#if PAGETABLE_LEVELS == 4 333#if PAGETABLE_LEVELS == 4
306 pudval_t (*pud_val)(pud_t); 334 struct paravirt_callee_save pud_val;
307 pud_t (*make_pud)(pudval_t pud); 335 struct paravirt_callee_save make_pud;
308 336
309 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); 337 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
310#endif /* PAGETABLE_LEVELS == 4 */ 338#endif /* PAGETABLE_LEVELS == 4 */
@@ -389,6 +417,8 @@ extern struct pv_lock_ops pv_lock_ops;
389 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") 417 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
390 418
391unsigned paravirt_patch_nop(void); 419unsigned paravirt_patch_nop(void);
420unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
421unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
392unsigned paravirt_patch_ignore(unsigned len); 422unsigned paravirt_patch_ignore(unsigned len);
393unsigned paravirt_patch_call(void *insnbuf, 423unsigned paravirt_patch_call(void *insnbuf,
394 const void *target, u16 tgt_clobbers, 424 const void *target, u16 tgt_clobbers,
@@ -480,25 +510,45 @@ int paravirt_disable_iospace(void);
480 * makes sure the incoming and outgoing types are always correct. 510 * makes sure the incoming and outgoing types are always correct.
481 */ 511 */
482#ifdef CONFIG_X86_32 512#ifdef CONFIG_X86_32
483#define PVOP_VCALL_ARGS unsigned long __eax, __edx, __ecx 513#define PVOP_VCALL_ARGS \
514 unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx
484#define PVOP_CALL_ARGS PVOP_VCALL_ARGS 515#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
516
517#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
518#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x))
519#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x))
520
485#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ 521#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \
486 "=c" (__ecx) 522 "=c" (__ecx)
487#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS 523#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS
524
525#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx)
526#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
527
488#define EXTRA_CLOBBERS 528#define EXTRA_CLOBBERS
489#define VEXTRA_CLOBBERS 529#define VEXTRA_CLOBBERS
490#else 530#else /* CONFIG_X86_64 */
491#define PVOP_VCALL_ARGS unsigned long __edi, __esi, __edx, __ecx 531#define PVOP_VCALL_ARGS \
532 unsigned long __edi = __edi, __esi = __esi, \
533 __edx = __edx, __ecx = __ecx
492#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax 534#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax
535
536#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
537#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
538#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x))
539#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x))
540
493#define PVOP_VCALL_CLOBBERS "=D" (__edi), \ 541#define PVOP_VCALL_CLOBBERS "=D" (__edi), \
494 "=S" (__esi), "=d" (__edx), \ 542 "=S" (__esi), "=d" (__edx), \
495 "=c" (__ecx) 543 "=c" (__ecx)
496
497#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) 544#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
498 545
546#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
547#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
548
499#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" 549#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
500#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" 550#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11"
501#endif 551#endif /* CONFIG_X86_32 */
502 552
503#ifdef CONFIG_PARAVIRT_DEBUG 553#ifdef CONFIG_PARAVIRT_DEBUG
504#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) 554#define PVOP_TEST_NULL(op) BUG_ON(op == NULL)
@@ -506,10 +556,11 @@ int paravirt_disable_iospace(void);
506#define PVOP_TEST_NULL(op) ((void)op) 556#define PVOP_TEST_NULL(op) ((void)op)
507#endif 557#endif
508 558
509#define __PVOP_CALL(rettype, op, pre, post, ...) \ 559#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \
560 pre, post, ...) \
510 ({ \ 561 ({ \
511 rettype __ret; \ 562 rettype __ret; \
512 PVOP_CALL_ARGS; \ 563 PVOP_CALL_ARGS; \
513 PVOP_TEST_NULL(op); \ 564 PVOP_TEST_NULL(op); \
514 /* This is 32-bit specific, but is okay in 64-bit */ \ 565 /* This is 32-bit specific, but is okay in 64-bit */ \
515 /* since this condition will never hold */ \ 566 /* since this condition will never hold */ \
@@ -517,70 +568,113 @@ int paravirt_disable_iospace(void);
517 asm volatile(pre \ 568 asm volatile(pre \
518 paravirt_alt(PARAVIRT_CALL) \ 569 paravirt_alt(PARAVIRT_CALL) \
519 post \ 570 post \
520 : PVOP_CALL_CLOBBERS \ 571 : call_clbr \
521 : paravirt_type(op), \ 572 : paravirt_type(op), \
522 paravirt_clobber(CLBR_ANY), \ 573 paravirt_clobber(clbr), \
523 ##__VA_ARGS__ \ 574 ##__VA_ARGS__ \
524 : "memory", "cc" EXTRA_CLOBBERS); \ 575 : "memory", "cc" extra_clbr); \
525 __ret = (rettype)((((u64)__edx) << 32) | __eax); \ 576 __ret = (rettype)((((u64)__edx) << 32) | __eax); \
526 } else { \ 577 } else { \
527 asm volatile(pre \ 578 asm volatile(pre \
528 paravirt_alt(PARAVIRT_CALL) \ 579 paravirt_alt(PARAVIRT_CALL) \
529 post \ 580 post \
530 : PVOP_CALL_CLOBBERS \ 581 : call_clbr \
531 : paravirt_type(op), \ 582 : paravirt_type(op), \
532 paravirt_clobber(CLBR_ANY), \ 583 paravirt_clobber(clbr), \
533 ##__VA_ARGS__ \ 584 ##__VA_ARGS__ \
534 : "memory", "cc" EXTRA_CLOBBERS); \ 585 : "memory", "cc" extra_clbr); \
535 __ret = (rettype)__eax; \ 586 __ret = (rettype)__eax; \
536 } \ 587 } \
537 __ret; \ 588 __ret; \
538 }) 589 })
539#define __PVOP_VCALL(op, pre, post, ...) \ 590
591#define __PVOP_CALL(rettype, op, pre, post, ...) \
592 ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \
593 EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__)
594
595#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \
596 ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
597 PVOP_CALLEE_CLOBBERS, , \
598 pre, post, ##__VA_ARGS__)
599
600
601#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \
540 ({ \ 602 ({ \
541 PVOP_VCALL_ARGS; \ 603 PVOP_VCALL_ARGS; \
542 PVOP_TEST_NULL(op); \ 604 PVOP_TEST_NULL(op); \
543 asm volatile(pre \ 605 asm volatile(pre \
544 paravirt_alt(PARAVIRT_CALL) \ 606 paravirt_alt(PARAVIRT_CALL) \
545 post \ 607 post \
546 : PVOP_VCALL_CLOBBERS \ 608 : call_clbr \
547 : paravirt_type(op), \ 609 : paravirt_type(op), \
548 paravirt_clobber(CLBR_ANY), \ 610 paravirt_clobber(clbr), \
549 ##__VA_ARGS__ \ 611 ##__VA_ARGS__ \
550 : "memory", "cc" VEXTRA_CLOBBERS); \ 612 : "memory", "cc" extra_clbr); \
551 }) 613 })
552 614
615#define __PVOP_VCALL(op, pre, post, ...) \
616 ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
617 VEXTRA_CLOBBERS, \
618 pre, post, ##__VA_ARGS__)
619
620#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \
621 ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
622 PVOP_VCALLEE_CLOBBERS, , \
623 pre, post, ##__VA_ARGS__)
624
625
626
553#define PVOP_CALL0(rettype, op) \ 627#define PVOP_CALL0(rettype, op) \
554 __PVOP_CALL(rettype, op, "", "") 628 __PVOP_CALL(rettype, op, "", "")
555#define PVOP_VCALL0(op) \ 629#define PVOP_VCALL0(op) \
556 __PVOP_VCALL(op, "", "") 630 __PVOP_VCALL(op, "", "")
557 631
632#define PVOP_CALLEE0(rettype, op) \
633 __PVOP_CALLEESAVE(rettype, op, "", "")
634#define PVOP_VCALLEE0(op) \
635 __PVOP_VCALLEESAVE(op, "", "")
636
637
558#define PVOP_CALL1(rettype, op, arg1) \ 638#define PVOP_CALL1(rettype, op, arg1) \
559 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1))) 639 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
560#define PVOP_VCALL1(op, arg1) \ 640#define PVOP_VCALL1(op, arg1) \
561 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1))) 641 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1))
642
643#define PVOP_CALLEE1(rettype, op, arg1) \
644 __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
645#define PVOP_VCALLEE1(op, arg1) \
646 __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1))
647
562 648
563#define PVOP_CALL2(rettype, op, arg1, arg2) \ 649#define PVOP_CALL2(rettype, op, arg1, arg2) \
564 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ 650 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
565 "1" ((unsigned long)(arg2))) 651 PVOP_CALL_ARG2(arg2))
566#define PVOP_VCALL2(op, arg1, arg2) \ 652#define PVOP_VCALL2(op, arg1, arg2) \
567 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ 653 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
568 "1" ((unsigned long)(arg2))) 654 PVOP_CALL_ARG2(arg2))
655
656#define PVOP_CALLEE2(rettype, op, arg1, arg2) \
657 __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
658 PVOP_CALL_ARG2(arg2))
659#define PVOP_VCALLEE2(op, arg1, arg2) \
660 __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \
661 PVOP_CALL_ARG2(arg2))
662
569 663
570#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ 664#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
571 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ 665 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
572 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) 666 PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
573#define PVOP_VCALL3(op, arg1, arg2, arg3) \ 667#define PVOP_VCALL3(op, arg1, arg2, arg3) \
574 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ 668 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
575 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) 669 PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
576 670
577/* This is the only difference in x86_64. We can make it much simpler */ 671/* This is the only difference in x86_64. We can make it much simpler */
578#ifdef CONFIG_X86_32 672#ifdef CONFIG_X86_32
579#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ 673#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
580 __PVOP_CALL(rettype, op, \ 674 __PVOP_CALL(rettype, op, \
581 "push %[_arg4];", "lea 4(%%esp),%%esp;", \ 675 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
582 "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ 676 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
583 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) 677 PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4)))
584#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ 678#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
585 __PVOP_VCALL(op, \ 679 __PVOP_VCALL(op, \
586 "push %[_arg4];", "lea 4(%%esp),%%esp;", \ 680 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
@@ -588,13 +682,13 @@ int paravirt_disable_iospace(void);
588 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) 682 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
589#else 683#else
590#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ 684#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
591 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ 685 __PVOP_CALL(rettype, op, "", "", \
592 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ 686 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
593 "3"((unsigned long)(arg4))) 687 PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
594#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ 688#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
595 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ 689 __PVOP_VCALL(op, "", "", \
596 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ 690 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
597 "3"((unsigned long)(arg4))) 691 PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
598#endif 692#endif
599 693
600static inline int paravirt_enabled(void) 694static inline int paravirt_enabled(void)
@@ -1061,13 +1155,13 @@ static inline pte_t __pte(pteval_t val)
1061 pteval_t ret; 1155 pteval_t ret;
1062 1156
1063 if (sizeof(pteval_t) > sizeof(long)) 1157 if (sizeof(pteval_t) > sizeof(long))
1064 ret = PVOP_CALL2(pteval_t, 1158 ret = PVOP_CALLEE2(pteval_t,
1065 pv_mmu_ops.make_pte, 1159 pv_mmu_ops.make_pte,
1066 val, (u64)val >> 32); 1160 val, (u64)val >> 32);
1067 else 1161 else
1068 ret = PVOP_CALL1(pteval_t, 1162 ret = PVOP_CALLEE1(pteval_t,
1069 pv_mmu_ops.make_pte, 1163 pv_mmu_ops.make_pte,
1070 val); 1164 val);
1071 1165
1072 return (pte_t) { .pte = ret }; 1166 return (pte_t) { .pte = ret };
1073} 1167}
@@ -1077,29 +1171,12 @@ static inline pteval_t pte_val(pte_t pte)
1077 pteval_t ret; 1171 pteval_t ret;
1078 1172
1079 if (sizeof(pteval_t) > sizeof(long)) 1173 if (sizeof(pteval_t) > sizeof(long))
1080 ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val, 1174 ret = PVOP_CALLEE2(pteval_t, pv_mmu_ops.pte_val,
1081 pte.pte, (u64)pte.pte >> 32); 1175 pte.pte, (u64)pte.pte >> 32);
1082 else
1083 ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val,
1084 pte.pte);
1085
1086 return ret;
1087}
1088
1089static inline pteval_t pte_flags(pte_t pte)
1090{
1091 pteval_t ret;
1092
1093 if (sizeof(pteval_t) > sizeof(long))
1094 ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags,
1095 pte.pte, (u64)pte.pte >> 32);
1096 else 1176 else
1097 ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags, 1177 ret = PVOP_CALLEE1(pteval_t, pv_mmu_ops.pte_val,
1098 pte.pte); 1178 pte.pte);
1099 1179
1100#ifdef CONFIG_PARAVIRT_DEBUG
1101 BUG_ON(ret & PTE_PFN_MASK);
1102#endif
1103 return ret; 1180 return ret;
1104} 1181}
1105 1182
@@ -1108,11 +1185,11 @@ static inline pgd_t __pgd(pgdval_t val)
1108 pgdval_t ret; 1185 pgdval_t ret;
1109 1186
1110 if (sizeof(pgdval_t) > sizeof(long)) 1187 if (sizeof(pgdval_t) > sizeof(long))
1111 ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd, 1188 ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.make_pgd,
1112 val, (u64)val >> 32); 1189 val, (u64)val >> 32);
1113 else 1190 else
1114 ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd, 1191 ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.make_pgd,
1115 val); 1192 val);
1116 1193
1117 return (pgd_t) { ret }; 1194 return (pgd_t) { ret };
1118} 1195}
@@ -1122,11 +1199,11 @@ static inline pgdval_t pgd_val(pgd_t pgd)
1122 pgdval_t ret; 1199 pgdval_t ret;
1123 1200
1124 if (sizeof(pgdval_t) > sizeof(long)) 1201 if (sizeof(pgdval_t) > sizeof(long))
1125 ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val, 1202 ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.pgd_val,
1126 pgd.pgd, (u64)pgd.pgd >> 32); 1203 pgd.pgd, (u64)pgd.pgd >> 32);
1127 else 1204 else
1128 ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val, 1205 ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.pgd_val,
1129 pgd.pgd); 1206 pgd.pgd);
1130 1207
1131 return ret; 1208 return ret;
1132} 1209}
@@ -1190,11 +1267,11 @@ static inline pmd_t __pmd(pmdval_t val)
1190 pmdval_t ret; 1267 pmdval_t ret;
1191 1268
1192 if (sizeof(pmdval_t) > sizeof(long)) 1269 if (sizeof(pmdval_t) > sizeof(long))
1193 ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd, 1270 ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.make_pmd,
1194 val, (u64)val >> 32); 1271 val, (u64)val >> 32);
1195 else 1272 else
1196 ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd, 1273 ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.make_pmd,
1197 val); 1274 val);
1198 1275
1199 return (pmd_t) { ret }; 1276 return (pmd_t) { ret };
1200} 1277}
@@ -1204,11 +1281,11 @@ static inline pmdval_t pmd_val(pmd_t pmd)
1204 pmdval_t ret; 1281 pmdval_t ret;
1205 1282
1206 if (sizeof(pmdval_t) > sizeof(long)) 1283 if (sizeof(pmdval_t) > sizeof(long))
1207 ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val, 1284 ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.pmd_val,
1208 pmd.pmd, (u64)pmd.pmd >> 32); 1285 pmd.pmd, (u64)pmd.pmd >> 32);
1209 else 1286 else
1210 ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val, 1287 ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.pmd_val,
1211 pmd.pmd); 1288 pmd.pmd);
1212 1289
1213 return ret; 1290 return ret;
1214} 1291}
@@ -1230,11 +1307,11 @@ static inline pud_t __pud(pudval_t val)
1230 pudval_t ret; 1307 pudval_t ret;
1231 1308
1232 if (sizeof(pudval_t) > sizeof(long)) 1309 if (sizeof(pudval_t) > sizeof(long))
1233 ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud, 1310 ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.make_pud,
1234 val, (u64)val >> 32); 1311 val, (u64)val >> 32);
1235 else 1312 else
1236 ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud, 1313 ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.make_pud,
1237 val); 1314 val);
1238 1315
1239 return (pud_t) { ret }; 1316 return (pud_t) { ret };
1240} 1317}
@@ -1244,11 +1321,11 @@ static inline pudval_t pud_val(pud_t pud)
1244 pudval_t ret; 1321 pudval_t ret;
1245 1322
1246 if (sizeof(pudval_t) > sizeof(long)) 1323 if (sizeof(pudval_t) > sizeof(long))
1247 ret = PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val, 1324 ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.pud_val,
1248 pud.pud, (u64)pud.pud >> 32); 1325 pud.pud, (u64)pud.pud >> 32);
1249 else 1326 else
1250 ret = PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val, 1327 ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.pud_val,
1251 pud.pud); 1328 pud.pud);
1252 1329
1253 return ret; 1330 return ret;
1254} 1331}
@@ -1389,6 +1466,9 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
1389} 1466}
1390 1467
1391void _paravirt_nop(void); 1468void _paravirt_nop(void);
1469u32 _paravirt_ident_32(u32);
1470u64 _paravirt_ident_64(u64);
1471
1392#define paravirt_nop ((void *)_paravirt_nop) 1472#define paravirt_nop ((void *)_paravirt_nop)
1393 1473
1394void paravirt_use_bytelocks(void); 1474void paravirt_use_bytelocks(void);
@@ -1440,12 +1520,37 @@ extern struct paravirt_patch_site __parainstructions[],
1440 __parainstructions_end[]; 1520 __parainstructions_end[];
1441 1521
1442#ifdef CONFIG_X86_32 1522#ifdef CONFIG_X86_32
1443#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;" 1523#define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
1444#define PV_RESTORE_REGS "popl %%edx; popl %%ecx" 1524#define PV_RESTORE_REGS "popl %edx; popl %ecx;"
1525
1526/* save and restore all caller-save registers, except return value */
1527#define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;"
1528#define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;"
1529
1445#define PV_FLAGS_ARG "0" 1530#define PV_FLAGS_ARG "0"
1446#define PV_EXTRA_CLOBBERS 1531#define PV_EXTRA_CLOBBERS
1447#define PV_VEXTRA_CLOBBERS 1532#define PV_VEXTRA_CLOBBERS
1448#else 1533#else
1534/* save and restore all caller-save registers, except return value */
1535#define PV_SAVE_ALL_CALLER_REGS \
1536 "push %rcx;" \
1537 "push %rdx;" \
1538 "push %rsi;" \
1539 "push %rdi;" \
1540 "push %r8;" \
1541 "push %r9;" \
1542 "push %r10;" \
1543 "push %r11;"
1544#define PV_RESTORE_ALL_CALLER_REGS \
1545 "pop %r11;" \
1546 "pop %r10;" \
1547 "pop %r9;" \
1548 "pop %r8;" \
1549 "pop %rdi;" \
1550 "pop %rsi;" \
1551 "pop %rdx;" \
1552 "pop %rcx;"
1553
1449/* We save some registers, but all of them, that's too much. We clobber all 1554/* We save some registers, but all of them, that's too much. We clobber all
1450 * caller saved registers but the argument parameter */ 1555 * caller saved registers but the argument parameter */
1451#define PV_SAVE_REGS "pushq %%rdi;" 1556#define PV_SAVE_REGS "pushq %%rdi;"
@@ -1455,52 +1560,76 @@ extern struct paravirt_patch_site __parainstructions[],
1455#define PV_FLAGS_ARG "D" 1560#define PV_FLAGS_ARG "D"
1456#endif 1561#endif
1457 1562
1563/*
1564 * Generate a thunk around a function which saves all caller-save
1565 * registers except for the return value. This allows C functions to
1566 * be called from assembler code where fewer than normal registers are
1567 * available. It may also help code generation around calls from C
1568 * code if the common case doesn't use many registers.
1569 *
1570 * When a callee is wrapped in a thunk, the caller can assume that all
1571 * arg regs and all scratch registers are preserved across the
1572 * call. The return value in rax/eax will not be saved, even for void
1573 * functions.
1574 */
1575#define PV_CALLEE_SAVE_REGS_THUNK(func) \
1576 extern typeof(func) __raw_callee_save_##func; \
1577 static void *__##func##__ __used = func; \
1578 \
1579 asm(".pushsection .text;" \
1580 "__raw_callee_save_" #func ": " \
1581 PV_SAVE_ALL_CALLER_REGS \
1582 "call " #func ";" \
1583 PV_RESTORE_ALL_CALLER_REGS \
1584 "ret;" \
1585 ".popsection")
1586
1587/* Get a reference to a callee-save function */
1588#define PV_CALLEE_SAVE(func) \
1589 ((struct paravirt_callee_save) { __raw_callee_save_##func })
1590
1591/* Promise that "func" already uses the right calling convention */
1592#define __PV_IS_CALLEE_SAVE(func) \
1593 ((struct paravirt_callee_save) { func })
1594
1458static inline unsigned long __raw_local_save_flags(void) 1595static inline unsigned long __raw_local_save_flags(void)
1459{ 1596{
1460 unsigned long f; 1597 unsigned long f;
1461 1598
1462 asm volatile(paravirt_alt(PV_SAVE_REGS 1599 asm volatile(paravirt_alt(PARAVIRT_CALL)
1463 PARAVIRT_CALL
1464 PV_RESTORE_REGS)
1465 : "=a"(f) 1600 : "=a"(f)
1466 : paravirt_type(pv_irq_ops.save_fl), 1601 : paravirt_type(pv_irq_ops.save_fl),
1467 paravirt_clobber(CLBR_EAX) 1602 paravirt_clobber(CLBR_EAX)
1468 : "memory", "cc" PV_VEXTRA_CLOBBERS); 1603 : "memory", "cc");
1469 return f; 1604 return f;
1470} 1605}
1471 1606
1472static inline void raw_local_irq_restore(unsigned long f) 1607static inline void raw_local_irq_restore(unsigned long f)
1473{ 1608{
1474 asm volatile(paravirt_alt(PV_SAVE_REGS 1609 asm volatile(paravirt_alt(PARAVIRT_CALL)
1475 PARAVIRT_CALL
1476 PV_RESTORE_REGS)
1477 : "=a"(f) 1610 : "=a"(f)
1478 : PV_FLAGS_ARG(f), 1611 : PV_FLAGS_ARG(f),
1479 paravirt_type(pv_irq_ops.restore_fl), 1612 paravirt_type(pv_irq_ops.restore_fl),
1480 paravirt_clobber(CLBR_EAX) 1613 paravirt_clobber(CLBR_EAX)
1481 : "memory", "cc" PV_EXTRA_CLOBBERS); 1614 : "memory", "cc");
1482} 1615}
1483 1616
1484static inline void raw_local_irq_disable(void) 1617static inline void raw_local_irq_disable(void)
1485{ 1618{
1486 asm volatile(paravirt_alt(PV_SAVE_REGS 1619 asm volatile(paravirt_alt(PARAVIRT_CALL)
1487 PARAVIRT_CALL
1488 PV_RESTORE_REGS)
1489 : 1620 :
1490 : paravirt_type(pv_irq_ops.irq_disable), 1621 : paravirt_type(pv_irq_ops.irq_disable),
1491 paravirt_clobber(CLBR_EAX) 1622 paravirt_clobber(CLBR_EAX)
1492 : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); 1623 : "memory", "eax", "cc");
1493} 1624}
1494 1625
1495static inline void raw_local_irq_enable(void) 1626static inline void raw_local_irq_enable(void)
1496{ 1627{
1497 asm volatile(paravirt_alt(PV_SAVE_REGS 1628 asm volatile(paravirt_alt(PARAVIRT_CALL)
1498 PARAVIRT_CALL
1499 PV_RESTORE_REGS)
1500 : 1629 :
1501 : paravirt_type(pv_irq_ops.irq_enable), 1630 : paravirt_type(pv_irq_ops.irq_enable),
1502 paravirt_clobber(CLBR_EAX) 1631 paravirt_clobber(CLBR_EAX)
1503 : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); 1632 : "memory", "eax", "cc");
1504} 1633}
1505 1634
1506static inline unsigned long __raw_local_irq_save(void) 1635static inline unsigned long __raw_local_irq_save(void)
@@ -1543,33 +1672,49 @@ static inline unsigned long __raw_local_irq_save(void)
1543 .popsection 1672 .popsection
1544 1673
1545 1674
1675#define COND_PUSH(set, mask, reg) \
1676 .if ((~(set)) & mask); push %reg; .endif
1677#define COND_POP(set, mask, reg) \
1678 .if ((~(set)) & mask); pop %reg; .endif
1679
1546#ifdef CONFIG_X86_64 1680#ifdef CONFIG_X86_64
1547#define PV_SAVE_REGS \ 1681
1548 push %rax; \ 1682#define PV_SAVE_REGS(set) \
1549 push %rcx; \ 1683 COND_PUSH(set, CLBR_RAX, rax); \
1550 push %rdx; \ 1684 COND_PUSH(set, CLBR_RCX, rcx); \
1551 push %rsi; \ 1685 COND_PUSH(set, CLBR_RDX, rdx); \
1552 push %rdi; \ 1686 COND_PUSH(set, CLBR_RSI, rsi); \
1553 push %r8; \ 1687 COND_PUSH(set, CLBR_RDI, rdi); \
1554 push %r9; \ 1688 COND_PUSH(set, CLBR_R8, r8); \
1555 push %r10; \ 1689 COND_PUSH(set, CLBR_R9, r9); \
1556 push %r11 1690 COND_PUSH(set, CLBR_R10, r10); \
1557#define PV_RESTORE_REGS \ 1691 COND_PUSH(set, CLBR_R11, r11)
1558 pop %r11; \ 1692#define PV_RESTORE_REGS(set) \
1559 pop %r10; \ 1693 COND_POP(set, CLBR_R11, r11); \
1560 pop %r9; \ 1694 COND_POP(set, CLBR_R10, r10); \
1561 pop %r8; \ 1695 COND_POP(set, CLBR_R9, r9); \
1562 pop %rdi; \ 1696 COND_POP(set, CLBR_R8, r8); \
1563 pop %rsi; \ 1697 COND_POP(set, CLBR_RDI, rdi); \
1564 pop %rdx; \ 1698 COND_POP(set, CLBR_RSI, rsi); \
1565 pop %rcx; \ 1699 COND_POP(set, CLBR_RDX, rdx); \
1566 pop %rax 1700 COND_POP(set, CLBR_RCX, rcx); \
1701 COND_POP(set, CLBR_RAX, rax)
1702
1567#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8) 1703#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
1568#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8) 1704#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
1569#define PARA_INDIRECT(addr) *addr(%rip) 1705#define PARA_INDIRECT(addr) *addr(%rip)
1570#else 1706#else
1571#define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx 1707#define PV_SAVE_REGS(set) \
1572#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax 1708 COND_PUSH(set, CLBR_EAX, eax); \
1709 COND_PUSH(set, CLBR_EDI, edi); \
1710 COND_PUSH(set, CLBR_ECX, ecx); \
1711 COND_PUSH(set, CLBR_EDX, edx)
1712#define PV_RESTORE_REGS(set) \
1713 COND_POP(set, CLBR_EDX, edx); \
1714 COND_POP(set, CLBR_ECX, ecx); \
1715 COND_POP(set, CLBR_EDI, edi); \
1716 COND_POP(set, CLBR_EAX, eax)
1717
1573#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4) 1718#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)
1574#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4) 1719#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
1575#define PARA_INDIRECT(addr) *%cs:addr 1720#define PARA_INDIRECT(addr) *%cs:addr
@@ -1581,15 +1726,15 @@ static inline unsigned long __raw_local_irq_save(void)
1581 1726
1582#define DISABLE_INTERRUPTS(clobbers) \ 1727#define DISABLE_INTERRUPTS(clobbers) \
1583 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ 1728 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
1584 PV_SAVE_REGS; \ 1729 PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
1585 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \ 1730 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \
1586 PV_RESTORE_REGS;) \ 1731 PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
1587 1732
1588#define ENABLE_INTERRUPTS(clobbers) \ 1733#define ENABLE_INTERRUPTS(clobbers) \
1589 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \ 1734 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
1590 PV_SAVE_REGS; \ 1735 PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
1591 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ 1736 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
1592 PV_RESTORE_REGS;) 1737 PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
1593 1738
1594#define USERGS_SYSRET32 \ 1739#define USERGS_SYSRET32 \
1595 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \ 1740 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \
@@ -1619,11 +1764,15 @@ static inline unsigned long __raw_local_irq_save(void)
1619 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ 1764 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
1620 swapgs) 1765 swapgs)
1621 1766
1767/*
1768 * Note: swapgs is very special, and in practise is either going to be
1769 * implemented with a single "swapgs" instruction or something very
1770 * special. Either way, we don't need to save any registers for
1771 * it.
1772 */
1622#define SWAPGS \ 1773#define SWAPGS \
1623 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ 1774 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
1624 PV_SAVE_REGS; \ 1775 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs) \
1625 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \
1626 PV_RESTORE_REGS \
1627 ) 1776 )
1628 1777
1629#define GET_CR2_INTO_RCX \ 1778#define GET_CR2_INTO_RCX \
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 06bbcbd66e9c..6ceaef08486f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -240,64 +240,78 @@ static inline int pmd_large(pmd_t pte)
240 (_PAGE_PSE | _PAGE_PRESENT); 240 (_PAGE_PSE | _PAGE_PRESENT);
241} 241}
242 242
243static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
244{
245 pteval_t v = native_pte_val(pte);
246
247 return native_make_pte(v | set);
248}
249
250static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
251{
252 pteval_t v = native_pte_val(pte);
253
254 return native_make_pte(v & ~clear);
255}
256
243static inline pte_t pte_mkclean(pte_t pte) 257static inline pte_t pte_mkclean(pte_t pte)
244{ 258{
245 return __pte(pte_val(pte) & ~_PAGE_DIRTY); 259 return pte_clear_flags(pte, _PAGE_DIRTY);
246} 260}
247 261
248static inline pte_t pte_mkold(pte_t pte) 262static inline pte_t pte_mkold(pte_t pte)
249{ 263{
250 return __pte(pte_val(pte) & ~_PAGE_ACCESSED); 264 return pte_clear_flags(pte, _PAGE_ACCESSED);
251} 265}
252 266
253static inline pte_t pte_wrprotect(pte_t pte) 267static inline pte_t pte_wrprotect(pte_t pte)
254{ 268{
255 return __pte(pte_val(pte) & ~_PAGE_RW); 269 return pte_clear_flags(pte, _PAGE_RW);
256} 270}
257 271
258static inline pte_t pte_mkexec(pte_t pte) 272static inline pte_t pte_mkexec(pte_t pte)
259{ 273{
260 return __pte(pte_val(pte) & ~_PAGE_NX); 274 return pte_clear_flags(pte, _PAGE_NX);
261} 275}
262 276
263static inline pte_t pte_mkdirty(pte_t pte) 277static inline pte_t pte_mkdirty(pte_t pte)
264{ 278{
265 return __pte(pte_val(pte) | _PAGE_DIRTY); 279 return pte_set_flags(pte, _PAGE_DIRTY);
266} 280}
267 281
268static inline pte_t pte_mkyoung(pte_t pte) 282static inline pte_t pte_mkyoung(pte_t pte)
269{ 283{
270 return __pte(pte_val(pte) | _PAGE_ACCESSED); 284 return pte_set_flags(pte, _PAGE_ACCESSED);
271} 285}
272 286
273static inline pte_t pte_mkwrite(pte_t pte) 287static inline pte_t pte_mkwrite(pte_t pte)
274{ 288{
275 return __pte(pte_val(pte) | _PAGE_RW); 289 return pte_set_flags(pte, _PAGE_RW);
276} 290}
277 291
278static inline pte_t pte_mkhuge(pte_t pte) 292static inline pte_t pte_mkhuge(pte_t pte)
279{ 293{
280 return __pte(pte_val(pte) | _PAGE_PSE); 294 return pte_set_flags(pte, _PAGE_PSE);
281} 295}
282 296
283static inline pte_t pte_clrhuge(pte_t pte) 297static inline pte_t pte_clrhuge(pte_t pte)
284{ 298{
285 return __pte(pte_val(pte) & ~_PAGE_PSE); 299 return pte_clear_flags(pte, _PAGE_PSE);
286} 300}
287 301
288static inline pte_t pte_mkglobal(pte_t pte) 302static inline pte_t pte_mkglobal(pte_t pte)
289{ 303{
290 return __pte(pte_val(pte) | _PAGE_GLOBAL); 304 return pte_set_flags(pte, _PAGE_GLOBAL);
291} 305}
292 306
293static inline pte_t pte_clrglobal(pte_t pte) 307static inline pte_t pte_clrglobal(pte_t pte)
294{ 308{
295 return __pte(pte_val(pte) & ~_PAGE_GLOBAL); 309 return pte_clear_flags(pte, _PAGE_GLOBAL);
296} 310}
297 311
298static inline pte_t pte_mkspecial(pte_t pte) 312static inline pte_t pte_mkspecial(pte_t pte)
299{ 313{
300 return __pte(pte_val(pte) | _PAGE_SPECIAL); 314 return pte_set_flags(pte, _PAGE_SPECIAL);
301} 315}
302 316
303extern pteval_t __supported_pte_mask; 317extern pteval_t __supported_pte_mask;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a52703864a16..e4c9710cae52 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1140,7 +1140,7 @@ ENTRY(native_load_gs_index)
1140 CFI_STARTPROC 1140 CFI_STARTPROC
1141 pushf 1141 pushf
1142 CFI_ADJUST_CFA_OFFSET 8 1142 CFI_ADJUST_CFA_OFFSET 8
1143 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) 1143 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1144 SWAPGS 1144 SWAPGS
1145gs_change: 1145gs_change:
1146 movl %edi,%gs 1146 movl %edi,%gs
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e4c8fb608873..cea11c8e3049 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -44,6 +44,17 @@ void _paravirt_nop(void)
44{ 44{
45} 45}
46 46
47/* identity function, which can be inlined */
48u32 _paravirt_ident_32(u32 x)
49{
50 return x;
51}
52
53u64 _paravirt_ident_64(u64 x)
54{
55 return x;
56}
57
47static void __init default_banner(void) 58static void __init default_banner(void)
48{ 59{
49 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 60 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
@@ -138,9 +149,16 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
138 if (opfunc == NULL) 149 if (opfunc == NULL)
139 /* If there's no function, patch it with a ud2a (BUG) */ 150 /* If there's no function, patch it with a ud2a (BUG) */
140 ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); 151 ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
141 else if (opfunc == paravirt_nop) 152 else if (opfunc == _paravirt_nop)
142 /* If the operation is a nop, then nop the callsite */ 153 /* If the operation is a nop, then nop the callsite */
143 ret = paravirt_patch_nop(); 154 ret = paravirt_patch_nop();
155
156 /* identity functions just return their single argument */
157 else if (opfunc == _paravirt_ident_32)
158 ret = paravirt_patch_ident_32(insnbuf, len);
159 else if (opfunc == _paravirt_ident_64)
160 ret = paravirt_patch_ident_64(insnbuf, len);
161
144 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || 162 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
145 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || 163 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
146 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || 164 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
@@ -292,10 +310,10 @@ struct pv_time_ops pv_time_ops = {
292 310
293struct pv_irq_ops pv_irq_ops = { 311struct pv_irq_ops pv_irq_ops = {
294 .init_IRQ = native_init_IRQ, 312 .init_IRQ = native_init_IRQ,
295 .save_fl = native_save_fl, 313 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
296 .restore_fl = native_restore_fl, 314 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
297 .irq_disable = native_irq_disable, 315 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
298 .irq_enable = native_irq_enable, 316 .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
299 .safe_halt = native_safe_halt, 317 .safe_halt = native_safe_halt,
300 .halt = native_halt, 318 .halt = native_halt,
301#ifdef CONFIG_X86_64 319#ifdef CONFIG_X86_64
@@ -373,6 +391,14 @@ struct pv_apic_ops pv_apic_ops = {
373#endif 391#endif
374}; 392};
375 393
394#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
395/* 32-bit pagetable entries */
396#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
397#else
398/* 64-bit pagetable entries */
399#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
400#endif
401
376struct pv_mmu_ops pv_mmu_ops = { 402struct pv_mmu_ops pv_mmu_ops = {
377#ifndef CONFIG_X86_64 403#ifndef CONFIG_X86_64
378 .pagetable_setup_start = native_pagetable_setup_start, 404 .pagetable_setup_start = native_pagetable_setup_start,
@@ -424,22 +450,23 @@ struct pv_mmu_ops pv_mmu_ops = {
424 .pmd_clear = native_pmd_clear, 450 .pmd_clear = native_pmd_clear,
425#endif 451#endif
426 .set_pud = native_set_pud, 452 .set_pud = native_set_pud,
427 .pmd_val = native_pmd_val, 453
428 .make_pmd = native_make_pmd, 454 .pmd_val = PTE_IDENT,
455 .make_pmd = PTE_IDENT,
429 456
430#if PAGETABLE_LEVELS == 4 457#if PAGETABLE_LEVELS == 4
431 .pud_val = native_pud_val, 458 .pud_val = PTE_IDENT,
432 .make_pud = native_make_pud, 459 .make_pud = PTE_IDENT,
460
433 .set_pgd = native_set_pgd, 461 .set_pgd = native_set_pgd,
434#endif 462#endif
435#endif /* PAGETABLE_LEVELS >= 3 */ 463#endif /* PAGETABLE_LEVELS >= 3 */
436 464
437 .pte_val = native_pte_val, 465 .pte_val = PTE_IDENT,
438 .pte_flags = native_pte_flags, 466 .pgd_val = PTE_IDENT,
439 .pgd_val = native_pgd_val,
440 467
441 .make_pte = native_make_pte, 468 .make_pte = PTE_IDENT,
442 .make_pgd = native_make_pgd, 469 .make_pgd = PTE_IDENT,
443 470
444 .dup_mmap = paravirt_nop, 471 .dup_mmap = paravirt_nop,
445 .exit_mmap = paravirt_nop, 472 .exit_mmap = paravirt_nop,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 9fe644f4861d..d9f32e6d6ab6 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,18 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
12DEF_NATIVE(pv_cpu_ops, clts, "clts"); 12DEF_NATIVE(pv_cpu_ops, clts, "clts");
13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); 13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
14 14
15unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
16{
17 /* arg in %eax, return in %eax */
18 return 0;
19}
20
21unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
22{
23 /* arg in %edx:%eax, return in %edx:%eax */
24 return 0;
25}
26
15unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 27unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
16 unsigned long addr, unsigned len) 28 unsigned long addr, unsigned len)
17{ 29{
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 061d01df9ae6..3f08f34f93eb 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -19,6 +19,21 @@ DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
19DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); 19DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
20DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); 20DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
21 21
22DEF_NATIVE(, mov32, "mov %edi, %eax");
23DEF_NATIVE(, mov64, "mov %rdi, %rax");
24
25unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
26{
27 return paravirt_patch_insns(insnbuf, len,
28 start__mov32, end__mov32);
29}
30
31unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
32{
33 return paravirt_patch_insns(insnbuf, len,
34 start__mov64, end__mov64);
35}
36
22unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 37unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
23 unsigned long addr, unsigned len) 38 unsigned long addr, unsigned len)
24{ 39{
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 1d3302cc2ddf..eb9e7347928e 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -670,10 +670,11 @@ static inline int __init activate_vmi(void)
670 para_fill(pv_mmu_ops.write_cr2, SetCR2); 670 para_fill(pv_mmu_ops.write_cr2, SetCR2);
671 para_fill(pv_mmu_ops.write_cr3, SetCR3); 671 para_fill(pv_mmu_ops.write_cr3, SetCR3);
672 para_fill(pv_cpu_ops.write_cr4, SetCR4); 672 para_fill(pv_cpu_ops.write_cr4, SetCR4);
673 para_fill(pv_irq_ops.save_fl, GetInterruptMask); 673
674 para_fill(pv_irq_ops.restore_fl, SetInterruptMask); 674 para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
675 para_fill(pv_irq_ops.irq_disable, DisableInterrupts); 675 para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
676 para_fill(pv_irq_ops.irq_enable, EnableInterrupts); 676 para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
677 para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
677 678
678 para_fill(pv_cpu_ops.wbinvd, WBINVD); 679 para_fill(pv_cpu_ops.wbinvd, WBINVD);
679 para_fill(pv_cpu_ops.read_tsc, RDTSC); 680 para_fill(pv_cpu_ops.read_tsc, RDTSC);
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a688f3bfaec2..c609205df594 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -37,6 +37,7 @@ static unsigned long vsmp_save_fl(void)
37 flags &= ~X86_EFLAGS_IF; 37 flags &= ~X86_EFLAGS_IF;
38 return flags; 38 return flags;
39} 39}
40PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
40 41
41static void vsmp_restore_fl(unsigned long flags) 42static void vsmp_restore_fl(unsigned long flags)
42{ 43{
@@ -46,6 +47,7 @@ static void vsmp_restore_fl(unsigned long flags)
46 flags |= X86_EFLAGS_AC; 47 flags |= X86_EFLAGS_AC;
47 native_restore_fl(flags); 48 native_restore_fl(flags);
48} 49}
50PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
49 51
50static void vsmp_irq_disable(void) 52static void vsmp_irq_disable(void)
51{ 53{
@@ -53,6 +55,7 @@ static void vsmp_irq_disable(void)
53 55
54 native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); 56 native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
55} 57}
58PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
56 59
57static void vsmp_irq_enable(void) 60static void vsmp_irq_enable(void)
58{ 61{
@@ -60,6 +63,7 @@ static void vsmp_irq_enable(void)
60 63
61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 64 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
62} 65}
66PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
63 67
64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, 68static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
65 unsigned long addr, unsigned len) 69 unsigned long addr, unsigned len)
@@ -90,10 +94,10 @@ static void __init set_vsmp_pv_ops(void)
90 cap, ctl); 94 cap, ctl);
91 if (cap & ctl & (1 << 4)) { 95 if (cap & ctl & (1 << 4)) {
92 /* Setup irq ops and turn on vSMP IRQ fastpath handling */ 96 /* Setup irq ops and turn on vSMP IRQ fastpath handling */
93 pv_irq_ops.irq_disable = vsmp_irq_disable; 97 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
94 pv_irq_ops.irq_enable = vsmp_irq_enable; 98 pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable);
95 pv_irq_ops.save_fl = vsmp_save_fl; 99 pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
96 pv_irq_ops.restore_fl = vsmp_restore_fl; 100 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
97 pv_init_ops.patch = vsmp_patch; 101 pv_init_ops.patch = vsmp_patch;
98 102
99 ctl &= ~(1 << 4); 103 ctl &= ~(1 << 4);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 92f1c6f3e19d..19e33b6cd593 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -173,24 +173,29 @@ static unsigned long save_fl(void)
173{ 173{
174 return lguest_data.irq_enabled; 174 return lguest_data.irq_enabled;
175} 175}
176PV_CALLEE_SAVE_REGS_THUNK(save_fl);
176 177
177/* restore_flags() just sets the flags back to the value given. */ 178/* restore_flags() just sets the flags back to the value given. */
178static void restore_fl(unsigned long flags) 179static void restore_fl(unsigned long flags)
179{ 180{
180 lguest_data.irq_enabled = flags; 181 lguest_data.irq_enabled = flags;
181} 182}
183PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
182 184
183/* Interrupts go off... */ 185/* Interrupts go off... */
184static void irq_disable(void) 186static void irq_disable(void)
185{ 187{
186 lguest_data.irq_enabled = 0; 188 lguest_data.irq_enabled = 0;
187} 189}
190PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
188 191
189/* Interrupts go on... */ 192/* Interrupts go on... */
190static void irq_enable(void) 193static void irq_enable(void)
191{ 194{
192 lguest_data.irq_enabled = X86_EFLAGS_IF; 195 lguest_data.irq_enabled = X86_EFLAGS_IF;
193} 196}
197PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
198
194/*:*/ 199/*:*/
195/*M:003 Note that we don't check for outstanding interrupts when we re-enable 200/*M:003 Note that we don't check for outstanding interrupts when we re-enable
196 * them (or when we unmask an interrupt). This seems to work for the moment, 201 * them (or when we unmask an interrupt). This seems to work for the moment,
@@ -984,10 +989,10 @@ __init void lguest_init(void)
984 989
985 /* interrupt-related operations */ 990 /* interrupt-related operations */
986 pv_irq_ops.init_IRQ = lguest_init_IRQ; 991 pv_irq_ops.init_IRQ = lguest_init_IRQ;
987 pv_irq_ops.save_fl = save_fl; 992 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
988 pv_irq_ops.restore_fl = restore_fl; 993 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
989 pv_irq_ops.irq_disable = irq_disable; 994 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
990 pv_irq_ops.irq_enable = irq_enable; 995 pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
991 pv_irq_ops.safe_halt = lguest_safe_halt; 996 pv_irq_ops.safe_halt = lguest_safe_halt;
992 997
993 /* init-time operations */ 998 /* init-time operations */
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 6dcefba7836f..3b767d03fd6a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -6,7 +6,8 @@ CFLAGS_REMOVE_irq.o = -pg
6endif 6endif
7 7
8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
9 time.o xen-asm_$(BITS).o grant-table.o suspend.o 9 time.o xen-asm.o xen-asm_$(BITS).o \
10 grant-table.o suspend.o
10 11
11obj-$(CONFIG_SMP) += smp.o spinlock.o 12obj-$(CONFIG_SMP) += smp.o spinlock.o
12obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file 13obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index fe19c88a5029..37230342c2c4 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -61,40 +61,13 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
61enum xen_domain_type xen_domain_type = XEN_NATIVE; 61enum xen_domain_type xen_domain_type = XEN_NATIVE;
62EXPORT_SYMBOL_GPL(xen_domain_type); 62EXPORT_SYMBOL_GPL(xen_domain_type);
63 63
64/*
65 * Identity map, in addition to plain kernel map. This needs to be
66 * large enough to allocate page table pages to allocate the rest.
67 * Each page can map 2MB.
68 */
69static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
70
71#ifdef CONFIG_X86_64
72/* l3 pud for userspace vsyscall mapping */
73static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
74#endif /* CONFIG_X86_64 */
75
76/*
77 * Note about cr3 (pagetable base) values:
78 *
79 * xen_cr3 contains the current logical cr3 value; it contains the
80 * last set cr3. This may not be the current effective cr3, because
81 * its update may be being lazily deferred. However, a vcpu looking
82 * at its own cr3 can use this value knowing that it everything will
83 * be self-consistent.
84 *
85 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
86 * hypercall to set the vcpu cr3 is complete (so it may be a little
87 * out of date, but it will never be set early). If one vcpu is
88 * looking at another vcpu's cr3 value, it should use this variable.
89 */
90DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
91DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
92
93struct start_info *xen_start_info; 64struct start_info *xen_start_info;
94EXPORT_SYMBOL_GPL(xen_start_info); 65EXPORT_SYMBOL_GPL(xen_start_info);
95 66
96struct shared_info xen_dummy_shared_info; 67struct shared_info xen_dummy_shared_info;
97 68
69void *xen_initial_gdt;
70
98/* 71/*
99 * Point at some empty memory to start with. We map the real shared_info 72 * Point at some empty memory to start with. We map the real shared_info
100 * page as soon as fixmap is up and running. 73 * page as soon as fixmap is up and running.
@@ -114,14 +87,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
114 * 87 *
115 * 0: not available, 1: available 88 * 0: not available, 1: available
116 */ 89 */
117static int have_vcpu_info_placement = 90static int have_vcpu_info_placement = 1;
118#ifdef CONFIG_X86_32
119 1
120#else
121 0
122#endif
123 ;
124
125 91
126static void xen_vcpu_setup(int cpu) 92static void xen_vcpu_setup(int cpu)
127{ 93{
@@ -237,7 +203,7 @@ static unsigned long xen_get_debugreg(int reg)
237 return HYPERVISOR_get_debugreg(reg); 203 return HYPERVISOR_get_debugreg(reg);
238} 204}
239 205
240static void xen_leave_lazy(void) 206void xen_leave_lazy(void)
241{ 207{
242 paravirt_leave_lazy(paravirt_get_lazy_mode()); 208 paravirt_leave_lazy(paravirt_get_lazy_mode());
243 xen_mc_flush(); 209 xen_mc_flush();
@@ -598,76 +564,6 @@ static struct apic_ops xen_basic_apic_ops = {
598 564
599#endif 565#endif
600 566
601static void xen_flush_tlb(void)
602{
603 struct mmuext_op *op;
604 struct multicall_space mcs;
605
606 preempt_disable();
607
608 mcs = xen_mc_entry(sizeof(*op));
609
610 op = mcs.args;
611 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
612 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
613
614 xen_mc_issue(PARAVIRT_LAZY_MMU);
615
616 preempt_enable();
617}
618
619static void xen_flush_tlb_single(unsigned long addr)
620{
621 struct mmuext_op *op;
622 struct multicall_space mcs;
623
624 preempt_disable();
625
626 mcs = xen_mc_entry(sizeof(*op));
627 op = mcs.args;
628 op->cmd = MMUEXT_INVLPG_LOCAL;
629 op->arg1.linear_addr = addr & PAGE_MASK;
630 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
631
632 xen_mc_issue(PARAVIRT_LAZY_MMU);
633
634 preempt_enable();
635}
636
637static void xen_flush_tlb_others(const struct cpumask *cpus,
638 struct mm_struct *mm, unsigned long va)
639{
640 struct {
641 struct mmuext_op op;
642 DECLARE_BITMAP(mask, NR_CPUS);
643 } *args;
644 struct multicall_space mcs;
645
646 BUG_ON(cpumask_empty(cpus));
647 BUG_ON(!mm);
648
649 mcs = xen_mc_entry(sizeof(*args));
650 args = mcs.args;
651 args->op.arg2.vcpumask = to_cpumask(args->mask);
652
653 /* Remove us, and any offline CPUS. */
654 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
655 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
656 if (unlikely(cpumask_empty(to_cpumask(args->mask))))
657 goto issue;
658
659 if (va == TLB_FLUSH_ALL) {
660 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
661 } else {
662 args->op.cmd = MMUEXT_INVLPG_MULTI;
663 args->op.arg1.linear_addr = va;
664 }
665
666 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
667
668issue:
669 xen_mc_issue(PARAVIRT_LAZY_MMU);
670}
671 567
672static void xen_clts(void) 568static void xen_clts(void)
673{ 569{
@@ -693,21 +589,6 @@ static void xen_write_cr0(unsigned long cr0)
693 xen_mc_issue(PARAVIRT_LAZY_CPU); 589 xen_mc_issue(PARAVIRT_LAZY_CPU);
694} 590}
695 591
696static void xen_write_cr2(unsigned long cr2)
697{
698 percpu_read(xen_vcpu)->arch.cr2 = cr2;
699}
700
701static unsigned long xen_read_cr2(void)
702{
703 return percpu_read(xen_vcpu)->arch.cr2;
704}
705
706static unsigned long xen_read_cr2_direct(void)
707{
708 return percpu_read(xen_vcpu_info.arch.cr2);
709}
710
711static void xen_write_cr4(unsigned long cr4) 592static void xen_write_cr4(unsigned long cr4)
712{ 593{
713 cr4 &= ~X86_CR4_PGE; 594 cr4 &= ~X86_CR4_PGE;
@@ -716,71 +597,6 @@ static void xen_write_cr4(unsigned long cr4)
716 native_write_cr4(cr4); 597 native_write_cr4(cr4);
717} 598}
718 599
719static unsigned long xen_read_cr3(void)
720{
721 return percpu_read(xen_cr3);
722}
723
724static void set_current_cr3(void *v)
725{
726 percpu_write(xen_current_cr3, (unsigned long)v);
727}
728
729static void __xen_write_cr3(bool kernel, unsigned long cr3)
730{
731 struct mmuext_op *op;
732 struct multicall_space mcs;
733 unsigned long mfn;
734
735 if (cr3)
736 mfn = pfn_to_mfn(PFN_DOWN(cr3));
737 else
738 mfn = 0;
739
740 WARN_ON(mfn == 0 && kernel);
741
742 mcs = __xen_mc_entry(sizeof(*op));
743
744 op = mcs.args;
745 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
746 op->arg1.mfn = mfn;
747
748 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
749
750 if (kernel) {
751 percpu_write(xen_cr3, cr3);
752
753 /* Update xen_current_cr3 once the batch has actually
754 been submitted. */
755 xen_mc_callback(set_current_cr3, (void *)cr3);
756 }
757}
758
759static void xen_write_cr3(unsigned long cr3)
760{
761 BUG_ON(preemptible());
762
763 xen_mc_batch(); /* disables interrupts */
764
765 /* Update while interrupts are disabled, so its atomic with
766 respect to ipis */
767 percpu_write(xen_cr3, cr3);
768
769 __xen_write_cr3(true, cr3);
770
771#ifdef CONFIG_X86_64
772 {
773 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
774 if (user_pgd)
775 __xen_write_cr3(false, __pa(user_pgd));
776 else
777 __xen_write_cr3(false, 0);
778 }
779#endif
780
781 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
782}
783
784static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) 600static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
785{ 601{
786 int ret; 602 int ret;
@@ -822,185 +638,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
822 return ret; 638 return ret;
823} 639}
824 640
825/* Early in boot, while setting up the initial pagetable, assume
826 everything is pinned. */
827static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
828{
829#ifdef CONFIG_FLATMEM
830 BUG_ON(mem_map); /* should only be used early */
831#endif
832 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
833}
834
835/* Early release_pte assumes that all pts are pinned, since there's
836 only init_mm and anything attached to that is pinned. */
837static void xen_release_pte_init(unsigned long pfn)
838{
839 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
840}
841
842static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
843{
844 struct mmuext_op op;
845 op.cmd = cmd;
846 op.arg1.mfn = pfn_to_mfn(pfn);
847 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
848 BUG();
849}
850
851/* This needs to make sure the new pte page is pinned iff its being
852 attached to a pinned pagetable. */
853static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
854{
855 struct page *page = pfn_to_page(pfn);
856
857 if (PagePinned(virt_to_page(mm->pgd))) {
858 SetPagePinned(page);
859
860 vm_unmap_aliases();
861 if (!PageHighMem(page)) {
862 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
863 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
864 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
865 } else {
866 /* make sure there are no stray mappings of
867 this page */
868 kmap_flush_unused();
869 }
870 }
871}
872
873static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
874{
875 xen_alloc_ptpage(mm, pfn, PT_PTE);
876}
877
878static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
879{
880 xen_alloc_ptpage(mm, pfn, PT_PMD);
881}
882
883static int xen_pgd_alloc(struct mm_struct *mm)
884{
885 pgd_t *pgd = mm->pgd;
886 int ret = 0;
887
888 BUG_ON(PagePinned(virt_to_page(pgd)));
889
890#ifdef CONFIG_X86_64
891 {
892 struct page *page = virt_to_page(pgd);
893 pgd_t *user_pgd;
894
895 BUG_ON(page->private != 0);
896
897 ret = -ENOMEM;
898
899 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
900 page->private = (unsigned long)user_pgd;
901
902 if (user_pgd != NULL) {
903 user_pgd[pgd_index(VSYSCALL_START)] =
904 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
905 ret = 0;
906 }
907
908 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
909 }
910#endif
911
912 return ret;
913}
914
915static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
916{
917#ifdef CONFIG_X86_64
918 pgd_t *user_pgd = xen_get_user_pgd(pgd);
919
920 if (user_pgd)
921 free_page((unsigned long)user_pgd);
922#endif
923}
924
925/* This should never happen until we're OK to use struct page */
926static void xen_release_ptpage(unsigned long pfn, unsigned level)
927{
928 struct page *page = pfn_to_page(pfn);
929
930 if (PagePinned(page)) {
931 if (!PageHighMem(page)) {
932 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
933 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
934 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
935 }
936 ClearPagePinned(page);
937 }
938}
939
940static void xen_release_pte(unsigned long pfn)
941{
942 xen_release_ptpage(pfn, PT_PTE);
943}
944
945static void xen_release_pmd(unsigned long pfn)
946{
947 xen_release_ptpage(pfn, PT_PMD);
948}
949
950#if PAGETABLE_LEVELS == 4
951static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
952{
953 xen_alloc_ptpage(mm, pfn, PT_PUD);
954}
955
956static void xen_release_pud(unsigned long pfn)
957{
958 xen_release_ptpage(pfn, PT_PUD);
959}
960#endif
961
962#ifdef CONFIG_HIGHPTE
963static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
964{
965 pgprot_t prot = PAGE_KERNEL;
966
967 if (PagePinned(page))
968 prot = PAGE_KERNEL_RO;
969
970 if (0 && PageHighMem(page))
971 printk("mapping highpte %lx type %d prot %s\n",
972 page_to_pfn(page), type,
973 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
974
975 return kmap_atomic_prot(page, type, prot);
976}
977#endif
978
979#ifdef CONFIG_X86_32
980static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
981{
982 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
983 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
984 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
985 pte_val_ma(pte));
986
987 return pte;
988}
989
990/* Init-time set_pte while constructing initial pagetables, which
991 doesn't allow RO pagetable pages to be remapped RW */
992static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
993{
994 pte = mask_rw_pte(ptep, pte);
995
996 xen_set_pte(ptep, pte);
997}
998#endif
999
1000static __init void xen_pagetable_setup_start(pgd_t *base)
1001{
1002}
1003
1004void xen_setup_shared_info(void) 641void xen_setup_shared_info(void)
1005{ 642{
1006 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 643 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1021,37 +658,6 @@ void xen_setup_shared_info(void)
1021 xen_setup_mfn_list_list(); 658 xen_setup_mfn_list_list();
1022} 659}
1023 660
1024static __init void xen_pagetable_setup_done(pgd_t *base)
1025{
1026 xen_setup_shared_info();
1027}
1028
1029static __init void xen_post_allocator_init(void)
1030{
1031 pv_mmu_ops.set_pte = xen_set_pte;
1032 pv_mmu_ops.set_pmd = xen_set_pmd;
1033 pv_mmu_ops.set_pud = xen_set_pud;
1034#if PAGETABLE_LEVELS == 4
1035 pv_mmu_ops.set_pgd = xen_set_pgd;
1036#endif
1037
1038 /* This will work as long as patching hasn't happened yet
1039 (which it hasn't) */
1040 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1041 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1042 pv_mmu_ops.release_pte = xen_release_pte;
1043 pv_mmu_ops.release_pmd = xen_release_pmd;
1044#if PAGETABLE_LEVELS == 4
1045 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1046 pv_mmu_ops.release_pud = xen_release_pud;
1047#endif
1048
1049#ifdef CONFIG_X86_64
1050 SetPagePinned(virt_to_page(level3_user_vsyscall));
1051#endif
1052 xen_mark_init_mm_pinned();
1053}
1054
1055/* This is called once we have the cpu_possible_map */ 661/* This is called once we have the cpu_possible_map */
1056void xen_setup_vcpu_info_placement(void) 662void xen_setup_vcpu_info_placement(void)
1057{ 663{
@@ -1065,10 +671,10 @@ void xen_setup_vcpu_info_placement(void)
1065 if (have_vcpu_info_placement) { 671 if (have_vcpu_info_placement) {
1066 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 672 printk(KERN_INFO "Xen: using vcpu_info placement\n");
1067 673
1068 pv_irq_ops.save_fl = xen_save_fl_direct; 674 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
1069 pv_irq_ops.restore_fl = xen_restore_fl_direct; 675 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
1070 pv_irq_ops.irq_disable = xen_irq_disable_direct; 676 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
1071 pv_irq_ops.irq_enable = xen_irq_enable_direct; 677 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
1072 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 678 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
1073 } 679 }
1074} 680}
@@ -1126,49 +732,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
1126 return ret; 732 return ret;
1127} 733}
1128 734
1129static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1130{
1131 pte_t pte;
1132
1133 phys >>= PAGE_SHIFT;
1134
1135 switch (idx) {
1136 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1137#ifdef CONFIG_X86_F00F_BUG
1138 case FIX_F00F_IDT:
1139#endif
1140#ifdef CONFIG_X86_32
1141 case FIX_WP_TEST:
1142 case FIX_VDSO:
1143# ifdef CONFIG_HIGHMEM
1144 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1145# endif
1146#else
1147 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1148#endif
1149#ifdef CONFIG_X86_LOCAL_APIC
1150 case FIX_APIC_BASE: /* maps dummy local APIC */
1151#endif
1152 pte = pfn_pte(phys, prot);
1153 break;
1154
1155 default:
1156 pte = mfn_pte(phys, prot);
1157 break;
1158 }
1159
1160 __native_set_fixmap(idx, pte);
1161
1162#ifdef CONFIG_X86_64
1163 /* Replicate changes to map the vsyscall page into the user
1164 pagetable vsyscall mapping. */
1165 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1166 unsigned long vaddr = __fix_to_virt(idx);
1167 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1168 }
1169#endif
1170}
1171
1172static const struct pv_info xen_info __initdata = { 735static const struct pv_info xen_info __initdata = {
1173 .paravirt_enabled = 1, 736 .paravirt_enabled = 1,
1174 .shared_kernel_pmd = 0, 737 .shared_kernel_pmd = 0,
@@ -1264,87 +827,6 @@ static const struct pv_apic_ops xen_apic_ops __initdata = {
1264#endif 827#endif
1265}; 828};
1266 829
1267static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1268 .pagetable_setup_start = xen_pagetable_setup_start,
1269 .pagetable_setup_done = xen_pagetable_setup_done,
1270
1271 .read_cr2 = xen_read_cr2,
1272 .write_cr2 = xen_write_cr2,
1273
1274 .read_cr3 = xen_read_cr3,
1275 .write_cr3 = xen_write_cr3,
1276
1277 .flush_tlb_user = xen_flush_tlb,
1278 .flush_tlb_kernel = xen_flush_tlb,
1279 .flush_tlb_single = xen_flush_tlb_single,
1280 .flush_tlb_others = xen_flush_tlb_others,
1281
1282 .pte_update = paravirt_nop,
1283 .pte_update_defer = paravirt_nop,
1284
1285 .pgd_alloc = xen_pgd_alloc,
1286 .pgd_free = xen_pgd_free,
1287
1288 .alloc_pte = xen_alloc_pte_init,
1289 .release_pte = xen_release_pte_init,
1290 .alloc_pmd = xen_alloc_pte_init,
1291 .alloc_pmd_clone = paravirt_nop,
1292 .release_pmd = xen_release_pte_init,
1293
1294#ifdef CONFIG_HIGHPTE
1295 .kmap_atomic_pte = xen_kmap_atomic_pte,
1296#endif
1297
1298#ifdef CONFIG_X86_64
1299 .set_pte = xen_set_pte,
1300#else
1301 .set_pte = xen_set_pte_init,
1302#endif
1303 .set_pte_at = xen_set_pte_at,
1304 .set_pmd = xen_set_pmd_hyper,
1305
1306 .ptep_modify_prot_start = __ptep_modify_prot_start,
1307 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1308
1309 .pte_val = xen_pte_val,
1310 .pte_flags = native_pte_flags,
1311 .pgd_val = xen_pgd_val,
1312
1313 .make_pte = xen_make_pte,
1314 .make_pgd = xen_make_pgd,
1315
1316#ifdef CONFIG_X86_PAE
1317 .set_pte_atomic = xen_set_pte_atomic,
1318 .set_pte_present = xen_set_pte_at,
1319 .pte_clear = xen_pte_clear,
1320 .pmd_clear = xen_pmd_clear,
1321#endif /* CONFIG_X86_PAE */
1322 .set_pud = xen_set_pud_hyper,
1323
1324 .make_pmd = xen_make_pmd,
1325 .pmd_val = xen_pmd_val,
1326
1327#if PAGETABLE_LEVELS == 4
1328 .pud_val = xen_pud_val,
1329 .make_pud = xen_make_pud,
1330 .set_pgd = xen_set_pgd_hyper,
1331
1332 .alloc_pud = xen_alloc_pte_init,
1333 .release_pud = xen_release_pte_init,
1334#endif /* PAGETABLE_LEVELS == 4 */
1335
1336 .activate_mm = xen_activate_mm,
1337 .dup_mmap = xen_dup_mmap,
1338 .exit_mmap = xen_exit_mmap,
1339
1340 .lazy_mode = {
1341 .enter = paravirt_enter_lazy_mmu,
1342 .leave = xen_leave_lazy,
1343 },
1344
1345 .set_fixmap = xen_set_fixmap,
1346};
1347
1348static void xen_reboot(int reason) 830static void xen_reboot(int reason)
1349{ 831{
1350 struct sched_shutdown r = { .reason = reason }; 832 struct sched_shutdown r = { .reason = reason };
@@ -1387,223 +869,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
1387}; 869};
1388 870
1389 871
1390static void __init xen_reserve_top(void)
1391{
1392#ifdef CONFIG_X86_32
1393 unsigned long top = HYPERVISOR_VIRT_START;
1394 struct xen_platform_parameters pp;
1395
1396 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1397 top = pp.virt_start;
1398
1399 reserve_top_address(-top);
1400#endif /* CONFIG_X86_32 */
1401}
1402
1403/*
1404 * Like __va(), but returns address in the kernel mapping (which is
1405 * all we have until the physical memory mapping has been set up.
1406 */
1407static void *__ka(phys_addr_t paddr)
1408{
1409#ifdef CONFIG_X86_64
1410 return (void *)(paddr + __START_KERNEL_map);
1411#else
1412 return __va(paddr);
1413#endif
1414}
1415
1416/* Convert a machine address to physical address */
1417static unsigned long m2p(phys_addr_t maddr)
1418{
1419 phys_addr_t paddr;
1420
1421 maddr &= PTE_PFN_MASK;
1422 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1423
1424 return paddr;
1425}
1426
1427/* Convert a machine address to kernel virtual */
1428static void *m2v(phys_addr_t maddr)
1429{
1430 return __ka(m2p(maddr));
1431}
1432
1433static void set_page_prot(void *addr, pgprot_t prot)
1434{
1435 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1436 pte_t pte = pfn_pte(pfn, prot);
1437
1438 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1439 BUG();
1440}
1441
1442static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1443{
1444 unsigned pmdidx, pteidx;
1445 unsigned ident_pte;
1446 unsigned long pfn;
1447
1448 ident_pte = 0;
1449 pfn = 0;
1450 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1451 pte_t *pte_page;
1452
1453 /* Reuse or allocate a page of ptes */
1454 if (pmd_present(pmd[pmdidx]))
1455 pte_page = m2v(pmd[pmdidx].pmd);
1456 else {
1457 /* Check for free pte pages */
1458 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1459 break;
1460
1461 pte_page = &level1_ident_pgt[ident_pte];
1462 ident_pte += PTRS_PER_PTE;
1463
1464 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1465 }
1466
1467 /* Install mappings */
1468 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1469 pte_t pte;
1470
1471 if (pfn > max_pfn_mapped)
1472 max_pfn_mapped = pfn;
1473
1474 if (!pte_none(pte_page[pteidx]))
1475 continue;
1476
1477 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1478 pte_page[pteidx] = pte;
1479 }
1480 }
1481
1482 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1483 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1484
1485 set_page_prot(pmd, PAGE_KERNEL_RO);
1486}
1487
1488#ifdef CONFIG_X86_64
1489static void convert_pfn_mfn(void *v)
1490{
1491 pte_t *pte = v;
1492 int i;
1493
1494 /* All levels are converted the same way, so just treat them
1495 as ptes. */
1496 for (i = 0; i < PTRS_PER_PTE; i++)
1497 pte[i] = xen_make_pte(pte[i].pte);
1498}
1499
1500/*
1501 * Set up the inital kernel pagetable.
1502 *
1503 * We can construct this by grafting the Xen provided pagetable into
1504 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1505 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1506 * means that only the kernel has a physical mapping to start with -
1507 * but that's enough to get __va working. We need to fill in the rest
1508 * of the physical mapping once some sort of allocator has been set
1509 * up.
1510 */
1511static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1512 unsigned long max_pfn)
1513{
1514 pud_t *l3;
1515 pmd_t *l2;
1516
1517 /* Zap identity mapping */
1518 init_level4_pgt[0] = __pgd(0);
1519
1520 /* Pre-constructed entries are in pfn, so convert to mfn */
1521 convert_pfn_mfn(init_level4_pgt);
1522 convert_pfn_mfn(level3_ident_pgt);
1523 convert_pfn_mfn(level3_kernel_pgt);
1524
1525 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1526 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1527
1528 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1529 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1530
1531 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1532 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1533 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1534
1535 /* Set up identity map */
1536 xen_map_identity_early(level2_ident_pgt, max_pfn);
1537
1538 /* Make pagetable pieces RO */
1539 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1540 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1541 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1542 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1543 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1544 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1545
1546 /* Pin down new L4 */
1547 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1548 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1549
1550 /* Unpin Xen-provided one */
1551 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1552
1553 /* Switch over */
1554 pgd = init_level4_pgt;
1555
1556 /*
1557 * At this stage there can be no user pgd, and no page
1558 * structure to attach it to, so make sure we just set kernel
1559 * pgd.
1560 */
1561 xen_mc_batch();
1562 __xen_write_cr3(true, __pa(pgd));
1563 xen_mc_issue(PARAVIRT_LAZY_CPU);
1564
1565 reserve_early(__pa(xen_start_info->pt_base),
1566 __pa(xen_start_info->pt_base +
1567 xen_start_info->nr_pt_frames * PAGE_SIZE),
1568 "XEN PAGETABLES");
1569
1570 return pgd;
1571}
1572#else /* !CONFIG_X86_64 */
1573static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1574
1575static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1576 unsigned long max_pfn)
1577{
1578 pmd_t *kernel_pmd;
1579
1580 init_pg_tables_start = __pa(pgd);
1581 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1582 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1583
1584 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1585 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1586
1587 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1588
1589 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1590 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1591 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1592
1593 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1594 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1595 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1596
1597 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1598
1599 xen_write_cr3(__pa(swapper_pg_dir));
1600
1601 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1602
1603 return swapper_pg_dir;
1604}
1605#endif /* CONFIG_X86_64 */
1606
1607/* First C function to be called on Xen boot */ 872/* First C function to be called on Xen boot */
1608asmlinkage void __init xen_start_kernel(void) 873asmlinkage void __init xen_start_kernel(void)
1609{ 874{
@@ -1643,12 +908,18 @@ asmlinkage void __init xen_start_kernel(void)
1643 machine_ops = xen_machine_ops; 908 machine_ops = xen_machine_ops;
1644 909
1645#ifdef CONFIG_X86_64 910#ifdef CONFIG_X86_64
1646 /* Disable until direct per-cpu data access. */ 911 /*
1647 have_vcpu_info_placement = 0; 912 * Setup percpu state. We only need to do this for 64-bit
1648#endif 913 * because 32-bit already has %fs set properly.
1649 914 */
1650 /* setup percpu state */
1651 load_percpu_segment(0); 915 load_percpu_segment(0);
916#endif
917 /*
918 * The only reliable way to retain the initial address of the
919 * percpu gdt_page is to remember it here, so we can go and
920 * mark it RW later, when the initial percpu area is freed.
921 */
922 xen_initial_gdt = &per_cpu(gdt_page, 0);
1652 923
1653 xen_smp_init(); 924 xen_smp_init();
1654 925
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 2e8271431e1a..5a070900ad35 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -50,6 +50,7 @@ static unsigned long xen_save_fl(void)
50 */ 50 */
51 return (-flags) & X86_EFLAGS_IF; 51 return (-flags) & X86_EFLAGS_IF;
52} 52}
53PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
53 54
54static void xen_restore_fl(unsigned long flags) 55static void xen_restore_fl(unsigned long flags)
55{ 56{
@@ -76,6 +77,7 @@ static void xen_restore_fl(unsigned long flags)
76 xen_force_evtchn_callback(); 77 xen_force_evtchn_callback();
77 } 78 }
78} 79}
80PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
79 81
80static void xen_irq_disable(void) 82static void xen_irq_disable(void)
81{ 83{
@@ -86,6 +88,7 @@ static void xen_irq_disable(void)
86 percpu_read(xen_vcpu)->evtchn_upcall_mask = 1; 88 percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
87 preempt_enable_no_resched(); 89 preempt_enable_no_resched();
88} 90}
91PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
89 92
90static void xen_irq_enable(void) 93static void xen_irq_enable(void)
91{ 94{
@@ -106,6 +109,7 @@ static void xen_irq_enable(void)
106 if (unlikely(vcpu->evtchn_upcall_pending)) 109 if (unlikely(vcpu->evtchn_upcall_pending))
107 xen_force_evtchn_callback(); 110 xen_force_evtchn_callback();
108} 111}
112PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
109 113
110static void xen_safe_halt(void) 114static void xen_safe_halt(void)
111{ 115{
@@ -124,10 +128,12 @@ static void xen_halt(void)
124 128
125static const struct pv_irq_ops xen_irq_ops __initdata = { 129static const struct pv_irq_ops xen_irq_ops __initdata = {
126 .init_IRQ = __xen_init_IRQ, 130 .init_IRQ = __xen_init_IRQ,
127 .save_fl = xen_save_fl, 131
128 .restore_fl = xen_restore_fl, 132 .save_fl = PV_CALLEE_SAVE(xen_save_fl),
129 .irq_disable = xen_irq_disable, 133 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
130 .irq_enable = xen_irq_enable, 134 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
135 .irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
136
131 .safe_halt = xen_safe_halt, 137 .safe_halt = xen_safe_halt,
132 .halt = xen_halt, 138 .halt = xen_halt,
133#ifdef CONFIG_X86_64 139#ifdef CONFIG_X86_64
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 98cb9869eb24..d2e8ed1aff3d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -47,6 +47,7 @@
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/fixmap.h> 48#include <asm/fixmap.h>
49#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
50#include <asm/setup.h>
50#include <asm/paravirt.h> 51#include <asm/paravirt.h>
51#include <asm/linkage.h> 52#include <asm/linkage.h>
52 53
@@ -55,6 +56,8 @@
55 56
56#include <xen/page.h> 57#include <xen/page.h>
57#include <xen/interface/xen.h> 58#include <xen/interface/xen.h>
59#include <xen/interface/version.h>
60#include <xen/hvc-console.h>
58 61
59#include "multicalls.h" 62#include "multicalls.h"
60#include "mmu.h" 63#include "mmu.h"
@@ -114,6 +117,37 @@ static inline void check_zero(void)
114 117
115#endif /* CONFIG_XEN_DEBUG_FS */ 118#endif /* CONFIG_XEN_DEBUG_FS */
116 119
120
121/*
122 * Identity map, in addition to plain kernel map. This needs to be
123 * large enough to allocate page table pages to allocate the rest.
124 * Each page can map 2MB.
125 */
126static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
127
128#ifdef CONFIG_X86_64
129/* l3 pud for userspace vsyscall mapping */
130static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
131#endif /* CONFIG_X86_64 */
132
133/*
134 * Note about cr3 (pagetable base) values:
135 *
136 * xen_cr3 contains the current logical cr3 value; it contains the
137 * last set cr3. This may not be the current effective cr3, because
138 * its update may be being lazily deferred. However, a vcpu looking
139 * at its own cr3 can use this value knowing that it everything will
140 * be self-consistent.
141 *
142 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
143 * hypercall to set the vcpu cr3 is complete (so it may be a little
144 * out of date, but it will never be set early). If one vcpu is
145 * looking at another vcpu's cr3 value, it should use this variable.
146 */
147DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
148DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
149
150
117/* 151/*
118 * Just beyond the highest usermode address. STACK_TOP_MAX has a 152 * Just beyond the highest usermode address. STACK_TOP_MAX has a
119 * redzone above it, so round it up to a PGD boundary. 153 * redzone above it, so round it up to a PGD boundary.
@@ -458,28 +492,33 @@ pteval_t xen_pte_val(pte_t pte)
458{ 492{
459 return pte_mfn_to_pfn(pte.pte); 493 return pte_mfn_to_pfn(pte.pte);
460} 494}
495PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
461 496
462pgdval_t xen_pgd_val(pgd_t pgd) 497pgdval_t xen_pgd_val(pgd_t pgd)
463{ 498{
464 return pte_mfn_to_pfn(pgd.pgd); 499 return pte_mfn_to_pfn(pgd.pgd);
465} 500}
501PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
466 502
467pte_t xen_make_pte(pteval_t pte) 503pte_t xen_make_pte(pteval_t pte)
468{ 504{
469 pte = pte_pfn_to_mfn(pte); 505 pte = pte_pfn_to_mfn(pte);
470 return native_make_pte(pte); 506 return native_make_pte(pte);
471} 507}
508PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
472 509
473pgd_t xen_make_pgd(pgdval_t pgd) 510pgd_t xen_make_pgd(pgdval_t pgd)
474{ 511{
475 pgd = pte_pfn_to_mfn(pgd); 512 pgd = pte_pfn_to_mfn(pgd);
476 return native_make_pgd(pgd); 513 return native_make_pgd(pgd);
477} 514}
515PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
478 516
479pmdval_t xen_pmd_val(pmd_t pmd) 517pmdval_t xen_pmd_val(pmd_t pmd)
480{ 518{
481 return pte_mfn_to_pfn(pmd.pmd); 519 return pte_mfn_to_pfn(pmd.pmd);
482} 520}
521PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
483 522
484void xen_set_pud_hyper(pud_t *ptr, pud_t val) 523void xen_set_pud_hyper(pud_t *ptr, pud_t val)
485{ 524{
@@ -556,12 +595,14 @@ pmd_t xen_make_pmd(pmdval_t pmd)
556 pmd = pte_pfn_to_mfn(pmd); 595 pmd = pte_pfn_to_mfn(pmd);
557 return native_make_pmd(pmd); 596 return native_make_pmd(pmd);
558} 597}
598PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
559 599
560#if PAGETABLE_LEVELS == 4 600#if PAGETABLE_LEVELS == 4
561pudval_t xen_pud_val(pud_t pud) 601pudval_t xen_pud_val(pud_t pud)
562{ 602{
563 return pte_mfn_to_pfn(pud.pud); 603 return pte_mfn_to_pfn(pud.pud);
564} 604}
605PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
565 606
566pud_t xen_make_pud(pudval_t pud) 607pud_t xen_make_pud(pudval_t pud)
567{ 608{
@@ -569,6 +610,7 @@ pud_t xen_make_pud(pudval_t pud)
569 610
570 return native_make_pud(pud); 611 return native_make_pud(pud);
571} 612}
613PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
572 614
573pgd_t *xen_get_user_pgd(pgd_t *pgd) 615pgd_t *xen_get_user_pgd(pgd_t *pgd)
574{ 616{
@@ -1152,6 +1194,709 @@ void xen_exit_mmap(struct mm_struct *mm)
1152 spin_unlock(&mm->page_table_lock); 1194 spin_unlock(&mm->page_table_lock);
1153} 1195}
1154 1196
1197static __init void xen_pagetable_setup_start(pgd_t *base)
1198{
1199}
1200
1201static __init void xen_pagetable_setup_done(pgd_t *base)
1202{
1203 xen_setup_shared_info();
1204}
1205
1206static void xen_write_cr2(unsigned long cr2)
1207{
1208 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1209}
1210
1211static unsigned long xen_read_cr2(void)
1212{
1213 return percpu_read(xen_vcpu)->arch.cr2;
1214}
1215
1216unsigned long xen_read_cr2_direct(void)
1217{
1218 return percpu_read(xen_vcpu_info.arch.cr2);
1219}
1220
1221static void xen_flush_tlb(void)
1222{
1223 struct mmuext_op *op;
1224 struct multicall_space mcs;
1225
1226 preempt_disable();
1227
1228 mcs = xen_mc_entry(sizeof(*op));
1229
1230 op = mcs.args;
1231 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1232 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1233
1234 xen_mc_issue(PARAVIRT_LAZY_MMU);
1235
1236 preempt_enable();
1237}
1238
1239static void xen_flush_tlb_single(unsigned long addr)
1240{
1241 struct mmuext_op *op;
1242 struct multicall_space mcs;
1243
1244 preempt_disable();
1245
1246 mcs = xen_mc_entry(sizeof(*op));
1247 op = mcs.args;
1248 op->cmd = MMUEXT_INVLPG_LOCAL;
1249 op->arg1.linear_addr = addr & PAGE_MASK;
1250 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1251
1252 xen_mc_issue(PARAVIRT_LAZY_MMU);
1253
1254 preempt_enable();
1255}
1256
1257static void xen_flush_tlb_others(const struct cpumask *cpus,
1258 struct mm_struct *mm, unsigned long va)
1259{
1260 struct {
1261 struct mmuext_op op;
1262 DECLARE_BITMAP(mask, NR_CPUS);
1263 } *args;
1264 struct multicall_space mcs;
1265
1266 BUG_ON(cpumask_empty(cpus));
1267 BUG_ON(!mm);
1268
1269 mcs = xen_mc_entry(sizeof(*args));
1270 args = mcs.args;
1271 args->op.arg2.vcpumask = to_cpumask(args->mask);
1272
1273 /* Remove us, and any offline CPUS. */
1274 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1275 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1276 if (unlikely(cpumask_empty(to_cpumask(args->mask))))
1277 goto issue;
1278
1279 if (va == TLB_FLUSH_ALL) {
1280 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1281 } else {
1282 args->op.cmd = MMUEXT_INVLPG_MULTI;
1283 args->op.arg1.linear_addr = va;
1284 }
1285
1286 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1287
1288issue:
1289 xen_mc_issue(PARAVIRT_LAZY_MMU);
1290}
1291
1292static unsigned long xen_read_cr3(void)
1293{
1294 return percpu_read(xen_cr3);
1295}
1296
1297static void set_current_cr3(void *v)
1298{
1299 percpu_write(xen_current_cr3, (unsigned long)v);
1300}
1301
1302static void __xen_write_cr3(bool kernel, unsigned long cr3)
1303{
1304 struct mmuext_op *op;
1305 struct multicall_space mcs;
1306 unsigned long mfn;
1307
1308 if (cr3)
1309 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1310 else
1311 mfn = 0;
1312
1313 WARN_ON(mfn == 0 && kernel);
1314
1315 mcs = __xen_mc_entry(sizeof(*op));
1316
1317 op = mcs.args;
1318 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1319 op->arg1.mfn = mfn;
1320
1321 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1322
1323 if (kernel) {
1324 percpu_write(xen_cr3, cr3);
1325
1326 /* Update xen_current_cr3 once the batch has actually
1327 been submitted. */
1328 xen_mc_callback(set_current_cr3, (void *)cr3);
1329 }
1330}
1331
1332static void xen_write_cr3(unsigned long cr3)
1333{
1334 BUG_ON(preemptible());
1335
1336 xen_mc_batch(); /* disables interrupts */
1337
1338 /* Update while interrupts are disabled, so its atomic with
1339 respect to ipis */
1340 percpu_write(xen_cr3, cr3);
1341
1342 __xen_write_cr3(true, cr3);
1343
1344#ifdef CONFIG_X86_64
1345 {
1346 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1347 if (user_pgd)
1348 __xen_write_cr3(false, __pa(user_pgd));
1349 else
1350 __xen_write_cr3(false, 0);
1351 }
1352#endif
1353
1354 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1355}
1356
1357static int xen_pgd_alloc(struct mm_struct *mm)
1358{
1359 pgd_t *pgd = mm->pgd;
1360 int ret = 0;
1361
1362 BUG_ON(PagePinned(virt_to_page(pgd)));
1363
1364#ifdef CONFIG_X86_64
1365 {
1366 struct page *page = virt_to_page(pgd);
1367 pgd_t *user_pgd;
1368
1369 BUG_ON(page->private != 0);
1370
1371 ret = -ENOMEM;
1372
1373 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1374 page->private = (unsigned long)user_pgd;
1375
1376 if (user_pgd != NULL) {
1377 user_pgd[pgd_index(VSYSCALL_START)] =
1378 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1379 ret = 0;
1380 }
1381
1382 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1383 }
1384#endif
1385
1386 return ret;
1387}
1388
1389static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1390{
1391#ifdef CONFIG_X86_64
1392 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1393
1394 if (user_pgd)
1395 free_page((unsigned long)user_pgd);
1396#endif
1397}
1398
1399#ifdef CONFIG_HIGHPTE
1400static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
1401{
1402 pgprot_t prot = PAGE_KERNEL;
1403
1404 if (PagePinned(page))
1405 prot = PAGE_KERNEL_RO;
1406
1407 if (0 && PageHighMem(page))
1408 printk("mapping highpte %lx type %d prot %s\n",
1409 page_to_pfn(page), type,
1410 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
1411
1412 return kmap_atomic_prot(page, type, prot);
1413}
1414#endif
1415
1416#ifdef CONFIG_X86_32
1417static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1418{
1419 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1420 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1421 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1422 pte_val_ma(pte));
1423
1424 return pte;
1425}
1426
1427/* Init-time set_pte while constructing initial pagetables, which
1428 doesn't allow RO pagetable pages to be remapped RW */
1429static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1430{
1431 pte = mask_rw_pte(ptep, pte);
1432
1433 xen_set_pte(ptep, pte);
1434}
1435#endif
1436
1437/* Early in boot, while setting up the initial pagetable, assume
1438 everything is pinned. */
1439static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1440{
1441#ifdef CONFIG_FLATMEM
1442 BUG_ON(mem_map); /* should only be used early */
1443#endif
1444 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1445}
1446
1447/* Early release_pte assumes that all pts are pinned, since there's
1448 only init_mm and anything attached to that is pinned. */
1449static void xen_release_pte_init(unsigned long pfn)
1450{
1451 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1452}
1453
1454static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1455{
1456 struct mmuext_op op;
1457 op.cmd = cmd;
1458 op.arg1.mfn = pfn_to_mfn(pfn);
1459 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1460 BUG();
1461}
1462
1463/* This needs to make sure the new pte page is pinned iff its being
1464 attached to a pinned pagetable. */
1465static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1466{
1467 struct page *page = pfn_to_page(pfn);
1468
1469 if (PagePinned(virt_to_page(mm->pgd))) {
1470 SetPagePinned(page);
1471
1472 vm_unmap_aliases();
1473 if (!PageHighMem(page)) {
1474 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1475 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1476 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1477 } else {
1478 /* make sure there are no stray mappings of
1479 this page */
1480 kmap_flush_unused();
1481 }
1482 }
1483}
1484
1485static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1486{
1487 xen_alloc_ptpage(mm, pfn, PT_PTE);
1488}
1489
1490static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1491{
1492 xen_alloc_ptpage(mm, pfn, PT_PMD);
1493}
1494
1495/* This should never happen until we're OK to use struct page */
1496static void xen_release_ptpage(unsigned long pfn, unsigned level)
1497{
1498 struct page *page = pfn_to_page(pfn);
1499
1500 if (PagePinned(page)) {
1501 if (!PageHighMem(page)) {
1502 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1503 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1504 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1505 }
1506 ClearPagePinned(page);
1507 }
1508}
1509
1510static void xen_release_pte(unsigned long pfn)
1511{
1512 xen_release_ptpage(pfn, PT_PTE);
1513}
1514
1515static void xen_release_pmd(unsigned long pfn)
1516{
1517 xen_release_ptpage(pfn, PT_PMD);
1518}
1519
1520#if PAGETABLE_LEVELS == 4
1521static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1522{
1523 xen_alloc_ptpage(mm, pfn, PT_PUD);
1524}
1525
1526static void xen_release_pud(unsigned long pfn)
1527{
1528 xen_release_ptpage(pfn, PT_PUD);
1529}
1530#endif
1531
1532void __init xen_reserve_top(void)
1533{
1534#ifdef CONFIG_X86_32
1535 unsigned long top = HYPERVISOR_VIRT_START;
1536 struct xen_platform_parameters pp;
1537
1538 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1539 top = pp.virt_start;
1540
1541 reserve_top_address(-top);
1542#endif /* CONFIG_X86_32 */
1543}
1544
1545/*
1546 * Like __va(), but returns address in the kernel mapping (which is
1547 * all we have until the physical memory mapping has been set up.
1548 */
1549static void *__ka(phys_addr_t paddr)
1550{
1551#ifdef CONFIG_X86_64
1552 return (void *)(paddr + __START_KERNEL_map);
1553#else
1554 return __va(paddr);
1555#endif
1556}
1557
1558/* Convert a machine address to physical address */
1559static unsigned long m2p(phys_addr_t maddr)
1560{
1561 phys_addr_t paddr;
1562
1563 maddr &= PTE_PFN_MASK;
1564 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1565
1566 return paddr;
1567}
1568
1569/* Convert a machine address to kernel virtual */
1570static void *m2v(phys_addr_t maddr)
1571{
1572 return __ka(m2p(maddr));
1573}
1574
1575static void set_page_prot(void *addr, pgprot_t prot)
1576{
1577 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1578 pte_t pte = pfn_pte(pfn, prot);
1579
1580 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1581 BUG();
1582}
1583
1584static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1585{
1586 unsigned pmdidx, pteidx;
1587 unsigned ident_pte;
1588 unsigned long pfn;
1589
1590 ident_pte = 0;
1591 pfn = 0;
1592 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1593 pte_t *pte_page;
1594
1595 /* Reuse or allocate a page of ptes */
1596 if (pmd_present(pmd[pmdidx]))
1597 pte_page = m2v(pmd[pmdidx].pmd);
1598 else {
1599 /* Check for free pte pages */
1600 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1601 break;
1602
1603 pte_page = &level1_ident_pgt[ident_pte];
1604 ident_pte += PTRS_PER_PTE;
1605
1606 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1607 }
1608
1609 /* Install mappings */
1610 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1611 pte_t pte;
1612
1613 if (pfn > max_pfn_mapped)
1614 max_pfn_mapped = pfn;
1615
1616 if (!pte_none(pte_page[pteidx]))
1617 continue;
1618
1619 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1620 pte_page[pteidx] = pte;
1621 }
1622 }
1623
1624 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1625 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1626
1627 set_page_prot(pmd, PAGE_KERNEL_RO);
1628}
1629
1630#ifdef CONFIG_X86_64
1631static void convert_pfn_mfn(void *v)
1632{
1633 pte_t *pte = v;
1634 int i;
1635
1636 /* All levels are converted the same way, so just treat them
1637 as ptes. */
1638 for (i = 0; i < PTRS_PER_PTE; i++)
1639 pte[i] = xen_make_pte(pte[i].pte);
1640}
1641
1642/*
1643 * Set up the inital kernel pagetable.
1644 *
1645 * We can construct this by grafting the Xen provided pagetable into
1646 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1647 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1648 * means that only the kernel has a physical mapping to start with -
1649 * but that's enough to get __va working. We need to fill in the rest
1650 * of the physical mapping once some sort of allocator has been set
1651 * up.
1652 */
1653__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1654 unsigned long max_pfn)
1655{
1656 pud_t *l3;
1657 pmd_t *l2;
1658
1659 /* Zap identity mapping */
1660 init_level4_pgt[0] = __pgd(0);
1661
1662 /* Pre-constructed entries are in pfn, so convert to mfn */
1663 convert_pfn_mfn(init_level4_pgt);
1664 convert_pfn_mfn(level3_ident_pgt);
1665 convert_pfn_mfn(level3_kernel_pgt);
1666
1667 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1668 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1669
1670 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1671 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1672
1673 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1674 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1675 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1676
1677 /* Set up identity map */
1678 xen_map_identity_early(level2_ident_pgt, max_pfn);
1679
1680 /* Make pagetable pieces RO */
1681 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1682 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1683 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1684 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1685 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1686 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1687
1688 /* Pin down new L4 */
1689 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1690 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1691
1692 /* Unpin Xen-provided one */
1693 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1694
1695 /* Switch over */
1696 pgd = init_level4_pgt;
1697
1698 /*
1699 * At this stage there can be no user pgd, and no page
1700 * structure to attach it to, so make sure we just set kernel
1701 * pgd.
1702 */
1703 xen_mc_batch();
1704 __xen_write_cr3(true, __pa(pgd));
1705 xen_mc_issue(PARAVIRT_LAZY_CPU);
1706
1707 reserve_early(__pa(xen_start_info->pt_base),
1708 __pa(xen_start_info->pt_base +
1709 xen_start_info->nr_pt_frames * PAGE_SIZE),
1710 "XEN PAGETABLES");
1711
1712 return pgd;
1713}
1714#else /* !CONFIG_X86_64 */
1715static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1716
1717__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1718 unsigned long max_pfn)
1719{
1720 pmd_t *kernel_pmd;
1721
1722 init_pg_tables_start = __pa(pgd);
1723 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1724 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1725
1726 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1727 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1728
1729 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1730
1731 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1732 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1733 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1734
1735 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1736 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1737 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1738
1739 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1740
1741 xen_write_cr3(__pa(swapper_pg_dir));
1742
1743 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1744
1745 return swapper_pg_dir;
1746}
1747#endif /* CONFIG_X86_64 */
1748
1749static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1750{
1751 pte_t pte;
1752
1753 phys >>= PAGE_SHIFT;
1754
1755 switch (idx) {
1756 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1757#ifdef CONFIG_X86_F00F_BUG
1758 case FIX_F00F_IDT:
1759#endif
1760#ifdef CONFIG_X86_32
1761 case FIX_WP_TEST:
1762 case FIX_VDSO:
1763# ifdef CONFIG_HIGHMEM
1764 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1765# endif
1766#else
1767 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1768#endif
1769#ifdef CONFIG_X86_LOCAL_APIC
1770 case FIX_APIC_BASE: /* maps dummy local APIC */
1771#endif
1772 pte = pfn_pte(phys, prot);
1773 break;
1774
1775 default:
1776 pte = mfn_pte(phys, prot);
1777 break;
1778 }
1779
1780 __native_set_fixmap(idx, pte);
1781
1782#ifdef CONFIG_X86_64
1783 /* Replicate changes to map the vsyscall page into the user
1784 pagetable vsyscall mapping. */
1785 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1786 unsigned long vaddr = __fix_to_virt(idx);
1787 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1788 }
1789#endif
1790}
1791
1792__init void xen_post_allocator_init(void)
1793{
1794 pv_mmu_ops.set_pte = xen_set_pte;
1795 pv_mmu_ops.set_pmd = xen_set_pmd;
1796 pv_mmu_ops.set_pud = xen_set_pud;
1797#if PAGETABLE_LEVELS == 4
1798 pv_mmu_ops.set_pgd = xen_set_pgd;
1799#endif
1800
1801 /* This will work as long as patching hasn't happened yet
1802 (which it hasn't) */
1803 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1804 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1805 pv_mmu_ops.release_pte = xen_release_pte;
1806 pv_mmu_ops.release_pmd = xen_release_pmd;
1807#if PAGETABLE_LEVELS == 4
1808 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1809 pv_mmu_ops.release_pud = xen_release_pud;
1810#endif
1811
1812#ifdef CONFIG_X86_64
1813 SetPagePinned(virt_to_page(level3_user_vsyscall));
1814#endif
1815 xen_mark_init_mm_pinned();
1816}
1817
1818
1819const struct pv_mmu_ops xen_mmu_ops __initdata = {
1820 .pagetable_setup_start = xen_pagetable_setup_start,
1821 .pagetable_setup_done = xen_pagetable_setup_done,
1822
1823 .read_cr2 = xen_read_cr2,
1824 .write_cr2 = xen_write_cr2,
1825
1826 .read_cr3 = xen_read_cr3,
1827 .write_cr3 = xen_write_cr3,
1828
1829 .flush_tlb_user = xen_flush_tlb,
1830 .flush_tlb_kernel = xen_flush_tlb,
1831 .flush_tlb_single = xen_flush_tlb_single,
1832 .flush_tlb_others = xen_flush_tlb_others,
1833
1834 .pte_update = paravirt_nop,
1835 .pte_update_defer = paravirt_nop,
1836
1837 .pgd_alloc = xen_pgd_alloc,
1838 .pgd_free = xen_pgd_free,
1839
1840 .alloc_pte = xen_alloc_pte_init,
1841 .release_pte = xen_release_pte_init,
1842 .alloc_pmd = xen_alloc_pte_init,
1843 .alloc_pmd_clone = paravirt_nop,
1844 .release_pmd = xen_release_pte_init,
1845
1846#ifdef CONFIG_HIGHPTE
1847 .kmap_atomic_pte = xen_kmap_atomic_pte,
1848#endif
1849
1850#ifdef CONFIG_X86_64
1851 .set_pte = xen_set_pte,
1852#else
1853 .set_pte = xen_set_pte_init,
1854#endif
1855 .set_pte_at = xen_set_pte_at,
1856 .set_pmd = xen_set_pmd_hyper,
1857
1858 .ptep_modify_prot_start = __ptep_modify_prot_start,
1859 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1860
1861 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
1862 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
1863
1864 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
1865 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
1866
1867#ifdef CONFIG_X86_PAE
1868 .set_pte_atomic = xen_set_pte_atomic,
1869 .set_pte_present = xen_set_pte_at,
1870 .pte_clear = xen_pte_clear,
1871 .pmd_clear = xen_pmd_clear,
1872#endif /* CONFIG_X86_PAE */
1873 .set_pud = xen_set_pud_hyper,
1874
1875 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1876 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
1877
1878#if PAGETABLE_LEVELS == 4
1879 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
1880 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
1881 .set_pgd = xen_set_pgd_hyper,
1882
1883 .alloc_pud = xen_alloc_pte_init,
1884 .release_pud = xen_release_pte_init,
1885#endif /* PAGETABLE_LEVELS == 4 */
1886
1887 .activate_mm = xen_activate_mm,
1888 .dup_mmap = xen_dup_mmap,
1889 .exit_mmap = xen_exit_mmap,
1890
1891 .lazy_mode = {
1892 .enter = paravirt_enter_lazy_mmu,
1893 .leave = xen_leave_lazy,
1894 },
1895
1896 .set_fixmap = xen_set_fixmap,
1897};
1898
1899
1155#ifdef CONFIG_XEN_DEBUG_FS 1900#ifdef CONFIG_XEN_DEBUG_FS
1156 1901
1157static struct dentry *d_mmu_debug; 1902static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 98d71659da5a..24d1b44a337d 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t
54void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 54void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
55 pte_t *ptep, pte_t pte); 55 pte_t *ptep, pte_t pte);
56 56
57unsigned long xen_read_cr2_direct(void);
58
59extern const struct pv_mmu_ops xen_mmu_ops;
57#endif /* _XEN_MMU_H */ 60#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 88d5d5ec6beb..035582ae815d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -170,8 +170,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
170 170
171 /* We've switched to the "real" per-cpu gdt, so make sure the 171 /* We've switched to the "real" per-cpu gdt, so make sure the
172 old memory can be recycled */ 172 old memory can be recycled */
173 make_lowmem_page_readwrite(__per_cpu_load + 173 make_lowmem_page_readwrite(xen_initial_gdt);
174 (unsigned long)&per_cpu_var(gdt_page));
175 174
176 xen_setup_vcpu_info_placement(); 175 xen_setup_vcpu_info_placement();
177} 176}
@@ -287,6 +286,9 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
287 irq_ctx_init(cpu); 286 irq_ctx_init(cpu);
288#else 287#else
289 clear_tsk_thread_flag(idle, TIF_FORK); 288 clear_tsk_thread_flag(idle, TIF_FORK);
289 per_cpu(kernel_stack, cpu) =
290 (unsigned long)task_stack_page(idle) -
291 KERNEL_STACK_OFFSET + THREAD_SIZE;
290#endif 292#endif
291 xen_setup_timer(cpu); 293 xen_setup_timer(cpu);
292 xen_init_lock_cpu(cpu); 294 xen_init_lock_cpu(cpu);
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
new file mode 100644
index 000000000000..4c6f96799131
--- /dev/null
+++ b/arch/x86/xen/xen-asm.S
@@ -0,0 +1,140 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in percpu data) of
10 the operations here; the indirect forms are better handled in
11 C, since they're generally too large to inline anyway.
12 */
13
14#include <asm/asm-offsets.h>
15#include <asm/percpu.h>
16#include <asm/processor-flags.h>
17
18#include "xen-asm.h"
19
20/*
21 Enable events. This clears the event mask and tests the pending
22 event status with one and operation. If there are pending
23 events, then enter the hypervisor to get them handled.
24 */
25ENTRY(xen_irq_enable_direct)
26 /* Unmask events */
27 movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
28
29 /* Preempt here doesn't matter because that will deal with
30 any pending interrupts. The pending check may end up being
31 run on the wrong CPU, but that doesn't hurt. */
32
33 /* Test for pending */
34 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
35 jz 1f
36
372: call check_events
381:
39ENDPATCH(xen_irq_enable_direct)
40 ret
41 ENDPROC(xen_irq_enable_direct)
42 RELOC(xen_irq_enable_direct, 2b+1)
43
44
45/*
46 Disabling events is simply a matter of making the event mask
47 non-zero.
48 */
49ENTRY(xen_irq_disable_direct)
50 movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
51ENDPATCH(xen_irq_disable_direct)
52 ret
53 ENDPROC(xen_irq_disable_direct)
54 RELOC(xen_irq_disable_direct, 0)
55
56/*
57 (xen_)save_fl is used to get the current interrupt enable status.
58 Callers expect the status to be in X86_EFLAGS_IF, and other bits
59 may be set in the return value. We take advantage of this by
60 making sure that X86_EFLAGS_IF has the right value (and other bits
61 in that byte are 0), but other bits in the return value are
62 undefined. We need to toggle the state of the bit, because
63 Xen and x86 use opposite senses (mask vs enable).
64 */
65ENTRY(xen_save_fl_direct)
66 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
67 setz %ah
68 addb %ah,%ah
69ENDPATCH(xen_save_fl_direct)
70 ret
71 ENDPROC(xen_save_fl_direct)
72 RELOC(xen_save_fl_direct, 0)
73
74
75/*
76 In principle the caller should be passing us a value return
77 from xen_save_fl_direct, but for robustness sake we test only
78 the X86_EFLAGS_IF flag rather than the whole byte. After
79 setting the interrupt mask state, it checks for unmasked
80 pending events and enters the hypervisor to get them delivered
81 if so.
82 */
83ENTRY(xen_restore_fl_direct)
84#ifdef CONFIG_X86_64
85 testw $X86_EFLAGS_IF, %di
86#else
87 testb $X86_EFLAGS_IF>>8, %ah
88#endif
89 setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
90 /* Preempt here doesn't matter because that will deal with
91 any pending interrupts. The pending check may end up being
92 run on the wrong CPU, but that doesn't hurt. */
93
94 /* check for unmasked and pending */
95 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
96 jz 1f
972: call check_events
981:
99ENDPATCH(xen_restore_fl_direct)
100 ret
101 ENDPROC(xen_restore_fl_direct)
102 RELOC(xen_restore_fl_direct, 2b+1)
103
104
105/*
106 Force an event check by making a hypercall,
107 but preserve regs before making the call.
108 */
109check_events:
110#ifdef CONFIG_X86_32
111 push %eax
112 push %ecx
113 push %edx
114 call xen_force_evtchn_callback
115 pop %edx
116 pop %ecx
117 pop %eax
118#else
119 push %rax
120 push %rcx
121 push %rdx
122 push %rsi
123 push %rdi
124 push %r8
125 push %r9
126 push %r10
127 push %r11
128 call xen_force_evtchn_callback
129 pop %r11
130 pop %r10
131 pop %r9
132 pop %r8
133 pop %rdi
134 pop %rsi
135 pop %rdx
136 pop %rcx
137 pop %rax
138#endif
139 ret
140
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
new file mode 100644
index 000000000000..465276467a47
--- /dev/null
+++ b/arch/x86/xen/xen-asm.h
@@ -0,0 +1,12 @@
1#ifndef _XEN_XEN_ASM_H
2#define _XEN_XEN_ASM_H
3
4#include <linux/linkage.h>
5
6#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
7#define ENDPATCH(x) .globl x##_end; x##_end=.
8
9/* Pseudo-flag used for virtual NMI, which we don't implement yet */
10#define XEN_EFLAGS_NMI 0x80000000
11
12#endif
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 42786f59d9c0..082d173caaf3 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -11,101 +11,28 @@
11 generally too large to inline anyway. 11 generally too large to inline anyway.
12 */ 12 */
13 13
14#include <linux/linkage.h> 14//#include <asm/asm-offsets.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/thread_info.h> 15#include <asm/thread_info.h>
18#include <asm/percpu.h>
19#include <asm/processor-flags.h> 16#include <asm/processor-flags.h>
20#include <asm/segment.h> 17#include <asm/segment.h>
21 18
22#include <xen/interface/xen.h> 19#include <xen/interface/xen.h>
23 20
24#define RELOC(x, v) .globl x##_reloc; x##_reloc=v 21#include "xen-asm.h"
25#define ENDPATCH(x) .globl x##_end; x##_end=.
26
27/* Pseudo-flag used for virtual NMI, which we don't implement yet */
28#define XEN_EFLAGS_NMI 0x80000000
29
30/*
31 Enable events. This clears the event mask and tests the pending
32 event status with one and operation. If there are pending
33 events, then enter the hypervisor to get them handled.
34 */
35ENTRY(xen_irq_enable_direct)
36 /* Unmask events */
37 movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
38
39 /* Preempt here doesn't matter because that will deal with
40 any pending interrupts. The pending check may end up being
41 run on the wrong CPU, but that doesn't hurt. */
42
43 /* Test for pending */
44 testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
45 jz 1f
46
472: call check_events
481:
49ENDPATCH(xen_irq_enable_direct)
50 ret
51 ENDPROC(xen_irq_enable_direct)
52 RELOC(xen_irq_enable_direct, 2b+1)
53
54 22
55/* 23/*
56 Disabling events is simply a matter of making the event mask 24 Force an event check by making a hypercall,
57 non-zero. 25 but preserve regs before making the call.
58 */
59ENTRY(xen_irq_disable_direct)
60 movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
61ENDPATCH(xen_irq_disable_direct)
62 ret
63 ENDPROC(xen_irq_disable_direct)
64 RELOC(xen_irq_disable_direct, 0)
65
66/*
67 (xen_)save_fl is used to get the current interrupt enable status.
68 Callers expect the status to be in X86_EFLAGS_IF, and other bits
69 may be set in the return value. We take advantage of this by
70 making sure that X86_EFLAGS_IF has the right value (and other bits
71 in that byte are 0), but other bits in the return value are
72 undefined. We need to toggle the state of the bit, because
73 Xen and x86 use opposite senses (mask vs enable).
74 */
75ENTRY(xen_save_fl_direct)
76 testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
77 setz %ah
78 addb %ah,%ah
79ENDPATCH(xen_save_fl_direct)
80 ret
81 ENDPROC(xen_save_fl_direct)
82 RELOC(xen_save_fl_direct, 0)
83
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */ 26 */
93ENTRY(xen_restore_fl_direct) 27check_events:
94 testb $X86_EFLAGS_IF>>8, %ah 28 push %eax
95 setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask 29 push %ecx
96 /* Preempt here doesn't matter because that will deal with 30 push %edx
97 any pending interrupts. The pending check may end up being 31 call xen_force_evtchn_callback
98 run on the wrong CPU, but that doesn't hurt. */ 32 pop %edx
99 33 pop %ecx
100 /* check for unmasked and pending */ 34 pop %eax
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret 35 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109 36
110/* 37/*
111 We can't use sysexit directly, because we're not running in ring0. 38 We can't use sysexit directly, because we're not running in ring0.
@@ -289,17 +216,3 @@ ENTRY(xen_iret_crit_fixup)
289 lea 4(%edi),%esp /* point esp to new frame */ 216 lea 4(%edi),%esp /* point esp to new frame */
2902: jmp xen_do_upcall 2172: jmp xen_do_upcall
291 218
292
293/*
294 Force an event check by making a hypercall,
295 but preserve regs before making the call.
296 */
297check_events:
298 push %eax
299 push %ecx
300 push %edx
301 call xen_force_evtchn_callback
302 pop %edx
303 pop %ecx
304 pop %eax
305 ret
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index d6fc51f4ce85..d205a283efe0 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -11,142 +11,14 @@
11 generally too large to inline anyway. 11 generally too large to inline anyway.
12 */ 12 */
13 13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h> 14#include <asm/errno.h>
19#include <asm/segment.h>
20#include <asm/percpu.h> 15#include <asm/percpu.h>
16#include <asm/processor-flags.h>
17#include <asm/segment.h>
21 18
22#include <xen/interface/xen.h> 19#include <xen/interface/xen.h>
23 20
24#define RELOC(x, v) .globl x##_reloc; x##_reloc=v 21#include "xen-asm.h"
25#define ENDPATCH(x) .globl x##_end; x##_end=.
26
27/* Pseudo-flag used for virtual NMI, which we don't implement yet */
28#define XEN_EFLAGS_NMI 0x80000000
29
30#if 1
31/*
32 FIXME: x86_64 now can support direct access to percpu variables
33 via a segment override. Update xen accordingly.
34 */
35#define BUG ud2a
36#endif
37
38/*
39 Enable events. This clears the event mask and tests the pending
40 event status with one and operation. If there are pending
41 events, then enter the hypervisor to get them handled.
42 */
43ENTRY(xen_irq_enable_direct)
44 BUG
45
46 /* Unmask events */
47 movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
48
49 /* Preempt here doesn't matter because that will deal with
50 any pending interrupts. The pending check may end up being
51 run on the wrong CPU, but that doesn't hurt. */
52
53 /* Test for pending */
54 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
55 jz 1f
56
572: call check_events
581:
59ENDPATCH(xen_irq_enable_direct)
60 ret
61 ENDPROC(xen_irq_enable_direct)
62 RELOC(xen_irq_enable_direct, 2b+1)
63
64/*
65 Disabling events is simply a matter of making the event mask
66 non-zero.
67 */
68ENTRY(xen_irq_disable_direct)
69 BUG
70
71 movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
72ENDPATCH(xen_irq_disable_direct)
73 ret
74 ENDPROC(xen_irq_disable_direct)
75 RELOC(xen_irq_disable_direct, 0)
76
77/*
78 (xen_)save_fl is used to get the current interrupt enable status.
79 Callers expect the status to be in X86_EFLAGS_IF, and other bits
80 may be set in the return value. We take advantage of this by
81 making sure that X86_EFLAGS_IF has the right value (and other bits
82 in that byte are 0), but other bits in the return value are
83 undefined. We need to toggle the state of the bit, because
84 Xen and x86 use opposite senses (mask vs enable).
85 */
86ENTRY(xen_save_fl_direct)
87 BUG
88
89 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
90 setz %ah
91 addb %ah,%ah
92ENDPATCH(xen_save_fl_direct)
93 ret
94 ENDPROC(xen_save_fl_direct)
95 RELOC(xen_save_fl_direct, 0)
96
97/*
98 In principle the caller should be passing us a value return
99 from xen_save_fl_direct, but for robustness sake we test only
100 the X86_EFLAGS_IF flag rather than the whole byte. After
101 setting the interrupt mask state, it checks for unmasked
102 pending events and enters the hypervisor to get them delivered
103 if so.
104 */
105ENTRY(xen_restore_fl_direct)
106 BUG
107
108 testb $X86_EFLAGS_IF>>8, %ah
109 setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
110 /* Preempt here doesn't matter because that will deal with
111 any pending interrupts. The pending check may end up being
112 run on the wrong CPU, but that doesn't hurt. */
113
114 /* check for unmasked and pending */
115 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
116 jz 1f
1172: call check_events
1181:
119ENDPATCH(xen_restore_fl_direct)
120 ret
121 ENDPROC(xen_restore_fl_direct)
122 RELOC(xen_restore_fl_direct, 2b+1)
123
124
125/*
126 Force an event check by making a hypercall,
127 but preserve regs before making the call.
128 */
129check_events:
130 push %rax
131 push %rcx
132 push %rdx
133 push %rsi
134 push %rdi
135 push %r8
136 push %r9
137 push %r10
138 push %r11
139 call xen_force_evtchn_callback
140 pop %r11
141 pop %r10
142 pop %r9
143 pop %r8
144 pop %rdi
145 pop %rsi
146 pop %rdx
147 pop %rcx
148 pop %rax
149 ret
150 22
151ENTRY(xen_adjust_exception_frame) 23ENTRY(xen_adjust_exception_frame)
152 mov 8+0(%rsp),%rcx 24 mov 8+0(%rsp),%rcx
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index c1f8faf0a2c5..2f5ef2632ea2 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -10,9 +10,12 @@
10extern const char xen_hypervisor_callback[]; 10extern const char xen_hypervisor_callback[];
11extern const char xen_failsafe_callback[]; 11extern const char xen_failsafe_callback[];
12 12
13extern void *xen_initial_gdt;
14
13struct trap_info; 15struct trap_info;
14void xen_copy_trap_info(struct trap_info *traps); 16void xen_copy_trap_info(struct trap_info *traps);
15 17
18DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
16DECLARE_PER_CPU(unsigned long, xen_cr3); 19DECLARE_PER_CPU(unsigned long, xen_cr3);
17DECLARE_PER_CPU(unsigned long, xen_current_cr3); 20DECLARE_PER_CPU(unsigned long, xen_current_cr3);
18 21
@@ -22,6 +25,13 @@ extern struct shared_info *HYPERVISOR_shared_info;
22 25
23void xen_setup_mfn_list_list(void); 26void xen_setup_mfn_list_list(void);
24void xen_setup_shared_info(void); 27void xen_setup_shared_info(void);
28void xen_setup_machphys_mapping(void);
29pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void);
31void xen_reserve_top(void);
32
33void xen_leave_lazy(void);
34void xen_post_allocator_init(void);
25 35
26char * __init xen_memory_setup(void); 36char * __init xen_memory_setup(void);
27void __init xen_arch_setup(void); 37void __init xen_arch_setup(void);