diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-10-19 13:04:47 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-19 13:04:47 -0400 |
commit | 3e10e879a8c334a5927d800a3663a24d562cfa31 (patch) | |
tree | 5d18bc7e38c986a044e99aa0d0a4aff4931ec7d0 /arch/x86 | |
parent | 98d9c66ab07471006fd7910cb16453581c41a3e7 (diff) | |
parent | 0cfd81031a26717fe14380d18275f8e217571615 (diff) |
Merge branch 'linus' into tracing-v28-for-linus-v3
Conflicts:
init/main.c
kernel/module.c
scripts/bootgraph.pl
Diffstat (limited to 'arch/x86')
37 files changed, 1996 insertions, 1121 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5a34c5427a07..40ee80809562 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -18,6 +18,7 @@ config X86_64 | |||
18 | ### Arch settings | 18 | ### Arch settings |
19 | config X86 | 19 | config X86 |
20 | def_bool y | 20 | def_bool y |
21 | select HAVE_AOUT if X86_32 | ||
21 | select HAVE_UNSTABLE_SCHED_CLOCK | 22 | select HAVE_UNSTABLE_SCHED_CLOCK |
22 | select HAVE_IDE | 23 | select HAVE_IDE |
23 | select HAVE_OPROFILE | 24 | select HAVE_OPROFILE |
@@ -39,10 +40,6 @@ config ARCH_DEFCONFIG | |||
39 | default "arch/x86/configs/i386_defconfig" if X86_32 | 40 | default "arch/x86/configs/i386_defconfig" if X86_32 |
40 | default "arch/x86/configs/x86_64_defconfig" if X86_64 | 41 | default "arch/x86/configs/x86_64_defconfig" if X86_64 |
41 | 42 | ||
42 | |||
43 | config GENERIC_LOCKBREAK | ||
44 | def_bool n | ||
45 | |||
46 | config GENERIC_TIME | 43 | config GENERIC_TIME |
47 | def_bool y | 44 | def_bool y |
48 | 45 | ||
@@ -95,7 +92,7 @@ config GENERIC_HWEIGHT | |||
95 | def_bool y | 92 | def_bool y |
96 | 93 | ||
97 | config GENERIC_GPIO | 94 | config GENERIC_GPIO |
98 | def_bool n | 95 | bool |
99 | 96 | ||
100 | config ARCH_MAY_HAVE_PC_FDC | 97 | config ARCH_MAY_HAVE_PC_FDC |
101 | def_bool y | 98 | def_bool y |
@@ -106,12 +103,6 @@ config RWSEM_GENERIC_SPINLOCK | |||
106 | config RWSEM_XCHGADD_ALGORITHM | 103 | config RWSEM_XCHGADD_ALGORITHM |
107 | def_bool X86_XADD | 104 | def_bool X86_XADD |
108 | 105 | ||
109 | config ARCH_HAS_ILOG2_U32 | ||
110 | def_bool n | ||
111 | |||
112 | config ARCH_HAS_ILOG2_U64 | ||
113 | def_bool n | ||
114 | |||
115 | config ARCH_HAS_CPU_IDLE_WAIT | 106 | config ARCH_HAS_CPU_IDLE_WAIT |
116 | def_bool y | 107 | def_bool y |
117 | 108 | ||
@@ -153,9 +144,6 @@ config AUDIT_ARCH | |||
153 | bool | 144 | bool |
154 | default X86_64 | 145 | default X86_64 |
155 | 146 | ||
156 | config ARCH_SUPPORTS_AOUT | ||
157 | def_bool y | ||
158 | |||
159 | config ARCH_SUPPORTS_OPTIMIZED_INLINING | 147 | config ARCH_SUPPORTS_OPTIMIZED_INLINING |
160 | def_bool y | 148 | def_bool y |
161 | 149 | ||
@@ -761,9 +749,8 @@ config I8K | |||
761 | Say N otherwise. | 749 | Say N otherwise. |
762 | 750 | ||
763 | config X86_REBOOTFIXUPS | 751 | config X86_REBOOTFIXUPS |
764 | def_bool n | 752 | bool "Enable X86 board specific fixups for reboot" |
765 | prompt "Enable X86 board specific fixups for reboot" | 753 | depends on X86_32 |
766 | depends on X86_32 && X86 | ||
767 | ---help--- | 754 | ---help--- |
768 | This enables chipset and/or board specific fixups to be done | 755 | This enables chipset and/or board specific fixups to be done |
769 | in order to get reboot to work correctly. This is only needed on | 756 | in order to get reboot to work correctly. This is only needed on |
@@ -947,16 +934,17 @@ config HIGHMEM | |||
947 | depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) | 934 | depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) |
948 | 935 | ||
949 | config X86_PAE | 936 | config X86_PAE |
950 | def_bool n | 937 | bool "PAE (Physical Address Extension) Support" |
951 | prompt "PAE (Physical Address Extension) Support" | ||
952 | depends on X86_32 && !HIGHMEM4G | 938 | depends on X86_32 && !HIGHMEM4G |
953 | select RESOURCES_64BIT | ||
954 | help | 939 | help |
955 | PAE is required for NX support, and furthermore enables | 940 | PAE is required for NX support, and furthermore enables |
956 | larger swapspace support for non-overcommit purposes. It | 941 | larger swapspace support for non-overcommit purposes. It |
957 | has the cost of more pagetable lookup overhead, and also | 942 | has the cost of more pagetable lookup overhead, and also |
958 | consumes more pagetable space per process. | 943 | consumes more pagetable space per process. |
959 | 944 | ||
945 | config ARCH_PHYS_ADDR_T_64BIT | ||
946 | def_bool X86_64 || X86_PAE | ||
947 | |||
960 | # Common NUMA Features | 948 | # Common NUMA Features |
961 | config NUMA | 949 | config NUMA |
962 | bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" | 950 | bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" |
@@ -1241,8 +1229,7 @@ config X86_PAT | |||
1241 | If unsure, say Y. | 1229 | If unsure, say Y. |
1242 | 1230 | ||
1243 | config EFI | 1231 | config EFI |
1244 | def_bool n | 1232 | bool "EFI runtime service support" |
1245 | prompt "EFI runtime service support" | ||
1246 | depends on ACPI | 1233 | depends on ACPI |
1247 | ---help--- | 1234 | ---help--- |
1248 | This enables the kernel to use EFI runtime services that are | 1235 | This enables the kernel to use EFI runtime services that are |
@@ -1886,7 +1873,7 @@ config IA32_EMULATION | |||
1886 | 1873 | ||
1887 | config IA32_AOUT | 1874 | config IA32_AOUT |
1888 | tristate "IA32 a.out support" | 1875 | tristate "IA32 a.out support" |
1889 | depends on IA32_EMULATION && ARCH_SUPPORTS_AOUT | 1876 | depends on IA32_EMULATION |
1890 | help | 1877 | help |
1891 | Support old a.out binaries in the 32bit emulation. | 1878 | Support old a.out binaries in the 32bit emulation. |
1892 | 1879 | ||
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 1e6fe0214c85..99b3079dc6ab 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c | |||
@@ -88,14 +88,11 @@ static int vesa_probe(void) | |||
88 | (vminfo.memory_layout == 4 || | 88 | (vminfo.memory_layout == 4 || |
89 | vminfo.memory_layout == 6) && | 89 | vminfo.memory_layout == 6) && |
90 | vminfo.memory_planes == 1) { | 90 | vminfo.memory_planes == 1) { |
91 | #ifdef CONFIG_FB | 91 | #ifdef CONFIG_FB_BOOT_VESA_SUPPORT |
92 | /* Graphics mode, color, linear frame buffer | 92 | /* Graphics mode, color, linear frame buffer |
93 | supported. Only register the mode if | 93 | supported. Only register the mode if |
94 | if framebuffer is configured, however, | 94 | if framebuffer is configured, however, |
95 | otherwise the user will be left without a screen. | 95 | otherwise the user will be left without a screen. */ |
96 | We don't require CONFIG_FB_VESA, however, since | ||
97 | some of the other framebuffer drivers can use | ||
98 | this mode-setting, too. */ | ||
99 | mi = GET_HEAP(struct mode_info, 1); | 96 | mi = GET_HEAP(struct mode_info, 1); |
100 | mi->mode = mode + VIDEO_FIRST_VESA; | 97 | mi->mode = mode + VIDEO_FIRST_VESA; |
101 | mi->depth = vminfo.bpp; | 98 | mi->depth = vminfo.bpp; |
@@ -133,10 +130,12 @@ static int vesa_set_mode(struct mode_info *mode) | |||
133 | if ((vminfo.mode_attr & 0x15) == 0x05) { | 130 | if ((vminfo.mode_attr & 0x15) == 0x05) { |
134 | /* It's a supported text mode */ | 131 | /* It's a supported text mode */ |
135 | is_graphic = 0; | 132 | is_graphic = 0; |
133 | #ifdef CONFIG_FB_BOOT_VESA_SUPPORT | ||
136 | } else if ((vminfo.mode_attr & 0x99) == 0x99) { | 134 | } else if ((vminfo.mode_attr & 0x99) == 0x99) { |
137 | /* It's a graphics mode with linear frame buffer */ | 135 | /* It's a graphics mode with linear frame buffer */ |
138 | is_graphic = 1; | 136 | is_graphic = 1; |
139 | vesa_mode |= 0x4000; /* Request linear frame buffer */ | 137 | vesa_mode |= 0x4000; /* Request linear frame buffer */ |
138 | #endif | ||
140 | } else { | 139 | } else { |
141 | return -1; /* Invalid mode */ | 140 | return -1; /* Invalid mode */ |
142 | } | 141 | } |
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index eb4314768bf7..256b00b61892 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S | |||
@@ -571,8 +571,8 @@ ia32_sys_call_table: | |||
571 | .quad compat_sys_setrlimit /* 75 */ | 571 | .quad compat_sys_setrlimit /* 75 */ |
572 | .quad compat_sys_old_getrlimit /* old_getrlimit */ | 572 | .quad compat_sys_old_getrlimit /* old_getrlimit */ |
573 | .quad compat_sys_getrusage | 573 | .quad compat_sys_getrusage |
574 | .quad sys32_gettimeofday | 574 | .quad compat_sys_gettimeofday |
575 | .quad sys32_settimeofday | 575 | .quad compat_sys_settimeofday |
576 | .quad sys_getgroups16 /* 80 */ | 576 | .quad sys_getgroups16 /* 80 */ |
577 | .quad sys_setgroups16 | 577 | .quad sys_setgroups16 |
578 | .quad sys32_old_select | 578 | .quad sys32_old_select |
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index beda4232ce69..2e09dcd3c0a6 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c | |||
@@ -49,41 +49,6 @@ | |||
49 | 49 | ||
50 | #define AA(__x) ((unsigned long)(__x)) | 50 | #define AA(__x) ((unsigned long)(__x)) |
51 | 51 | ||
52 | int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf) | ||
53 | { | ||
54 | compat_ino_t ino; | ||
55 | |||
56 | typeof(ubuf->st_uid) uid = 0; | ||
57 | typeof(ubuf->st_gid) gid = 0; | ||
58 | SET_UID(uid, kbuf->uid); | ||
59 | SET_GID(gid, kbuf->gid); | ||
60 | if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev)) | ||
61 | return -EOVERFLOW; | ||
62 | if (kbuf->size >= 0x7fffffff) | ||
63 | return -EOVERFLOW; | ||
64 | ino = kbuf->ino; | ||
65 | if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino) | ||
66 | return -EOVERFLOW; | ||
67 | if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) || | ||
68 | __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) || | ||
69 | __put_user(ino, &ubuf->st_ino) || | ||
70 | __put_user(kbuf->mode, &ubuf->st_mode) || | ||
71 | __put_user(kbuf->nlink, &ubuf->st_nlink) || | ||
72 | __put_user(uid, &ubuf->st_uid) || | ||
73 | __put_user(gid, &ubuf->st_gid) || | ||
74 | __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || | ||
75 | __put_user(kbuf->size, &ubuf->st_size) || | ||
76 | __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) || | ||
77 | __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || | ||
78 | __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) || | ||
79 | __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || | ||
80 | __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) || | ||
81 | __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || | ||
82 | __put_user(kbuf->blksize, &ubuf->st_blksize) || | ||
83 | __put_user(kbuf->blocks, &ubuf->st_blocks)) | ||
84 | return -EFAULT; | ||
85 | return 0; | ||
86 | } | ||
87 | 52 | ||
88 | asmlinkage long sys32_truncate64(char __user *filename, | 53 | asmlinkage long sys32_truncate64(char __user *filename, |
89 | unsigned long offset_low, | 54 | unsigned long offset_low, |
@@ -402,75 +367,11 @@ asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, | |||
402 | return 0; | 367 | return 0; |
403 | } | 368 | } |
404 | 369 | ||
405 | static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i) | ||
406 | { | ||
407 | int err = -EFAULT; | ||
408 | |||
409 | if (access_ok(VERIFY_READ, i, sizeof(*i))) { | ||
410 | err = __get_user(o->tv_sec, &i->tv_sec); | ||
411 | err |= __get_user(o->tv_usec, &i->tv_usec); | ||
412 | } | ||
413 | return err; | ||
414 | } | ||
415 | |||
416 | static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i) | ||
417 | { | ||
418 | int err = -EFAULT; | ||
419 | |||
420 | if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { | ||
421 | err = __put_user(i->tv_sec, &o->tv_sec); | ||
422 | err |= __put_user(i->tv_usec, &o->tv_usec); | ||
423 | } | ||
424 | return err; | ||
425 | } | ||
426 | |||
427 | asmlinkage long sys32_alarm(unsigned int seconds) | 370 | asmlinkage long sys32_alarm(unsigned int seconds) |
428 | { | 371 | { |
429 | return alarm_setitimer(seconds); | 372 | return alarm_setitimer(seconds); |
430 | } | 373 | } |
431 | 374 | ||
432 | /* | ||
433 | * Translations due to time_t size differences. Which affects all | ||
434 | * sorts of things, like timeval and itimerval. | ||
435 | */ | ||
436 | asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv, | ||
437 | struct timezone __user *tz) | ||
438 | { | ||
439 | if (tv) { | ||
440 | struct timeval ktv; | ||
441 | |||
442 | do_gettimeofday(&ktv); | ||
443 | if (put_tv32(tv, &ktv)) | ||
444 | return -EFAULT; | ||
445 | } | ||
446 | if (tz) { | ||
447 | if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) | ||
448 | return -EFAULT; | ||
449 | } | ||
450 | return 0; | ||
451 | } | ||
452 | |||
453 | asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv, | ||
454 | struct timezone __user *tz) | ||
455 | { | ||
456 | struct timeval ktv; | ||
457 | struct timespec kts; | ||
458 | struct timezone ktz; | ||
459 | |||
460 | if (tv) { | ||
461 | if (get_tv32(&ktv, tv)) | ||
462 | return -EFAULT; | ||
463 | kts.tv_sec = ktv.tv_sec; | ||
464 | kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC; | ||
465 | } | ||
466 | if (tz) { | ||
467 | if (copy_from_user(&ktz, tz, sizeof(ktz))) | ||
468 | return -EFAULT; | ||
469 | } | ||
470 | |||
471 | return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); | ||
472 | } | ||
473 | |||
474 | struct sel_arg_struct { | 375 | struct sel_arg_struct { |
475 | unsigned int n; | 376 | unsigned int n; |
476 | unsigned int inp; | 377 | unsigned int inp; |
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 34e4d112b1ef..a8fd9ebdc8e2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c | |||
@@ -295,7 +295,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, | |||
295 | u64 address, size_t size) | 295 | u64 address, size_t size) |
296 | { | 296 | { |
297 | int s = 0; | 297 | int s = 0; |
298 | unsigned pages = iommu_num_pages(address, size); | 298 | unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); |
299 | 299 | ||
300 | address &= PAGE_MASK; | 300 | address &= PAGE_MASK; |
301 | 301 | ||
@@ -680,7 +680,8 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, | |||
680 | iommu->exclusion_start < dma_dom->aperture_size) { | 680 | iommu->exclusion_start < dma_dom->aperture_size) { |
681 | unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; | 681 | unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; |
682 | int pages = iommu_num_pages(iommu->exclusion_start, | 682 | int pages = iommu_num_pages(iommu->exclusion_start, |
683 | iommu->exclusion_length); | 683 | iommu->exclusion_length, |
684 | PAGE_SIZE); | ||
684 | dma_ops_reserve_addresses(dma_dom, startpage, pages); | 685 | dma_ops_reserve_addresses(dma_dom, startpage, pages); |
685 | } | 686 | } |
686 | 687 | ||
@@ -935,7 +936,7 @@ static dma_addr_t __map_single(struct device *dev, | |||
935 | unsigned long align_mask = 0; | 936 | unsigned long align_mask = 0; |
936 | int i; | 937 | int i; |
937 | 938 | ||
938 | pages = iommu_num_pages(paddr, size); | 939 | pages = iommu_num_pages(paddr, size, PAGE_SIZE); |
939 | paddr &= PAGE_MASK; | 940 | paddr &= PAGE_MASK; |
940 | 941 | ||
941 | if (align) | 942 | if (align) |
@@ -980,7 +981,7 @@ static void __unmap_single(struct amd_iommu *iommu, | |||
980 | if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) | 981 | if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) |
981 | return; | 982 | return; |
982 | 983 | ||
983 | pages = iommu_num_pages(dma_addr, size); | 984 | pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); |
984 | dma_addr &= PAGE_MASK; | 985 | dma_addr &= PAGE_MASK; |
985 | start = dma_addr; | 986 | start = dma_addr; |
986 | 987 | ||
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 6a44d6465991..72cefd1e649b 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c | |||
@@ -147,8 +147,8 @@ static __cpuinit int cpuid_device_create(int cpu) | |||
147 | { | 147 | { |
148 | struct device *dev; | 148 | struct device *dev; |
149 | 149 | ||
150 | dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), | 150 | dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, |
151 | NULL, "cpu%d", cpu); | 151 | "cpu%d", cpu); |
152 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; | 152 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; |
153 | } | 153 | } |
154 | 154 | ||
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 201ee359a1a9..1a78180f08d3 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/kexec.h> | 13 | #include <linux/kexec.h> |
14 | #include <linux/bug.h> | 14 | #include <linux/bug.h> |
15 | #include <linux/nmi.h> | 15 | #include <linux/nmi.h> |
16 | #include <linux/sysfs.h> | ||
16 | 17 | ||
17 | #include <asm/stacktrace.h> | 18 | #include <asm/stacktrace.h> |
18 | 19 | ||
@@ -343,6 +344,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) | |||
343 | printk("DEBUG_PAGEALLOC"); | 344 | printk("DEBUG_PAGEALLOC"); |
344 | #endif | 345 | #endif |
345 | printk("\n"); | 346 | printk("\n"); |
347 | sysfs_printk_last_file(); | ||
346 | if (notify_die(DIE_OOPS, str, regs, err, | 348 | if (notify_die(DIE_OOPS, str, regs, err, |
347 | current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) | 349 | current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) |
348 | return 1; | 350 | return 1; |
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 086cc8118e39..96a5db7da8a7 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/kexec.h> | 13 | #include <linux/kexec.h> |
14 | #include <linux/bug.h> | 14 | #include <linux/bug.h> |
15 | #include <linux/nmi.h> | 15 | #include <linux/nmi.h> |
16 | #include <linux/sysfs.h> | ||
16 | 17 | ||
17 | #include <asm/stacktrace.h> | 18 | #include <asm/stacktrace.h> |
18 | 19 | ||
@@ -489,6 +490,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) | |||
489 | printk("DEBUG_PAGEALLOC"); | 490 | printk("DEBUG_PAGEALLOC"); |
490 | #endif | 491 | #endif |
491 | printk("\n"); | 492 | printk("\n"); |
493 | sysfs_printk_last_file(); | ||
492 | if (notify_die(DIE_OOPS, str, regs, err, | 494 | if (notify_die(DIE_OOPS, str, regs, err, |
493 | current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) | 495 | current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) |
494 | return 1; | 496 | return 1; |
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 78e642feac30..ce97bf3bed12 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -1282,12 +1282,10 @@ void __init e820_reserve_resources(void) | |||
1282 | e820_res = res; | 1282 | e820_res = res; |
1283 | for (i = 0; i < e820.nr_map; i++) { | 1283 | for (i = 0; i < e820.nr_map; i++) { |
1284 | end = e820.map[i].addr + e820.map[i].size - 1; | 1284 | end = e820.map[i].addr + e820.map[i].size - 1; |
1285 | #ifndef CONFIG_RESOURCES_64BIT | 1285 | if (end != (resource_size_t)end) { |
1286 | if (end > 0x100000000ULL) { | ||
1287 | res++; | 1286 | res++; |
1288 | continue; | 1287 | continue; |
1289 | } | 1288 | } |
1290 | #endif | ||
1291 | res->name = e820_type_to_string(e820.map[i].type); | 1289 | res->name = e820_type_to_string(e820.map[i].type); |
1292 | res->start = e820.map[i].addr; | 1290 | res->start = e820.map[i].addr; |
1293 | res->end = end; | 1291 | res->end = end; |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d02def06ca91..774ac4991568 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void) | |||
78 | return ret; | 78 | return ret; |
79 | } | 79 | } |
80 | 80 | ||
81 | /* | ||
82 | * If we don't do that, there is the possibility that the guest | ||
83 | * will calibrate under heavy load - thus, getting a lower lpj - | ||
84 | * and execute the delays themselves without load. This is wrong, | ||
85 | * because no delay loop can finish beforehand. | ||
86 | * Any heuristics is subject to fail, because ultimately, a large | ||
87 | * poll of guests can be running and trouble each other. So we preset | ||
88 | * lpj here | ||
89 | */ | ||
90 | static unsigned long kvm_get_tsc_khz(void) | ||
91 | { | ||
92 | return preset_lpj; | ||
93 | } | ||
94 | |||
95 | static void kvm_get_preset_lpj(void) | ||
96 | { | ||
97 | struct pvclock_vcpu_time_info *src; | ||
98 | unsigned long khz; | ||
99 | u64 lpj; | ||
100 | |||
101 | src = &per_cpu(hv_clock, 0); | ||
102 | khz = pvclock_tsc_khz(src); | ||
103 | |||
104 | lpj = ((u64)khz * 1000); | ||
105 | do_div(lpj, HZ); | ||
106 | preset_lpj = lpj; | ||
107 | } | ||
108 | |||
81 | static struct clocksource kvm_clock = { | 109 | static struct clocksource kvm_clock = { |
82 | .name = "kvm-clock", | 110 | .name = "kvm-clock", |
83 | .read = kvm_clock_read, | 111 | .read = kvm_clock_read, |
@@ -153,6 +181,7 @@ void __init kvmclock_init(void) | |||
153 | pv_time_ops.get_wallclock = kvm_get_wallclock; | 181 | pv_time_ops.get_wallclock = kvm_get_wallclock; |
154 | pv_time_ops.set_wallclock = kvm_set_wallclock; | 182 | pv_time_ops.set_wallclock = kvm_set_wallclock; |
155 | pv_time_ops.sched_clock = kvm_clock_read; | 183 | pv_time_ops.sched_clock = kvm_clock_read; |
184 | pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; | ||
156 | #ifdef CONFIG_X86_LOCAL_APIC | 185 | #ifdef CONFIG_X86_LOCAL_APIC |
157 | pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; | 186 | pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; |
158 | #endif | 187 | #endif |
@@ -163,6 +192,7 @@ void __init kvmclock_init(void) | |||
163 | #ifdef CONFIG_KEXEC | 192 | #ifdef CONFIG_KEXEC |
164 | machine_ops.crash_shutdown = kvm_crash_shutdown; | 193 | machine_ops.crash_shutdown = kvm_crash_shutdown; |
165 | #endif | 194 | #endif |
195 | kvm_get_preset_lpj(); | ||
166 | clocksource_register(&kvm_clock); | 196 | clocksource_register(&kvm_clock); |
167 | } | 197 | } |
168 | } | 198 | } |
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 2e2af5d18191..82a7c7ed6d45 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c | |||
@@ -163,8 +163,8 @@ static int __cpuinit msr_device_create(int cpu) | |||
163 | { | 163 | { |
164 | struct device *dev; | 164 | struct device *dev; |
165 | 165 | ||
166 | dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), | 166 | dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, |
167 | NULL, "msr%d", cpu); | 167 | "msr%d", cpu); |
168 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; | 168 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; |
169 | } | 169 | } |
170 | 170 | ||
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 080d1d27f37a..e1e731d78f38 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c | |||
@@ -217,16 +217,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, | |||
217 | 217 | ||
218 | #endif /* CONFIG_IOMMU_DEBUG */ | 218 | #endif /* CONFIG_IOMMU_DEBUG */ |
219 | 219 | ||
220 | static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) | ||
221 | { | ||
222 | unsigned int npages; | ||
223 | |||
224 | npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK); | ||
225 | npages >>= PAGE_SHIFT; | ||
226 | |||
227 | return npages; | ||
228 | } | ||
229 | |||
230 | static inline int translation_enabled(struct iommu_table *tbl) | 220 | static inline int translation_enabled(struct iommu_table *tbl) |
231 | { | 221 | { |
232 | /* only PHBs with translation enabled have an IOMMU table */ | 222 | /* only PHBs with translation enabled have an IOMMU table */ |
@@ -408,7 +398,7 @@ static void calgary_unmap_sg(struct device *dev, | |||
408 | if (dmalen == 0) | 398 | if (dmalen == 0) |
409 | break; | 399 | break; |
410 | 400 | ||
411 | npages = num_dma_pages(dma, dmalen); | 401 | npages = iommu_num_pages(dma, dmalen, PAGE_SIZE); |
412 | iommu_free(tbl, dma, npages); | 402 | iommu_free(tbl, dma, npages); |
413 | } | 403 | } |
414 | } | 404 | } |
@@ -427,7 +417,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, | |||
427 | BUG_ON(!sg_page(s)); | 417 | BUG_ON(!sg_page(s)); |
428 | 418 | ||
429 | vaddr = (unsigned long) sg_virt(s); | 419 | vaddr = (unsigned long) sg_virt(s); |
430 | npages = num_dma_pages(vaddr, s->length); | 420 | npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); |
431 | 421 | ||
432 | entry = iommu_range_alloc(dev, tbl, npages); | 422 | entry = iommu_range_alloc(dev, tbl, npages); |
433 | if (entry == bad_dma_address) { | 423 | if (entry == bad_dma_address) { |
@@ -464,7 +454,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, | |||
464 | struct iommu_table *tbl = find_iommu_table(dev); | 454 | struct iommu_table *tbl = find_iommu_table(dev); |
465 | 455 | ||
466 | uaddr = (unsigned long)vaddr; | 456 | uaddr = (unsigned long)vaddr; |
467 | npages = num_dma_pages(uaddr, size); | 457 | npages = iommu_num_pages(uaddr, size, PAGE_SIZE); |
468 | 458 | ||
469 | return iommu_alloc(dev, tbl, vaddr, npages, direction); | 459 | return iommu_alloc(dev, tbl, vaddr, npages, direction); |
470 | } | 460 | } |
@@ -475,7 +465,7 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, | |||
475 | struct iommu_table *tbl = find_iommu_table(dev); | 465 | struct iommu_table *tbl = find_iommu_table(dev); |
476 | unsigned int npages; | 466 | unsigned int npages; |
477 | 467 | ||
478 | npages = num_dma_pages(dma_handle, size); | 468 | npages = iommu_num_pages(dma_handle, size, PAGE_SIZE); |
479 | iommu_free(tbl, dma_handle, npages); | 469 | iommu_free(tbl, dma_handle, npages); |
480 | } | 470 | } |
481 | 471 | ||
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 0a3824e837b4..192624820217 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c | |||
@@ -125,13 +125,13 @@ void __init pci_iommu_alloc(void) | |||
125 | pci_swiotlb_init(); | 125 | pci_swiotlb_init(); |
126 | } | 126 | } |
127 | 127 | ||
128 | unsigned long iommu_num_pages(unsigned long addr, unsigned long len) | 128 | unsigned long iommu_nr_pages(unsigned long addr, unsigned long len) |
129 | { | 129 | { |
130 | unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); | 130 | unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); |
131 | 131 | ||
132 | return size >> PAGE_SHIFT; | 132 | return size >> PAGE_SHIFT; |
133 | } | 133 | } |
134 | EXPORT_SYMBOL(iommu_num_pages); | 134 | EXPORT_SYMBOL(iommu_nr_pages); |
135 | #endif | 135 | #endif |
136 | 136 | ||
137 | void *dma_generic_alloc_coherent(struct device *dev, size_t size, | 137 | void *dma_generic_alloc_coherent(struct device *dev, size_t size, |
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 145f1c83369f..e3f75bbcedea 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c | |||
@@ -231,7 +231,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size) | |||
231 | static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, | 231 | static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, |
232 | size_t size, int dir, unsigned long align_mask) | 232 | size_t size, int dir, unsigned long align_mask) |
233 | { | 233 | { |
234 | unsigned long npages = iommu_num_pages(phys_mem, size); | 234 | unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); |
235 | unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); | 235 | unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); |
236 | int i; | 236 | int i; |
237 | 237 | ||
@@ -285,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, | |||
285 | return; | 285 | return; |
286 | 286 | ||
287 | iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; | 287 | iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; |
288 | npages = iommu_num_pages(dma_addr, size); | 288 | npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); |
289 | for (i = 0; i < npages; i++) { | 289 | for (i = 0; i < npages; i++) { |
290 | iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; | 290 | iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; |
291 | CLEAR_LEAK(iommu_page + i); | 291 | CLEAR_LEAK(iommu_page + i); |
@@ -368,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, | |||
368 | } | 368 | } |
369 | 369 | ||
370 | addr = phys_addr; | 370 | addr = phys_addr; |
371 | pages = iommu_num_pages(s->offset, s->length); | 371 | pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); |
372 | while (pages--) { | 372 | while (pages--) { |
373 | iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); | 373 | iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); |
374 | SET_LEAK(iommu_page); | 374 | SET_LEAK(iommu_page); |
@@ -451,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) | |||
451 | 451 | ||
452 | seg_size += s->length; | 452 | seg_size += s->length; |
453 | need = nextneed; | 453 | need = nextneed; |
454 | pages += iommu_num_pages(s->offset, s->length); | 454 | pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE); |
455 | ps = s; | 455 | ps = s; |
456 | } | 456 | } |
457 | if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) | 457 | if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) |
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 05fbe9a0325a..4f9c55f3a7c0 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c | |||
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, | |||
97 | return dst->version; | 97 | return dst->version; |
98 | } | 98 | } |
99 | 99 | ||
100 | unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) | ||
101 | { | ||
102 | u64 pv_tsc_khz = 1000000ULL << 32; | ||
103 | |||
104 | do_div(pv_tsc_khz, src->tsc_to_system_mul); | ||
105 | if (src->tsc_shift < 0) | ||
106 | pv_tsc_khz <<= -src->tsc_shift; | ||
107 | else | ||
108 | pv_tsc_khz >>= src->tsc_shift; | ||
109 | return pv_tsc_khz; | ||
110 | } | ||
111 | |||
100 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) | 112 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) |
101 | { | 113 | { |
102 | struct pvclock_shadow_time shadow; | 114 | struct pvclock_shadow_time shadow; |
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 05191bbc68b8..0a23b5795b25 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c | |||
@@ -223,11 +223,25 @@ static struct platform_device rtc_device = { | |||
223 | static __init int add_rtc_cmos(void) | 223 | static __init int add_rtc_cmos(void) |
224 | { | 224 | { |
225 | #ifdef CONFIG_PNP | 225 | #ifdef CONFIG_PNP |
226 | if (!pnp_platform_devices) | 226 | static const char *ids[] __initconst = |
227 | platform_device_register(&rtc_device); | 227 | { "PNP0b00", "PNP0b01", "PNP0b02", }; |
228 | #else | 228 | struct pnp_dev *dev; |
229 | struct pnp_id *id; | ||
230 | int i; | ||
231 | |||
232 | pnp_for_each_dev(dev) { | ||
233 | for (id = dev->id; id; id = id->next) { | ||
234 | for (i = 0; i < ARRAY_SIZE(ids); i++) { | ||
235 | if (compare_pnp_id(id, ids[i]) != 0) | ||
236 | return 0; | ||
237 | } | ||
238 | } | ||
239 | } | ||
240 | #endif | ||
241 | |||
229 | platform_device_register(&rtc_device); | 242 | platform_device_register(&rtc_device); |
230 | #endif /* CONFIG_PNP */ | 243 | dev_info(&rtc_device.dev, |
244 | "registered platform RTC device (no PNP device found)\n"); | ||
231 | return 0; | 245 | return 0; |
232 | } | 246 | } |
233 | device_initcall(add_rtc_cmos); | 247 | device_initcall(add_rtc_cmos); |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8c3aca7cb343..7ed9e070a6e9 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -282,6 +282,8 @@ static void __cpuinit smp_callin(void) | |||
282 | cpu_set(cpuid, cpu_callin_map); | 282 | cpu_set(cpuid, cpu_callin_map); |
283 | } | 283 | } |
284 | 284 | ||
285 | static int __cpuinitdata unsafe_smp; | ||
286 | |||
285 | /* | 287 | /* |
286 | * Activate a secondary processor. | 288 | * Activate a secondary processor. |
287 | */ | 289 | */ |
@@ -397,7 +399,7 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) | |||
397 | goto valid_k7; | 399 | goto valid_k7; |
398 | 400 | ||
399 | /* If we get here, not a certified SMP capable AMD system. */ | 401 | /* If we get here, not a certified SMP capable AMD system. */ |
400 | add_taint(TAINT_UNSAFE_SMP); | 402 | unsafe_smp = 1; |
401 | } | 403 | } |
402 | 404 | ||
403 | valid_k7: | 405 | valid_k7: |
@@ -414,12 +416,10 @@ static void __cpuinit smp_checks(void) | |||
414 | * Don't taint if we are running SMP kernel on a single non-MP | 416 | * Don't taint if we are running SMP kernel on a single non-MP |
415 | * approved Athlon | 417 | * approved Athlon |
416 | */ | 418 | */ |
417 | if (tainted & TAINT_UNSAFE_SMP) { | 419 | if (unsafe_smp && num_online_cpus() > 1) { |
418 | if (num_online_cpus()) | 420 | printk(KERN_INFO "WARNING: This combination of AMD" |
419 | printk(KERN_INFO "WARNING: This combination of AMD" | 421 | "processors is not suitable for SMP.\n"); |
420 | "processors is not suitable for SMP.\n"); | 422 | add_taint(TAINT_UNSAFE_SMP); |
421 | else | ||
422 | tainted &= ~TAINT_UNSAFE_SMP; | ||
423 | } | 423 | } |
424 | } | 424 | } |
425 | 425 | ||
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index d0e940bb6f40..c02343594b4d 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
@@ -3,10 +3,13 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ | 5 | common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ |
6 | coalesced_mmio.o) | 6 | coalesced_mmio.o irq_comm.o) |
7 | ifeq ($(CONFIG_KVM_TRACE),y) | 7 | ifeq ($(CONFIG_KVM_TRACE),y) |
8 | common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) | 8 | common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) |
9 | endif | 9 | endif |
10 | ifeq ($(CONFIG_DMAR),y) | ||
11 | common-objs += $(addprefix ../../../virt/kvm/, vtd.o) | ||
12 | endif | ||
10 | 13 | ||
11 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm | 14 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm |
12 | 15 | ||
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index c0f7872a9124..634132a9a512 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -200,13 +200,14 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) | |||
200 | 200 | ||
201 | if (!atomic_inc_and_test(&pt->pending)) | 201 | if (!atomic_inc_and_test(&pt->pending)) |
202 | set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); | 202 | set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); |
203 | if (vcpu0 && waitqueue_active(&vcpu0->wq)) { | 203 | |
204 | vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 204 | if (vcpu0 && waitqueue_active(&vcpu0->wq)) |
205 | wake_up_interruptible(&vcpu0->wq); | 205 | wake_up_interruptible(&vcpu0->wq); |
206 | } | ||
207 | 206 | ||
208 | pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); | 207 | pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); |
209 | pt->scheduled = ktime_to_ns(pt->timer.expires); | 208 | pt->scheduled = ktime_to_ns(pt->timer.expires); |
209 | if (pt->period) | ||
210 | ps->channels[0].count_load_time = pt->timer.expires; | ||
210 | 211 | ||
211 | return (pt->period == 0 ? 0 : 1); | 212 | return (pt->period == 0 ? 0 : 1); |
212 | } | 213 | } |
@@ -215,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) | |||
215 | { | 216 | { |
216 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; | 217 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; |
217 | 218 | ||
218 | if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending) | 219 | if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack) |
219 | return atomic_read(&pit->pit_state.pit_timer.pending); | 220 | return atomic_read(&pit->pit_state.pit_timer.pending); |
220 | |||
221 | return 0; | 221 | return 0; |
222 | } | 222 | } |
223 | 223 | ||
224 | static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) | ||
225 | { | ||
226 | struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, | ||
227 | irq_ack_notifier); | ||
228 | spin_lock(&ps->inject_lock); | ||
229 | if (atomic_dec_return(&ps->pit_timer.pending) < 0) | ||
230 | atomic_inc(&ps->pit_timer.pending); | ||
231 | ps->irq_ack = 1; | ||
232 | spin_unlock(&ps->inject_lock); | ||
233 | } | ||
234 | |||
224 | static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) | 235 | static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) |
225 | { | 236 | { |
226 | struct kvm_kpit_state *ps; | 237 | struct kvm_kpit_state *ps; |
@@ -255,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt) | |||
255 | hrtimer_cancel(&pt->timer); | 266 | hrtimer_cancel(&pt->timer); |
256 | } | 267 | } |
257 | 268 | ||
258 | static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) | 269 | static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) |
259 | { | 270 | { |
271 | struct kvm_kpit_timer *pt = &ps->pit_timer; | ||
260 | s64 interval; | 272 | s64 interval; |
261 | 273 | ||
262 | interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); | 274 | interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); |
@@ -268,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) | |||
268 | pt->period = (is_period == 0) ? 0 : interval; | 280 | pt->period = (is_period == 0) ? 0 : interval; |
269 | pt->timer.function = pit_timer_fn; | 281 | pt->timer.function = pit_timer_fn; |
270 | atomic_set(&pt->pending, 0); | 282 | atomic_set(&pt->pending, 0); |
283 | ps->irq_ack = 1; | ||
271 | 284 | ||
272 | hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), | 285 | hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), |
273 | HRTIMER_MODE_ABS); | 286 | HRTIMER_MODE_ABS); |
@@ -302,11 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) | |||
302 | case 1: | 315 | case 1: |
303 | /* FIXME: enhance mode 4 precision */ | 316 | /* FIXME: enhance mode 4 precision */ |
304 | case 4: | 317 | case 4: |
305 | create_pit_timer(&ps->pit_timer, val, 0); | 318 | create_pit_timer(ps, val, 0); |
306 | break; | 319 | break; |
307 | case 2: | 320 | case 2: |
308 | case 3: | 321 | case 3: |
309 | create_pit_timer(&ps->pit_timer, val, 1); | 322 | create_pit_timer(ps, val, 1); |
310 | break; | 323 | break; |
311 | default: | 324 | default: |
312 | destroy_pit_timer(&ps->pit_timer); | 325 | destroy_pit_timer(&ps->pit_timer); |
@@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit) | |||
520 | mutex_unlock(&pit->pit_state.lock); | 533 | mutex_unlock(&pit->pit_state.lock); |
521 | 534 | ||
522 | atomic_set(&pit->pit_state.pit_timer.pending, 0); | 535 | atomic_set(&pit->pit_state.pit_timer.pending, 0); |
523 | pit->pit_state.inject_pending = 1; | 536 | pit->pit_state.irq_ack = 1; |
524 | } | 537 | } |
525 | 538 | ||
526 | struct kvm_pit *kvm_create_pit(struct kvm *kvm) | 539 | struct kvm_pit *kvm_create_pit(struct kvm *kvm) |
@@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) | |||
534 | 547 | ||
535 | mutex_init(&pit->pit_state.lock); | 548 | mutex_init(&pit->pit_state.lock); |
536 | mutex_lock(&pit->pit_state.lock); | 549 | mutex_lock(&pit->pit_state.lock); |
550 | spin_lock_init(&pit->pit_state.inject_lock); | ||
537 | 551 | ||
538 | /* Initialize PIO device */ | 552 | /* Initialize PIO device */ |
539 | pit->dev.read = pit_ioport_read; | 553 | pit->dev.read = pit_ioport_read; |
@@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) | |||
555 | pit_state->pit = pit; | 569 | pit_state->pit = pit; |
556 | hrtimer_init(&pit_state->pit_timer.timer, | 570 | hrtimer_init(&pit_state->pit_timer.timer, |
557 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 571 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); |
572 | pit_state->irq_ack_notifier.gsi = 0; | ||
573 | pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; | ||
574 | kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); | ||
558 | mutex_unlock(&pit->pit_state.lock); | 575 | mutex_unlock(&pit->pit_state.lock); |
559 | 576 | ||
560 | kvm_pit_reset(pit); | 577 | kvm_pit_reset(pit); |
@@ -578,10 +595,8 @@ void kvm_free_pit(struct kvm *kvm) | |||
578 | static void __inject_pit_timer_intr(struct kvm *kvm) | 595 | static void __inject_pit_timer_intr(struct kvm *kvm) |
579 | { | 596 | { |
580 | mutex_lock(&kvm->lock); | 597 | mutex_lock(&kvm->lock); |
581 | kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); | 598 | kvm_set_irq(kvm, 0, 1); |
582 | kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0); | 599 | kvm_set_irq(kvm, 0, 0); |
583 | kvm_pic_set_irq(pic_irqchip(kvm), 0, 1); | ||
584 | kvm_pic_set_irq(pic_irqchip(kvm), 0, 0); | ||
585 | mutex_unlock(&kvm->lock); | 600 | mutex_unlock(&kvm->lock); |
586 | } | 601 | } |
587 | 602 | ||
@@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) | |||
592 | struct kvm_kpit_state *ps; | 607 | struct kvm_kpit_state *ps; |
593 | 608 | ||
594 | if (vcpu && pit) { | 609 | if (vcpu && pit) { |
610 | int inject = 0; | ||
595 | ps = &pit->pit_state; | 611 | ps = &pit->pit_state; |
596 | 612 | ||
597 | /* Try to inject pending interrupts when: | 613 | /* Try to inject pending interrupts when |
598 | * 1. Pending exists | 614 | * last one has been acked. |
599 | * 2. Last interrupt was accepted or waited for too long time*/ | 615 | */ |
600 | if (atomic_read(&ps->pit_timer.pending) && | 616 | spin_lock(&ps->inject_lock); |
601 | (ps->inject_pending || | 617 | if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { |
602 | (jiffies - ps->last_injected_time | 618 | ps->irq_ack = 0; |
603 | >= KVM_MAX_PIT_INTR_INTERVAL))) { | 619 | inject = 1; |
604 | ps->inject_pending = 0; | ||
605 | __inject_pit_timer_intr(kvm); | ||
606 | ps->last_injected_time = jiffies; | ||
607 | } | ||
608 | } | ||
609 | } | ||
610 | |||
611 | void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | ||
612 | { | ||
613 | struct kvm_arch *arch = &vcpu->kvm->arch; | ||
614 | struct kvm_kpit_state *ps; | ||
615 | |||
616 | if (vcpu && arch->vpit) { | ||
617 | ps = &arch->vpit->pit_state; | ||
618 | if (atomic_read(&ps->pit_timer.pending) && | ||
619 | (((arch->vpic->pics[0].imr & 1) == 0 && | ||
620 | arch->vpic->pics[0].irq_base == vec) || | ||
621 | (arch->vioapic->redirtbl[0].fields.vector == vec && | ||
622 | arch->vioapic->redirtbl[0].fields.mask != 1))) { | ||
623 | ps->inject_pending = 1; | ||
624 | atomic_dec(&ps->pit_timer.pending); | ||
625 | ps->channels[0].count_load_time = ktime_get(); | ||
626 | } | 620 | } |
621 | spin_unlock(&ps->inject_lock); | ||
622 | if (inject) | ||
623 | __inject_pit_timer_intr(kvm); | ||
627 | } | 624 | } |
628 | } | 625 | } |
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index db25c2a6c8c4..e436d4983aa1 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h | |||
@@ -8,7 +8,6 @@ struct kvm_kpit_timer { | |||
8 | int irq; | 8 | int irq; |
9 | s64 period; /* unit: ns */ | 9 | s64 period; /* unit: ns */ |
10 | s64 scheduled; | 10 | s64 scheduled; |
11 | ktime_t last_update; | ||
12 | atomic_t pending; | 11 | atomic_t pending; |
13 | }; | 12 | }; |
14 | 13 | ||
@@ -34,8 +33,9 @@ struct kvm_kpit_state { | |||
34 | u32 speaker_data_on; | 33 | u32 speaker_data_on; |
35 | struct mutex lock; | 34 | struct mutex lock; |
36 | struct kvm_pit *pit; | 35 | struct kvm_pit *pit; |
37 | bool inject_pending; /* if inject pending interrupts */ | 36 | spinlock_t inject_lock; |
38 | unsigned long last_injected_time; | 37 | unsigned long irq_ack; |
38 | struct kvm_irq_ack_notifier irq_ack_notifier; | ||
39 | }; | 39 | }; |
40 | 40 | ||
41 | struct kvm_pit { | 41 | struct kvm_pit { |
@@ -54,7 +54,6 @@ struct kvm_pit { | |||
54 | #define KVM_PIT_CHANNEL_MASK 0x3 | 54 | #define KVM_PIT_CHANNEL_MASK 0x3 |
55 | 55 | ||
56 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); | 56 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); |
57 | void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
58 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); | 57 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); |
59 | struct kvm_pit *kvm_create_pit(struct kvm *kvm); | 58 | struct kvm_pit *kvm_create_pit(struct kvm *kvm); |
60 | void kvm_free_pit(struct kvm *kvm); | 59 | void kvm_free_pit(struct kvm *kvm); |
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index c31164e8aa46..17e41e165f1a 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -30,6 +30,19 @@ | |||
30 | 30 | ||
31 | #include <linux/kvm_host.h> | 31 | #include <linux/kvm_host.h> |
32 | 32 | ||
33 | static void pic_clear_isr(struct kvm_kpic_state *s, int irq) | ||
34 | { | ||
35 | s->isr &= ~(1 << irq); | ||
36 | s->isr_ack |= (1 << irq); | ||
37 | } | ||
38 | |||
39 | void kvm_pic_clear_isr_ack(struct kvm *kvm) | ||
40 | { | ||
41 | struct kvm_pic *s = pic_irqchip(kvm); | ||
42 | s->pics[0].isr_ack = 0xff; | ||
43 | s->pics[1].isr_ack = 0xff; | ||
44 | } | ||
45 | |||
33 | /* | 46 | /* |
34 | * set irq level. If an edge is detected, then the IRR is set to 1 | 47 | * set irq level. If an edge is detected, then the IRR is set to 1 |
35 | */ | 48 | */ |
@@ -141,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level) | |||
141 | */ | 154 | */ |
142 | static inline void pic_intack(struct kvm_kpic_state *s, int irq) | 155 | static inline void pic_intack(struct kvm_kpic_state *s, int irq) |
143 | { | 156 | { |
157 | s->isr |= 1 << irq; | ||
144 | if (s->auto_eoi) { | 158 | if (s->auto_eoi) { |
145 | if (s->rotate_on_auto_eoi) | 159 | if (s->rotate_on_auto_eoi) |
146 | s->priority_add = (irq + 1) & 7; | 160 | s->priority_add = (irq + 1) & 7; |
147 | } else | 161 | pic_clear_isr(s, irq); |
148 | s->isr |= (1 << irq); | 162 | } |
149 | /* | 163 | /* |
150 | * We don't clear a level sensitive interrupt here | 164 | * We don't clear a level sensitive interrupt here |
151 | */ | 165 | */ |
@@ -153,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq) | |||
153 | s->irr &= ~(1 << irq); | 167 | s->irr &= ~(1 << irq); |
154 | } | 168 | } |
155 | 169 | ||
156 | int kvm_pic_read_irq(struct kvm_pic *s) | 170 | int kvm_pic_read_irq(struct kvm *kvm) |
157 | { | 171 | { |
158 | int irq, irq2, intno; | 172 | int irq, irq2, intno; |
173 | struct kvm_pic *s = pic_irqchip(kvm); | ||
159 | 174 | ||
160 | irq = pic_get_irq(&s->pics[0]); | 175 | irq = pic_get_irq(&s->pics[0]); |
161 | if (irq >= 0) { | 176 | if (irq >= 0) { |
@@ -181,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s) | |||
181 | intno = s->pics[0].irq_base + irq; | 196 | intno = s->pics[0].irq_base + irq; |
182 | } | 197 | } |
183 | pic_update_irq(s); | 198 | pic_update_irq(s); |
199 | kvm_notify_acked_irq(kvm, irq); | ||
184 | 200 | ||
185 | return intno; | 201 | return intno; |
186 | } | 202 | } |
187 | 203 | ||
188 | void kvm_pic_reset(struct kvm_kpic_state *s) | 204 | void kvm_pic_reset(struct kvm_kpic_state *s) |
189 | { | 205 | { |
206 | int irq, irqbase; | ||
207 | struct kvm *kvm = s->pics_state->irq_request_opaque; | ||
208 | struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; | ||
209 | |||
210 | if (s == &s->pics_state->pics[0]) | ||
211 | irqbase = 0; | ||
212 | else | ||
213 | irqbase = 8; | ||
214 | |||
215 | for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { | ||
216 | if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) | ||
217 | if (s->irr & (1 << irq) || s->isr & (1 << irq)) | ||
218 | kvm_notify_acked_irq(kvm, irq+irqbase); | ||
219 | } | ||
190 | s->last_irr = 0; | 220 | s->last_irr = 0; |
191 | s->irr = 0; | 221 | s->irr = 0; |
192 | s->imr = 0; | 222 | s->imr = 0; |
193 | s->isr = 0; | 223 | s->isr = 0; |
224 | s->isr_ack = 0xff; | ||
194 | s->priority_add = 0; | 225 | s->priority_add = 0; |
195 | s->irq_base = 0; | 226 | s->irq_base = 0; |
196 | s->read_reg_select = 0; | 227 | s->read_reg_select = 0; |
@@ -243,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
243 | priority = get_priority(s, s->isr); | 274 | priority = get_priority(s, s->isr); |
244 | if (priority != 8) { | 275 | if (priority != 8) { |
245 | irq = (priority + s->priority_add) & 7; | 276 | irq = (priority + s->priority_add) & 7; |
246 | s->isr &= ~(1 << irq); | 277 | pic_clear_isr(s, irq); |
247 | if (cmd == 5) | 278 | if (cmd == 5) |
248 | s->priority_add = (irq + 1) & 7; | 279 | s->priority_add = (irq + 1) & 7; |
249 | pic_update_irq(s->pics_state); | 280 | pic_update_irq(s->pics_state); |
@@ -251,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
251 | break; | 282 | break; |
252 | case 3: | 283 | case 3: |
253 | irq = val & 7; | 284 | irq = val & 7; |
254 | s->isr &= ~(1 << irq); | 285 | pic_clear_isr(s, irq); |
255 | pic_update_irq(s->pics_state); | 286 | pic_update_irq(s->pics_state); |
256 | break; | 287 | break; |
257 | case 6: | 288 | case 6: |
@@ -260,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
260 | break; | 291 | break; |
261 | case 7: | 292 | case 7: |
262 | irq = val & 7; | 293 | irq = val & 7; |
263 | s->isr &= ~(1 << irq); | ||
264 | s->priority_add = (irq + 1) & 7; | 294 | s->priority_add = (irq + 1) & 7; |
295 | pic_clear_isr(s, irq); | ||
265 | pic_update_irq(s->pics_state); | 296 | pic_update_irq(s->pics_state); |
266 | break; | 297 | break; |
267 | default: | 298 | default: |
@@ -303,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) | |||
303 | s->pics_state->pics[0].irr &= ~(1 << 2); | 334 | s->pics_state->pics[0].irr &= ~(1 << 2); |
304 | } | 335 | } |
305 | s->irr &= ~(1 << ret); | 336 | s->irr &= ~(1 << ret); |
306 | s->isr &= ~(1 << ret); | 337 | pic_clear_isr(s, ret); |
307 | if (addr1 >> 7 || ret != 2) | 338 | if (addr1 >> 7 || ret != 2) |
308 | pic_update_irq(s->pics_state); | 339 | pic_update_irq(s->pics_state); |
309 | } else { | 340 | } else { |
@@ -422,10 +453,14 @@ static void pic_irq_request(void *opaque, int level) | |||
422 | { | 453 | { |
423 | struct kvm *kvm = opaque; | 454 | struct kvm *kvm = opaque; |
424 | struct kvm_vcpu *vcpu = kvm->vcpus[0]; | 455 | struct kvm_vcpu *vcpu = kvm->vcpus[0]; |
456 | struct kvm_pic *s = pic_irqchip(kvm); | ||
457 | int irq = pic_get_irq(&s->pics[0]); | ||
425 | 458 | ||
426 | pic_irqchip(kvm)->output = level; | 459 | s->output = level; |
427 | if (vcpu) | 460 | if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { |
461 | s->pics[0].isr_ack &= ~(1 << irq); | ||
428 | kvm_vcpu_kick(vcpu); | 462 | kvm_vcpu_kick(vcpu); |
463 | } | ||
429 | } | 464 | } |
430 | 465 | ||
431 | struct kvm_pic *kvm_create_pic(struct kvm *kvm) | 466 | struct kvm_pic *kvm_create_pic(struct kvm *kvm) |
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 76d736b5f664..c019b8edcdb7 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c | |||
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) | |||
72 | if (kvm_apic_accept_pic_intr(v)) { | 72 | if (kvm_apic_accept_pic_intr(v)) { |
73 | s = pic_irqchip(v->kvm); | 73 | s = pic_irqchip(v->kvm); |
74 | s->output = 0; /* PIC */ | 74 | s->output = 0; /* PIC */ |
75 | vector = kvm_pic_read_irq(s); | 75 | vector = kvm_pic_read_irq(v->kvm); |
76 | } | 76 | } |
77 | } | 77 | } |
78 | return vector; | 78 | return vector; |
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); | |||
90 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | 90 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) |
91 | { | 91 | { |
92 | kvm_apic_timer_intr_post(vcpu, vec); | 92 | kvm_apic_timer_intr_post(vcpu, vec); |
93 | kvm_pit_timer_intr_post(vcpu, vec); | ||
94 | /* TODO: PIT, RTC etc. */ | 93 | /* TODO: PIT, RTC etc. */ |
95 | } | 94 | } |
96 | EXPORT_SYMBOL_GPL(kvm_timer_intr_post); | 95 | EXPORT_SYMBOL_GPL(kvm_timer_intr_post); |
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 7ca47cbb48bb..f17c8f5bbf31 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h | |||
@@ -42,6 +42,7 @@ struct kvm_kpic_state { | |||
42 | u8 irr; /* interrupt request register */ | 42 | u8 irr; /* interrupt request register */ |
43 | u8 imr; /* interrupt mask register */ | 43 | u8 imr; /* interrupt mask register */ |
44 | u8 isr; /* interrupt service register */ | 44 | u8 isr; /* interrupt service register */ |
45 | u8 isr_ack; /* interrupt ack detection */ | ||
45 | u8 priority_add; /* highest irq priority */ | 46 | u8 priority_add; /* highest irq priority */ |
46 | u8 irq_base; | 47 | u8 irq_base; |
47 | u8 read_reg_select; | 48 | u8 read_reg_select; |
@@ -63,12 +64,13 @@ struct kvm_pic { | |||
63 | void *irq_request_opaque; | 64 | void *irq_request_opaque; |
64 | int output; /* intr from master PIC */ | 65 | int output; /* intr from master PIC */ |
65 | struct kvm_io_device dev; | 66 | struct kvm_io_device dev; |
67 | void (*ack_notifier)(void *opaque, int irq); | ||
66 | }; | 68 | }; |
67 | 69 | ||
68 | struct kvm_pic *kvm_create_pic(struct kvm *kvm); | 70 | struct kvm_pic *kvm_create_pic(struct kvm *kvm); |
69 | void kvm_pic_set_irq(void *opaque, int irq, int level); | 71 | int kvm_pic_read_irq(struct kvm *kvm); |
70 | int kvm_pic_read_irq(struct kvm_pic *s); | ||
71 | void kvm_pic_update_irq(struct kvm_pic *s); | 72 | void kvm_pic_update_irq(struct kvm_pic *s); |
73 | void kvm_pic_clear_isr_ack(struct kvm *kvm); | ||
72 | 74 | ||
73 | static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) | 75 | static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) |
74 | { | 76 | { |
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h new file mode 100644 index 000000000000..1ff819dce7d3 --- /dev/null +++ b/arch/x86/kvm/kvm_cache_regs.h | |||
@@ -0,0 +1,32 @@ | |||
1 | #ifndef ASM_KVM_CACHE_REGS_H | ||
2 | #define ASM_KVM_CACHE_REGS_H | ||
3 | |||
4 | static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, | ||
5 | enum kvm_reg reg) | ||
6 | { | ||
7 | if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail)) | ||
8 | kvm_x86_ops->cache_reg(vcpu, reg); | ||
9 | |||
10 | return vcpu->arch.regs[reg]; | ||
11 | } | ||
12 | |||
13 | static inline void kvm_register_write(struct kvm_vcpu *vcpu, | ||
14 | enum kvm_reg reg, | ||
15 | unsigned long val) | ||
16 | { | ||
17 | vcpu->arch.regs[reg] = val; | ||
18 | __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); | ||
19 | __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); | ||
20 | } | ||
21 | |||
22 | static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) | ||
23 | { | ||
24 | return kvm_register_read(vcpu, VCPU_REGS_RIP); | ||
25 | } | ||
26 | |||
27 | static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) | ||
28 | { | ||
29 | kvm_register_write(vcpu, VCPU_REGS_RIP, val); | ||
30 | } | ||
31 | |||
32 | #endif | ||
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 73f43de69f67..6571926bfd33 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <asm/current.h> | 32 | #include <asm/current.h> |
33 | #include <asm/apicdef.h> | 33 | #include <asm/apicdef.h> |
34 | #include <asm/atomic.h> | 34 | #include <asm/atomic.h> |
35 | #include "kvm_cache_regs.h" | ||
35 | #include "irq.h" | 36 | #include "irq.h" |
36 | 37 | ||
37 | #define PRId64 "d" | 38 | #define PRId64 "d" |
@@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
338 | } else | 339 | } else |
339 | apic_clear_vector(vector, apic->regs + APIC_TMR); | 340 | apic_clear_vector(vector, apic->regs + APIC_TMR); |
340 | 341 | ||
341 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) | 342 | kvm_vcpu_kick(vcpu); |
342 | kvm_vcpu_kick(vcpu); | ||
343 | else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) { | ||
344 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
345 | if (waitqueue_active(&vcpu->wq)) | ||
346 | wake_up_interruptible(&vcpu->wq); | ||
347 | } | ||
348 | 343 | ||
349 | result = (orig_irr == 0); | 344 | result = (orig_irr == 0); |
350 | break; | 345 | break; |
@@ -370,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
370 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; | 365 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; |
371 | kvm_vcpu_kick(vcpu); | 366 | kvm_vcpu_kick(vcpu); |
372 | } else { | 367 | } else { |
373 | printk(KERN_DEBUG | 368 | apic_debug("Ignoring de-assert INIT to vcpu %d\n", |
374 | "Ignoring de-assert INIT to vcpu %d\n", | 369 | vcpu->vcpu_id); |
375 | vcpu->vcpu_id); | ||
376 | } | 370 | } |
377 | |||
378 | break; | 371 | break; |
379 | 372 | ||
380 | case APIC_DM_STARTUP: | 373 | case APIC_DM_STARTUP: |
381 | printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", | 374 | apic_debug("SIPI to vcpu %d vector 0x%02x\n", |
382 | vcpu->vcpu_id, vector); | 375 | vcpu->vcpu_id, vector); |
383 | if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { | 376 | if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { |
384 | vcpu->arch.sipi_vector = vector; | 377 | vcpu->arch.sipi_vector = vector; |
385 | vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; | 378 | vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; |
386 | if (waitqueue_active(&vcpu->wq)) | 379 | kvm_vcpu_kick(vcpu); |
387 | wake_up_interruptible(&vcpu->wq); | ||
388 | } | 380 | } |
389 | break; | 381 | break; |
390 | 382 | ||
@@ -438,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, | |||
438 | static void apic_set_eoi(struct kvm_lapic *apic) | 430 | static void apic_set_eoi(struct kvm_lapic *apic) |
439 | { | 431 | { |
440 | int vector = apic_find_highest_isr(apic); | 432 | int vector = apic_find_highest_isr(apic); |
441 | 433 | int trigger_mode; | |
442 | /* | 434 | /* |
443 | * Not every write EOI will has corresponding ISR, | 435 | * Not every write EOI will has corresponding ISR, |
444 | * one example is when Kernel check timer on setup_IO_APIC | 436 | * one example is when Kernel check timer on setup_IO_APIC |
@@ -450,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic) | |||
450 | apic_update_ppr(apic); | 442 | apic_update_ppr(apic); |
451 | 443 | ||
452 | if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) | 444 | if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) |
453 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); | 445 | trigger_mode = IOAPIC_LEVEL_TRIG; |
446 | else | ||
447 | trigger_mode = IOAPIC_EDGE_TRIG; | ||
448 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); | ||
454 | } | 449 | } |
455 | 450 | ||
456 | static void apic_send_ipi(struct kvm_lapic *apic) | 451 | static void apic_send_ipi(struct kvm_lapic *apic) |
@@ -558,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write) | |||
558 | struct kvm_run *run = vcpu->run; | 553 | struct kvm_run *run = vcpu->run; |
559 | 554 | ||
560 | set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); | 555 | set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); |
561 | kvm_x86_ops->cache_regs(vcpu); | 556 | run->tpr_access.rip = kvm_rip_read(vcpu); |
562 | run->tpr_access.rip = vcpu->arch.rip; | ||
563 | run->tpr_access.is_write = write; | 557 | run->tpr_access.is_write = write; |
564 | } | 558 | } |
565 | 559 | ||
@@ -683,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
683 | * Refer SDM 8.4.1 | 677 | * Refer SDM 8.4.1 |
684 | */ | 678 | */ |
685 | if (len != 4 || alignment) { | 679 | if (len != 4 || alignment) { |
686 | if (printk_ratelimit()) | 680 | /* Don't shout loud, $infamous_os would cause only noise. */ |
687 | printk(KERN_ERR "apic write: bad size=%d %lx\n", | 681 | apic_debug("apic write: bad size=%d %lx\n", |
688 | len, (long)address); | 682 | len, (long)address); |
689 | return; | 683 | return; |
690 | } | 684 | } |
691 | 685 | ||
@@ -947,10 +941,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic) | |||
947 | 941 | ||
948 | if(!atomic_inc_and_test(&apic->timer.pending)) | 942 | if(!atomic_inc_and_test(&apic->timer.pending)) |
949 | set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); | 943 | set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); |
950 | if (waitqueue_active(q)) { | 944 | if (waitqueue_active(q)) |
951 | apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
952 | wake_up_interruptible(q); | 945 | wake_up_interruptible(q); |
953 | } | 946 | |
954 | if (apic_lvtt_period(apic)) { | 947 | if (apic_lvtt_period(apic)) { |
955 | result = 1; | 948 | result = 1; |
956 | apic->timer.dev.expires = ktime_add_ns( | 949 | apic->timer.dev.expires = ktime_add_ns( |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3da2508eb22a..99c239c5c0ac 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -70,6 +70,9 @@ static int dbg = 0; | |||
70 | module_param(dbg, bool, 0644); | 70 | module_param(dbg, bool, 0644); |
71 | #endif | 71 | #endif |
72 | 72 | ||
73 | static int oos_shadow = 1; | ||
74 | module_param(oos_shadow, bool, 0644); | ||
75 | |||
73 | #ifndef MMU_DEBUG | 76 | #ifndef MMU_DEBUG |
74 | #define ASSERT(x) do { } while (0) | 77 | #define ASSERT(x) do { } while (0) |
75 | #else | 78 | #else |
@@ -135,18 +138,24 @@ module_param(dbg, bool, 0644); | |||
135 | #define ACC_USER_MASK PT_USER_MASK | 138 | #define ACC_USER_MASK PT_USER_MASK |
136 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) | 139 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) |
137 | 140 | ||
138 | struct kvm_pv_mmu_op_buffer { | 141 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
139 | void *ptr; | ||
140 | unsigned len; | ||
141 | unsigned processed; | ||
142 | char buf[512] __aligned(sizeof(long)); | ||
143 | }; | ||
144 | 142 | ||
145 | struct kvm_rmap_desc { | 143 | struct kvm_rmap_desc { |
146 | u64 *shadow_ptes[RMAP_EXT]; | 144 | u64 *shadow_ptes[RMAP_EXT]; |
147 | struct kvm_rmap_desc *more; | 145 | struct kvm_rmap_desc *more; |
148 | }; | 146 | }; |
149 | 147 | ||
148 | struct kvm_shadow_walk { | ||
149 | int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu, | ||
150 | u64 addr, u64 *spte, int level); | ||
151 | }; | ||
152 | |||
153 | struct kvm_unsync_walk { | ||
154 | int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); | ||
155 | }; | ||
156 | |||
157 | typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); | ||
158 | |||
150 | static struct kmem_cache *pte_chain_cache; | 159 | static struct kmem_cache *pte_chain_cache; |
151 | static struct kmem_cache *rmap_desc_cache; | 160 | static struct kmem_cache *rmap_desc_cache; |
152 | static struct kmem_cache *mmu_page_header_cache; | 161 | static struct kmem_cache *mmu_page_header_cache; |
@@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) | |||
405 | { | 414 | { |
406 | struct vm_area_struct *vma; | 415 | struct vm_area_struct *vma; |
407 | unsigned long addr; | 416 | unsigned long addr; |
417 | int ret = 0; | ||
408 | 418 | ||
409 | addr = gfn_to_hva(kvm, gfn); | 419 | addr = gfn_to_hva(kvm, gfn); |
410 | if (kvm_is_error_hva(addr)) | 420 | if (kvm_is_error_hva(addr)) |
411 | return 0; | 421 | return ret; |
412 | 422 | ||
423 | down_read(¤t->mm->mmap_sem); | ||
413 | vma = find_vma(current->mm, addr); | 424 | vma = find_vma(current->mm, addr); |
414 | if (vma && is_vm_hugetlb_page(vma)) | 425 | if (vma && is_vm_hugetlb_page(vma)) |
415 | return 1; | 426 | ret = 1; |
427 | up_read(¤t->mm->mmap_sem); | ||
416 | 428 | ||
417 | return 0; | 429 | return ret; |
418 | } | 430 | } |
419 | 431 | ||
420 | static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) | 432 | static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) |
@@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
649 | 661 | ||
650 | if (write_protected) | 662 | if (write_protected) |
651 | kvm_flush_remote_tlbs(kvm); | 663 | kvm_flush_remote_tlbs(kvm); |
652 | |||
653 | account_shadowed(kvm, gfn); | ||
654 | } | 664 | } |
655 | 665 | ||
656 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) | 666 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) |
@@ -859,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, | |||
859 | BUG(); | 869 | BUG(); |
860 | } | 870 | } |
861 | 871 | ||
872 | |||
873 | static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | ||
874 | mmu_parent_walk_fn fn) | ||
875 | { | ||
876 | struct kvm_pte_chain *pte_chain; | ||
877 | struct hlist_node *node; | ||
878 | struct kvm_mmu_page *parent_sp; | ||
879 | int i; | ||
880 | |||
881 | if (!sp->multimapped && sp->parent_pte) { | ||
882 | parent_sp = page_header(__pa(sp->parent_pte)); | ||
883 | fn(vcpu, parent_sp); | ||
884 | mmu_parent_walk(vcpu, parent_sp, fn); | ||
885 | return; | ||
886 | } | ||
887 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | ||
888 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
889 | if (!pte_chain->parent_ptes[i]) | ||
890 | break; | ||
891 | parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); | ||
892 | fn(vcpu, parent_sp); | ||
893 | mmu_parent_walk(vcpu, parent_sp, fn); | ||
894 | } | ||
895 | } | ||
896 | |||
897 | static void kvm_mmu_update_unsync_bitmap(u64 *spte) | ||
898 | { | ||
899 | unsigned int index; | ||
900 | struct kvm_mmu_page *sp = page_header(__pa(spte)); | ||
901 | |||
902 | index = spte - sp->spt; | ||
903 | __set_bit(index, sp->unsync_child_bitmap); | ||
904 | sp->unsync_children = 1; | ||
905 | } | ||
906 | |||
907 | static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) | ||
908 | { | ||
909 | struct kvm_pte_chain *pte_chain; | ||
910 | struct hlist_node *node; | ||
911 | int i; | ||
912 | |||
913 | if (!sp->parent_pte) | ||
914 | return; | ||
915 | |||
916 | if (!sp->multimapped) { | ||
917 | kvm_mmu_update_unsync_bitmap(sp->parent_pte); | ||
918 | return; | ||
919 | } | ||
920 | |||
921 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | ||
922 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
923 | if (!pte_chain->parent_ptes[i]) | ||
924 | break; | ||
925 | kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); | ||
926 | } | ||
927 | } | ||
928 | |||
929 | static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | ||
930 | { | ||
931 | sp->unsync_children = 1; | ||
932 | kvm_mmu_update_parents_unsync(sp); | ||
933 | return 1; | ||
934 | } | ||
935 | |||
936 | static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, | ||
937 | struct kvm_mmu_page *sp) | ||
938 | { | ||
939 | mmu_parent_walk(vcpu, sp, unsync_walk_fn); | ||
940 | kvm_mmu_update_parents_unsync(sp); | ||
941 | } | ||
942 | |||
862 | static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | 943 | static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, |
863 | struct kvm_mmu_page *sp) | 944 | struct kvm_mmu_page *sp) |
864 | { | 945 | { |
@@ -868,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | |||
868 | sp->spt[i] = shadow_trap_nonpresent_pte; | 949 | sp->spt[i] = shadow_trap_nonpresent_pte; |
869 | } | 950 | } |
870 | 951 | ||
952 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, | ||
953 | struct kvm_mmu_page *sp) | ||
954 | { | ||
955 | return 1; | ||
956 | } | ||
957 | |||
958 | static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) | ||
959 | { | ||
960 | } | ||
961 | |||
962 | #define for_each_unsync_children(bitmap, idx) \ | ||
963 | for (idx = find_first_bit(bitmap, 512); \ | ||
964 | idx < 512; \ | ||
965 | idx = find_next_bit(bitmap, 512, idx+1)) | ||
966 | |||
967 | static int mmu_unsync_walk(struct kvm_mmu_page *sp, | ||
968 | struct kvm_unsync_walk *walker) | ||
969 | { | ||
970 | int i, ret; | ||
971 | |||
972 | if (!sp->unsync_children) | ||
973 | return 0; | ||
974 | |||
975 | for_each_unsync_children(sp->unsync_child_bitmap, i) { | ||
976 | u64 ent = sp->spt[i]; | ||
977 | |||
978 | if (is_shadow_present_pte(ent)) { | ||
979 | struct kvm_mmu_page *child; | ||
980 | child = page_header(ent & PT64_BASE_ADDR_MASK); | ||
981 | |||
982 | if (child->unsync_children) { | ||
983 | ret = mmu_unsync_walk(child, walker); | ||
984 | if (ret) | ||
985 | return ret; | ||
986 | __clear_bit(i, sp->unsync_child_bitmap); | ||
987 | } | ||
988 | |||
989 | if (child->unsync) { | ||
990 | ret = walker->entry(child, walker); | ||
991 | __clear_bit(i, sp->unsync_child_bitmap); | ||
992 | if (ret) | ||
993 | return ret; | ||
994 | } | ||
995 | } | ||
996 | } | ||
997 | |||
998 | if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) | ||
999 | sp->unsync_children = 0; | ||
1000 | |||
1001 | return 0; | ||
1002 | } | ||
1003 | |||
871 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) | 1004 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) |
872 | { | 1005 | { |
873 | unsigned index; | 1006 | unsigned index; |
@@ -888,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) | |||
888 | return NULL; | 1021 | return NULL; |
889 | } | 1022 | } |
890 | 1023 | ||
1024 | static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1025 | { | ||
1026 | WARN_ON(!sp->unsync); | ||
1027 | sp->unsync = 0; | ||
1028 | --kvm->stat.mmu_unsync; | ||
1029 | } | ||
1030 | |||
1031 | static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); | ||
1032 | |||
1033 | static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | ||
1034 | { | ||
1035 | if (sp->role.glevels != vcpu->arch.mmu.root_level) { | ||
1036 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1037 | return 1; | ||
1038 | } | ||
1039 | |||
1040 | rmap_write_protect(vcpu->kvm, sp->gfn); | ||
1041 | if (vcpu->arch.mmu.sync_page(vcpu, sp)) { | ||
1042 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1043 | return 1; | ||
1044 | } | ||
1045 | |||
1046 | kvm_mmu_flush_tlb(vcpu); | ||
1047 | kvm_unlink_unsync_page(vcpu->kvm, sp); | ||
1048 | return 0; | ||
1049 | } | ||
1050 | |||
1051 | struct sync_walker { | ||
1052 | struct kvm_vcpu *vcpu; | ||
1053 | struct kvm_unsync_walk walker; | ||
1054 | }; | ||
1055 | |||
1056 | static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) | ||
1057 | { | ||
1058 | struct sync_walker *sync_walk = container_of(walk, struct sync_walker, | ||
1059 | walker); | ||
1060 | struct kvm_vcpu *vcpu = sync_walk->vcpu; | ||
1061 | |||
1062 | kvm_sync_page(vcpu, sp); | ||
1063 | return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)); | ||
1064 | } | ||
1065 | |||
1066 | static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | ||
1067 | { | ||
1068 | struct sync_walker walker = { | ||
1069 | .walker = { .entry = mmu_sync_fn, }, | ||
1070 | .vcpu = vcpu, | ||
1071 | }; | ||
1072 | |||
1073 | while (mmu_unsync_walk(sp, &walker.walker)) | ||
1074 | cond_resched_lock(&vcpu->kvm->mmu_lock); | ||
1075 | } | ||
1076 | |||
891 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | 1077 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, |
892 | gfn_t gfn, | 1078 | gfn_t gfn, |
893 | gva_t gaddr, | 1079 | gva_t gaddr, |
@@ -901,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
901 | unsigned quadrant; | 1087 | unsigned quadrant; |
902 | struct hlist_head *bucket; | 1088 | struct hlist_head *bucket; |
903 | struct kvm_mmu_page *sp; | 1089 | struct kvm_mmu_page *sp; |
904 | struct hlist_node *node; | 1090 | struct hlist_node *node, *tmp; |
905 | 1091 | ||
906 | role.word = 0; | 1092 | role.word = 0; |
907 | role.glevels = vcpu->arch.mmu.root_level; | 1093 | role.glevels = vcpu->arch.mmu.root_level; |
@@ -917,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
917 | gfn, role.word); | 1103 | gfn, role.word); |
918 | index = kvm_page_table_hashfn(gfn); | 1104 | index = kvm_page_table_hashfn(gfn); |
919 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | 1105 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
920 | hlist_for_each_entry(sp, node, bucket, hash_link) | 1106 | hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) |
921 | if (sp->gfn == gfn && sp->role.word == role.word) { | 1107 | if (sp->gfn == gfn) { |
1108 | if (sp->unsync) | ||
1109 | if (kvm_sync_page(vcpu, sp)) | ||
1110 | continue; | ||
1111 | |||
1112 | if (sp->role.word != role.word) | ||
1113 | continue; | ||
1114 | |||
922 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | 1115 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); |
1116 | if (sp->unsync_children) { | ||
1117 | set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); | ||
1118 | kvm_mmu_mark_parents_unsync(vcpu, sp); | ||
1119 | } | ||
923 | pgprintk("%s: found\n", __func__); | 1120 | pgprintk("%s: found\n", __func__); |
924 | return sp; | 1121 | return sp; |
925 | } | 1122 | } |
@@ -931,8 +1128,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
931 | sp->gfn = gfn; | 1128 | sp->gfn = gfn; |
932 | sp->role = role; | 1129 | sp->role = role; |
933 | hlist_add_head(&sp->hash_link, bucket); | 1130 | hlist_add_head(&sp->hash_link, bucket); |
934 | if (!metaphysical) | 1131 | if (!metaphysical) { |
935 | rmap_write_protect(vcpu->kvm, gfn); | 1132 | rmap_write_protect(vcpu->kvm, gfn); |
1133 | account_shadowed(vcpu->kvm, gfn); | ||
1134 | } | ||
936 | if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) | 1135 | if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) |
937 | vcpu->arch.mmu.prefetch_page(vcpu, sp); | 1136 | vcpu->arch.mmu.prefetch_page(vcpu, sp); |
938 | else | 1137 | else |
@@ -940,6 +1139,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
940 | return sp; | 1139 | return sp; |
941 | } | 1140 | } |
942 | 1141 | ||
1142 | static int walk_shadow(struct kvm_shadow_walk *walker, | ||
1143 | struct kvm_vcpu *vcpu, u64 addr) | ||
1144 | { | ||
1145 | hpa_t shadow_addr; | ||
1146 | int level; | ||
1147 | int r; | ||
1148 | u64 *sptep; | ||
1149 | unsigned index; | ||
1150 | |||
1151 | shadow_addr = vcpu->arch.mmu.root_hpa; | ||
1152 | level = vcpu->arch.mmu.shadow_root_level; | ||
1153 | if (level == PT32E_ROOT_LEVEL) { | ||
1154 | shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | ||
1155 | shadow_addr &= PT64_BASE_ADDR_MASK; | ||
1156 | --level; | ||
1157 | } | ||
1158 | |||
1159 | while (level >= PT_PAGE_TABLE_LEVEL) { | ||
1160 | index = SHADOW_PT_INDEX(addr, level); | ||
1161 | sptep = ((u64 *)__va(shadow_addr)) + index; | ||
1162 | r = walker->entry(walker, vcpu, addr, sptep, level); | ||
1163 | if (r) | ||
1164 | return r; | ||
1165 | shadow_addr = *sptep & PT64_BASE_ADDR_MASK; | ||
1166 | --level; | ||
1167 | } | ||
1168 | return 0; | ||
1169 | } | ||
1170 | |||
943 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | 1171 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, |
944 | struct kvm_mmu_page *sp) | 1172 | struct kvm_mmu_page *sp) |
945 | { | 1173 | { |
@@ -955,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, | |||
955 | rmap_remove(kvm, &pt[i]); | 1183 | rmap_remove(kvm, &pt[i]); |
956 | pt[i] = shadow_trap_nonpresent_pte; | 1184 | pt[i] = shadow_trap_nonpresent_pte; |
957 | } | 1185 | } |
958 | kvm_flush_remote_tlbs(kvm); | ||
959 | return; | 1186 | return; |
960 | } | 1187 | } |
961 | 1188 | ||
@@ -974,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, | |||
974 | } | 1201 | } |
975 | pt[i] = shadow_trap_nonpresent_pte; | 1202 | pt[i] = shadow_trap_nonpresent_pte; |
976 | } | 1203 | } |
977 | kvm_flush_remote_tlbs(kvm); | ||
978 | } | 1204 | } |
979 | 1205 | ||
980 | static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | 1206 | static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) |
@@ -991,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) | |||
991 | kvm->vcpus[i]->arch.last_pte_updated = NULL; | 1217 | kvm->vcpus[i]->arch.last_pte_updated = NULL; |
992 | } | 1218 | } |
993 | 1219 | ||
994 | static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 1220 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) |
995 | { | 1221 | { |
996 | u64 *parent_pte; | 1222 | u64 *parent_pte; |
997 | 1223 | ||
998 | ++kvm->stat.mmu_shadow_zapped; | ||
999 | while (sp->multimapped || sp->parent_pte) { | 1224 | while (sp->multimapped || sp->parent_pte) { |
1000 | if (!sp->multimapped) | 1225 | if (!sp->multimapped) |
1001 | parent_pte = sp->parent_pte; | 1226 | parent_pte = sp->parent_pte; |
@@ -1010,21 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
1010 | kvm_mmu_put_page(sp, parent_pte); | 1235 | kvm_mmu_put_page(sp, parent_pte); |
1011 | set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); | 1236 | set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); |
1012 | } | 1237 | } |
1238 | } | ||
1239 | |||
1240 | struct zap_walker { | ||
1241 | struct kvm_unsync_walk walker; | ||
1242 | struct kvm *kvm; | ||
1243 | int zapped; | ||
1244 | }; | ||
1245 | |||
1246 | static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) | ||
1247 | { | ||
1248 | struct zap_walker *zap_walk = container_of(walk, struct zap_walker, | ||
1249 | walker); | ||
1250 | kvm_mmu_zap_page(zap_walk->kvm, sp); | ||
1251 | zap_walk->zapped = 1; | ||
1252 | return 0; | ||
1253 | } | ||
1254 | |||
1255 | static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1256 | { | ||
1257 | struct zap_walker walker = { | ||
1258 | .walker = { .entry = mmu_zap_fn, }, | ||
1259 | .kvm = kvm, | ||
1260 | .zapped = 0, | ||
1261 | }; | ||
1262 | |||
1263 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) | ||
1264 | return 0; | ||
1265 | mmu_unsync_walk(sp, &walker.walker); | ||
1266 | return walker.zapped; | ||
1267 | } | ||
1268 | |||
1269 | static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1270 | { | ||
1271 | int ret; | ||
1272 | ++kvm->stat.mmu_shadow_zapped; | ||
1273 | ret = mmu_zap_unsync_children(kvm, sp); | ||
1013 | kvm_mmu_page_unlink_children(kvm, sp); | 1274 | kvm_mmu_page_unlink_children(kvm, sp); |
1275 | kvm_mmu_unlink_parents(kvm, sp); | ||
1276 | kvm_flush_remote_tlbs(kvm); | ||
1277 | if (!sp->role.invalid && !sp->role.metaphysical) | ||
1278 | unaccount_shadowed(kvm, sp->gfn); | ||
1279 | if (sp->unsync) | ||
1280 | kvm_unlink_unsync_page(kvm, sp); | ||
1014 | if (!sp->root_count) { | 1281 | if (!sp->root_count) { |
1015 | if (!sp->role.metaphysical && !sp->role.invalid) | ||
1016 | unaccount_shadowed(kvm, sp->gfn); | ||
1017 | hlist_del(&sp->hash_link); | 1282 | hlist_del(&sp->hash_link); |
1018 | kvm_mmu_free_page(kvm, sp); | 1283 | kvm_mmu_free_page(kvm, sp); |
1019 | } else { | 1284 | } else { |
1020 | int invalid = sp->role.invalid; | ||
1021 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | ||
1022 | sp->role.invalid = 1; | 1285 | sp->role.invalid = 1; |
1286 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | ||
1023 | kvm_reload_remote_mmus(kvm); | 1287 | kvm_reload_remote_mmus(kvm); |
1024 | if (!sp->role.metaphysical && !invalid) | ||
1025 | unaccount_shadowed(kvm, sp->gfn); | ||
1026 | } | 1288 | } |
1027 | kvm_mmu_reset_last_pte_updated(kvm); | 1289 | kvm_mmu_reset_last_pte_updated(kvm); |
1290 | return ret; | ||
1028 | } | 1291 | } |
1029 | 1292 | ||
1030 | /* | 1293 | /* |
@@ -1077,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | |||
1077 | if (sp->gfn == gfn && !sp->role.metaphysical) { | 1340 | if (sp->gfn == gfn && !sp->role.metaphysical) { |
1078 | pgprintk("%s: gfn %lx role %x\n", __func__, gfn, | 1341 | pgprintk("%s: gfn %lx role %x\n", __func__, gfn, |
1079 | sp->role.word); | 1342 | sp->role.word); |
1080 | kvm_mmu_zap_page(kvm, sp); | ||
1081 | r = 1; | 1343 | r = 1; |
1344 | if (kvm_mmu_zap_page(kvm, sp)) | ||
1345 | n = bucket->first; | ||
1082 | } | 1346 | } |
1083 | return r; | 1347 | return r; |
1084 | } | 1348 | } |
@@ -1101,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) | |||
1101 | __set_bit(slot, &sp->slot_bitmap); | 1365 | __set_bit(slot, &sp->slot_bitmap); |
1102 | } | 1366 | } |
1103 | 1367 | ||
1368 | static void mmu_convert_notrap(struct kvm_mmu_page *sp) | ||
1369 | { | ||
1370 | int i; | ||
1371 | u64 *pt = sp->spt; | ||
1372 | |||
1373 | if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) | ||
1374 | return; | ||
1375 | |||
1376 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
1377 | if (pt[i] == shadow_notrap_nonpresent_pte) | ||
1378 | set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); | ||
1379 | } | ||
1380 | } | ||
1381 | |||
1104 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | 1382 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) |
1105 | { | 1383 | { |
1106 | struct page *page; | 1384 | struct page *page; |
@@ -1110,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | |||
1110 | if (gpa == UNMAPPED_GVA) | 1388 | if (gpa == UNMAPPED_GVA) |
1111 | return NULL; | 1389 | return NULL; |
1112 | 1390 | ||
1113 | down_read(¤t->mm->mmap_sem); | ||
1114 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | 1391 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); |
1115 | up_read(¤t->mm->mmap_sem); | ||
1116 | 1392 | ||
1117 | return page; | 1393 | return page; |
1118 | } | 1394 | } |
1119 | 1395 | ||
1120 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | 1396 | static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) |
1121 | unsigned pt_access, unsigned pte_access, | ||
1122 | int user_fault, int write_fault, int dirty, | ||
1123 | int *ptwrite, int largepage, gfn_t gfn, | ||
1124 | pfn_t pfn, bool speculative) | ||
1125 | { | 1397 | { |
1126 | u64 spte; | 1398 | unsigned index; |
1127 | int was_rmapped = 0; | 1399 | struct hlist_head *bucket; |
1128 | int was_writeble = is_writeble_pte(*shadow_pte); | 1400 | struct kvm_mmu_page *s; |
1401 | struct hlist_node *node, *n; | ||
1129 | 1402 | ||
1130 | pgprintk("%s: spte %llx access %x write_fault %d" | 1403 | index = kvm_page_table_hashfn(sp->gfn); |
1131 | " user_fault %d gfn %lx\n", | 1404 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
1132 | __func__, *shadow_pte, pt_access, | 1405 | /* don't unsync if pagetable is shadowed with multiple roles */ |
1133 | write_fault, user_fault, gfn); | 1406 | hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { |
1407 | if (s->gfn != sp->gfn || s->role.metaphysical) | ||
1408 | continue; | ||
1409 | if (s->role.word != sp->role.word) | ||
1410 | return 1; | ||
1411 | } | ||
1412 | kvm_mmu_mark_parents_unsync(vcpu, sp); | ||
1413 | ++vcpu->kvm->stat.mmu_unsync; | ||
1414 | sp->unsync = 1; | ||
1415 | mmu_convert_notrap(sp); | ||
1416 | return 0; | ||
1417 | } | ||
1134 | 1418 | ||
1135 | if (is_rmap_pte(*shadow_pte)) { | 1419 | static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, |
1136 | /* | 1420 | bool can_unsync) |
1137 | * If we overwrite a PTE page pointer with a 2MB PMD, unlink | 1421 | { |
1138 | * the parent of the now unreachable PTE. | 1422 | struct kvm_mmu_page *shadow; |
1139 | */ | ||
1140 | if (largepage && !is_large_pte(*shadow_pte)) { | ||
1141 | struct kvm_mmu_page *child; | ||
1142 | u64 pte = *shadow_pte; | ||
1143 | 1423 | ||
1144 | child = page_header(pte & PT64_BASE_ADDR_MASK); | 1424 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); |
1145 | mmu_page_remove_parent_pte(child, shadow_pte); | 1425 | if (shadow) { |
1146 | } else if (pfn != spte_to_pfn(*shadow_pte)) { | 1426 | if (shadow->role.level != PT_PAGE_TABLE_LEVEL) |
1147 | pgprintk("hfn old %lx new %lx\n", | 1427 | return 1; |
1148 | spte_to_pfn(*shadow_pte), pfn); | 1428 | if (shadow->unsync) |
1149 | rmap_remove(vcpu->kvm, shadow_pte); | 1429 | return 0; |
1150 | } else { | 1430 | if (can_unsync && oos_shadow) |
1151 | if (largepage) | 1431 | return kvm_unsync_page(vcpu, shadow); |
1152 | was_rmapped = is_large_pte(*shadow_pte); | 1432 | return 1; |
1153 | else | ||
1154 | was_rmapped = 1; | ||
1155 | } | ||
1156 | } | 1433 | } |
1434 | return 0; | ||
1435 | } | ||
1157 | 1436 | ||
1437 | static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | ||
1438 | unsigned pte_access, int user_fault, | ||
1439 | int write_fault, int dirty, int largepage, | ||
1440 | gfn_t gfn, pfn_t pfn, bool speculative, | ||
1441 | bool can_unsync) | ||
1442 | { | ||
1443 | u64 spte; | ||
1444 | int ret = 0; | ||
1158 | /* | 1445 | /* |
1159 | * We don't set the accessed bit, since we sometimes want to see | 1446 | * We don't set the accessed bit, since we sometimes want to see |
1160 | * whether the guest actually used the pte (in order to detect | 1447 | * whether the guest actually used the pte (in order to detect |
@@ -1162,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1162 | */ | 1449 | */ |
1163 | spte = shadow_base_present_pte | shadow_dirty_mask; | 1450 | spte = shadow_base_present_pte | shadow_dirty_mask; |
1164 | if (!speculative) | 1451 | if (!speculative) |
1165 | pte_access |= PT_ACCESSED_MASK; | 1452 | spte |= shadow_accessed_mask; |
1166 | if (!dirty) | 1453 | if (!dirty) |
1167 | pte_access &= ~ACC_WRITE_MASK; | 1454 | pte_access &= ~ACC_WRITE_MASK; |
1168 | if (pte_access & ACC_EXEC_MASK) | 1455 | if (pte_access & ACC_EXEC_MASK) |
@@ -1178,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1178 | 1465 | ||
1179 | if ((pte_access & ACC_WRITE_MASK) | 1466 | if ((pte_access & ACC_WRITE_MASK) |
1180 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { | 1467 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { |
1181 | struct kvm_mmu_page *shadow; | 1468 | |
1469 | if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { | ||
1470 | ret = 1; | ||
1471 | spte = shadow_trap_nonpresent_pte; | ||
1472 | goto set_pte; | ||
1473 | } | ||
1182 | 1474 | ||
1183 | spte |= PT_WRITABLE_MASK; | 1475 | spte |= PT_WRITABLE_MASK; |
1184 | 1476 | ||
1185 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); | 1477 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { |
1186 | if (shadow || | ||
1187 | (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { | ||
1188 | pgprintk("%s: found shadow page for %lx, marking ro\n", | 1478 | pgprintk("%s: found shadow page for %lx, marking ro\n", |
1189 | __func__, gfn); | 1479 | __func__, gfn); |
1480 | ret = 1; | ||
1190 | pte_access &= ~ACC_WRITE_MASK; | 1481 | pte_access &= ~ACC_WRITE_MASK; |
1191 | if (is_writeble_pte(spte)) { | 1482 | if (is_writeble_pte(spte)) |
1192 | spte &= ~PT_WRITABLE_MASK; | 1483 | spte &= ~PT_WRITABLE_MASK; |
1193 | kvm_x86_ops->tlb_flush(vcpu); | ||
1194 | } | ||
1195 | if (write_fault) | ||
1196 | *ptwrite = 1; | ||
1197 | } | 1484 | } |
1198 | } | 1485 | } |
1199 | 1486 | ||
1200 | if (pte_access & ACC_WRITE_MASK) | 1487 | if (pte_access & ACC_WRITE_MASK) |
1201 | mark_page_dirty(vcpu->kvm, gfn); | 1488 | mark_page_dirty(vcpu->kvm, gfn); |
1202 | 1489 | ||
1203 | pgprintk("%s: setting spte %llx\n", __func__, spte); | 1490 | set_pte: |
1204 | pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", | ||
1205 | (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", | ||
1206 | (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); | ||
1207 | set_shadow_pte(shadow_pte, spte); | 1491 | set_shadow_pte(shadow_pte, spte); |
1208 | if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) | 1492 | return ret; |
1209 | && (spte & PT_PRESENT_MASK)) | 1493 | } |
1494 | |||
1495 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | ||
1496 | unsigned pt_access, unsigned pte_access, | ||
1497 | int user_fault, int write_fault, int dirty, | ||
1498 | int *ptwrite, int largepage, gfn_t gfn, | ||
1499 | pfn_t pfn, bool speculative) | ||
1500 | { | ||
1501 | int was_rmapped = 0; | ||
1502 | int was_writeble = is_writeble_pte(*shadow_pte); | ||
1503 | |||
1504 | pgprintk("%s: spte %llx access %x write_fault %d" | ||
1505 | " user_fault %d gfn %lx\n", | ||
1506 | __func__, *shadow_pte, pt_access, | ||
1507 | write_fault, user_fault, gfn); | ||
1508 | |||
1509 | if (is_rmap_pte(*shadow_pte)) { | ||
1510 | /* | ||
1511 | * If we overwrite a PTE page pointer with a 2MB PMD, unlink | ||
1512 | * the parent of the now unreachable PTE. | ||
1513 | */ | ||
1514 | if (largepage && !is_large_pte(*shadow_pte)) { | ||
1515 | struct kvm_mmu_page *child; | ||
1516 | u64 pte = *shadow_pte; | ||
1517 | |||
1518 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
1519 | mmu_page_remove_parent_pte(child, shadow_pte); | ||
1520 | } else if (pfn != spte_to_pfn(*shadow_pte)) { | ||
1521 | pgprintk("hfn old %lx new %lx\n", | ||
1522 | spte_to_pfn(*shadow_pte), pfn); | ||
1523 | rmap_remove(vcpu->kvm, shadow_pte); | ||
1524 | } else { | ||
1525 | if (largepage) | ||
1526 | was_rmapped = is_large_pte(*shadow_pte); | ||
1527 | else | ||
1528 | was_rmapped = 1; | ||
1529 | } | ||
1530 | } | ||
1531 | if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, | ||
1532 | dirty, largepage, gfn, pfn, speculative, true)) { | ||
1533 | if (write_fault) | ||
1534 | *ptwrite = 1; | ||
1535 | kvm_x86_ops->tlb_flush(vcpu); | ||
1536 | } | ||
1537 | |||
1538 | pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); | ||
1539 | pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", | ||
1540 | is_large_pte(*shadow_pte)? "2MB" : "4kB", | ||
1541 | is_present_pte(*shadow_pte)?"RW":"R", gfn, | ||
1542 | *shadow_pte, shadow_pte); | ||
1543 | if (!was_rmapped && is_large_pte(*shadow_pte)) | ||
1210 | ++vcpu->kvm->stat.lpages; | 1544 | ++vcpu->kvm->stat.lpages; |
1211 | 1545 | ||
1212 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); | 1546 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); |
@@ -1230,54 +1564,67 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | |||
1230 | { | 1564 | { |
1231 | } | 1565 | } |
1232 | 1566 | ||
1233 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | 1567 | struct direct_shadow_walk { |
1234 | int largepage, gfn_t gfn, pfn_t pfn, | 1568 | struct kvm_shadow_walk walker; |
1235 | int level) | 1569 | pfn_t pfn; |
1236 | { | 1570 | int write; |
1237 | hpa_t table_addr = vcpu->arch.mmu.root_hpa; | 1571 | int largepage; |
1238 | int pt_write = 0; | 1572 | int pt_write; |
1239 | 1573 | }; | |
1240 | for (; ; level--) { | ||
1241 | u32 index = PT64_INDEX(v, level); | ||
1242 | u64 *table; | ||
1243 | |||
1244 | ASSERT(VALID_PAGE(table_addr)); | ||
1245 | table = __va(table_addr); | ||
1246 | 1574 | ||
1247 | if (level == 1) { | 1575 | static int direct_map_entry(struct kvm_shadow_walk *_walk, |
1248 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, | 1576 | struct kvm_vcpu *vcpu, |
1249 | 0, write, 1, &pt_write, 0, gfn, pfn, false); | 1577 | u64 addr, u64 *sptep, int level) |
1250 | return pt_write; | 1578 | { |
1251 | } | 1579 | struct direct_shadow_walk *walk = |
1580 | container_of(_walk, struct direct_shadow_walk, walker); | ||
1581 | struct kvm_mmu_page *sp; | ||
1582 | gfn_t pseudo_gfn; | ||
1583 | gfn_t gfn = addr >> PAGE_SHIFT; | ||
1584 | |||
1585 | if (level == PT_PAGE_TABLE_LEVEL | ||
1586 | || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { | ||
1587 | mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, | ||
1588 | 0, walk->write, 1, &walk->pt_write, | ||
1589 | walk->largepage, gfn, walk->pfn, false); | ||
1590 | ++vcpu->stat.pf_fixed; | ||
1591 | return 1; | ||
1592 | } | ||
1252 | 1593 | ||
1253 | if (largepage && level == 2) { | 1594 | if (*sptep == shadow_trap_nonpresent_pte) { |
1254 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, | 1595 | pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; |
1255 | 0, write, 1, &pt_write, 1, gfn, pfn, false); | 1596 | sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1, |
1256 | return pt_write; | 1597 | 1, ACC_ALL, sptep); |
1598 | if (!sp) { | ||
1599 | pgprintk("nonpaging_map: ENOMEM\n"); | ||
1600 | kvm_release_pfn_clean(walk->pfn); | ||
1601 | return -ENOMEM; | ||
1257 | } | 1602 | } |
1258 | 1603 | ||
1259 | if (table[index] == shadow_trap_nonpresent_pte) { | 1604 | set_shadow_pte(sptep, |
1260 | struct kvm_mmu_page *new_table; | 1605 | __pa(sp->spt) |
1261 | gfn_t pseudo_gfn; | 1606 | | PT_PRESENT_MASK | PT_WRITABLE_MASK |
1262 | 1607 | | shadow_user_mask | shadow_x_mask); | |
1263 | pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) | ||
1264 | >> PAGE_SHIFT; | ||
1265 | new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, | ||
1266 | v, level - 1, | ||
1267 | 1, ACC_ALL, &table[index]); | ||
1268 | if (!new_table) { | ||
1269 | pgprintk("nonpaging_map: ENOMEM\n"); | ||
1270 | kvm_release_pfn_clean(pfn); | ||
1271 | return -ENOMEM; | ||
1272 | } | ||
1273 | |||
1274 | set_shadow_pte(&table[index], | ||
1275 | __pa(new_table->spt) | ||
1276 | | PT_PRESENT_MASK | PT_WRITABLE_MASK | ||
1277 | | shadow_user_mask | shadow_x_mask); | ||
1278 | } | ||
1279 | table_addr = table[index] & PT64_BASE_ADDR_MASK; | ||
1280 | } | 1608 | } |
1609 | return 0; | ||
1610 | } | ||
1611 | |||
1612 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | ||
1613 | int largepage, gfn_t gfn, pfn_t pfn) | ||
1614 | { | ||
1615 | int r; | ||
1616 | struct direct_shadow_walk walker = { | ||
1617 | .walker = { .entry = direct_map_entry, }, | ||
1618 | .pfn = pfn, | ||
1619 | .largepage = largepage, | ||
1620 | .write = write, | ||
1621 | .pt_write = 0, | ||
1622 | }; | ||
1623 | |||
1624 | r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT); | ||
1625 | if (r < 0) | ||
1626 | return r; | ||
1627 | return walker.pt_write; | ||
1281 | } | 1628 | } |
1282 | 1629 | ||
1283 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | 1630 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) |
@@ -1287,16 +1634,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
1287 | pfn_t pfn; | 1634 | pfn_t pfn; |
1288 | unsigned long mmu_seq; | 1635 | unsigned long mmu_seq; |
1289 | 1636 | ||
1290 | down_read(¤t->mm->mmap_sem); | ||
1291 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { | 1637 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { |
1292 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 1638 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); |
1293 | largepage = 1; | 1639 | largepage = 1; |
1294 | } | 1640 | } |
1295 | 1641 | ||
1296 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 1642 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
1297 | /* implicit mb(), we'll read before PT lock is unlocked */ | 1643 | smp_rmb(); |
1298 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 1644 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1299 | up_read(¤t->mm->mmap_sem); | ||
1300 | 1645 | ||
1301 | /* mmio */ | 1646 | /* mmio */ |
1302 | if (is_error_pfn(pfn)) { | 1647 | if (is_error_pfn(pfn)) { |
@@ -1308,8 +1653,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
1308 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 1653 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
1309 | goto out_unlock; | 1654 | goto out_unlock; |
1310 | kvm_mmu_free_some_pages(vcpu); | 1655 | kvm_mmu_free_some_pages(vcpu); |
1311 | r = __direct_map(vcpu, v, write, largepage, gfn, pfn, | 1656 | r = __direct_map(vcpu, v, write, largepage, gfn, pfn); |
1312 | PT32E_ROOT_LEVEL); | ||
1313 | spin_unlock(&vcpu->kvm->mmu_lock); | 1657 | spin_unlock(&vcpu->kvm->mmu_lock); |
1314 | 1658 | ||
1315 | 1659 | ||
@@ -1405,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
1405 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | 1749 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); |
1406 | } | 1750 | } |
1407 | 1751 | ||
1752 | static void mmu_sync_roots(struct kvm_vcpu *vcpu) | ||
1753 | { | ||
1754 | int i; | ||
1755 | struct kvm_mmu_page *sp; | ||
1756 | |||
1757 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
1758 | return; | ||
1759 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
1760 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
1761 | sp = page_header(root); | ||
1762 | mmu_sync_children(vcpu, sp); | ||
1763 | return; | ||
1764 | } | ||
1765 | for (i = 0; i < 4; ++i) { | ||
1766 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
1767 | |||
1768 | if (root) { | ||
1769 | root &= PT64_BASE_ADDR_MASK; | ||
1770 | sp = page_header(root); | ||
1771 | mmu_sync_children(vcpu, sp); | ||
1772 | } | ||
1773 | } | ||
1774 | } | ||
1775 | |||
1776 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | ||
1777 | { | ||
1778 | spin_lock(&vcpu->kvm->mmu_lock); | ||
1779 | mmu_sync_roots(vcpu); | ||
1780 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
1781 | } | ||
1782 | |||
1408 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) | 1783 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) |
1409 | { | 1784 | { |
1410 | return vaddr; | 1785 | return vaddr; |
@@ -1446,15 +1821,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
1446 | if (r) | 1821 | if (r) |
1447 | return r; | 1822 | return r; |
1448 | 1823 | ||
1449 | down_read(¤t->mm->mmap_sem); | ||
1450 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { | 1824 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { |
1451 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 1825 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); |
1452 | largepage = 1; | 1826 | largepage = 1; |
1453 | } | 1827 | } |
1454 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 1828 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
1455 | /* implicit mb(), we'll read before PT lock is unlocked */ | 1829 | smp_rmb(); |
1456 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 1830 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1457 | up_read(¤t->mm->mmap_sem); | ||
1458 | if (is_error_pfn(pfn)) { | 1831 | if (is_error_pfn(pfn)) { |
1459 | kvm_release_pfn_clean(pfn); | 1832 | kvm_release_pfn_clean(pfn); |
1460 | return 1; | 1833 | return 1; |
@@ -1464,7 +1837,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
1464 | goto out_unlock; | 1837 | goto out_unlock; |
1465 | kvm_mmu_free_some_pages(vcpu); | 1838 | kvm_mmu_free_some_pages(vcpu); |
1466 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, | 1839 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, |
1467 | largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); | 1840 | largepage, gfn, pfn); |
1468 | spin_unlock(&vcpu->kvm->mmu_lock); | 1841 | spin_unlock(&vcpu->kvm->mmu_lock); |
1469 | 1842 | ||
1470 | return r; | 1843 | return r; |
@@ -1489,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) | |||
1489 | context->gva_to_gpa = nonpaging_gva_to_gpa; | 1862 | context->gva_to_gpa = nonpaging_gva_to_gpa; |
1490 | context->free = nonpaging_free; | 1863 | context->free = nonpaging_free; |
1491 | context->prefetch_page = nonpaging_prefetch_page; | 1864 | context->prefetch_page = nonpaging_prefetch_page; |
1865 | context->sync_page = nonpaging_sync_page; | ||
1866 | context->invlpg = nonpaging_invlpg; | ||
1492 | context->root_level = 0; | 1867 | context->root_level = 0; |
1493 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 1868 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
1494 | context->root_hpa = INVALID_PAGE; | 1869 | context->root_hpa = INVALID_PAGE; |
@@ -1536,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | |||
1536 | context->page_fault = paging64_page_fault; | 1911 | context->page_fault = paging64_page_fault; |
1537 | context->gva_to_gpa = paging64_gva_to_gpa; | 1912 | context->gva_to_gpa = paging64_gva_to_gpa; |
1538 | context->prefetch_page = paging64_prefetch_page; | 1913 | context->prefetch_page = paging64_prefetch_page; |
1914 | context->sync_page = paging64_sync_page; | ||
1915 | context->invlpg = paging64_invlpg; | ||
1539 | context->free = paging_free; | 1916 | context->free = paging_free; |
1540 | context->root_level = level; | 1917 | context->root_level = level; |
1541 | context->shadow_root_level = level; | 1918 | context->shadow_root_level = level; |
@@ -1557,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) | |||
1557 | context->gva_to_gpa = paging32_gva_to_gpa; | 1934 | context->gva_to_gpa = paging32_gva_to_gpa; |
1558 | context->free = paging_free; | 1935 | context->free = paging_free; |
1559 | context->prefetch_page = paging32_prefetch_page; | 1936 | context->prefetch_page = paging32_prefetch_page; |
1937 | context->sync_page = paging32_sync_page; | ||
1938 | context->invlpg = paging32_invlpg; | ||
1560 | context->root_level = PT32_ROOT_LEVEL; | 1939 | context->root_level = PT32_ROOT_LEVEL; |
1561 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 1940 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
1562 | context->root_hpa = INVALID_PAGE; | 1941 | context->root_hpa = INVALID_PAGE; |
@@ -1576,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
1576 | context->page_fault = tdp_page_fault; | 1955 | context->page_fault = tdp_page_fault; |
1577 | context->free = nonpaging_free; | 1956 | context->free = nonpaging_free; |
1578 | context->prefetch_page = nonpaging_prefetch_page; | 1957 | context->prefetch_page = nonpaging_prefetch_page; |
1958 | context->sync_page = nonpaging_sync_page; | ||
1959 | context->invlpg = nonpaging_invlpg; | ||
1579 | context->shadow_root_level = kvm_x86_ops->get_tdp_level(); | 1960 | context->shadow_root_level = kvm_x86_ops->get_tdp_level(); |
1580 | context->root_hpa = INVALID_PAGE; | 1961 | context->root_hpa = INVALID_PAGE; |
1581 | 1962 | ||
@@ -1647,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) | |||
1647 | spin_lock(&vcpu->kvm->mmu_lock); | 2028 | spin_lock(&vcpu->kvm->mmu_lock); |
1648 | kvm_mmu_free_some_pages(vcpu); | 2029 | kvm_mmu_free_some_pages(vcpu); |
1649 | mmu_alloc_roots(vcpu); | 2030 | mmu_alloc_roots(vcpu); |
2031 | mmu_sync_roots(vcpu); | ||
1650 | spin_unlock(&vcpu->kvm->mmu_lock); | 2032 | spin_unlock(&vcpu->kvm->mmu_lock); |
1651 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); | 2033 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); |
1652 | kvm_mmu_flush_tlb(vcpu); | 2034 | kvm_mmu_flush_tlb(vcpu); |
@@ -1767,15 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1767 | return; | 2149 | return; |
1768 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | 2150 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
1769 | 2151 | ||
1770 | down_read(¤t->mm->mmap_sem); | ||
1771 | if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { | 2152 | if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { |
1772 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 2153 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); |
1773 | vcpu->arch.update_pte.largepage = 1; | 2154 | vcpu->arch.update_pte.largepage = 1; |
1774 | } | 2155 | } |
1775 | vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2156 | vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; |
1776 | /* implicit mb(), we'll read before PT lock is unlocked */ | 2157 | smp_rmb(); |
1777 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2158 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1778 | up_read(¤t->mm->mmap_sem); | ||
1779 | 2159 | ||
1780 | if (is_error_pfn(pfn)) { | 2160 | if (is_error_pfn(pfn)) { |
1781 | kvm_release_pfn_clean(pfn); | 2161 | kvm_release_pfn_clean(pfn); |
@@ -1837,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1837 | index = kvm_page_table_hashfn(gfn); | 2217 | index = kvm_page_table_hashfn(gfn); |
1838 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | 2218 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
1839 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { | 2219 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { |
1840 | if (sp->gfn != gfn || sp->role.metaphysical) | 2220 | if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid) |
1841 | continue; | 2221 | continue; |
1842 | pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; | 2222 | pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; |
1843 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | 2223 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); |
@@ -1855,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1855 | */ | 2235 | */ |
1856 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | 2236 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", |
1857 | gpa, bytes, sp->role.word); | 2237 | gpa, bytes, sp->role.word); |
1858 | kvm_mmu_zap_page(vcpu->kvm, sp); | 2238 | if (kvm_mmu_zap_page(vcpu->kvm, sp)) |
2239 | n = bucket->first; | ||
1859 | ++vcpu->kvm->stat.mmu_flooded; | 2240 | ++vcpu->kvm->stat.mmu_flooded; |
1860 | continue; | 2241 | continue; |
1861 | } | 2242 | } |
@@ -1969,6 +2350,16 @@ out: | |||
1969 | } | 2350 | } |
1970 | EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); | 2351 | EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); |
1971 | 2352 | ||
2353 | void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) | ||
2354 | { | ||
2355 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2356 | vcpu->arch.mmu.invlpg(vcpu, gva); | ||
2357 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2358 | kvm_mmu_flush_tlb(vcpu); | ||
2359 | ++vcpu->stat.invlpg; | ||
2360 | } | ||
2361 | EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); | ||
2362 | |||
1972 | void kvm_enable_tdp(void) | 2363 | void kvm_enable_tdp(void) |
1973 | { | 2364 | { |
1974 | tdp_enabled = true; | 2365 | tdp_enabled = true; |
@@ -2055,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
2055 | { | 2446 | { |
2056 | struct kvm_mmu_page *sp; | 2447 | struct kvm_mmu_page *sp; |
2057 | 2448 | ||
2449 | spin_lock(&kvm->mmu_lock); | ||
2058 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { | 2450 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { |
2059 | int i; | 2451 | int i; |
2060 | u64 *pt; | 2452 | u64 *pt; |
@@ -2068,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
2068 | if (pt[i] & PT_WRITABLE_MASK) | 2460 | if (pt[i] & PT_WRITABLE_MASK) |
2069 | pt[i] &= ~PT_WRITABLE_MASK; | 2461 | pt[i] &= ~PT_WRITABLE_MASK; |
2070 | } | 2462 | } |
2463 | kvm_flush_remote_tlbs(kvm); | ||
2464 | spin_unlock(&kvm->mmu_lock); | ||
2071 | } | 2465 | } |
2072 | 2466 | ||
2073 | void kvm_mmu_zap_all(struct kvm *kvm) | 2467 | void kvm_mmu_zap_all(struct kvm *kvm) |
@@ -2076,7 +2470,9 @@ void kvm_mmu_zap_all(struct kvm *kvm) | |||
2076 | 2470 | ||
2077 | spin_lock(&kvm->mmu_lock); | 2471 | spin_lock(&kvm->mmu_lock); |
2078 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) | 2472 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) |
2079 | kvm_mmu_zap_page(kvm, sp); | 2473 | if (kvm_mmu_zap_page(kvm, sp)) |
2474 | node = container_of(kvm->arch.active_mmu_pages.next, | ||
2475 | struct kvm_mmu_page, link); | ||
2080 | spin_unlock(&kvm->mmu_lock); | 2476 | spin_unlock(&kvm->mmu_lock); |
2081 | 2477 | ||
2082 | kvm_flush_remote_tlbs(kvm); | 2478 | kvm_flush_remote_tlbs(kvm); |
@@ -2291,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, | |||
2291 | gpa_t addr, unsigned long *ret) | 2687 | gpa_t addr, unsigned long *ret) |
2292 | { | 2688 | { |
2293 | int r; | 2689 | int r; |
2294 | struct kvm_pv_mmu_op_buffer buffer; | 2690 | struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer; |
2295 | 2691 | ||
2296 | buffer.ptr = buffer.buf; | 2692 | buffer->ptr = buffer->buf; |
2297 | buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); | 2693 | buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf); |
2298 | buffer.processed = 0; | 2694 | buffer->processed = 0; |
2299 | 2695 | ||
2300 | r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); | 2696 | r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len); |
2301 | if (r) | 2697 | if (r) |
2302 | goto out; | 2698 | goto out; |
2303 | 2699 | ||
2304 | while (buffer.len) { | 2700 | while (buffer->len) { |
2305 | r = kvm_pv_mmu_op_one(vcpu, &buffer); | 2701 | r = kvm_pv_mmu_op_one(vcpu, buffer); |
2306 | if (r < 0) | 2702 | if (r < 0) |
2307 | goto out; | 2703 | goto out; |
2308 | if (r == 0) | 2704 | if (r == 0) |
@@ -2311,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, | |||
2311 | 2707 | ||
2312 | r = 1; | 2708 | r = 1; |
2313 | out: | 2709 | out: |
2314 | *ret = buffer.processed; | 2710 | *ret = buffer->processed; |
2315 | return r; | 2711 | return r; |
2316 | } | 2712 | } |
2317 | 2713 | ||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 4a814bff21f2..613ec9aa674a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -25,11 +25,11 @@ | |||
25 | #if PTTYPE == 64 | 25 | #if PTTYPE == 64 |
26 | #define pt_element_t u64 | 26 | #define pt_element_t u64 |
27 | #define guest_walker guest_walker64 | 27 | #define guest_walker guest_walker64 |
28 | #define shadow_walker shadow_walker64 | ||
28 | #define FNAME(name) paging##64_##name | 29 | #define FNAME(name) paging##64_##name |
29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK | 30 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK |
30 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK | 31 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK |
31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | 32 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) |
32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | 33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) |
34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS | 34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS |
35 | #ifdef CONFIG_X86_64 | 35 | #ifdef CONFIG_X86_64 |
@@ -42,11 +42,11 @@ | |||
42 | #elif PTTYPE == 32 | 42 | #elif PTTYPE == 32 |
43 | #define pt_element_t u32 | 43 | #define pt_element_t u32 |
44 | #define guest_walker guest_walker32 | 44 | #define guest_walker guest_walker32 |
45 | #define shadow_walker shadow_walker32 | ||
45 | #define FNAME(name) paging##32_##name | 46 | #define FNAME(name) paging##32_##name |
46 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK | 47 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK |
47 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK | 48 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK |
48 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | 49 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) |
49 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
50 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | 50 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) |
51 | #define PT_LEVEL_BITS PT32_LEVEL_BITS | 51 | #define PT_LEVEL_BITS PT32_LEVEL_BITS |
52 | #define PT_MAX_FULL_LEVELS 2 | 52 | #define PT_MAX_FULL_LEVELS 2 |
@@ -73,6 +73,17 @@ struct guest_walker { | |||
73 | u32 error_code; | 73 | u32 error_code; |
74 | }; | 74 | }; |
75 | 75 | ||
76 | struct shadow_walker { | ||
77 | struct kvm_shadow_walk walker; | ||
78 | struct guest_walker *guest_walker; | ||
79 | int user_fault; | ||
80 | int write_fault; | ||
81 | int largepage; | ||
82 | int *ptwrite; | ||
83 | pfn_t pfn; | ||
84 | u64 *sptep; | ||
85 | }; | ||
86 | |||
76 | static gfn_t gpte_to_gfn(pt_element_t gpte) | 87 | static gfn_t gpte_to_gfn(pt_element_t gpte) |
77 | { | 88 | { |
78 | return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | 89 | return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; |
@@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, | |||
91 | pt_element_t *table; | 102 | pt_element_t *table; |
92 | struct page *page; | 103 | struct page *page; |
93 | 104 | ||
94 | down_read(¤t->mm->mmap_sem); | ||
95 | page = gfn_to_page(kvm, table_gfn); | 105 | page = gfn_to_page(kvm, table_gfn); |
96 | up_read(¤t->mm->mmap_sem); | ||
97 | 106 | ||
98 | table = kmap_atomic(page, KM_USER0); | 107 | table = kmap_atomic(page, KM_USER0); |
99 | |||
100 | ret = CMPXCHG(&table[index], orig_pte, new_pte); | 108 | ret = CMPXCHG(&table[index], orig_pte, new_pte); |
101 | |||
102 | kunmap_atomic(table, KM_USER0); | 109 | kunmap_atomic(table, KM_USER0); |
103 | 110 | ||
104 | kvm_release_page_dirty(page); | 111 | kvm_release_page_dirty(page); |
@@ -274,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
274 | /* | 281 | /* |
275 | * Fetch a shadow pte for a specific level in the paging hierarchy. | 282 | * Fetch a shadow pte for a specific level in the paging hierarchy. |
276 | */ | 283 | */ |
277 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 284 | static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, |
278 | struct guest_walker *walker, | 285 | struct kvm_vcpu *vcpu, u64 addr, |
279 | int user_fault, int write_fault, int largepage, | 286 | u64 *sptep, int level) |
280 | int *ptwrite, pfn_t pfn) | ||
281 | { | 287 | { |
282 | hpa_t shadow_addr; | 288 | struct shadow_walker *sw = |
283 | int level; | 289 | container_of(_sw, struct shadow_walker, walker); |
284 | u64 *shadow_ent; | 290 | struct guest_walker *gw = sw->guest_walker; |
285 | unsigned access = walker->pt_access; | 291 | unsigned access = gw->pt_access; |
286 | 292 | struct kvm_mmu_page *shadow_page; | |
287 | if (!is_present_pte(walker->ptes[walker->level - 1])) | 293 | u64 spte; |
288 | return NULL; | 294 | int metaphysical; |
289 | 295 | gfn_t table_gfn; | |
290 | shadow_addr = vcpu->arch.mmu.root_hpa; | 296 | int r; |
291 | level = vcpu->arch.mmu.shadow_root_level; | 297 | pt_element_t curr_pte; |
292 | if (level == PT32E_ROOT_LEVEL) { | 298 | |
293 | shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | 299 | if (level == PT_PAGE_TABLE_LEVEL |
294 | shadow_addr &= PT64_BASE_ADDR_MASK; | 300 | || (sw->largepage && level == PT_DIRECTORY_LEVEL)) { |
295 | --level; | 301 | mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, |
302 | sw->user_fault, sw->write_fault, | ||
303 | gw->ptes[gw->level-1] & PT_DIRTY_MASK, | ||
304 | sw->ptwrite, sw->largepage, gw->gfn, sw->pfn, | ||
305 | false); | ||
306 | sw->sptep = sptep; | ||
307 | return 1; | ||
296 | } | 308 | } |
297 | 309 | ||
298 | for (; ; level--) { | 310 | if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) |
299 | u32 index = SHADOW_PT_INDEX(addr, level); | 311 | return 0; |
300 | struct kvm_mmu_page *shadow_page; | ||
301 | u64 shadow_pte; | ||
302 | int metaphysical; | ||
303 | gfn_t table_gfn; | ||
304 | |||
305 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; | ||
306 | if (level == PT_PAGE_TABLE_LEVEL) | ||
307 | break; | ||
308 | |||
309 | if (largepage && level == PT_DIRECTORY_LEVEL) | ||
310 | break; | ||
311 | 312 | ||
312 | if (is_shadow_present_pte(*shadow_ent) | 313 | if (is_large_pte(*sptep)) { |
313 | && !is_large_pte(*shadow_ent)) { | 314 | set_shadow_pte(sptep, shadow_trap_nonpresent_pte); |
314 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | 315 | kvm_flush_remote_tlbs(vcpu->kvm); |
315 | continue; | 316 | rmap_remove(vcpu->kvm, sptep); |
316 | } | 317 | } |
317 | 318 | ||
318 | if (is_large_pte(*shadow_ent)) | 319 | if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) { |
319 | rmap_remove(vcpu->kvm, shadow_ent); | 320 | metaphysical = 1; |
320 | 321 | if (!is_dirty_pte(gw->ptes[level - 1])) | |
321 | if (level - 1 == PT_PAGE_TABLE_LEVEL | 322 | access &= ~ACC_WRITE_MASK; |
322 | && walker->level == PT_DIRECTORY_LEVEL) { | 323 | table_gfn = gpte_to_gfn(gw->ptes[level - 1]); |
323 | metaphysical = 1; | 324 | } else { |
324 | if (!is_dirty_pte(walker->ptes[level - 1])) | 325 | metaphysical = 0; |
325 | access &= ~ACC_WRITE_MASK; | 326 | table_gfn = gw->table_gfn[level - 2]; |
326 | table_gfn = gpte_to_gfn(walker->ptes[level - 1]); | 327 | } |
327 | } else { | 328 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1, |
328 | metaphysical = 0; | 329 | metaphysical, access, sptep); |
329 | table_gfn = walker->table_gfn[level - 2]; | 330 | if (!metaphysical) { |
330 | } | 331 | r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], |
331 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | 332 | &curr_pte, sizeof(curr_pte)); |
332 | metaphysical, access, | 333 | if (r || curr_pte != gw->ptes[level - 2]) { |
333 | shadow_ent); | 334 | kvm_release_pfn_clean(sw->pfn); |
334 | if (!metaphysical) { | 335 | sw->sptep = NULL; |
335 | int r; | 336 | return 1; |
336 | pt_element_t curr_pte; | ||
337 | r = kvm_read_guest_atomic(vcpu->kvm, | ||
338 | walker->pte_gpa[level - 2], | ||
339 | &curr_pte, sizeof(curr_pte)); | ||
340 | if (r || curr_pte != walker->ptes[level - 2]) { | ||
341 | kvm_release_pfn_clean(pfn); | ||
342 | return NULL; | ||
343 | } | ||
344 | } | 337 | } |
345 | shadow_addr = __pa(shadow_page->spt); | ||
346 | shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | ||
347 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
348 | set_shadow_pte(shadow_ent, shadow_pte); | ||
349 | } | 338 | } |
350 | 339 | ||
351 | mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, | 340 | spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK |
352 | user_fault, write_fault, | 341 | | PT_WRITABLE_MASK | PT_USER_MASK; |
353 | walker->ptes[walker->level-1] & PT_DIRTY_MASK, | 342 | *sptep = spte; |
354 | ptwrite, largepage, walker->gfn, pfn, false); | 343 | return 0; |
344 | } | ||
345 | |||
346 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | ||
347 | struct guest_walker *guest_walker, | ||
348 | int user_fault, int write_fault, int largepage, | ||
349 | int *ptwrite, pfn_t pfn) | ||
350 | { | ||
351 | struct shadow_walker walker = { | ||
352 | .walker = { .entry = FNAME(shadow_walk_entry), }, | ||
353 | .guest_walker = guest_walker, | ||
354 | .user_fault = user_fault, | ||
355 | .write_fault = write_fault, | ||
356 | .largepage = largepage, | ||
357 | .ptwrite = ptwrite, | ||
358 | .pfn = pfn, | ||
359 | }; | ||
360 | |||
361 | if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1])) | ||
362 | return NULL; | ||
363 | |||
364 | walk_shadow(&walker.walker, vcpu, addr); | ||
355 | 365 | ||
356 | return shadow_ent; | 366 | return walker.sptep; |
357 | } | 367 | } |
358 | 368 | ||
359 | /* | 369 | /* |
@@ -407,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
407 | return 0; | 417 | return 0; |
408 | } | 418 | } |
409 | 419 | ||
410 | down_read(¤t->mm->mmap_sem); | ||
411 | if (walker.level == PT_DIRECTORY_LEVEL) { | 420 | if (walker.level == PT_DIRECTORY_LEVEL) { |
412 | gfn_t large_gfn; | 421 | gfn_t large_gfn; |
413 | large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); | 422 | large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); |
@@ -417,9 +426,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
417 | } | 426 | } |
418 | } | 427 | } |
419 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 428 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
420 | /* implicit mb(), we'll read before PT lock is unlocked */ | 429 | smp_rmb(); |
421 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); | 430 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); |
422 | up_read(¤t->mm->mmap_sem); | ||
423 | 431 | ||
424 | /* mmio */ | 432 | /* mmio */ |
425 | if (is_error_pfn(pfn)) { | 433 | if (is_error_pfn(pfn)) { |
@@ -453,6 +461,31 @@ out_unlock: | |||
453 | return 0; | 461 | return 0; |
454 | } | 462 | } |
455 | 463 | ||
464 | static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, | ||
465 | struct kvm_vcpu *vcpu, u64 addr, | ||
466 | u64 *sptep, int level) | ||
467 | { | ||
468 | |||
469 | if (level == PT_PAGE_TABLE_LEVEL) { | ||
470 | if (is_shadow_present_pte(*sptep)) | ||
471 | rmap_remove(vcpu->kvm, sptep); | ||
472 | set_shadow_pte(sptep, shadow_trap_nonpresent_pte); | ||
473 | return 1; | ||
474 | } | ||
475 | if (!is_shadow_present_pte(*sptep)) | ||
476 | return 1; | ||
477 | return 0; | ||
478 | } | ||
479 | |||
480 | static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | ||
481 | { | ||
482 | struct shadow_walker walker = { | ||
483 | .walker = { .entry = FNAME(shadow_invlpg_entry), }, | ||
484 | }; | ||
485 | |||
486 | walk_shadow(&walker.walker, vcpu, gva); | ||
487 | } | ||
488 | |||
456 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | 489 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) |
457 | { | 490 | { |
458 | struct guest_walker walker; | 491 | struct guest_walker walker; |
@@ -499,12 +532,66 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | |||
499 | } | 532 | } |
500 | } | 533 | } |
501 | 534 | ||
535 | /* | ||
536 | * Using the cached information from sp->gfns is safe because: | ||
537 | * - The spte has a reference to the struct page, so the pfn for a given gfn | ||
538 | * can't change unless all sptes pointing to it are nuked first. | ||
539 | * - Alias changes zap the entire shadow cache. | ||
540 | */ | ||
541 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | ||
542 | { | ||
543 | int i, offset, nr_present; | ||
544 | |||
545 | offset = nr_present = 0; | ||
546 | |||
547 | if (PTTYPE == 32) | ||
548 | offset = sp->role.quadrant << PT64_LEVEL_BITS; | ||
549 | |||
550 | for (i = 0; i < PT64_ENT_PER_PAGE; i++) { | ||
551 | unsigned pte_access; | ||
552 | pt_element_t gpte; | ||
553 | gpa_t pte_gpa; | ||
554 | gfn_t gfn = sp->gfns[i]; | ||
555 | |||
556 | if (!is_shadow_present_pte(sp->spt[i])) | ||
557 | continue; | ||
558 | |||
559 | pte_gpa = gfn_to_gpa(sp->gfn); | ||
560 | pte_gpa += (i+offset) * sizeof(pt_element_t); | ||
561 | |||
562 | if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, | ||
563 | sizeof(pt_element_t))) | ||
564 | return -EINVAL; | ||
565 | |||
566 | if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || | ||
567 | !(gpte & PT_ACCESSED_MASK)) { | ||
568 | u64 nonpresent; | ||
569 | |||
570 | rmap_remove(vcpu->kvm, &sp->spt[i]); | ||
571 | if (is_present_pte(gpte)) | ||
572 | nonpresent = shadow_trap_nonpresent_pte; | ||
573 | else | ||
574 | nonpresent = shadow_notrap_nonpresent_pte; | ||
575 | set_shadow_pte(&sp->spt[i], nonpresent); | ||
576 | continue; | ||
577 | } | ||
578 | |||
579 | nr_present++; | ||
580 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | ||
581 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, | ||
582 | is_dirty_pte(gpte), 0, gfn, | ||
583 | spte_to_pfn(sp->spt[i]), true, false); | ||
584 | } | ||
585 | |||
586 | return !nr_present; | ||
587 | } | ||
588 | |||
502 | #undef pt_element_t | 589 | #undef pt_element_t |
503 | #undef guest_walker | 590 | #undef guest_walker |
591 | #undef shadow_walker | ||
504 | #undef FNAME | 592 | #undef FNAME |
505 | #undef PT_BASE_ADDR_MASK | 593 | #undef PT_BASE_ADDR_MASK |
506 | #undef PT_INDEX | 594 | #undef PT_INDEX |
507 | #undef SHADOW_PT_INDEX | ||
508 | #undef PT_LEVEL_MASK | 595 | #undef PT_LEVEL_MASK |
509 | #undef PT_DIR_BASE_ADDR_MASK | 596 | #undef PT_DIR_BASE_ADDR_MASK |
510 | #undef PT_LEVEL_BITS | 597 | #undef PT_LEVEL_BITS |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8233b86c778c..9c4ce657d963 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include "kvm_svm.h" | 18 | #include "kvm_svm.h" |
19 | #include "irq.h" | 19 | #include "irq.h" |
20 | #include "mmu.h" | 20 | #include "mmu.h" |
21 | #include "kvm_cache_regs.h" | ||
21 | 22 | ||
22 | #include <linux/module.h> | 23 | #include <linux/module.h> |
23 | #include <linux/kernel.h> | 24 | #include <linux/kernel.h> |
@@ -35,10 +36,6 @@ MODULE_LICENSE("GPL"); | |||
35 | #define IOPM_ALLOC_ORDER 2 | 36 | #define IOPM_ALLOC_ORDER 2 |
36 | #define MSRPM_ALLOC_ORDER 1 | 37 | #define MSRPM_ALLOC_ORDER 1 |
37 | 38 | ||
38 | #define DB_VECTOR 1 | ||
39 | #define UD_VECTOR 6 | ||
40 | #define GP_VECTOR 13 | ||
41 | |||
42 | #define DR7_GD_MASK (1 << 13) | 39 | #define DR7_GD_MASK (1 << 13) |
43 | #define DR6_BD_MASK (1 << 13) | 40 | #define DR6_BD_MASK (1 << 13) |
44 | 41 | ||
@@ -47,7 +44,7 @@ MODULE_LICENSE("GPL"); | |||
47 | 44 | ||
48 | #define SVM_FEATURE_NPT (1 << 0) | 45 | #define SVM_FEATURE_NPT (1 << 0) |
49 | #define SVM_FEATURE_LBRV (1 << 1) | 46 | #define SVM_FEATURE_LBRV (1 << 1) |
50 | #define SVM_DEATURE_SVML (1 << 2) | 47 | #define SVM_FEATURE_SVML (1 << 2) |
51 | 48 | ||
52 | #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) | 49 | #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) |
53 | 50 | ||
@@ -236,13 +233,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
236 | printk(KERN_DEBUG "%s: NOP\n", __func__); | 233 | printk(KERN_DEBUG "%s: NOP\n", __func__); |
237 | return; | 234 | return; |
238 | } | 235 | } |
239 | if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) | 236 | if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) |
240 | printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", | 237 | printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", |
241 | __func__, | 238 | __func__, kvm_rip_read(vcpu), svm->next_rip); |
242 | svm->vmcb->save.rip, | ||
243 | svm->next_rip); | ||
244 | 239 | ||
245 | vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; | 240 | kvm_rip_write(vcpu, svm->next_rip); |
246 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; | 241 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; |
247 | 242 | ||
248 | vcpu->arch.interrupt_window_open = 1; | 243 | vcpu->arch.interrupt_window_open = 1; |
@@ -530,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
530 | (1ULL << INTERCEPT_CPUID) | | 525 | (1ULL << INTERCEPT_CPUID) | |
531 | (1ULL << INTERCEPT_INVD) | | 526 | (1ULL << INTERCEPT_INVD) | |
532 | (1ULL << INTERCEPT_HLT) | | 527 | (1ULL << INTERCEPT_HLT) | |
528 | (1ULL << INTERCEPT_INVLPG) | | ||
533 | (1ULL << INTERCEPT_INVLPGA) | | 529 | (1ULL << INTERCEPT_INVLPGA) | |
534 | (1ULL << INTERCEPT_IOIO_PROT) | | 530 | (1ULL << INTERCEPT_IOIO_PROT) | |
535 | (1ULL << INTERCEPT_MSR_PROT) | | 531 | (1ULL << INTERCEPT_MSR_PROT) | |
@@ -581,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
581 | save->dr7 = 0x400; | 577 | save->dr7 = 0x400; |
582 | save->rflags = 2; | 578 | save->rflags = 2; |
583 | save->rip = 0x0000fff0; | 579 | save->rip = 0x0000fff0; |
580 | svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; | ||
584 | 581 | ||
585 | /* | 582 | /* |
586 | * cr0 val on cpu init should be 0x60000010, we enable cpu | 583 | * cr0 val on cpu init should be 0x60000010, we enable cpu |
@@ -593,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
593 | if (npt_enabled) { | 590 | if (npt_enabled) { |
594 | /* Setup VMCB for Nested Paging */ | 591 | /* Setup VMCB for Nested Paging */ |
595 | control->nested_ctl = 1; | 592 | control->nested_ctl = 1; |
596 | control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH); | 593 | control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | |
594 | (1ULL << INTERCEPT_INVLPG)); | ||
597 | control->intercept_exceptions &= ~(1 << PF_VECTOR); | 595 | control->intercept_exceptions &= ~(1 << PF_VECTOR); |
598 | control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| | 596 | control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| |
599 | INTERCEPT_CR3_MASK); | 597 | INTERCEPT_CR3_MASK); |
@@ -615,10 +613,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) | |||
615 | init_vmcb(svm); | 613 | init_vmcb(svm); |
616 | 614 | ||
617 | if (vcpu->vcpu_id != 0) { | 615 | if (vcpu->vcpu_id != 0) { |
618 | svm->vmcb->save.rip = 0; | 616 | kvm_rip_write(vcpu, 0); |
619 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; | 617 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; |
620 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; | 618 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; |
621 | } | 619 | } |
620 | vcpu->arch.regs_avail = ~0; | ||
621 | vcpu->arch.regs_dirty = ~0; | ||
622 | 622 | ||
623 | return 0; | 623 | return 0; |
624 | } | 624 | } |
@@ -721,23 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) | |||
721 | rdtscll(vcpu->arch.host_tsc); | 721 | rdtscll(vcpu->arch.host_tsc); |
722 | } | 722 | } |
723 | 723 | ||
724 | static void svm_cache_regs(struct kvm_vcpu *vcpu) | ||
725 | { | ||
726 | struct vcpu_svm *svm = to_svm(vcpu); | ||
727 | |||
728 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | ||
729 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | ||
730 | vcpu->arch.rip = svm->vmcb->save.rip; | ||
731 | } | ||
732 | |||
733 | static void svm_decache_regs(struct kvm_vcpu *vcpu) | ||
734 | { | ||
735 | struct vcpu_svm *svm = to_svm(vcpu); | ||
736 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
737 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | ||
738 | svm->vmcb->save.rip = vcpu->arch.rip; | ||
739 | } | ||
740 | |||
741 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) | 724 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) |
742 | { | 725 | { |
743 | return to_svm(vcpu)->vmcb->save.rflags; | 726 | return to_svm(vcpu)->vmcb->save.rflags; |
@@ -1040,7 +1023,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1040 | if (npt_enabled) | 1023 | if (npt_enabled) |
1041 | svm_flush_tlb(&svm->vcpu); | 1024 | svm_flush_tlb(&svm->vcpu); |
1042 | 1025 | ||
1043 | if (event_injection) | 1026 | if (!npt_enabled && event_injection) |
1044 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); | 1027 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); |
1045 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); | 1028 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); |
1046 | } | 1029 | } |
@@ -1139,14 +1122,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1139 | 1122 | ||
1140 | static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1123 | static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1141 | { | 1124 | { |
1142 | svm->next_rip = svm->vmcb->save.rip + 1; | 1125 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; |
1143 | skip_emulated_instruction(&svm->vcpu); | 1126 | skip_emulated_instruction(&svm->vcpu); |
1144 | return kvm_emulate_halt(&svm->vcpu); | 1127 | return kvm_emulate_halt(&svm->vcpu); |
1145 | } | 1128 | } |
1146 | 1129 | ||
1147 | static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1130 | static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1148 | { | 1131 | { |
1149 | svm->next_rip = svm->vmcb->save.rip + 3; | 1132 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
1150 | skip_emulated_instruction(&svm->vcpu); | 1133 | skip_emulated_instruction(&svm->vcpu); |
1151 | kvm_emulate_hypercall(&svm->vcpu); | 1134 | kvm_emulate_hypercall(&svm->vcpu); |
1152 | return 1; | 1135 | return 1; |
@@ -1178,11 +1161,18 @@ static int task_switch_interception(struct vcpu_svm *svm, | |||
1178 | 1161 | ||
1179 | static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1162 | static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1180 | { | 1163 | { |
1181 | svm->next_rip = svm->vmcb->save.rip + 2; | 1164 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; |
1182 | kvm_emulate_cpuid(&svm->vcpu); | 1165 | kvm_emulate_cpuid(&svm->vcpu); |
1183 | return 1; | 1166 | return 1; |
1184 | } | 1167 | } |
1185 | 1168 | ||
1169 | static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1170 | { | ||
1171 | if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) | ||
1172 | pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); | ||
1173 | return 1; | ||
1174 | } | ||
1175 | |||
1186 | static int emulate_on_interception(struct vcpu_svm *svm, | 1176 | static int emulate_on_interception(struct vcpu_svm *svm, |
1187 | struct kvm_run *kvm_run) | 1177 | struct kvm_run *kvm_run) |
1188 | { | 1178 | { |
@@ -1273,9 +1263,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1273 | KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, | 1263 | KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, |
1274 | (u32)(data >> 32), handler); | 1264 | (u32)(data >> 32), handler); |
1275 | 1265 | ||
1276 | svm->vmcb->save.rax = data & 0xffffffff; | 1266 | svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; |
1277 | svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; | 1267 | svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; |
1278 | svm->next_rip = svm->vmcb->save.rip + 2; | 1268 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; |
1279 | skip_emulated_instruction(&svm->vcpu); | 1269 | skip_emulated_instruction(&svm->vcpu); |
1280 | } | 1270 | } |
1281 | return 1; | 1271 | return 1; |
@@ -1359,13 +1349,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
1359 | static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1349 | static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1360 | { | 1350 | { |
1361 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; | 1351 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; |
1362 | u64 data = (svm->vmcb->save.rax & -1u) | 1352 | u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) |
1363 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); | 1353 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); |
1364 | 1354 | ||
1365 | KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), | 1355 | KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), |
1366 | handler); | 1356 | handler); |
1367 | 1357 | ||
1368 | svm->next_rip = svm->vmcb->save.rip + 2; | 1358 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; |
1369 | if (svm_set_msr(&svm->vcpu, ecx, data)) | 1359 | if (svm_set_msr(&svm->vcpu, ecx, data)) |
1370 | kvm_inject_gp(&svm->vcpu, 0); | 1360 | kvm_inject_gp(&svm->vcpu, 0); |
1371 | else | 1361 | else |
@@ -1436,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, | |||
1436 | [SVM_EXIT_CPUID] = cpuid_interception, | 1426 | [SVM_EXIT_CPUID] = cpuid_interception, |
1437 | [SVM_EXIT_INVD] = emulate_on_interception, | 1427 | [SVM_EXIT_INVD] = emulate_on_interception, |
1438 | [SVM_EXIT_HLT] = halt_interception, | 1428 | [SVM_EXIT_HLT] = halt_interception, |
1439 | [SVM_EXIT_INVLPG] = emulate_on_interception, | 1429 | [SVM_EXIT_INVLPG] = invlpg_interception, |
1440 | [SVM_EXIT_INVLPGA] = invalid_op_interception, | 1430 | [SVM_EXIT_INVLPGA] = invalid_op_interception, |
1441 | [SVM_EXIT_IOIO] = io_interception, | 1431 | [SVM_EXIT_IOIO] = io_interception, |
1442 | [SVM_EXIT_MSR] = msr_interception, | 1432 | [SVM_EXIT_MSR] = msr_interception, |
@@ -1538,6 +1528,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | |||
1538 | 1528 | ||
1539 | KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); | 1529 | KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); |
1540 | 1530 | ||
1531 | ++svm->vcpu.stat.irq_injections; | ||
1541 | control = &svm->vmcb->control; | 1532 | control = &svm->vmcb->control; |
1542 | control->int_vector = irq; | 1533 | control->int_vector = irq; |
1543 | control->int_ctl &= ~V_INTR_PRIO_MASK; | 1534 | control->int_ctl &= ~V_INTR_PRIO_MASK; |
@@ -1716,6 +1707,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) | |||
1716 | svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; | 1707 | svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; |
1717 | } | 1708 | } |
1718 | 1709 | ||
1710 | #ifdef CONFIG_X86_64 | ||
1711 | #define R "r" | ||
1712 | #else | ||
1713 | #define R "e" | ||
1714 | #endif | ||
1715 | |||
1719 | static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 1716 | static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
1720 | { | 1717 | { |
1721 | struct vcpu_svm *svm = to_svm(vcpu); | 1718 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -1723,6 +1720,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1723 | u16 gs_selector; | 1720 | u16 gs_selector; |
1724 | u16 ldt_selector; | 1721 | u16 ldt_selector; |
1725 | 1722 | ||
1723 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
1724 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | ||
1725 | svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; | ||
1726 | |||
1726 | pre_svm_run(svm); | 1727 | pre_svm_run(svm); |
1727 | 1728 | ||
1728 | sync_lapic_to_cr8(vcpu); | 1729 | sync_lapic_to_cr8(vcpu); |
@@ -1750,19 +1751,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1750 | local_irq_enable(); | 1751 | local_irq_enable(); |
1751 | 1752 | ||
1752 | asm volatile ( | 1753 | asm volatile ( |
1754 | "push %%"R"bp; \n\t" | ||
1755 | "mov %c[rbx](%[svm]), %%"R"bx \n\t" | ||
1756 | "mov %c[rcx](%[svm]), %%"R"cx \n\t" | ||
1757 | "mov %c[rdx](%[svm]), %%"R"dx \n\t" | ||
1758 | "mov %c[rsi](%[svm]), %%"R"si \n\t" | ||
1759 | "mov %c[rdi](%[svm]), %%"R"di \n\t" | ||
1760 | "mov %c[rbp](%[svm]), %%"R"bp \n\t" | ||
1753 | #ifdef CONFIG_X86_64 | 1761 | #ifdef CONFIG_X86_64 |
1754 | "push %%rbp; \n\t" | ||
1755 | #else | ||
1756 | "push %%ebp; \n\t" | ||
1757 | #endif | ||
1758 | |||
1759 | #ifdef CONFIG_X86_64 | ||
1760 | "mov %c[rbx](%[svm]), %%rbx \n\t" | ||
1761 | "mov %c[rcx](%[svm]), %%rcx \n\t" | ||
1762 | "mov %c[rdx](%[svm]), %%rdx \n\t" | ||
1763 | "mov %c[rsi](%[svm]), %%rsi \n\t" | ||
1764 | "mov %c[rdi](%[svm]), %%rdi \n\t" | ||
1765 | "mov %c[rbp](%[svm]), %%rbp \n\t" | ||
1766 | "mov %c[r8](%[svm]), %%r8 \n\t" | 1762 | "mov %c[r8](%[svm]), %%r8 \n\t" |
1767 | "mov %c[r9](%[svm]), %%r9 \n\t" | 1763 | "mov %c[r9](%[svm]), %%r9 \n\t" |
1768 | "mov %c[r10](%[svm]), %%r10 \n\t" | 1764 | "mov %c[r10](%[svm]), %%r10 \n\t" |
@@ -1771,41 +1767,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1771 | "mov %c[r13](%[svm]), %%r13 \n\t" | 1767 | "mov %c[r13](%[svm]), %%r13 \n\t" |
1772 | "mov %c[r14](%[svm]), %%r14 \n\t" | 1768 | "mov %c[r14](%[svm]), %%r14 \n\t" |
1773 | "mov %c[r15](%[svm]), %%r15 \n\t" | 1769 | "mov %c[r15](%[svm]), %%r15 \n\t" |
1774 | #else | ||
1775 | "mov %c[rbx](%[svm]), %%ebx \n\t" | ||
1776 | "mov %c[rcx](%[svm]), %%ecx \n\t" | ||
1777 | "mov %c[rdx](%[svm]), %%edx \n\t" | ||
1778 | "mov %c[rsi](%[svm]), %%esi \n\t" | ||
1779 | "mov %c[rdi](%[svm]), %%edi \n\t" | ||
1780 | "mov %c[rbp](%[svm]), %%ebp \n\t" | ||
1781 | #endif | 1770 | #endif |
1782 | 1771 | ||
1783 | #ifdef CONFIG_X86_64 | ||
1784 | /* Enter guest mode */ | ||
1785 | "push %%rax \n\t" | ||
1786 | "mov %c[vmcb](%[svm]), %%rax \n\t" | ||
1787 | __ex(SVM_VMLOAD) "\n\t" | ||
1788 | __ex(SVM_VMRUN) "\n\t" | ||
1789 | __ex(SVM_VMSAVE) "\n\t" | ||
1790 | "pop %%rax \n\t" | ||
1791 | #else | ||
1792 | /* Enter guest mode */ | 1772 | /* Enter guest mode */ |
1793 | "push %%eax \n\t" | 1773 | "push %%"R"ax \n\t" |
1794 | "mov %c[vmcb](%[svm]), %%eax \n\t" | 1774 | "mov %c[vmcb](%[svm]), %%"R"ax \n\t" |
1795 | __ex(SVM_VMLOAD) "\n\t" | 1775 | __ex(SVM_VMLOAD) "\n\t" |
1796 | __ex(SVM_VMRUN) "\n\t" | 1776 | __ex(SVM_VMRUN) "\n\t" |
1797 | __ex(SVM_VMSAVE) "\n\t" | 1777 | __ex(SVM_VMSAVE) "\n\t" |
1798 | "pop %%eax \n\t" | 1778 | "pop %%"R"ax \n\t" |
1799 | #endif | ||
1800 | 1779 | ||
1801 | /* Save guest registers, load host registers */ | 1780 | /* Save guest registers, load host registers */ |
1781 | "mov %%"R"bx, %c[rbx](%[svm]) \n\t" | ||
1782 | "mov %%"R"cx, %c[rcx](%[svm]) \n\t" | ||
1783 | "mov %%"R"dx, %c[rdx](%[svm]) \n\t" | ||
1784 | "mov %%"R"si, %c[rsi](%[svm]) \n\t" | ||
1785 | "mov %%"R"di, %c[rdi](%[svm]) \n\t" | ||
1786 | "mov %%"R"bp, %c[rbp](%[svm]) \n\t" | ||
1802 | #ifdef CONFIG_X86_64 | 1787 | #ifdef CONFIG_X86_64 |
1803 | "mov %%rbx, %c[rbx](%[svm]) \n\t" | ||
1804 | "mov %%rcx, %c[rcx](%[svm]) \n\t" | ||
1805 | "mov %%rdx, %c[rdx](%[svm]) \n\t" | ||
1806 | "mov %%rsi, %c[rsi](%[svm]) \n\t" | ||
1807 | "mov %%rdi, %c[rdi](%[svm]) \n\t" | ||
1808 | "mov %%rbp, %c[rbp](%[svm]) \n\t" | ||
1809 | "mov %%r8, %c[r8](%[svm]) \n\t" | 1788 | "mov %%r8, %c[r8](%[svm]) \n\t" |
1810 | "mov %%r9, %c[r9](%[svm]) \n\t" | 1789 | "mov %%r9, %c[r9](%[svm]) \n\t" |
1811 | "mov %%r10, %c[r10](%[svm]) \n\t" | 1790 | "mov %%r10, %c[r10](%[svm]) \n\t" |
@@ -1814,18 +1793,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1814 | "mov %%r13, %c[r13](%[svm]) \n\t" | 1793 | "mov %%r13, %c[r13](%[svm]) \n\t" |
1815 | "mov %%r14, %c[r14](%[svm]) \n\t" | 1794 | "mov %%r14, %c[r14](%[svm]) \n\t" |
1816 | "mov %%r15, %c[r15](%[svm]) \n\t" | 1795 | "mov %%r15, %c[r15](%[svm]) \n\t" |
1817 | |||
1818 | "pop %%rbp; \n\t" | ||
1819 | #else | ||
1820 | "mov %%ebx, %c[rbx](%[svm]) \n\t" | ||
1821 | "mov %%ecx, %c[rcx](%[svm]) \n\t" | ||
1822 | "mov %%edx, %c[rdx](%[svm]) \n\t" | ||
1823 | "mov %%esi, %c[rsi](%[svm]) \n\t" | ||
1824 | "mov %%edi, %c[rdi](%[svm]) \n\t" | ||
1825 | "mov %%ebp, %c[rbp](%[svm]) \n\t" | ||
1826 | |||
1827 | "pop %%ebp; \n\t" | ||
1828 | #endif | 1796 | #endif |
1797 | "pop %%"R"bp" | ||
1829 | : | 1798 | : |
1830 | : [svm]"a"(svm), | 1799 | : [svm]"a"(svm), |
1831 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), | 1800 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), |
@@ -1846,11 +1815,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1846 | [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) | 1815 | [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) |
1847 | #endif | 1816 | #endif |
1848 | : "cc", "memory" | 1817 | : "cc", "memory" |
1818 | , R"bx", R"cx", R"dx", R"si", R"di" | ||
1849 | #ifdef CONFIG_X86_64 | 1819 | #ifdef CONFIG_X86_64 |
1850 | , "rbx", "rcx", "rdx", "rsi", "rdi" | ||
1851 | , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" | 1820 | , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" |
1852 | #else | ||
1853 | , "ebx", "ecx", "edx" , "esi", "edi" | ||
1854 | #endif | 1821 | #endif |
1855 | ); | 1822 | ); |
1856 | 1823 | ||
@@ -1858,6 +1825,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1858 | load_db_regs(svm->host_db_regs); | 1825 | load_db_regs(svm->host_db_regs); |
1859 | 1826 | ||
1860 | vcpu->arch.cr2 = svm->vmcb->save.cr2; | 1827 | vcpu->arch.cr2 = svm->vmcb->save.cr2; |
1828 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | ||
1829 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | ||
1830 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; | ||
1861 | 1831 | ||
1862 | write_dr6(svm->host_dr6); | 1832 | write_dr6(svm->host_dr6); |
1863 | write_dr7(svm->host_dr7); | 1833 | write_dr7(svm->host_dr7); |
@@ -1879,6 +1849,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1879 | svm->next_rip = 0; | 1849 | svm->next_rip = 0; |
1880 | } | 1850 | } |
1881 | 1851 | ||
1852 | #undef R | ||
1853 | |||
1882 | static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) | 1854 | static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) |
1883 | { | 1855 | { |
1884 | struct vcpu_svm *svm = to_svm(vcpu); | 1856 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -1977,8 +1949,6 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
1977 | .set_gdt = svm_set_gdt, | 1949 | .set_gdt = svm_set_gdt, |
1978 | .get_dr = svm_get_dr, | 1950 | .get_dr = svm_get_dr, |
1979 | .set_dr = svm_set_dr, | 1951 | .set_dr = svm_set_dr, |
1980 | .cache_regs = svm_cache_regs, | ||
1981 | .decache_regs = svm_decache_regs, | ||
1982 | .get_rflags = svm_get_rflags, | 1952 | .get_rflags = svm_get_rflags, |
1983 | .set_rflags = svm_set_rflags, | 1953 | .set_rflags = svm_set_rflags, |
1984 | 1954 | ||
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7041cc52b562..2643b430d83a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -26,6 +26,8 @@ | |||
26 | #include <linux/highmem.h> | 26 | #include <linux/highmem.h> |
27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
28 | #include <linux/moduleparam.h> | 28 | #include <linux/moduleparam.h> |
29 | #include "kvm_cache_regs.h" | ||
30 | #include "x86.h" | ||
29 | 31 | ||
30 | #include <asm/io.h> | 32 | #include <asm/io.h> |
31 | #include <asm/desc.h> | 33 | #include <asm/desc.h> |
@@ -47,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0); | |||
47 | static int enable_ept = 1; | 49 | static int enable_ept = 1; |
48 | module_param(enable_ept, bool, 0); | 50 | module_param(enable_ept, bool, 0); |
49 | 51 | ||
52 | static int emulate_invalid_guest_state = 0; | ||
53 | module_param(emulate_invalid_guest_state, bool, 0); | ||
54 | |||
50 | struct vmcs { | 55 | struct vmcs { |
51 | u32 revision_id; | 56 | u32 revision_id; |
52 | u32 abort; | 57 | u32 abort; |
@@ -56,6 +61,7 @@ struct vmcs { | |||
56 | struct vcpu_vmx { | 61 | struct vcpu_vmx { |
57 | struct kvm_vcpu vcpu; | 62 | struct kvm_vcpu vcpu; |
58 | struct list_head local_vcpus_link; | 63 | struct list_head local_vcpus_link; |
64 | unsigned long host_rsp; | ||
59 | int launched; | 65 | int launched; |
60 | u8 fail; | 66 | u8 fail; |
61 | u32 idt_vectoring_info; | 67 | u32 idt_vectoring_info; |
@@ -83,6 +89,7 @@ struct vcpu_vmx { | |||
83 | } irq; | 89 | } irq; |
84 | } rmode; | 90 | } rmode; |
85 | int vpid; | 91 | int vpid; |
92 | bool emulation_required; | ||
86 | }; | 93 | }; |
87 | 94 | ||
88 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | 95 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) |
@@ -468,7 +475,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) | |||
468 | if (!vcpu->fpu_active) | 475 | if (!vcpu->fpu_active) |
469 | eb |= 1u << NM_VECTOR; | 476 | eb |= 1u << NM_VECTOR; |
470 | if (vcpu->guest_debug.enabled) | 477 | if (vcpu->guest_debug.enabled) |
471 | eb |= 1u << 1; | 478 | eb |= 1u << DB_VECTOR; |
472 | if (vcpu->arch.rmode.active) | 479 | if (vcpu->arch.rmode.active) |
473 | eb = ~0; | 480 | eb = ~0; |
474 | if (vm_need_ept()) | 481 | if (vm_need_ept()) |
@@ -715,9 +722,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
715 | unsigned long rip; | 722 | unsigned long rip; |
716 | u32 interruptibility; | 723 | u32 interruptibility; |
717 | 724 | ||
718 | rip = vmcs_readl(GUEST_RIP); | 725 | rip = kvm_rip_read(vcpu); |
719 | rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | 726 | rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); |
720 | vmcs_writel(GUEST_RIP, rip); | 727 | kvm_rip_write(vcpu, rip); |
721 | 728 | ||
722 | /* | 729 | /* |
723 | * We emulated an instruction, so temporary interrupt blocking | 730 | * We emulated an instruction, so temporary interrupt blocking |
@@ -733,19 +740,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
733 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | 740 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, |
734 | bool has_error_code, u32 error_code) | 741 | bool has_error_code, u32 error_code) |
735 | { | 742 | { |
743 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
744 | |||
745 | if (has_error_code) | ||
746 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
747 | |||
748 | if (vcpu->arch.rmode.active) { | ||
749 | vmx->rmode.irq.pending = true; | ||
750 | vmx->rmode.irq.vector = nr; | ||
751 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | ||
752 | if (nr == BP_VECTOR) | ||
753 | vmx->rmode.irq.rip++; | ||
754 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
755 | nr | INTR_TYPE_SOFT_INTR | ||
756 | | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) | ||
757 | | INTR_INFO_VALID_MASK); | ||
758 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | ||
759 | kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); | ||
760 | return; | ||
761 | } | ||
762 | |||
736 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 763 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
737 | nr | INTR_TYPE_EXCEPTION | 764 | nr | INTR_TYPE_EXCEPTION |
738 | | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) | 765 | | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) |
739 | | INTR_INFO_VALID_MASK); | 766 | | INTR_INFO_VALID_MASK); |
740 | if (has_error_code) | ||
741 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
742 | } | 767 | } |
743 | 768 | ||
744 | static bool vmx_exception_injected(struct kvm_vcpu *vcpu) | 769 | static bool vmx_exception_injected(struct kvm_vcpu *vcpu) |
745 | { | 770 | { |
746 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 771 | return false; |
747 | |||
748 | return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); | ||
749 | } | 772 | } |
750 | 773 | ||
751 | /* | 774 | /* |
@@ -947,24 +970,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
947 | return ret; | 970 | return ret; |
948 | } | 971 | } |
949 | 972 | ||
950 | /* | 973 | static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) |
951 | * Sync the rsp and rip registers into the vcpu structure. This allows | ||
952 | * registers to be accessed by indexing vcpu->arch.regs. | ||
953 | */ | ||
954 | static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) | ||
955 | { | ||
956 | vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); | ||
957 | vcpu->arch.rip = vmcs_readl(GUEST_RIP); | ||
958 | } | ||
959 | |||
960 | /* | ||
961 | * Syncs rsp and rip back into the vmcs. Should be called after possible | ||
962 | * modification. | ||
963 | */ | ||
964 | static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) | ||
965 | { | 974 | { |
966 | vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); | 975 | __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); |
967 | vmcs_writel(GUEST_RIP, vcpu->arch.rip); | 976 | switch (reg) { |
977 | case VCPU_REGS_RSP: | ||
978 | vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); | ||
979 | break; | ||
980 | case VCPU_REGS_RIP: | ||
981 | vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); | ||
982 | break; | ||
983 | default: | ||
984 | break; | ||
985 | } | ||
968 | } | 986 | } |
969 | 987 | ||
970 | static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | 988 | static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) |
@@ -1007,17 +1025,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | |||
1007 | 1025 | ||
1008 | static int vmx_get_irq(struct kvm_vcpu *vcpu) | 1026 | static int vmx_get_irq(struct kvm_vcpu *vcpu) |
1009 | { | 1027 | { |
1010 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1028 | if (!vcpu->arch.interrupt.pending) |
1011 | u32 idtv_info_field; | 1029 | return -1; |
1012 | 1030 | return vcpu->arch.interrupt.nr; | |
1013 | idtv_info_field = vmx->idt_vectoring_info; | ||
1014 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | ||
1015 | if (is_external_interrupt(idtv_info_field)) | ||
1016 | return idtv_info_field & VECTORING_INFO_VECTOR_MASK; | ||
1017 | else | ||
1018 | printk(KERN_DEBUG "pending exception: not handled yet\n"); | ||
1019 | } | ||
1020 | return -1; | ||
1021 | } | 1031 | } |
1022 | 1032 | ||
1023 | static __init int cpu_has_kvm_support(void) | 1033 | static __init int cpu_has_kvm_support(void) |
@@ -1031,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void) | |||
1031 | u64 msr; | 1041 | u64 msr; |
1032 | 1042 | ||
1033 | rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); | 1043 | rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); |
1034 | return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | | 1044 | return (msr & (FEATURE_CONTROL_LOCKED | |
1035 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | 1045 | FEATURE_CONTROL_VMXON_ENABLED)) |
1036 | == MSR_IA32_FEATURE_CONTROL_LOCKED; | 1046 | == FEATURE_CONTROL_LOCKED; |
1037 | /* locked but not enabled */ | 1047 | /* locked but not enabled */ |
1038 | } | 1048 | } |
1039 | 1049 | ||
@@ -1045,14 +1055,14 @@ static void hardware_enable(void *garbage) | |||
1045 | 1055 | ||
1046 | INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); | 1056 | INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); |
1047 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); | 1057 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); |
1048 | if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | | 1058 | if ((old & (FEATURE_CONTROL_LOCKED | |
1049 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | 1059 | FEATURE_CONTROL_VMXON_ENABLED)) |
1050 | != (MSR_IA32_FEATURE_CONTROL_LOCKED | | 1060 | != (FEATURE_CONTROL_LOCKED | |
1051 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | 1061 | FEATURE_CONTROL_VMXON_ENABLED)) |
1052 | /* enable and lock */ | 1062 | /* enable and lock */ |
1053 | wrmsrl(MSR_IA32_FEATURE_CONTROL, old | | 1063 | wrmsrl(MSR_IA32_FEATURE_CONTROL, old | |
1054 | MSR_IA32_FEATURE_CONTROL_LOCKED | | 1064 | FEATURE_CONTROL_LOCKED | |
1055 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); | 1065 | FEATURE_CONTROL_VMXON_ENABLED); |
1056 | write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ | 1066 | write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ |
1057 | asm volatile (ASM_VMX_VMXON_RAX | 1067 | asm volatile (ASM_VMX_VMXON_RAX |
1058 | : : "a"(&phys_addr), "m"(phys_addr) | 1068 | : : "a"(&phys_addr), "m"(phys_addr) |
@@ -1120,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1120 | CPU_BASED_CR3_STORE_EXITING | | 1130 | CPU_BASED_CR3_STORE_EXITING | |
1121 | CPU_BASED_USE_IO_BITMAPS | | 1131 | CPU_BASED_USE_IO_BITMAPS | |
1122 | CPU_BASED_MOV_DR_EXITING | | 1132 | CPU_BASED_MOV_DR_EXITING | |
1123 | CPU_BASED_USE_TSC_OFFSETING; | 1133 | CPU_BASED_USE_TSC_OFFSETING | |
1134 | CPU_BASED_INVLPG_EXITING; | ||
1124 | opt = CPU_BASED_TPR_SHADOW | | 1135 | opt = CPU_BASED_TPR_SHADOW | |
1125 | CPU_BASED_USE_MSR_BITMAPS | | 1136 | CPU_BASED_USE_MSR_BITMAPS | |
1126 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | 1137 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; |
@@ -1149,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1149 | _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; | 1160 | _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; |
1150 | #endif | 1161 | #endif |
1151 | if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { | 1162 | if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { |
1152 | /* CR3 accesses don't need to cause VM Exits when EPT enabled */ | 1163 | /* CR3 accesses and invlpg don't need to cause VM Exits when EPT |
1164 | enabled */ | ||
1153 | min &= ~(CPU_BASED_CR3_LOAD_EXITING | | 1165 | min &= ~(CPU_BASED_CR3_LOAD_EXITING | |
1154 | CPU_BASED_CR3_STORE_EXITING); | 1166 | CPU_BASED_CR3_STORE_EXITING | |
1167 | CPU_BASED_INVLPG_EXITING); | ||
1155 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, | 1168 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, |
1156 | &_cpu_based_exec_control) < 0) | 1169 | &_cpu_based_exec_control) < 0) |
1157 | return -EIO; | 1170 | return -EIO; |
@@ -1288,7 +1301,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) | |||
1288 | static void enter_pmode(struct kvm_vcpu *vcpu) | 1301 | static void enter_pmode(struct kvm_vcpu *vcpu) |
1289 | { | 1302 | { |
1290 | unsigned long flags; | 1303 | unsigned long flags; |
1304 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1291 | 1305 | ||
1306 | vmx->emulation_required = 1; | ||
1292 | vcpu->arch.rmode.active = 0; | 1307 | vcpu->arch.rmode.active = 0; |
1293 | 1308 | ||
1294 | vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); | 1309 | vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); |
@@ -1305,6 +1320,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
1305 | 1320 | ||
1306 | update_exception_bitmap(vcpu); | 1321 | update_exception_bitmap(vcpu); |
1307 | 1322 | ||
1323 | if (emulate_invalid_guest_state) | ||
1324 | return; | ||
1325 | |||
1308 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); | 1326 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); |
1309 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); | 1327 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); |
1310 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); | 1328 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); |
@@ -1345,7 +1363,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) | |||
1345 | static void enter_rmode(struct kvm_vcpu *vcpu) | 1363 | static void enter_rmode(struct kvm_vcpu *vcpu) |
1346 | { | 1364 | { |
1347 | unsigned long flags; | 1365 | unsigned long flags; |
1366 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1348 | 1367 | ||
1368 | vmx->emulation_required = 1; | ||
1349 | vcpu->arch.rmode.active = 1; | 1369 | vcpu->arch.rmode.active = 1; |
1350 | 1370 | ||
1351 | vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); | 1371 | vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); |
@@ -1367,6 +1387,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
1367 | vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); | 1387 | vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); |
1368 | update_exception_bitmap(vcpu); | 1388 | update_exception_bitmap(vcpu); |
1369 | 1389 | ||
1390 | if (emulate_invalid_guest_state) | ||
1391 | goto continue_rmode; | ||
1392 | |||
1370 | vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); | 1393 | vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); |
1371 | vmcs_write32(GUEST_SS_LIMIT, 0xffff); | 1394 | vmcs_write32(GUEST_SS_LIMIT, 0xffff); |
1372 | vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); | 1395 | vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); |
@@ -1382,6 +1405,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
1382 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); | 1405 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); |
1383 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); | 1406 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); |
1384 | 1407 | ||
1408 | continue_rmode: | ||
1385 | kvm_mmu_reset_context(vcpu); | 1409 | kvm_mmu_reset_context(vcpu); |
1386 | init_rmode(vcpu->kvm); | 1410 | init_rmode(vcpu->kvm); |
1387 | } | 1411 | } |
@@ -1715,6 +1739,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | |||
1715 | vmcs_writel(GUEST_GDTR_BASE, dt->base); | 1739 | vmcs_writel(GUEST_GDTR_BASE, dt->base); |
1716 | } | 1740 | } |
1717 | 1741 | ||
1742 | static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) | ||
1743 | { | ||
1744 | struct kvm_segment var; | ||
1745 | u32 ar; | ||
1746 | |||
1747 | vmx_get_segment(vcpu, &var, seg); | ||
1748 | ar = vmx_segment_access_rights(&var); | ||
1749 | |||
1750 | if (var.base != (var.selector << 4)) | ||
1751 | return false; | ||
1752 | if (var.limit != 0xffff) | ||
1753 | return false; | ||
1754 | if (ar != 0xf3) | ||
1755 | return false; | ||
1756 | |||
1757 | return true; | ||
1758 | } | ||
1759 | |||
1760 | static bool code_segment_valid(struct kvm_vcpu *vcpu) | ||
1761 | { | ||
1762 | struct kvm_segment cs; | ||
1763 | unsigned int cs_rpl; | ||
1764 | |||
1765 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
1766 | cs_rpl = cs.selector & SELECTOR_RPL_MASK; | ||
1767 | |||
1768 | if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) | ||
1769 | return false; | ||
1770 | if (!cs.s) | ||
1771 | return false; | ||
1772 | if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) { | ||
1773 | if (cs.dpl > cs_rpl) | ||
1774 | return false; | ||
1775 | } else if (cs.type & AR_TYPE_CODE_MASK) { | ||
1776 | if (cs.dpl != cs_rpl) | ||
1777 | return false; | ||
1778 | } | ||
1779 | if (!cs.present) | ||
1780 | return false; | ||
1781 | |||
1782 | /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ | ||
1783 | return true; | ||
1784 | } | ||
1785 | |||
1786 | static bool stack_segment_valid(struct kvm_vcpu *vcpu) | ||
1787 | { | ||
1788 | struct kvm_segment ss; | ||
1789 | unsigned int ss_rpl; | ||
1790 | |||
1791 | vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); | ||
1792 | ss_rpl = ss.selector & SELECTOR_RPL_MASK; | ||
1793 | |||
1794 | if ((ss.type != 3) || (ss.type != 7)) | ||
1795 | return false; | ||
1796 | if (!ss.s) | ||
1797 | return false; | ||
1798 | if (ss.dpl != ss_rpl) /* DPL != RPL */ | ||
1799 | return false; | ||
1800 | if (!ss.present) | ||
1801 | return false; | ||
1802 | |||
1803 | return true; | ||
1804 | } | ||
1805 | |||
1806 | static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) | ||
1807 | { | ||
1808 | struct kvm_segment var; | ||
1809 | unsigned int rpl; | ||
1810 | |||
1811 | vmx_get_segment(vcpu, &var, seg); | ||
1812 | rpl = var.selector & SELECTOR_RPL_MASK; | ||
1813 | |||
1814 | if (!var.s) | ||
1815 | return false; | ||
1816 | if (!var.present) | ||
1817 | return false; | ||
1818 | if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { | ||
1819 | if (var.dpl < rpl) /* DPL < RPL */ | ||
1820 | return false; | ||
1821 | } | ||
1822 | |||
1823 | /* TODO: Add other members to kvm_segment_field to allow checking for other access | ||
1824 | * rights flags | ||
1825 | */ | ||
1826 | return true; | ||
1827 | } | ||
1828 | |||
1829 | static bool tr_valid(struct kvm_vcpu *vcpu) | ||
1830 | { | ||
1831 | struct kvm_segment tr; | ||
1832 | |||
1833 | vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); | ||
1834 | |||
1835 | if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ | ||
1836 | return false; | ||
1837 | if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */ | ||
1838 | return false; | ||
1839 | if (!tr.present) | ||
1840 | return false; | ||
1841 | |||
1842 | return true; | ||
1843 | } | ||
1844 | |||
1845 | static bool ldtr_valid(struct kvm_vcpu *vcpu) | ||
1846 | { | ||
1847 | struct kvm_segment ldtr; | ||
1848 | |||
1849 | vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); | ||
1850 | |||
1851 | if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ | ||
1852 | return false; | ||
1853 | if (ldtr.type != 2) | ||
1854 | return false; | ||
1855 | if (!ldtr.present) | ||
1856 | return false; | ||
1857 | |||
1858 | return true; | ||
1859 | } | ||
1860 | |||
1861 | static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) | ||
1862 | { | ||
1863 | struct kvm_segment cs, ss; | ||
1864 | |||
1865 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
1866 | vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); | ||
1867 | |||
1868 | return ((cs.selector & SELECTOR_RPL_MASK) == | ||
1869 | (ss.selector & SELECTOR_RPL_MASK)); | ||
1870 | } | ||
1871 | |||
1872 | /* | ||
1873 | * Check if guest state is valid. Returns true if valid, false if | ||
1874 | * not. | ||
1875 | * We assume that registers are always usable | ||
1876 | */ | ||
1877 | static bool guest_state_valid(struct kvm_vcpu *vcpu) | ||
1878 | { | ||
1879 | /* real mode guest state checks */ | ||
1880 | if (!(vcpu->arch.cr0 & X86_CR0_PE)) { | ||
1881 | if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) | ||
1882 | return false; | ||
1883 | if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) | ||
1884 | return false; | ||
1885 | if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) | ||
1886 | return false; | ||
1887 | if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) | ||
1888 | return false; | ||
1889 | if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) | ||
1890 | return false; | ||
1891 | if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) | ||
1892 | return false; | ||
1893 | } else { | ||
1894 | /* protected mode guest state checks */ | ||
1895 | if (!cs_ss_rpl_check(vcpu)) | ||
1896 | return false; | ||
1897 | if (!code_segment_valid(vcpu)) | ||
1898 | return false; | ||
1899 | if (!stack_segment_valid(vcpu)) | ||
1900 | return false; | ||
1901 | if (!data_segment_valid(vcpu, VCPU_SREG_DS)) | ||
1902 | return false; | ||
1903 | if (!data_segment_valid(vcpu, VCPU_SREG_ES)) | ||
1904 | return false; | ||
1905 | if (!data_segment_valid(vcpu, VCPU_SREG_FS)) | ||
1906 | return false; | ||
1907 | if (!data_segment_valid(vcpu, VCPU_SREG_GS)) | ||
1908 | return false; | ||
1909 | if (!tr_valid(vcpu)) | ||
1910 | return false; | ||
1911 | if (!ldtr_valid(vcpu)) | ||
1912 | return false; | ||
1913 | } | ||
1914 | /* TODO: | ||
1915 | * - Add checks on RIP | ||
1916 | * - Add checks on RFLAGS | ||
1917 | */ | ||
1918 | |||
1919 | return true; | ||
1920 | } | ||
1921 | |||
1718 | static int init_rmode_tss(struct kvm *kvm) | 1922 | static int init_rmode_tss(struct kvm *kvm) |
1719 | { | 1923 | { |
1720 | gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; | 1924 | gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; |
@@ -1726,7 +1930,8 @@ static int init_rmode_tss(struct kvm *kvm) | |||
1726 | if (r < 0) | 1930 | if (r < 0) |
1727 | goto out; | 1931 | goto out; |
1728 | data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; | 1932 | data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; |
1729 | r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); | 1933 | r = kvm_write_guest_page(kvm, fn++, &data, |
1934 | TSS_IOPB_BASE_OFFSET, sizeof(u16)); | ||
1730 | if (r < 0) | 1935 | if (r < 0) |
1731 | goto out; | 1936 | goto out; |
1732 | r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); | 1937 | r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); |
@@ -1789,7 +1994,7 @@ static void seg_setup(int seg) | |||
1789 | vmcs_write16(sf->selector, 0); | 1994 | vmcs_write16(sf->selector, 0); |
1790 | vmcs_writel(sf->base, 0); | 1995 | vmcs_writel(sf->base, 0); |
1791 | vmcs_write32(sf->limit, 0xffff); | 1996 | vmcs_write32(sf->limit, 0xffff); |
1792 | vmcs_write32(sf->ar_bytes, 0x93); | 1997 | vmcs_write32(sf->ar_bytes, 0xf3); |
1793 | } | 1998 | } |
1794 | 1999 | ||
1795 | static int alloc_apic_access_page(struct kvm *kvm) | 2000 | static int alloc_apic_access_page(struct kvm *kvm) |
@@ -1808,9 +2013,7 @@ static int alloc_apic_access_page(struct kvm *kvm) | |||
1808 | if (r) | 2013 | if (r) |
1809 | goto out; | 2014 | goto out; |
1810 | 2015 | ||
1811 | down_read(¤t->mm->mmap_sem); | ||
1812 | kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); | 2016 | kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); |
1813 | up_read(¤t->mm->mmap_sem); | ||
1814 | out: | 2017 | out: |
1815 | up_write(&kvm->slots_lock); | 2018 | up_write(&kvm->slots_lock); |
1816 | return r; | 2019 | return r; |
@@ -1832,10 +2035,8 @@ static int alloc_identity_pagetable(struct kvm *kvm) | |||
1832 | if (r) | 2035 | if (r) |
1833 | goto out; | 2036 | goto out; |
1834 | 2037 | ||
1835 | down_read(¤t->mm->mmap_sem); | ||
1836 | kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, | 2038 | kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, |
1837 | VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); | 2039 | VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); |
1838 | up_read(¤t->mm->mmap_sem); | ||
1839 | out: | 2040 | out: |
1840 | up_write(&kvm->slots_lock); | 2041 | up_write(&kvm->slots_lock); |
1841 | return r; | 2042 | return r; |
@@ -1917,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
1917 | } | 2118 | } |
1918 | if (!vm_need_ept()) | 2119 | if (!vm_need_ept()) |
1919 | exec_control |= CPU_BASED_CR3_STORE_EXITING | | 2120 | exec_control |= CPU_BASED_CR3_STORE_EXITING | |
1920 | CPU_BASED_CR3_LOAD_EXITING; | 2121 | CPU_BASED_CR3_LOAD_EXITING | |
2122 | CPU_BASED_INVLPG_EXITING; | ||
1921 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | 2123 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); |
1922 | 2124 | ||
1923 | if (cpu_has_secondary_exec_ctrls()) { | 2125 | if (cpu_has_secondary_exec_ctrls()) { |
@@ -2019,6 +2221,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2019 | u64 msr; | 2221 | u64 msr; |
2020 | int ret; | 2222 | int ret; |
2021 | 2223 | ||
2224 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); | ||
2022 | down_read(&vcpu->kvm->slots_lock); | 2225 | down_read(&vcpu->kvm->slots_lock); |
2023 | if (!init_rmode(vmx->vcpu.kvm)) { | 2226 | if (!init_rmode(vmx->vcpu.kvm)) { |
2024 | ret = -ENOMEM; | 2227 | ret = -ENOMEM; |
@@ -2036,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2036 | 2239 | ||
2037 | fx_init(&vmx->vcpu); | 2240 | fx_init(&vmx->vcpu); |
2038 | 2241 | ||
2242 | seg_setup(VCPU_SREG_CS); | ||
2039 | /* | 2243 | /* |
2040 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode | 2244 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode |
2041 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. | 2245 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. |
@@ -2047,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2047 | vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); | 2251 | vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); |
2048 | vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); | 2252 | vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); |
2049 | } | 2253 | } |
2050 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
2051 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
2052 | 2254 | ||
2053 | seg_setup(VCPU_SREG_DS); | 2255 | seg_setup(VCPU_SREG_DS); |
2054 | seg_setup(VCPU_SREG_ES); | 2256 | seg_setup(VCPU_SREG_ES); |
@@ -2072,10 +2274,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2072 | 2274 | ||
2073 | vmcs_writel(GUEST_RFLAGS, 0x02); | 2275 | vmcs_writel(GUEST_RFLAGS, 0x02); |
2074 | if (vmx->vcpu.vcpu_id == 0) | 2276 | if (vmx->vcpu.vcpu_id == 0) |
2075 | vmcs_writel(GUEST_RIP, 0xfff0); | 2277 | kvm_rip_write(vcpu, 0xfff0); |
2076 | else | 2278 | else |
2077 | vmcs_writel(GUEST_RIP, 0); | 2279 | kvm_rip_write(vcpu, 0); |
2078 | vmcs_writel(GUEST_RSP, 0); | 2280 | kvm_register_write(vcpu, VCPU_REGS_RSP, 0); |
2079 | 2281 | ||
2080 | /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ | 2282 | /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ |
2081 | vmcs_writel(GUEST_DR7, 0x400); | 2283 | vmcs_writel(GUEST_DR7, 0x400); |
@@ -2125,6 +2327,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2125 | 2327 | ||
2126 | ret = 0; | 2328 | ret = 0; |
2127 | 2329 | ||
2330 | /* HACK: Don't enable emulation on guest boot/reset */ | ||
2331 | vmx->emulation_required = 0; | ||
2332 | |||
2128 | out: | 2333 | out: |
2129 | up_read(&vcpu->kvm->slots_lock); | 2334 | up_read(&vcpu->kvm->slots_lock); |
2130 | return ret; | 2335 | return ret; |
@@ -2136,14 +2341,15 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) | |||
2136 | 2341 | ||
2137 | KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); | 2342 | KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); |
2138 | 2343 | ||
2344 | ++vcpu->stat.irq_injections; | ||
2139 | if (vcpu->arch.rmode.active) { | 2345 | if (vcpu->arch.rmode.active) { |
2140 | vmx->rmode.irq.pending = true; | 2346 | vmx->rmode.irq.pending = true; |
2141 | vmx->rmode.irq.vector = irq; | 2347 | vmx->rmode.irq.vector = irq; |
2142 | vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); | 2348 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); |
2143 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2349 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
2144 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); | 2350 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); |
2145 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | 2351 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); |
2146 | vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); | 2352 | kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); |
2147 | return; | 2353 | return; |
2148 | } | 2354 | } |
2149 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2355 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
@@ -2154,7 +2360,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | |||
2154 | { | 2360 | { |
2155 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2361 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
2156 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); | 2362 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); |
2157 | vcpu->arch.nmi_pending = 0; | ||
2158 | } | 2363 | } |
2159 | 2364 | ||
2160 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | 2365 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) |
@@ -2166,7 +2371,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | |||
2166 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); | 2371 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); |
2167 | if (!vcpu->arch.irq_pending[word_index]) | 2372 | if (!vcpu->arch.irq_pending[word_index]) |
2168 | clear_bit(word_index, &vcpu->arch.irq_summary); | 2373 | clear_bit(word_index, &vcpu->arch.irq_summary); |
2169 | vmx_inject_irq(vcpu, irq); | 2374 | kvm_queue_interrupt(vcpu, irq); |
2170 | } | 2375 | } |
2171 | 2376 | ||
2172 | 2377 | ||
@@ -2180,13 +2385,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, | |||
2180 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); | 2385 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); |
2181 | 2386 | ||
2182 | if (vcpu->arch.interrupt_window_open && | 2387 | if (vcpu->arch.interrupt_window_open && |
2183 | vcpu->arch.irq_summary && | 2388 | vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) |
2184 | !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) | ||
2185 | /* | ||
2186 | * If interrupts enabled, and not blocked by sti or mov ss. Good. | ||
2187 | */ | ||
2188 | kvm_do_inject_irq(vcpu); | 2389 | kvm_do_inject_irq(vcpu); |
2189 | 2390 | ||
2391 | if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending) | ||
2392 | vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); | ||
2393 | |||
2190 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 2394 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); |
2191 | if (!vcpu->arch.interrupt_window_open && | 2395 | if (!vcpu->arch.interrupt_window_open && |
2192 | (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) | 2396 | (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) |
@@ -2237,9 +2441,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) | |||
2237 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, | 2441 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, |
2238 | int vec, u32 err_code) | 2442 | int vec, u32 err_code) |
2239 | { | 2443 | { |
2240 | if (!vcpu->arch.rmode.active) | ||
2241 | return 0; | ||
2242 | |||
2243 | /* | 2444 | /* |
2244 | * Instruction with address size override prefix opcode 0x67 | 2445 | * Instruction with address size override prefix opcode 0x67 |
2245 | * Cause the #SS fault with 0 error code in VM86 mode. | 2446 | * Cause the #SS fault with 0 error code in VM86 mode. |
@@ -2247,6 +2448,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, | |||
2247 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) | 2448 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) |
2248 | if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) | 2449 | if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) |
2249 | return 1; | 2450 | return 1; |
2451 | /* | ||
2452 | * Forward all other exceptions that are valid in real mode. | ||
2453 | * FIXME: Breaks guest debugging in real mode, needs to be fixed with | ||
2454 | * the required debugging infrastructure rework. | ||
2455 | */ | ||
2456 | switch (vec) { | ||
2457 | case DE_VECTOR: | ||
2458 | case DB_VECTOR: | ||
2459 | case BP_VECTOR: | ||
2460 | case OF_VECTOR: | ||
2461 | case BR_VECTOR: | ||
2462 | case UD_VECTOR: | ||
2463 | case DF_VECTOR: | ||
2464 | case SS_VECTOR: | ||
2465 | case GP_VECTOR: | ||
2466 | case MF_VECTOR: | ||
2467 | kvm_queue_exception(vcpu, vec); | ||
2468 | return 1; | ||
2469 | } | ||
2250 | return 0; | 2470 | return 0; |
2251 | } | 2471 | } |
2252 | 2472 | ||
@@ -2288,7 +2508,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2288 | } | 2508 | } |
2289 | 2509 | ||
2290 | error_code = 0; | 2510 | error_code = 0; |
2291 | rip = vmcs_readl(GUEST_RIP); | 2511 | rip = kvm_rip_read(vcpu); |
2292 | if (intr_info & INTR_INFO_DELIVER_CODE_MASK) | 2512 | if (intr_info & INTR_INFO_DELIVER_CODE_MASK) |
2293 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | 2513 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); |
2294 | if (is_page_fault(intr_info)) { | 2514 | if (is_page_fault(intr_info)) { |
@@ -2298,7 +2518,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2298 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | 2518 | cr2 = vmcs_readl(EXIT_QUALIFICATION); |
2299 | KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, | 2519 | KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, |
2300 | (u32)((u64)cr2 >> 32), handler); | 2520 | (u32)((u64)cr2 >> 32), handler); |
2301 | if (vect_info & VECTORING_INFO_VALID_MASK) | 2521 | if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) |
2302 | kvm_mmu_unprotect_page_virt(vcpu, cr2); | 2522 | kvm_mmu_unprotect_page_virt(vcpu, cr2); |
2303 | return kvm_mmu_page_fault(vcpu, cr2, error_code); | 2523 | return kvm_mmu_page_fault(vcpu, cr2, error_code); |
2304 | } | 2524 | } |
@@ -2386,27 +2606,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2386 | reg = (exit_qualification >> 8) & 15; | 2606 | reg = (exit_qualification >> 8) & 15; |
2387 | switch ((exit_qualification >> 4) & 3) { | 2607 | switch ((exit_qualification >> 4) & 3) { |
2388 | case 0: /* mov to cr */ | 2608 | case 0: /* mov to cr */ |
2389 | KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg], | 2609 | KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, |
2390 | (u32)((u64)vcpu->arch.regs[reg] >> 32), handler); | 2610 | (u32)kvm_register_read(vcpu, reg), |
2611 | (u32)((u64)kvm_register_read(vcpu, reg) >> 32), | ||
2612 | handler); | ||
2391 | switch (cr) { | 2613 | switch (cr) { |
2392 | case 0: | 2614 | case 0: |
2393 | vcpu_load_rsp_rip(vcpu); | 2615 | kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); |
2394 | kvm_set_cr0(vcpu, vcpu->arch.regs[reg]); | ||
2395 | skip_emulated_instruction(vcpu); | 2616 | skip_emulated_instruction(vcpu); |
2396 | return 1; | 2617 | return 1; |
2397 | case 3: | 2618 | case 3: |
2398 | vcpu_load_rsp_rip(vcpu); | 2619 | kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); |
2399 | kvm_set_cr3(vcpu, vcpu->arch.regs[reg]); | ||
2400 | skip_emulated_instruction(vcpu); | 2620 | skip_emulated_instruction(vcpu); |
2401 | return 1; | 2621 | return 1; |
2402 | case 4: | 2622 | case 4: |
2403 | vcpu_load_rsp_rip(vcpu); | 2623 | kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); |
2404 | kvm_set_cr4(vcpu, vcpu->arch.regs[reg]); | ||
2405 | skip_emulated_instruction(vcpu); | 2624 | skip_emulated_instruction(vcpu); |
2406 | return 1; | 2625 | return 1; |
2407 | case 8: | 2626 | case 8: |
2408 | vcpu_load_rsp_rip(vcpu); | 2627 | kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); |
2409 | kvm_set_cr8(vcpu, vcpu->arch.regs[reg]); | ||
2410 | skip_emulated_instruction(vcpu); | 2628 | skip_emulated_instruction(vcpu); |
2411 | if (irqchip_in_kernel(vcpu->kvm)) | 2629 | if (irqchip_in_kernel(vcpu->kvm)) |
2412 | return 1; | 2630 | return 1; |
@@ -2415,7 +2633,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2415 | }; | 2633 | }; |
2416 | break; | 2634 | break; |
2417 | case 2: /* clts */ | 2635 | case 2: /* clts */ |
2418 | vcpu_load_rsp_rip(vcpu); | ||
2419 | vmx_fpu_deactivate(vcpu); | 2636 | vmx_fpu_deactivate(vcpu); |
2420 | vcpu->arch.cr0 &= ~X86_CR0_TS; | 2637 | vcpu->arch.cr0 &= ~X86_CR0_TS; |
2421 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); | 2638 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); |
@@ -2426,21 +2643,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2426 | case 1: /*mov from cr*/ | 2643 | case 1: /*mov from cr*/ |
2427 | switch (cr) { | 2644 | switch (cr) { |
2428 | case 3: | 2645 | case 3: |
2429 | vcpu_load_rsp_rip(vcpu); | 2646 | kvm_register_write(vcpu, reg, vcpu->arch.cr3); |
2430 | vcpu->arch.regs[reg] = vcpu->arch.cr3; | ||
2431 | vcpu_put_rsp_rip(vcpu); | ||
2432 | KVMTRACE_3D(CR_READ, vcpu, (u32)cr, | 2647 | KVMTRACE_3D(CR_READ, vcpu, (u32)cr, |
2433 | (u32)vcpu->arch.regs[reg], | 2648 | (u32)kvm_register_read(vcpu, reg), |
2434 | (u32)((u64)vcpu->arch.regs[reg] >> 32), | 2649 | (u32)((u64)kvm_register_read(vcpu, reg) >> 32), |
2435 | handler); | 2650 | handler); |
2436 | skip_emulated_instruction(vcpu); | 2651 | skip_emulated_instruction(vcpu); |
2437 | return 1; | 2652 | return 1; |
2438 | case 8: | 2653 | case 8: |
2439 | vcpu_load_rsp_rip(vcpu); | 2654 | kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); |
2440 | vcpu->arch.regs[reg] = kvm_get_cr8(vcpu); | ||
2441 | vcpu_put_rsp_rip(vcpu); | ||
2442 | KVMTRACE_2D(CR_READ, vcpu, (u32)cr, | 2655 | KVMTRACE_2D(CR_READ, vcpu, (u32)cr, |
2443 | (u32)vcpu->arch.regs[reg], handler); | 2656 | (u32)kvm_register_read(vcpu, reg), handler); |
2444 | skip_emulated_instruction(vcpu); | 2657 | skip_emulated_instruction(vcpu); |
2445 | return 1; | 2658 | return 1; |
2446 | } | 2659 | } |
@@ -2472,7 +2685,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2472 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | 2685 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
2473 | dr = exit_qualification & 7; | 2686 | dr = exit_qualification & 7; |
2474 | reg = (exit_qualification >> 8) & 15; | 2687 | reg = (exit_qualification >> 8) & 15; |
2475 | vcpu_load_rsp_rip(vcpu); | ||
2476 | if (exit_qualification & 16) { | 2688 | if (exit_qualification & 16) { |
2477 | /* mov from dr */ | 2689 | /* mov from dr */ |
2478 | switch (dr) { | 2690 | switch (dr) { |
@@ -2485,12 +2697,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2485 | default: | 2697 | default: |
2486 | val = 0; | 2698 | val = 0; |
2487 | } | 2699 | } |
2488 | vcpu->arch.regs[reg] = val; | 2700 | kvm_register_write(vcpu, reg, val); |
2489 | KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); | 2701 | KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); |
2490 | } else { | 2702 | } else { |
2491 | /* mov to dr */ | 2703 | /* mov to dr */ |
2492 | } | 2704 | } |
2493 | vcpu_put_rsp_rip(vcpu); | ||
2494 | skip_emulated_instruction(vcpu); | 2705 | skip_emulated_instruction(vcpu); |
2495 | return 1; | 2706 | return 1; |
2496 | } | 2707 | } |
@@ -2583,6 +2794,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2583 | return 1; | 2794 | return 1; |
2584 | } | 2795 | } |
2585 | 2796 | ||
2797 | static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2798 | { | ||
2799 | u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); | ||
2800 | |||
2801 | kvm_mmu_invlpg(vcpu, exit_qualification); | ||
2802 | skip_emulated_instruction(vcpu); | ||
2803 | return 1; | ||
2804 | } | ||
2805 | |||
2586 | static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2806 | static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2587 | { | 2807 | { |
2588 | skip_emulated_instruction(vcpu); | 2808 | skip_emulated_instruction(vcpu); |
@@ -2695,6 +2915,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2695 | return 1; | 2915 | return 1; |
2696 | } | 2916 | } |
2697 | 2917 | ||
2918 | static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, | ||
2919 | struct kvm_run *kvm_run) | ||
2920 | { | ||
2921 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2922 | int err; | ||
2923 | |||
2924 | preempt_enable(); | ||
2925 | local_irq_enable(); | ||
2926 | |||
2927 | while (!guest_state_valid(vcpu)) { | ||
2928 | err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); | ||
2929 | |||
2930 | switch (err) { | ||
2931 | case EMULATE_DONE: | ||
2932 | break; | ||
2933 | case EMULATE_DO_MMIO: | ||
2934 | kvm_report_emulation_failure(vcpu, "mmio"); | ||
2935 | /* TODO: Handle MMIO */ | ||
2936 | return; | ||
2937 | default: | ||
2938 | kvm_report_emulation_failure(vcpu, "emulation failure"); | ||
2939 | return; | ||
2940 | } | ||
2941 | |||
2942 | if (signal_pending(current)) | ||
2943 | break; | ||
2944 | if (need_resched()) | ||
2945 | schedule(); | ||
2946 | } | ||
2947 | |||
2948 | local_irq_disable(); | ||
2949 | preempt_disable(); | ||
2950 | |||
2951 | /* Guest state should be valid now, no more emulation should be needed */ | ||
2952 | vmx->emulation_required = 0; | ||
2953 | } | ||
2954 | |||
2698 | /* | 2955 | /* |
2699 | * The exit handlers return 1 if the exit was handled fully and guest execution | 2956 | * The exit handlers return 1 if the exit was handled fully and guest execution |
2700 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs | 2957 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs |
@@ -2714,6 +2971,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, | |||
2714 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, | 2971 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, |
2715 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, | 2972 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, |
2716 | [EXIT_REASON_HLT] = handle_halt, | 2973 | [EXIT_REASON_HLT] = handle_halt, |
2974 | [EXIT_REASON_INVLPG] = handle_invlpg, | ||
2717 | [EXIT_REASON_VMCALL] = handle_vmcall, | 2975 | [EXIT_REASON_VMCALL] = handle_vmcall, |
2718 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, | 2976 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, |
2719 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | 2977 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, |
@@ -2735,8 +2993,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
2735 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2993 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2736 | u32 vectoring_info = vmx->idt_vectoring_info; | 2994 | u32 vectoring_info = vmx->idt_vectoring_info; |
2737 | 2995 | ||
2738 | KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), | 2996 | KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), |
2739 | (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); | 2997 | (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); |
2740 | 2998 | ||
2741 | /* Access CR3 don't cause VMExit in paging mode, so we need | 2999 | /* Access CR3 don't cause VMExit in paging mode, so we need |
2742 | * to sync with guest real CR3. */ | 3000 | * to sync with guest real CR3. */ |
@@ -2829,88 +3087,92 @@ static void enable_intr_window(struct kvm_vcpu *vcpu) | |||
2829 | enable_irq_window(vcpu); | 3087 | enable_irq_window(vcpu); |
2830 | } | 3088 | } |
2831 | 3089 | ||
2832 | static void vmx_intr_assist(struct kvm_vcpu *vcpu) | 3090 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) |
2833 | { | 3091 | { |
2834 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3092 | u32 exit_intr_info; |
2835 | u32 idtv_info_field, intr_info_field, exit_intr_info_field; | 3093 | u32 idt_vectoring_info; |
2836 | int vector; | 3094 | bool unblock_nmi; |
3095 | u8 vector; | ||
3096 | int type; | ||
3097 | bool idtv_info_valid; | ||
3098 | u32 error; | ||
2837 | 3099 | ||
2838 | update_tpr_threshold(vcpu); | 3100 | exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); |
2839 | 3101 | if (cpu_has_virtual_nmis()) { | |
2840 | intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); | 3102 | unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; |
2841 | exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO); | 3103 | vector = exit_intr_info & INTR_INFO_VECTOR_MASK; |
2842 | idtv_info_field = vmx->idt_vectoring_info; | 3104 | /* |
2843 | if (intr_info_field & INTR_INFO_VALID_MASK) { | 3105 | * SDM 3: 25.7.1.2 |
2844 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | 3106 | * Re-set bit "block by NMI" before VM entry if vmexit caused by |
2845 | /* TODO: fault when IDT_Vectoring */ | 3107 | * a guest IRET fault. |
2846 | if (printk_ratelimit()) | 3108 | */ |
2847 | printk(KERN_ERR "Fault when IDT_Vectoring\n"); | 3109 | if (unblock_nmi && vector != DF_VECTOR) |
2848 | } | 3110 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, |
2849 | enable_intr_window(vcpu); | 3111 | GUEST_INTR_STATE_NMI); |
2850 | return; | ||
2851 | } | 3112 | } |
2852 | if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { | ||
2853 | if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) | ||
2854 | == INTR_TYPE_EXT_INTR | ||
2855 | && vcpu->arch.rmode.active) { | ||
2856 | u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; | ||
2857 | |||
2858 | vmx_inject_irq(vcpu, vect); | ||
2859 | enable_intr_window(vcpu); | ||
2860 | return; | ||
2861 | } | ||
2862 | |||
2863 | KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); | ||
2864 | 3113 | ||
3114 | idt_vectoring_info = vmx->idt_vectoring_info; | ||
3115 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; | ||
3116 | vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; | ||
3117 | type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; | ||
3118 | if (vmx->vcpu.arch.nmi_injected) { | ||
2865 | /* | 3119 | /* |
2866 | * SDM 3: 25.7.1.2 | 3120 | * SDM 3: 25.7.1.2 |
2867 | * Clear bit "block by NMI" before VM entry if a NMI delivery | 3121 | * Clear bit "block by NMI" before VM entry if a NMI delivery |
2868 | * faulted. | 3122 | * faulted. |
2869 | */ | 3123 | */ |
2870 | if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) | 3124 | if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) |
2871 | == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis()) | 3125 | vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, |
2872 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | 3126 | GUEST_INTR_STATE_NMI); |
2873 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | 3127 | else |
2874 | ~GUEST_INTR_STATE_NMI); | 3128 | vmx->vcpu.arch.nmi_injected = false; |
2875 | 3129 | } | |
2876 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field | 3130 | kvm_clear_exception_queue(&vmx->vcpu); |
2877 | & ~INTR_INFO_RESVD_BITS_MASK); | 3131 | if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) { |
2878 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | 3132 | if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { |
2879 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); | 3133 | error = vmcs_read32(IDT_VECTORING_ERROR_CODE); |
2880 | 3134 | kvm_queue_exception_e(&vmx->vcpu, vector, error); | |
2881 | if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) | 3135 | } else |
2882 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | 3136 | kvm_queue_exception(&vmx->vcpu, vector); |
2883 | vmcs_read32(IDT_VECTORING_ERROR_CODE)); | 3137 | vmx->idt_vectoring_info = 0; |
2884 | enable_intr_window(vcpu); | ||
2885 | return; | ||
2886 | } | 3138 | } |
3139 | kvm_clear_interrupt_queue(&vmx->vcpu); | ||
3140 | if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { | ||
3141 | kvm_queue_interrupt(&vmx->vcpu, vector); | ||
3142 | vmx->idt_vectoring_info = 0; | ||
3143 | } | ||
3144 | } | ||
3145 | |||
3146 | static void vmx_intr_assist(struct kvm_vcpu *vcpu) | ||
3147 | { | ||
3148 | update_tpr_threshold(vcpu); | ||
3149 | |||
2887 | if (cpu_has_virtual_nmis()) { | 3150 | if (cpu_has_virtual_nmis()) { |
2888 | /* | 3151 | if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { |
2889 | * SDM 3: 25.7.1.2 | 3152 | if (vmx_nmi_enabled(vcpu)) { |
2890 | * Re-set bit "block by NMI" before VM entry if vmexit caused by | 3153 | vcpu->arch.nmi_pending = false; |
2891 | * a guest IRET fault. | 3154 | vcpu->arch.nmi_injected = true; |
2892 | */ | 3155 | } else { |
2893 | if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) && | 3156 | enable_intr_window(vcpu); |
2894 | (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8) | 3157 | return; |
2895 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | 3158 | } |
2896 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) | | 3159 | } |
2897 | GUEST_INTR_STATE_NMI); | 3160 | if (vcpu->arch.nmi_injected) { |
2898 | else if (vcpu->arch.nmi_pending) { | 3161 | vmx_inject_nmi(vcpu); |
2899 | if (vmx_nmi_enabled(vcpu)) | ||
2900 | vmx_inject_nmi(vcpu); | ||
2901 | enable_intr_window(vcpu); | 3162 | enable_intr_window(vcpu); |
2902 | return; | 3163 | return; |
2903 | } | 3164 | } |
2904 | |||
2905 | } | 3165 | } |
2906 | if (!kvm_cpu_has_interrupt(vcpu)) | 3166 | if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { |
2907 | return; | 3167 | if (vmx_irq_enabled(vcpu)) |
2908 | if (vmx_irq_enabled(vcpu)) { | 3168 | kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); |
2909 | vector = kvm_cpu_get_interrupt(vcpu); | 3169 | else |
2910 | vmx_inject_irq(vcpu, vector); | 3170 | enable_irq_window(vcpu); |
2911 | kvm_timer_intr_post(vcpu, vector); | 3171 | } |
2912 | } else | 3172 | if (vcpu->arch.interrupt.pending) { |
2913 | enable_irq_window(vcpu); | 3173 | vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); |
3174 | kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); | ||
3175 | } | ||
2914 | } | 3176 | } |
2915 | 3177 | ||
2916 | /* | 3178 | /* |
@@ -2922,9 +3184,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) | |||
2922 | static void fixup_rmode_irq(struct vcpu_vmx *vmx) | 3184 | static void fixup_rmode_irq(struct vcpu_vmx *vmx) |
2923 | { | 3185 | { |
2924 | vmx->rmode.irq.pending = 0; | 3186 | vmx->rmode.irq.pending = 0; |
2925 | if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) | 3187 | if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) |
2926 | return; | 3188 | return; |
2927 | vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); | 3189 | kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); |
2928 | if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { | 3190 | if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { |
2929 | vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; | 3191 | vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; |
2930 | vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; | 3192 | vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; |
@@ -2936,11 +3198,30 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) | |||
2936 | | vmx->rmode.irq.vector; | 3198 | | vmx->rmode.irq.vector; |
2937 | } | 3199 | } |
2938 | 3200 | ||
3201 | #ifdef CONFIG_X86_64 | ||
3202 | #define R "r" | ||
3203 | #define Q "q" | ||
3204 | #else | ||
3205 | #define R "e" | ||
3206 | #define Q "l" | ||
3207 | #endif | ||
3208 | |||
2939 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 3209 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2940 | { | 3210 | { |
2941 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3211 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2942 | u32 intr_info; | 3212 | u32 intr_info; |
2943 | 3213 | ||
3214 | /* Handle invalid guest state instead of entering VMX */ | ||
3215 | if (vmx->emulation_required && emulate_invalid_guest_state) { | ||
3216 | handle_invalid_guest_state(vcpu, kvm_run); | ||
3217 | return; | ||
3218 | } | ||
3219 | |||
3220 | if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) | ||
3221 | vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); | ||
3222 | if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) | ||
3223 | vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); | ||
3224 | |||
2944 | /* | 3225 | /* |
2945 | * Loading guest fpu may have cleared host cr0.ts | 3226 | * Loading guest fpu may have cleared host cr0.ts |
2946 | */ | 3227 | */ |
@@ -2948,26 +3229,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2948 | 3229 | ||
2949 | asm( | 3230 | asm( |
2950 | /* Store host registers */ | 3231 | /* Store host registers */ |
2951 | #ifdef CONFIG_X86_64 | 3232 | "push %%"R"dx; push %%"R"bp;" |
2952 | "push %%rdx; push %%rbp;" | 3233 | "push %%"R"cx \n\t" |
2953 | "push %%rcx \n\t" | 3234 | "cmp %%"R"sp, %c[host_rsp](%0) \n\t" |
2954 | #else | 3235 | "je 1f \n\t" |
2955 | "push %%edx; push %%ebp;" | 3236 | "mov %%"R"sp, %c[host_rsp](%0) \n\t" |
2956 | "push %%ecx \n\t" | ||
2957 | #endif | ||
2958 | __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" | 3237 | __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" |
3238 | "1: \n\t" | ||
2959 | /* Check if vmlaunch of vmresume is needed */ | 3239 | /* Check if vmlaunch of vmresume is needed */ |
2960 | "cmpl $0, %c[launched](%0) \n\t" | 3240 | "cmpl $0, %c[launched](%0) \n\t" |
2961 | /* Load guest registers. Don't clobber flags. */ | 3241 | /* Load guest registers. Don't clobber flags. */ |
3242 | "mov %c[cr2](%0), %%"R"ax \n\t" | ||
3243 | "mov %%"R"ax, %%cr2 \n\t" | ||
3244 | "mov %c[rax](%0), %%"R"ax \n\t" | ||
3245 | "mov %c[rbx](%0), %%"R"bx \n\t" | ||
3246 | "mov %c[rdx](%0), %%"R"dx \n\t" | ||
3247 | "mov %c[rsi](%0), %%"R"si \n\t" | ||
3248 | "mov %c[rdi](%0), %%"R"di \n\t" | ||
3249 | "mov %c[rbp](%0), %%"R"bp \n\t" | ||
2962 | #ifdef CONFIG_X86_64 | 3250 | #ifdef CONFIG_X86_64 |
2963 | "mov %c[cr2](%0), %%rax \n\t" | ||
2964 | "mov %%rax, %%cr2 \n\t" | ||
2965 | "mov %c[rax](%0), %%rax \n\t" | ||
2966 | "mov %c[rbx](%0), %%rbx \n\t" | ||
2967 | "mov %c[rdx](%0), %%rdx \n\t" | ||
2968 | "mov %c[rsi](%0), %%rsi \n\t" | ||
2969 | "mov %c[rdi](%0), %%rdi \n\t" | ||
2970 | "mov %c[rbp](%0), %%rbp \n\t" | ||
2971 | "mov %c[r8](%0), %%r8 \n\t" | 3251 | "mov %c[r8](%0), %%r8 \n\t" |
2972 | "mov %c[r9](%0), %%r9 \n\t" | 3252 | "mov %c[r9](%0), %%r9 \n\t" |
2973 | "mov %c[r10](%0), %%r10 \n\t" | 3253 | "mov %c[r10](%0), %%r10 \n\t" |
@@ -2976,18 +3256,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2976 | "mov %c[r13](%0), %%r13 \n\t" | 3256 | "mov %c[r13](%0), %%r13 \n\t" |
2977 | "mov %c[r14](%0), %%r14 \n\t" | 3257 | "mov %c[r14](%0), %%r14 \n\t" |
2978 | "mov %c[r15](%0), %%r15 \n\t" | 3258 | "mov %c[r15](%0), %%r15 \n\t" |
2979 | "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ | ||
2980 | #else | ||
2981 | "mov %c[cr2](%0), %%eax \n\t" | ||
2982 | "mov %%eax, %%cr2 \n\t" | ||
2983 | "mov %c[rax](%0), %%eax \n\t" | ||
2984 | "mov %c[rbx](%0), %%ebx \n\t" | ||
2985 | "mov %c[rdx](%0), %%edx \n\t" | ||
2986 | "mov %c[rsi](%0), %%esi \n\t" | ||
2987 | "mov %c[rdi](%0), %%edi \n\t" | ||
2988 | "mov %c[rbp](%0), %%ebp \n\t" | ||
2989 | "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ | ||
2990 | #endif | 3259 | #endif |
3260 | "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ | ||
3261 | |||
2991 | /* Enter guest mode */ | 3262 | /* Enter guest mode */ |
2992 | "jne .Llaunched \n\t" | 3263 | "jne .Llaunched \n\t" |
2993 | __ex(ASM_VMX_VMLAUNCH) "\n\t" | 3264 | __ex(ASM_VMX_VMLAUNCH) "\n\t" |
@@ -2995,15 +3266,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2995 | ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" | 3266 | ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" |
2996 | ".Lkvm_vmx_return: " | 3267 | ".Lkvm_vmx_return: " |
2997 | /* Save guest registers, load host registers, keep flags */ | 3268 | /* Save guest registers, load host registers, keep flags */ |
3269 | "xchg %0, (%%"R"sp) \n\t" | ||
3270 | "mov %%"R"ax, %c[rax](%0) \n\t" | ||
3271 | "mov %%"R"bx, %c[rbx](%0) \n\t" | ||
3272 | "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" | ||
3273 | "mov %%"R"dx, %c[rdx](%0) \n\t" | ||
3274 | "mov %%"R"si, %c[rsi](%0) \n\t" | ||
3275 | "mov %%"R"di, %c[rdi](%0) \n\t" | ||
3276 | "mov %%"R"bp, %c[rbp](%0) \n\t" | ||
2998 | #ifdef CONFIG_X86_64 | 3277 | #ifdef CONFIG_X86_64 |
2999 | "xchg %0, (%%rsp) \n\t" | ||
3000 | "mov %%rax, %c[rax](%0) \n\t" | ||
3001 | "mov %%rbx, %c[rbx](%0) \n\t" | ||
3002 | "pushq (%%rsp); popq %c[rcx](%0) \n\t" | ||
3003 | "mov %%rdx, %c[rdx](%0) \n\t" | ||
3004 | "mov %%rsi, %c[rsi](%0) \n\t" | ||
3005 | "mov %%rdi, %c[rdi](%0) \n\t" | ||
3006 | "mov %%rbp, %c[rbp](%0) \n\t" | ||
3007 | "mov %%r8, %c[r8](%0) \n\t" | 3278 | "mov %%r8, %c[r8](%0) \n\t" |
3008 | "mov %%r9, %c[r9](%0) \n\t" | 3279 | "mov %%r9, %c[r9](%0) \n\t" |
3009 | "mov %%r10, %c[r10](%0) \n\t" | 3280 | "mov %%r10, %c[r10](%0) \n\t" |
@@ -3012,28 +3283,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3012 | "mov %%r13, %c[r13](%0) \n\t" | 3283 | "mov %%r13, %c[r13](%0) \n\t" |
3013 | "mov %%r14, %c[r14](%0) \n\t" | 3284 | "mov %%r14, %c[r14](%0) \n\t" |
3014 | "mov %%r15, %c[r15](%0) \n\t" | 3285 | "mov %%r15, %c[r15](%0) \n\t" |
3015 | "mov %%cr2, %%rax \n\t" | ||
3016 | "mov %%rax, %c[cr2](%0) \n\t" | ||
3017 | |||
3018 | "pop %%rbp; pop %%rbp; pop %%rdx \n\t" | ||
3019 | #else | ||
3020 | "xchg %0, (%%esp) \n\t" | ||
3021 | "mov %%eax, %c[rax](%0) \n\t" | ||
3022 | "mov %%ebx, %c[rbx](%0) \n\t" | ||
3023 | "pushl (%%esp); popl %c[rcx](%0) \n\t" | ||
3024 | "mov %%edx, %c[rdx](%0) \n\t" | ||
3025 | "mov %%esi, %c[rsi](%0) \n\t" | ||
3026 | "mov %%edi, %c[rdi](%0) \n\t" | ||
3027 | "mov %%ebp, %c[rbp](%0) \n\t" | ||
3028 | "mov %%cr2, %%eax \n\t" | ||
3029 | "mov %%eax, %c[cr2](%0) \n\t" | ||
3030 | |||
3031 | "pop %%ebp; pop %%ebp; pop %%edx \n\t" | ||
3032 | #endif | 3286 | #endif |
3287 | "mov %%cr2, %%"R"ax \n\t" | ||
3288 | "mov %%"R"ax, %c[cr2](%0) \n\t" | ||
3289 | |||
3290 | "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" | ||
3033 | "setbe %c[fail](%0) \n\t" | 3291 | "setbe %c[fail](%0) \n\t" |
3034 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), | 3292 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), |
3035 | [launched]"i"(offsetof(struct vcpu_vmx, launched)), | 3293 | [launched]"i"(offsetof(struct vcpu_vmx, launched)), |
3036 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | 3294 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), |
3295 | [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), | ||
3037 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), | 3296 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), |
3038 | [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), | 3297 | [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), |
3039 | [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), | 3298 | [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), |
@@ -3053,14 +3312,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3053 | #endif | 3312 | #endif |
3054 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) | 3313 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) |
3055 | : "cc", "memory" | 3314 | : "cc", "memory" |
3315 | , R"bx", R"di", R"si" | ||
3056 | #ifdef CONFIG_X86_64 | 3316 | #ifdef CONFIG_X86_64 |
3057 | , "rbx", "rdi", "rsi" | ||
3058 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" | 3317 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" |
3059 | #else | ||
3060 | , "ebx", "edi", "rsi" | ||
3061 | #endif | 3318 | #endif |
3062 | ); | 3319 | ); |
3063 | 3320 | ||
3321 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); | ||
3322 | vcpu->arch.regs_dirty = 0; | ||
3323 | |||
3064 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 3324 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); |
3065 | if (vmx->rmode.irq.pending) | 3325 | if (vmx->rmode.irq.pending) |
3066 | fixup_rmode_irq(vmx); | 3326 | fixup_rmode_irq(vmx); |
@@ -3080,8 +3340,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3080 | KVMTRACE_0D(NMI, vcpu, handler); | 3340 | KVMTRACE_0D(NMI, vcpu, handler); |
3081 | asm("int $2"); | 3341 | asm("int $2"); |
3082 | } | 3342 | } |
3343 | |||
3344 | vmx_complete_interrupts(vmx); | ||
3083 | } | 3345 | } |
3084 | 3346 | ||
3347 | #undef R | ||
3348 | #undef Q | ||
3349 | |||
3085 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) | 3350 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) |
3086 | { | 3351 | { |
3087 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3352 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
@@ -3224,8 +3489,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
3224 | .set_idt = vmx_set_idt, | 3489 | .set_idt = vmx_set_idt, |
3225 | .get_gdt = vmx_get_gdt, | 3490 | .get_gdt = vmx_get_gdt, |
3226 | .set_gdt = vmx_set_gdt, | 3491 | .set_gdt = vmx_set_gdt, |
3227 | .cache_regs = vcpu_load_rsp_rip, | 3492 | .cache_reg = vmx_cache_reg, |
3228 | .decache_regs = vcpu_put_rsp_rip, | ||
3229 | .get_rflags = vmx_get_rflags, | 3493 | .get_rflags = vmx_get_rflags, |
3230 | .set_rflags = vmx_set_rflags, | 3494 | .set_rflags = vmx_set_rflags, |
3231 | 3495 | ||
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 17e25995b65b..3e010d21fdd7 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h | |||
@@ -331,9 +331,6 @@ enum vmcs_field { | |||
331 | 331 | ||
332 | #define AR_RESERVD_MASK 0xfffe0f00 | 332 | #define AR_RESERVD_MASK 0xfffe0f00 |
333 | 333 | ||
334 | #define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 | ||
335 | #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 | ||
336 | |||
337 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 | 334 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 |
338 | #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 | 335 | #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 |
339 | 336 | ||
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0d682fc6aeb3..4f0677d1eae8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -4,10 +4,14 @@ | |||
4 | * derived from drivers/kvm/kvm_main.c | 4 | * derived from drivers/kvm/kvm_main.c |
5 | * | 5 | * |
6 | * Copyright (C) 2006 Qumranet, Inc. | 6 | * Copyright (C) 2006 Qumranet, Inc. |
7 | * Copyright (C) 2008 Qumranet, Inc. | ||
8 | * Copyright IBM Corporation, 2008 | ||
7 | * | 9 | * |
8 | * Authors: | 10 | * Authors: |
9 | * Avi Kivity <avi@qumranet.com> | 11 | * Avi Kivity <avi@qumranet.com> |
10 | * Yaniv Kamay <yaniv@qumranet.com> | 12 | * Yaniv Kamay <yaniv@qumranet.com> |
13 | * Amit Shah <amit.shah@qumranet.com> | ||
14 | * Ben-Ami Yassour <benami@il.ibm.com> | ||
11 | * | 15 | * |
12 | * This work is licensed under the terms of the GNU GPL, version 2. See | 16 | * This work is licensed under the terms of the GNU GPL, version 2. See |
13 | * the COPYING file in the top-level directory. | 17 | * the COPYING file in the top-level directory. |
@@ -19,14 +23,18 @@ | |||
19 | #include "mmu.h" | 23 | #include "mmu.h" |
20 | #include "i8254.h" | 24 | #include "i8254.h" |
21 | #include "tss.h" | 25 | #include "tss.h" |
26 | #include "kvm_cache_regs.h" | ||
27 | #include "x86.h" | ||
22 | 28 | ||
23 | #include <linux/clocksource.h> | 29 | #include <linux/clocksource.h> |
30 | #include <linux/interrupt.h> | ||
24 | #include <linux/kvm.h> | 31 | #include <linux/kvm.h> |
25 | #include <linux/fs.h> | 32 | #include <linux/fs.h> |
26 | #include <linux/vmalloc.h> | 33 | #include <linux/vmalloc.h> |
27 | #include <linux/module.h> | 34 | #include <linux/module.h> |
28 | #include <linux/mman.h> | 35 | #include <linux/mman.h> |
29 | #include <linux/highmem.h> | 36 | #include <linux/highmem.h> |
37 | #include <linux/intel-iommu.h> | ||
30 | 38 | ||
31 | #include <asm/uaccess.h> | 39 | #include <asm/uaccess.h> |
32 | #include <asm/msr.h> | 40 | #include <asm/msr.h> |
@@ -61,6 +69,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, | |||
61 | struct kvm_cpuid_entry2 __user *entries); | 69 | struct kvm_cpuid_entry2 __user *entries); |
62 | 70 | ||
63 | struct kvm_x86_ops *kvm_x86_ops; | 71 | struct kvm_x86_ops *kvm_x86_ops; |
72 | EXPORT_SYMBOL_GPL(kvm_x86_ops); | ||
64 | 73 | ||
65 | struct kvm_stats_debugfs_item debugfs_entries[] = { | 74 | struct kvm_stats_debugfs_item debugfs_entries[] = { |
66 | { "pf_fixed", VCPU_STAT(pf_fixed) }, | 75 | { "pf_fixed", VCPU_STAT(pf_fixed) }, |
@@ -83,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
83 | { "fpu_reload", VCPU_STAT(fpu_reload) }, | 92 | { "fpu_reload", VCPU_STAT(fpu_reload) }, |
84 | { "insn_emulation", VCPU_STAT(insn_emulation) }, | 93 | { "insn_emulation", VCPU_STAT(insn_emulation) }, |
85 | { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, | 94 | { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, |
95 | { "irq_injections", VCPU_STAT(irq_injections) }, | ||
86 | { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, | 96 | { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, |
87 | { "mmu_pte_write", VM_STAT(mmu_pte_write) }, | 97 | { "mmu_pte_write", VM_STAT(mmu_pte_write) }, |
88 | { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, | 98 | { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, |
@@ -90,12 +100,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
90 | { "mmu_flooded", VM_STAT(mmu_flooded) }, | 100 | { "mmu_flooded", VM_STAT(mmu_flooded) }, |
91 | { "mmu_recycled", VM_STAT(mmu_recycled) }, | 101 | { "mmu_recycled", VM_STAT(mmu_recycled) }, |
92 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, | 102 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, |
103 | { "mmu_unsync", VM_STAT(mmu_unsync) }, | ||
93 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, | 104 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, |
94 | { "largepages", VM_STAT(lpages) }, | 105 | { "largepages", VM_STAT(lpages) }, |
95 | { NULL } | 106 | { NULL } |
96 | }; | 107 | }; |
97 | 108 | ||
98 | |||
99 | unsigned long segment_base(u16 selector) | 109 | unsigned long segment_base(u16 selector) |
100 | { | 110 | { |
101 | struct descriptor_table gdt; | 111 | struct descriptor_table gdt; |
@@ -352,6 +362,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); | |||
352 | void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | 362 | void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
353 | { | 363 | { |
354 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { | 364 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { |
365 | kvm_mmu_sync_roots(vcpu); | ||
355 | kvm_mmu_flush_tlb(vcpu); | 366 | kvm_mmu_flush_tlb(vcpu); |
356 | return; | 367 | return; |
357 | } | 368 | } |
@@ -564,7 +575,7 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info * | |||
564 | hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); | 575 | hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); |
565 | 576 | ||
566 | pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", | 577 | pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", |
567 | __FUNCTION__, tsc_khz, hv_clock->tsc_shift, | 578 | __func__, tsc_khz, hv_clock->tsc_shift, |
568 | hv_clock->tsc_to_system_mul); | 579 | hv_clock->tsc_to_system_mul); |
569 | } | 580 | } |
570 | 581 | ||
@@ -662,6 +673,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
662 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", | 673 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", |
663 | __func__, data); | 674 | __func__, data); |
664 | break; | 675 | break; |
676 | case MSR_IA32_DEBUGCTLMSR: | ||
677 | if (!data) { | ||
678 | /* We support the non-activated case already */ | ||
679 | break; | ||
680 | } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { | ||
681 | /* Values other than LBR and BTF are vendor-specific, | ||
682 | thus reserved and should throw a #GP */ | ||
683 | return 1; | ||
684 | } | ||
685 | pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", | ||
686 | __func__, data); | ||
687 | break; | ||
665 | case MSR_IA32_UCODE_REV: | 688 | case MSR_IA32_UCODE_REV: |
666 | case MSR_IA32_UCODE_WRITE: | 689 | case MSR_IA32_UCODE_WRITE: |
667 | break; | 690 | break; |
@@ -692,10 +715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
692 | /* ...but clean it before doing the actual write */ | 715 | /* ...but clean it before doing the actual write */ |
693 | vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); | 716 | vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); |
694 | 717 | ||
695 | down_read(¤t->mm->mmap_sem); | ||
696 | vcpu->arch.time_page = | 718 | vcpu->arch.time_page = |
697 | gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); | 719 | gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); |
698 | up_read(¤t->mm->mmap_sem); | ||
699 | 720 | ||
700 | if (is_error_page(vcpu->arch.time_page)) { | 721 | if (is_error_page(vcpu->arch.time_page)) { |
701 | kvm_release_page_clean(vcpu->arch.time_page); | 722 | kvm_release_page_clean(vcpu->arch.time_page); |
@@ -752,8 +773,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
752 | case MSR_IA32_MC0_MISC+8: | 773 | case MSR_IA32_MC0_MISC+8: |
753 | case MSR_IA32_MC0_MISC+12: | 774 | case MSR_IA32_MC0_MISC+12: |
754 | case MSR_IA32_MC0_MISC+16: | 775 | case MSR_IA32_MC0_MISC+16: |
776 | case MSR_IA32_MC0_MISC+20: | ||
755 | case MSR_IA32_UCODE_REV: | 777 | case MSR_IA32_UCODE_REV: |
756 | case MSR_IA32_EBL_CR_POWERON: | 778 | case MSR_IA32_EBL_CR_POWERON: |
779 | case MSR_IA32_DEBUGCTLMSR: | ||
780 | case MSR_IA32_LASTBRANCHFROMIP: | ||
781 | case MSR_IA32_LASTBRANCHTOIP: | ||
782 | case MSR_IA32_LASTINTFROMIP: | ||
783 | case MSR_IA32_LASTINTTOIP: | ||
757 | data = 0; | 784 | data = 0; |
758 | break; | 785 | break; |
759 | case MSR_MTRRcap: | 786 | case MSR_MTRRcap: |
@@ -901,6 +928,9 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
901 | case KVM_CAP_PV_MMU: | 928 | case KVM_CAP_PV_MMU: |
902 | r = !tdp_enabled; | 929 | r = !tdp_enabled; |
903 | break; | 930 | break; |
931 | case KVM_CAP_IOMMU: | ||
932 | r = intel_iommu_found(); | ||
933 | break; | ||
904 | default: | 934 | default: |
905 | r = 0; | 935 | r = 0; |
906 | break; | 936 | break; |
@@ -1303,28 +1333,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
1303 | struct kvm_vcpu *vcpu = filp->private_data; | 1333 | struct kvm_vcpu *vcpu = filp->private_data; |
1304 | void __user *argp = (void __user *)arg; | 1334 | void __user *argp = (void __user *)arg; |
1305 | int r; | 1335 | int r; |
1336 | struct kvm_lapic_state *lapic = NULL; | ||
1306 | 1337 | ||
1307 | switch (ioctl) { | 1338 | switch (ioctl) { |
1308 | case KVM_GET_LAPIC: { | 1339 | case KVM_GET_LAPIC: { |
1309 | struct kvm_lapic_state lapic; | 1340 | lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); |
1310 | 1341 | ||
1311 | memset(&lapic, 0, sizeof lapic); | 1342 | r = -ENOMEM; |
1312 | r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); | 1343 | if (!lapic) |
1344 | goto out; | ||
1345 | r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); | ||
1313 | if (r) | 1346 | if (r) |
1314 | goto out; | 1347 | goto out; |
1315 | r = -EFAULT; | 1348 | r = -EFAULT; |
1316 | if (copy_to_user(argp, &lapic, sizeof lapic)) | 1349 | if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) |
1317 | goto out; | 1350 | goto out; |
1318 | r = 0; | 1351 | r = 0; |
1319 | break; | 1352 | break; |
1320 | } | 1353 | } |
1321 | case KVM_SET_LAPIC: { | 1354 | case KVM_SET_LAPIC: { |
1322 | struct kvm_lapic_state lapic; | 1355 | lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); |
1323 | 1356 | r = -ENOMEM; | |
1357 | if (!lapic) | ||
1358 | goto out; | ||
1324 | r = -EFAULT; | 1359 | r = -EFAULT; |
1325 | if (copy_from_user(&lapic, argp, sizeof lapic)) | 1360 | if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) |
1326 | goto out; | 1361 | goto out; |
1327 | r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; | 1362 | r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); |
1328 | if (r) | 1363 | if (r) |
1329 | goto out; | 1364 | goto out; |
1330 | r = 0; | 1365 | r = 0; |
@@ -1422,6 +1457,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
1422 | r = -EINVAL; | 1457 | r = -EINVAL; |
1423 | } | 1458 | } |
1424 | out: | 1459 | out: |
1460 | if (lapic) | ||
1461 | kfree(lapic); | ||
1425 | return r; | 1462 | return r; |
1426 | } | 1463 | } |
1427 | 1464 | ||
@@ -1630,6 +1667,15 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1630 | struct kvm *kvm = filp->private_data; | 1667 | struct kvm *kvm = filp->private_data; |
1631 | void __user *argp = (void __user *)arg; | 1668 | void __user *argp = (void __user *)arg; |
1632 | int r = -EINVAL; | 1669 | int r = -EINVAL; |
1670 | /* | ||
1671 | * This union makes it completely explicit to gcc-3.x | ||
1672 | * that these two variables' stack usage should be | ||
1673 | * combined, not added together. | ||
1674 | */ | ||
1675 | union { | ||
1676 | struct kvm_pit_state ps; | ||
1677 | struct kvm_memory_alias alias; | ||
1678 | } u; | ||
1633 | 1679 | ||
1634 | switch (ioctl) { | 1680 | switch (ioctl) { |
1635 | case KVM_SET_TSS_ADDR: | 1681 | case KVM_SET_TSS_ADDR: |
@@ -1661,17 +1707,14 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1661 | case KVM_GET_NR_MMU_PAGES: | 1707 | case KVM_GET_NR_MMU_PAGES: |
1662 | r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); | 1708 | r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); |
1663 | break; | 1709 | break; |
1664 | case KVM_SET_MEMORY_ALIAS: { | 1710 | case KVM_SET_MEMORY_ALIAS: |
1665 | struct kvm_memory_alias alias; | ||
1666 | |||
1667 | r = -EFAULT; | 1711 | r = -EFAULT; |
1668 | if (copy_from_user(&alias, argp, sizeof alias)) | 1712 | if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) |
1669 | goto out; | 1713 | goto out; |
1670 | r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); | 1714 | r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); |
1671 | if (r) | 1715 | if (r) |
1672 | goto out; | 1716 | goto out; |
1673 | break; | 1717 | break; |
1674 | } | ||
1675 | case KVM_CREATE_IRQCHIP: | 1718 | case KVM_CREATE_IRQCHIP: |
1676 | r = -ENOMEM; | 1719 | r = -ENOMEM; |
1677 | kvm->arch.vpic = kvm_create_pic(kvm); | 1720 | kvm->arch.vpic = kvm_create_pic(kvm); |
@@ -1699,13 +1742,7 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1699 | goto out; | 1742 | goto out; |
1700 | if (irqchip_in_kernel(kvm)) { | 1743 | if (irqchip_in_kernel(kvm)) { |
1701 | mutex_lock(&kvm->lock); | 1744 | mutex_lock(&kvm->lock); |
1702 | if (irq_event.irq < 16) | 1745 | kvm_set_irq(kvm, irq_event.irq, irq_event.level); |
1703 | kvm_pic_set_irq(pic_irqchip(kvm), | ||
1704 | irq_event.irq, | ||
1705 | irq_event.level); | ||
1706 | kvm_ioapic_set_irq(kvm->arch.vioapic, | ||
1707 | irq_event.irq, | ||
1708 | irq_event.level); | ||
1709 | mutex_unlock(&kvm->lock); | 1746 | mutex_unlock(&kvm->lock); |
1710 | r = 0; | 1747 | r = 0; |
1711 | } | 1748 | } |
@@ -1713,65 +1750,77 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1713 | } | 1750 | } |
1714 | case KVM_GET_IRQCHIP: { | 1751 | case KVM_GET_IRQCHIP: { |
1715 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | 1752 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ |
1716 | struct kvm_irqchip chip; | 1753 | struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); |
1717 | 1754 | ||
1718 | r = -EFAULT; | 1755 | r = -ENOMEM; |
1719 | if (copy_from_user(&chip, argp, sizeof chip)) | 1756 | if (!chip) |
1720 | goto out; | 1757 | goto out; |
1758 | r = -EFAULT; | ||
1759 | if (copy_from_user(chip, argp, sizeof *chip)) | ||
1760 | goto get_irqchip_out; | ||
1721 | r = -ENXIO; | 1761 | r = -ENXIO; |
1722 | if (!irqchip_in_kernel(kvm)) | 1762 | if (!irqchip_in_kernel(kvm)) |
1723 | goto out; | 1763 | goto get_irqchip_out; |
1724 | r = kvm_vm_ioctl_get_irqchip(kvm, &chip); | 1764 | r = kvm_vm_ioctl_get_irqchip(kvm, chip); |
1725 | if (r) | 1765 | if (r) |
1726 | goto out; | 1766 | goto get_irqchip_out; |
1727 | r = -EFAULT; | 1767 | r = -EFAULT; |
1728 | if (copy_to_user(argp, &chip, sizeof chip)) | 1768 | if (copy_to_user(argp, chip, sizeof *chip)) |
1729 | goto out; | 1769 | goto get_irqchip_out; |
1730 | r = 0; | 1770 | r = 0; |
1771 | get_irqchip_out: | ||
1772 | kfree(chip); | ||
1773 | if (r) | ||
1774 | goto out; | ||
1731 | break; | 1775 | break; |
1732 | } | 1776 | } |
1733 | case KVM_SET_IRQCHIP: { | 1777 | case KVM_SET_IRQCHIP: { |
1734 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | 1778 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ |
1735 | struct kvm_irqchip chip; | 1779 | struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); |
1736 | 1780 | ||
1737 | r = -EFAULT; | 1781 | r = -ENOMEM; |
1738 | if (copy_from_user(&chip, argp, sizeof chip)) | 1782 | if (!chip) |
1739 | goto out; | 1783 | goto out; |
1784 | r = -EFAULT; | ||
1785 | if (copy_from_user(chip, argp, sizeof *chip)) | ||
1786 | goto set_irqchip_out; | ||
1740 | r = -ENXIO; | 1787 | r = -ENXIO; |
1741 | if (!irqchip_in_kernel(kvm)) | 1788 | if (!irqchip_in_kernel(kvm)) |
1742 | goto out; | 1789 | goto set_irqchip_out; |
1743 | r = kvm_vm_ioctl_set_irqchip(kvm, &chip); | 1790 | r = kvm_vm_ioctl_set_irqchip(kvm, chip); |
1744 | if (r) | 1791 | if (r) |
1745 | goto out; | 1792 | goto set_irqchip_out; |
1746 | r = 0; | 1793 | r = 0; |
1794 | set_irqchip_out: | ||
1795 | kfree(chip); | ||
1796 | if (r) | ||
1797 | goto out; | ||
1747 | break; | 1798 | break; |
1748 | } | 1799 | } |
1749 | case KVM_GET_PIT: { | 1800 | case KVM_GET_PIT: { |
1750 | struct kvm_pit_state ps; | ||
1751 | r = -EFAULT; | 1801 | r = -EFAULT; |
1752 | if (copy_from_user(&ps, argp, sizeof ps)) | 1802 | if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) |
1753 | goto out; | 1803 | goto out; |
1754 | r = -ENXIO; | 1804 | r = -ENXIO; |
1755 | if (!kvm->arch.vpit) | 1805 | if (!kvm->arch.vpit) |
1756 | goto out; | 1806 | goto out; |
1757 | r = kvm_vm_ioctl_get_pit(kvm, &ps); | 1807 | r = kvm_vm_ioctl_get_pit(kvm, &u.ps); |
1758 | if (r) | 1808 | if (r) |
1759 | goto out; | 1809 | goto out; |
1760 | r = -EFAULT; | 1810 | r = -EFAULT; |
1761 | if (copy_to_user(argp, &ps, sizeof ps)) | 1811 | if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) |
1762 | goto out; | 1812 | goto out; |
1763 | r = 0; | 1813 | r = 0; |
1764 | break; | 1814 | break; |
1765 | } | 1815 | } |
1766 | case KVM_SET_PIT: { | 1816 | case KVM_SET_PIT: { |
1767 | struct kvm_pit_state ps; | ||
1768 | r = -EFAULT; | 1817 | r = -EFAULT; |
1769 | if (copy_from_user(&ps, argp, sizeof ps)) | 1818 | if (copy_from_user(&u.ps, argp, sizeof u.ps)) |
1770 | goto out; | 1819 | goto out; |
1771 | r = -ENXIO; | 1820 | r = -ENXIO; |
1772 | if (!kvm->arch.vpit) | 1821 | if (!kvm->arch.vpit) |
1773 | goto out; | 1822 | goto out; |
1774 | r = kvm_vm_ioctl_set_pit(kvm, &ps); | 1823 | r = kvm_vm_ioctl_set_pit(kvm, &u.ps); |
1775 | if (r) | 1824 | if (r) |
1776 | goto out; | 1825 | goto out; |
1777 | r = 0; | 1826 | r = 0; |
@@ -2018,9 +2067,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
2018 | 2067 | ||
2019 | val = *(u64 *)new; | 2068 | val = *(u64 *)new; |
2020 | 2069 | ||
2021 | down_read(¤t->mm->mmap_sem); | ||
2022 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | 2070 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); |
2023 | up_read(¤t->mm->mmap_sem); | ||
2024 | 2071 | ||
2025 | kaddr = kmap_atomic(page, KM_USER0); | 2072 | kaddr = kmap_atomic(page, KM_USER0); |
2026 | set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); | 2073 | set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); |
@@ -2040,6 +2087,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) | |||
2040 | 2087 | ||
2041 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) | 2088 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) |
2042 | { | 2089 | { |
2090 | kvm_mmu_invlpg(vcpu, address); | ||
2043 | return X86EMUL_CONTINUE; | 2091 | return X86EMUL_CONTINUE; |
2044 | } | 2092 | } |
2045 | 2093 | ||
@@ -2080,7 +2128,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) | |||
2080 | void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) | 2128 | void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) |
2081 | { | 2129 | { |
2082 | u8 opcodes[4]; | 2130 | u8 opcodes[4]; |
2083 | unsigned long rip = vcpu->arch.rip; | 2131 | unsigned long rip = kvm_rip_read(vcpu); |
2084 | unsigned long rip_linear; | 2132 | unsigned long rip_linear; |
2085 | 2133 | ||
2086 | if (!printk_ratelimit()) | 2134 | if (!printk_ratelimit()) |
@@ -2102,6 +2150,14 @@ static struct x86_emulate_ops emulate_ops = { | |||
2102 | .cmpxchg_emulated = emulator_cmpxchg_emulated, | 2150 | .cmpxchg_emulated = emulator_cmpxchg_emulated, |
2103 | }; | 2151 | }; |
2104 | 2152 | ||
2153 | static void cache_all_regs(struct kvm_vcpu *vcpu) | ||
2154 | { | ||
2155 | kvm_register_read(vcpu, VCPU_REGS_RAX); | ||
2156 | kvm_register_read(vcpu, VCPU_REGS_RSP); | ||
2157 | kvm_register_read(vcpu, VCPU_REGS_RIP); | ||
2158 | vcpu->arch.regs_dirty = ~0; | ||
2159 | } | ||
2160 | |||
2105 | int emulate_instruction(struct kvm_vcpu *vcpu, | 2161 | int emulate_instruction(struct kvm_vcpu *vcpu, |
2106 | struct kvm_run *run, | 2162 | struct kvm_run *run, |
2107 | unsigned long cr2, | 2163 | unsigned long cr2, |
@@ -2111,8 +2167,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
2111 | int r; | 2167 | int r; |
2112 | struct decode_cache *c; | 2168 | struct decode_cache *c; |
2113 | 2169 | ||
2170 | kvm_clear_exception_queue(vcpu); | ||
2114 | vcpu->arch.mmio_fault_cr2 = cr2; | 2171 | vcpu->arch.mmio_fault_cr2 = cr2; |
2115 | kvm_x86_ops->cache_regs(vcpu); | 2172 | /* |
2173 | * TODO: fix x86_emulate.c to use guest_read/write_register | ||
2174 | * instead of direct ->regs accesses, can save hundred cycles | ||
2175 | * on Intel for instructions that don't read/change RSP, for | ||
2176 | * for example. | ||
2177 | */ | ||
2178 | cache_all_regs(vcpu); | ||
2116 | 2179 | ||
2117 | vcpu->mmio_is_write = 0; | 2180 | vcpu->mmio_is_write = 0; |
2118 | vcpu->arch.pio.string = 0; | 2181 | vcpu->arch.pio.string = 0; |
@@ -2172,7 +2235,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
2172 | return EMULATE_DO_MMIO; | 2235 | return EMULATE_DO_MMIO; |
2173 | } | 2236 | } |
2174 | 2237 | ||
2175 | kvm_x86_ops->decache_regs(vcpu); | ||
2176 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | 2238 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); |
2177 | 2239 | ||
2178 | if (vcpu->mmio_is_write) { | 2240 | if (vcpu->mmio_is_write) { |
@@ -2225,20 +2287,19 @@ int complete_pio(struct kvm_vcpu *vcpu) | |||
2225 | struct kvm_pio_request *io = &vcpu->arch.pio; | 2287 | struct kvm_pio_request *io = &vcpu->arch.pio; |
2226 | long delta; | 2288 | long delta; |
2227 | int r; | 2289 | int r; |
2228 | 2290 | unsigned long val; | |
2229 | kvm_x86_ops->cache_regs(vcpu); | ||
2230 | 2291 | ||
2231 | if (!io->string) { | 2292 | if (!io->string) { |
2232 | if (io->in) | 2293 | if (io->in) { |
2233 | memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, | 2294 | val = kvm_register_read(vcpu, VCPU_REGS_RAX); |
2234 | io->size); | 2295 | memcpy(&val, vcpu->arch.pio_data, io->size); |
2296 | kvm_register_write(vcpu, VCPU_REGS_RAX, val); | ||
2297 | } | ||
2235 | } else { | 2298 | } else { |
2236 | if (io->in) { | 2299 | if (io->in) { |
2237 | r = pio_copy_data(vcpu); | 2300 | r = pio_copy_data(vcpu); |
2238 | if (r) { | 2301 | if (r) |
2239 | kvm_x86_ops->cache_regs(vcpu); | ||
2240 | return r; | 2302 | return r; |
2241 | } | ||
2242 | } | 2303 | } |
2243 | 2304 | ||
2244 | delta = 1; | 2305 | delta = 1; |
@@ -2248,19 +2309,24 @@ int complete_pio(struct kvm_vcpu *vcpu) | |||
2248 | * The size of the register should really depend on | 2309 | * The size of the register should really depend on |
2249 | * current address size. | 2310 | * current address size. |
2250 | */ | 2311 | */ |
2251 | vcpu->arch.regs[VCPU_REGS_RCX] -= delta; | 2312 | val = kvm_register_read(vcpu, VCPU_REGS_RCX); |
2313 | val -= delta; | ||
2314 | kvm_register_write(vcpu, VCPU_REGS_RCX, val); | ||
2252 | } | 2315 | } |
2253 | if (io->down) | 2316 | if (io->down) |
2254 | delta = -delta; | 2317 | delta = -delta; |
2255 | delta *= io->size; | 2318 | delta *= io->size; |
2256 | if (io->in) | 2319 | if (io->in) { |
2257 | vcpu->arch.regs[VCPU_REGS_RDI] += delta; | 2320 | val = kvm_register_read(vcpu, VCPU_REGS_RDI); |
2258 | else | 2321 | val += delta; |
2259 | vcpu->arch.regs[VCPU_REGS_RSI] += delta; | 2322 | kvm_register_write(vcpu, VCPU_REGS_RDI, val); |
2323 | } else { | ||
2324 | val = kvm_register_read(vcpu, VCPU_REGS_RSI); | ||
2325 | val += delta; | ||
2326 | kvm_register_write(vcpu, VCPU_REGS_RSI, val); | ||
2327 | } | ||
2260 | } | 2328 | } |
2261 | 2329 | ||
2262 | kvm_x86_ops->decache_regs(vcpu); | ||
2263 | |||
2264 | io->count -= io->cur_count; | 2330 | io->count -= io->cur_count; |
2265 | io->cur_count = 0; | 2331 | io->cur_count = 0; |
2266 | 2332 | ||
@@ -2313,6 +2379,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
2313 | int size, unsigned port) | 2379 | int size, unsigned port) |
2314 | { | 2380 | { |
2315 | struct kvm_io_device *pio_dev; | 2381 | struct kvm_io_device *pio_dev; |
2382 | unsigned long val; | ||
2316 | 2383 | ||
2317 | vcpu->run->exit_reason = KVM_EXIT_IO; | 2384 | vcpu->run->exit_reason = KVM_EXIT_IO; |
2318 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | 2385 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; |
@@ -2333,8 +2400,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
2333 | KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, | 2400 | KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, |
2334 | handler); | 2401 | handler); |
2335 | 2402 | ||
2336 | kvm_x86_ops->cache_regs(vcpu); | 2403 | val = kvm_register_read(vcpu, VCPU_REGS_RAX); |
2337 | memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); | 2404 | memcpy(vcpu->arch.pio_data, &val, 4); |
2338 | 2405 | ||
2339 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 2406 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
2340 | 2407 | ||
@@ -2492,11 +2559,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) | |||
2492 | KVMTRACE_0D(HLT, vcpu, handler); | 2559 | KVMTRACE_0D(HLT, vcpu, handler); |
2493 | if (irqchip_in_kernel(vcpu->kvm)) { | 2560 | if (irqchip_in_kernel(vcpu->kvm)) { |
2494 | vcpu->arch.mp_state = KVM_MP_STATE_HALTED; | 2561 | vcpu->arch.mp_state = KVM_MP_STATE_HALTED; |
2495 | up_read(&vcpu->kvm->slots_lock); | ||
2496 | kvm_vcpu_block(vcpu); | ||
2497 | down_read(&vcpu->kvm->slots_lock); | ||
2498 | if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) | ||
2499 | return -EINTR; | ||
2500 | return 1; | 2562 | return 1; |
2501 | } else { | 2563 | } else { |
2502 | vcpu->run->exit_reason = KVM_EXIT_HLT; | 2564 | vcpu->run->exit_reason = KVM_EXIT_HLT; |
@@ -2519,13 +2581,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
2519 | unsigned long nr, a0, a1, a2, a3, ret; | 2581 | unsigned long nr, a0, a1, a2, a3, ret; |
2520 | int r = 1; | 2582 | int r = 1; |
2521 | 2583 | ||
2522 | kvm_x86_ops->cache_regs(vcpu); | 2584 | nr = kvm_register_read(vcpu, VCPU_REGS_RAX); |
2523 | 2585 | a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); | |
2524 | nr = vcpu->arch.regs[VCPU_REGS_RAX]; | 2586 | a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); |
2525 | a0 = vcpu->arch.regs[VCPU_REGS_RBX]; | 2587 | a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); |
2526 | a1 = vcpu->arch.regs[VCPU_REGS_RCX]; | 2588 | a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); |
2527 | a2 = vcpu->arch.regs[VCPU_REGS_RDX]; | ||
2528 | a3 = vcpu->arch.regs[VCPU_REGS_RSI]; | ||
2529 | 2589 | ||
2530 | KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); | 2590 | KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); |
2531 | 2591 | ||
@@ -2548,8 +2608,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
2548 | ret = -KVM_ENOSYS; | 2608 | ret = -KVM_ENOSYS; |
2549 | break; | 2609 | break; |
2550 | } | 2610 | } |
2551 | vcpu->arch.regs[VCPU_REGS_RAX] = ret; | 2611 | kvm_register_write(vcpu, VCPU_REGS_RAX, ret); |
2552 | kvm_x86_ops->decache_regs(vcpu); | ||
2553 | ++vcpu->stat.hypercalls; | 2612 | ++vcpu->stat.hypercalls; |
2554 | return r; | 2613 | return r; |
2555 | } | 2614 | } |
@@ -2559,6 +2618,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) | |||
2559 | { | 2618 | { |
2560 | char instruction[3]; | 2619 | char instruction[3]; |
2561 | int ret = 0; | 2620 | int ret = 0; |
2621 | unsigned long rip = kvm_rip_read(vcpu); | ||
2562 | 2622 | ||
2563 | 2623 | ||
2564 | /* | 2624 | /* |
@@ -2568,9 +2628,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) | |||
2568 | */ | 2628 | */ |
2569 | kvm_mmu_zap_all(vcpu->kvm); | 2629 | kvm_mmu_zap_all(vcpu->kvm); |
2570 | 2630 | ||
2571 | kvm_x86_ops->cache_regs(vcpu); | ||
2572 | kvm_x86_ops->patch_hypercall(vcpu, instruction); | 2631 | kvm_x86_ops->patch_hypercall(vcpu, instruction); |
2573 | if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) | 2632 | if (emulator_write_emulated(rip, instruction, 3, vcpu) |
2574 | != X86EMUL_CONTINUE) | 2633 | != X86EMUL_CONTINUE) |
2575 | ret = -EFAULT; | 2634 | ret = -EFAULT; |
2576 | 2635 | ||
@@ -2700,13 +2759,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | |||
2700 | u32 function, index; | 2759 | u32 function, index; |
2701 | struct kvm_cpuid_entry2 *e, *best; | 2760 | struct kvm_cpuid_entry2 *e, *best; |
2702 | 2761 | ||
2703 | kvm_x86_ops->cache_regs(vcpu); | 2762 | function = kvm_register_read(vcpu, VCPU_REGS_RAX); |
2704 | function = vcpu->arch.regs[VCPU_REGS_RAX]; | 2763 | index = kvm_register_read(vcpu, VCPU_REGS_RCX); |
2705 | index = vcpu->arch.regs[VCPU_REGS_RCX]; | 2764 | kvm_register_write(vcpu, VCPU_REGS_RAX, 0); |
2706 | vcpu->arch.regs[VCPU_REGS_RAX] = 0; | 2765 | kvm_register_write(vcpu, VCPU_REGS_RBX, 0); |
2707 | vcpu->arch.regs[VCPU_REGS_RBX] = 0; | 2766 | kvm_register_write(vcpu, VCPU_REGS_RCX, 0); |
2708 | vcpu->arch.regs[VCPU_REGS_RCX] = 0; | 2767 | kvm_register_write(vcpu, VCPU_REGS_RDX, 0); |
2709 | vcpu->arch.regs[VCPU_REGS_RDX] = 0; | ||
2710 | best = NULL; | 2768 | best = NULL; |
2711 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { | 2769 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { |
2712 | e = &vcpu->arch.cpuid_entries[i]; | 2770 | e = &vcpu->arch.cpuid_entries[i]; |
@@ -2724,18 +2782,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | |||
2724 | best = e; | 2782 | best = e; |
2725 | } | 2783 | } |
2726 | if (best) { | 2784 | if (best) { |
2727 | vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; | 2785 | kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); |
2728 | vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; | 2786 | kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); |
2729 | vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; | 2787 | kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); |
2730 | vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; | 2788 | kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); |
2731 | } | 2789 | } |
2732 | kvm_x86_ops->decache_regs(vcpu); | ||
2733 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 2790 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
2734 | KVMTRACE_5D(CPUID, vcpu, function, | 2791 | KVMTRACE_5D(CPUID, vcpu, function, |
2735 | (u32)vcpu->arch.regs[VCPU_REGS_RAX], | 2792 | (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), |
2736 | (u32)vcpu->arch.regs[VCPU_REGS_RBX], | 2793 | (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), |
2737 | (u32)vcpu->arch.regs[VCPU_REGS_RCX], | 2794 | (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), |
2738 | (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); | 2795 | (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); |
2739 | } | 2796 | } |
2740 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | 2797 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); |
2741 | 2798 | ||
@@ -2776,9 +2833,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu) | |||
2776 | if (!apic || !apic->vapic_addr) | 2833 | if (!apic || !apic->vapic_addr) |
2777 | return; | 2834 | return; |
2778 | 2835 | ||
2779 | down_read(¤t->mm->mmap_sem); | ||
2780 | page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); | 2836 | page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); |
2781 | up_read(¤t->mm->mmap_sem); | ||
2782 | 2837 | ||
2783 | vcpu->arch.apic->vapic_page = page; | 2838 | vcpu->arch.apic->vapic_page = page; |
2784 | } | 2839 | } |
@@ -2796,28 +2851,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu) | |||
2796 | up_read(&vcpu->kvm->slots_lock); | 2851 | up_read(&vcpu->kvm->slots_lock); |
2797 | } | 2852 | } |
2798 | 2853 | ||
2799 | static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2854 | static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2800 | { | 2855 | { |
2801 | int r; | 2856 | int r; |
2802 | 2857 | ||
2803 | if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { | ||
2804 | pr_debug("vcpu %d received sipi with vector # %x\n", | ||
2805 | vcpu->vcpu_id, vcpu->arch.sipi_vector); | ||
2806 | kvm_lapic_reset(vcpu); | ||
2807 | r = kvm_x86_ops->vcpu_reset(vcpu); | ||
2808 | if (r) | ||
2809 | return r; | ||
2810 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
2811 | } | ||
2812 | |||
2813 | down_read(&vcpu->kvm->slots_lock); | ||
2814 | vapic_enter(vcpu); | ||
2815 | |||
2816 | preempted: | ||
2817 | if (vcpu->guest_debug.enabled) | ||
2818 | kvm_x86_ops->guest_debug_pre(vcpu); | ||
2819 | |||
2820 | again: | ||
2821 | if (vcpu->requests) | 2858 | if (vcpu->requests) |
2822 | if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) | 2859 | if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) |
2823 | kvm_mmu_unload(vcpu); | 2860 | kvm_mmu_unload(vcpu); |
@@ -2829,6 +2866,8 @@ again: | |||
2829 | if (vcpu->requests) { | 2866 | if (vcpu->requests) { |
2830 | if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) | 2867 | if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) |
2831 | __kvm_migrate_timers(vcpu); | 2868 | __kvm_migrate_timers(vcpu); |
2869 | if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) | ||
2870 | kvm_mmu_sync_roots(vcpu); | ||
2832 | if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) | 2871 | if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) |
2833 | kvm_x86_ops->tlb_flush(vcpu); | 2872 | kvm_x86_ops->tlb_flush(vcpu); |
2834 | if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, | 2873 | if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, |
@@ -2854,21 +2893,15 @@ again: | |||
2854 | 2893 | ||
2855 | local_irq_disable(); | 2894 | local_irq_disable(); |
2856 | 2895 | ||
2857 | if (vcpu->requests || need_resched()) { | 2896 | if (vcpu->requests || need_resched() || signal_pending(current)) { |
2858 | local_irq_enable(); | 2897 | local_irq_enable(); |
2859 | preempt_enable(); | 2898 | preempt_enable(); |
2860 | r = 1; | 2899 | r = 1; |
2861 | goto out; | 2900 | goto out; |
2862 | } | 2901 | } |
2863 | 2902 | ||
2864 | if (signal_pending(current)) { | 2903 | if (vcpu->guest_debug.enabled) |
2865 | local_irq_enable(); | 2904 | kvm_x86_ops->guest_debug_pre(vcpu); |
2866 | preempt_enable(); | ||
2867 | r = -EINTR; | ||
2868 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
2869 | ++vcpu->stat.signal_exits; | ||
2870 | goto out; | ||
2871 | } | ||
2872 | 2905 | ||
2873 | vcpu->guest_mode = 1; | 2906 | vcpu->guest_mode = 1; |
2874 | /* | 2907 | /* |
@@ -2917,8 +2950,8 @@ again: | |||
2917 | * Profile KVM exit RIPs: | 2950 | * Profile KVM exit RIPs: |
2918 | */ | 2951 | */ |
2919 | if (unlikely(prof_on == KVM_PROFILING)) { | 2952 | if (unlikely(prof_on == KVM_PROFILING)) { |
2920 | kvm_x86_ops->cache_regs(vcpu); | 2953 | unsigned long rip = kvm_rip_read(vcpu); |
2921 | profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); | 2954 | profile_hit(KVM_PROFILING, (void *)rip); |
2922 | } | 2955 | } |
2923 | 2956 | ||
2924 | if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) | 2957 | if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) |
@@ -2927,26 +2960,63 @@ again: | |||
2927 | kvm_lapic_sync_from_vapic(vcpu); | 2960 | kvm_lapic_sync_from_vapic(vcpu); |
2928 | 2961 | ||
2929 | r = kvm_x86_ops->handle_exit(kvm_run, vcpu); | 2962 | r = kvm_x86_ops->handle_exit(kvm_run, vcpu); |
2963 | out: | ||
2964 | return r; | ||
2965 | } | ||
2930 | 2966 | ||
2931 | if (r > 0) { | 2967 | static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2932 | if (dm_request_for_irq_injection(vcpu, kvm_run)) { | 2968 | { |
2933 | r = -EINTR; | 2969 | int r; |
2934 | kvm_run->exit_reason = KVM_EXIT_INTR; | 2970 | |
2935 | ++vcpu->stat.request_irq_exits; | 2971 | if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { |
2936 | goto out; | 2972 | pr_debug("vcpu %d received sipi with vector # %x\n", |
2937 | } | 2973 | vcpu->vcpu_id, vcpu->arch.sipi_vector); |
2938 | if (!need_resched()) | 2974 | kvm_lapic_reset(vcpu); |
2939 | goto again; | 2975 | r = kvm_x86_ops->vcpu_reset(vcpu); |
2976 | if (r) | ||
2977 | return r; | ||
2978 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
2940 | } | 2979 | } |
2941 | 2980 | ||
2942 | out: | 2981 | down_read(&vcpu->kvm->slots_lock); |
2943 | up_read(&vcpu->kvm->slots_lock); | 2982 | vapic_enter(vcpu); |
2944 | if (r > 0) { | 2983 | |
2945 | kvm_resched(vcpu); | 2984 | r = 1; |
2946 | down_read(&vcpu->kvm->slots_lock); | 2985 | while (r > 0) { |
2947 | goto preempted; | 2986 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) |
2987 | r = vcpu_enter_guest(vcpu, kvm_run); | ||
2988 | else { | ||
2989 | up_read(&vcpu->kvm->slots_lock); | ||
2990 | kvm_vcpu_block(vcpu); | ||
2991 | down_read(&vcpu->kvm->slots_lock); | ||
2992 | if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) | ||
2993 | if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) | ||
2994 | vcpu->arch.mp_state = | ||
2995 | KVM_MP_STATE_RUNNABLE; | ||
2996 | if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) | ||
2997 | r = -EINTR; | ||
2998 | } | ||
2999 | |||
3000 | if (r > 0) { | ||
3001 | if (dm_request_for_irq_injection(vcpu, kvm_run)) { | ||
3002 | r = -EINTR; | ||
3003 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
3004 | ++vcpu->stat.request_irq_exits; | ||
3005 | } | ||
3006 | if (signal_pending(current)) { | ||
3007 | r = -EINTR; | ||
3008 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
3009 | ++vcpu->stat.signal_exits; | ||
3010 | } | ||
3011 | if (need_resched()) { | ||
3012 | up_read(&vcpu->kvm->slots_lock); | ||
3013 | kvm_resched(vcpu); | ||
3014 | down_read(&vcpu->kvm->slots_lock); | ||
3015 | } | ||
3016 | } | ||
2948 | } | 3017 | } |
2949 | 3018 | ||
3019 | up_read(&vcpu->kvm->slots_lock); | ||
2950 | post_kvm_run_save(vcpu, kvm_run); | 3020 | post_kvm_run_save(vcpu, kvm_run); |
2951 | 3021 | ||
2952 | vapic_exit(vcpu); | 3022 | vapic_exit(vcpu); |
@@ -2966,6 +3036,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2966 | 3036 | ||
2967 | if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { | 3037 | if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { |
2968 | kvm_vcpu_block(vcpu); | 3038 | kvm_vcpu_block(vcpu); |
3039 | clear_bit(KVM_REQ_UNHALT, &vcpu->requests); | ||
2969 | r = -EAGAIN; | 3040 | r = -EAGAIN; |
2970 | goto out; | 3041 | goto out; |
2971 | } | 3042 | } |
@@ -2999,11 +3070,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2999 | } | 3070 | } |
3000 | } | 3071 | } |
3001 | #endif | 3072 | #endif |
3002 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { | 3073 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) |
3003 | kvm_x86_ops->cache_regs(vcpu); | 3074 | kvm_register_write(vcpu, VCPU_REGS_RAX, |
3004 | vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; | 3075 | kvm_run->hypercall.ret); |
3005 | kvm_x86_ops->decache_regs(vcpu); | ||
3006 | } | ||
3007 | 3076 | ||
3008 | r = __vcpu_run(vcpu, kvm_run); | 3077 | r = __vcpu_run(vcpu, kvm_run); |
3009 | 3078 | ||
@@ -3019,28 +3088,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
3019 | { | 3088 | { |
3020 | vcpu_load(vcpu); | 3089 | vcpu_load(vcpu); |
3021 | 3090 | ||
3022 | kvm_x86_ops->cache_regs(vcpu); | 3091 | regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); |
3023 | 3092 | regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); | |
3024 | regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; | 3093 | regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); |
3025 | regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; | 3094 | regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); |
3026 | regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; | 3095 | regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); |
3027 | regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; | 3096 | regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); |
3028 | regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; | 3097 | regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); |
3029 | regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; | 3098 | regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); |
3030 | regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | ||
3031 | regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; | ||
3032 | #ifdef CONFIG_X86_64 | 3099 | #ifdef CONFIG_X86_64 |
3033 | regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; | 3100 | regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); |
3034 | regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; | 3101 | regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); |
3035 | regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; | 3102 | regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); |
3036 | regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; | 3103 | regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); |
3037 | regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; | 3104 | regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); |
3038 | regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; | 3105 | regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); |
3039 | regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; | 3106 | regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); |
3040 | regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; | 3107 | regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); |
3041 | #endif | 3108 | #endif |
3042 | 3109 | ||
3043 | regs->rip = vcpu->arch.rip; | 3110 | regs->rip = kvm_rip_read(vcpu); |
3044 | regs->rflags = kvm_x86_ops->get_rflags(vcpu); | 3111 | regs->rflags = kvm_x86_ops->get_rflags(vcpu); |
3045 | 3112 | ||
3046 | /* | 3113 | /* |
@@ -3058,29 +3125,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
3058 | { | 3125 | { |
3059 | vcpu_load(vcpu); | 3126 | vcpu_load(vcpu); |
3060 | 3127 | ||
3061 | vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; | 3128 | kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); |
3062 | vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; | 3129 | kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); |
3063 | vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; | 3130 | kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); |
3064 | vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; | 3131 | kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); |
3065 | vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; | 3132 | kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); |
3066 | vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; | 3133 | kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); |
3067 | vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; | 3134 | kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); |
3068 | vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; | 3135 | kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); |
3069 | #ifdef CONFIG_X86_64 | 3136 | #ifdef CONFIG_X86_64 |
3070 | vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; | 3137 | kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); |
3071 | vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; | 3138 | kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); |
3072 | vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; | 3139 | kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); |
3073 | vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; | 3140 | kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); |
3074 | vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; | 3141 | kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); |
3075 | vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; | 3142 | kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); |
3076 | vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; | 3143 | kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); |
3077 | vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; | 3144 | kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); |
3145 | |||
3078 | #endif | 3146 | #endif |
3079 | 3147 | ||
3080 | vcpu->arch.rip = regs->rip; | 3148 | kvm_rip_write(vcpu, regs->rip); |
3081 | kvm_x86_ops->set_rflags(vcpu, regs->rflags); | 3149 | kvm_x86_ops->set_rflags(vcpu, regs->rflags); |
3082 | 3150 | ||
3083 | kvm_x86_ops->decache_regs(vcpu); | ||
3084 | 3151 | ||
3085 | vcpu->arch.exception.pending = false; | 3152 | vcpu->arch.exception.pending = false; |
3086 | 3153 | ||
@@ -3294,11 +3361,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, | |||
3294 | return 0; | 3361 | return 0; |
3295 | } | 3362 | } |
3296 | 3363 | ||
3364 | static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) | ||
3365 | { | ||
3366 | struct kvm_segment segvar = { | ||
3367 | .base = selector << 4, | ||
3368 | .limit = 0xffff, | ||
3369 | .selector = selector, | ||
3370 | .type = 3, | ||
3371 | .present = 1, | ||
3372 | .dpl = 3, | ||
3373 | .db = 0, | ||
3374 | .s = 1, | ||
3375 | .l = 0, | ||
3376 | .g = 0, | ||
3377 | .avl = 0, | ||
3378 | .unusable = 0, | ||
3379 | }; | ||
3380 | kvm_x86_ops->set_segment(vcpu, &segvar, seg); | ||
3381 | return 0; | ||
3382 | } | ||
3383 | |||
3297 | int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | 3384 | int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, |
3298 | int type_bits, int seg) | 3385 | int type_bits, int seg) |
3299 | { | 3386 | { |
3300 | struct kvm_segment kvm_seg; | 3387 | struct kvm_segment kvm_seg; |
3301 | 3388 | ||
3389 | if (!(vcpu->arch.cr0 & X86_CR0_PE)) | ||
3390 | return kvm_load_realmode_segment(vcpu, selector, seg); | ||
3302 | if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) | 3391 | if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) |
3303 | return 1; | 3392 | return 1; |
3304 | kvm_seg.type |= type_bits; | 3393 | kvm_seg.type |= type_bits; |
@@ -3316,17 +3405,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, | |||
3316 | struct tss_segment_32 *tss) | 3405 | struct tss_segment_32 *tss) |
3317 | { | 3406 | { |
3318 | tss->cr3 = vcpu->arch.cr3; | 3407 | tss->cr3 = vcpu->arch.cr3; |
3319 | tss->eip = vcpu->arch.rip; | 3408 | tss->eip = kvm_rip_read(vcpu); |
3320 | tss->eflags = kvm_x86_ops->get_rflags(vcpu); | 3409 | tss->eflags = kvm_x86_ops->get_rflags(vcpu); |
3321 | tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; | 3410 | tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); |
3322 | tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; | 3411 | tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); |
3323 | tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; | 3412 | tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); |
3324 | tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; | 3413 | tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); |
3325 | tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; | 3414 | tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); |
3326 | tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; | 3415 | tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); |
3327 | tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; | 3416 | tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); |
3328 | tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; | 3417 | tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); |
3329 | |||
3330 | tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); | 3418 | tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); |
3331 | tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); | 3419 | tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); |
3332 | tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); | 3420 | tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); |
@@ -3342,17 +3430,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, | |||
3342 | { | 3430 | { |
3343 | kvm_set_cr3(vcpu, tss->cr3); | 3431 | kvm_set_cr3(vcpu, tss->cr3); |
3344 | 3432 | ||
3345 | vcpu->arch.rip = tss->eip; | 3433 | kvm_rip_write(vcpu, tss->eip); |
3346 | kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); | 3434 | kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); |
3347 | 3435 | ||
3348 | vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; | 3436 | kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); |
3349 | vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; | 3437 | kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); |
3350 | vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; | 3438 | kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); |
3351 | vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; | 3439 | kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); |
3352 | vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; | 3440 | kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); |
3353 | vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; | 3441 | kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); |
3354 | vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; | 3442 | kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); |
3355 | vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; | 3443 | kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); |
3356 | 3444 | ||
3357 | if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) | 3445 | if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) |
3358 | return 1; | 3446 | return 1; |
@@ -3380,16 +3468,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, | |||
3380 | static void save_state_to_tss16(struct kvm_vcpu *vcpu, | 3468 | static void save_state_to_tss16(struct kvm_vcpu *vcpu, |
3381 | struct tss_segment_16 *tss) | 3469 | struct tss_segment_16 *tss) |
3382 | { | 3470 | { |
3383 | tss->ip = vcpu->arch.rip; | 3471 | tss->ip = kvm_rip_read(vcpu); |
3384 | tss->flag = kvm_x86_ops->get_rflags(vcpu); | 3472 | tss->flag = kvm_x86_ops->get_rflags(vcpu); |
3385 | tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; | 3473 | tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); |
3386 | tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; | 3474 | tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); |
3387 | tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; | 3475 | tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); |
3388 | tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; | 3476 | tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); |
3389 | tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; | 3477 | tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); |
3390 | tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; | 3478 | tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); |
3391 | tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; | 3479 | tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); |
3392 | tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; | 3480 | tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); |
3393 | 3481 | ||
3394 | tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); | 3482 | tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); |
3395 | tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); | 3483 | tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); |
@@ -3402,16 +3490,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu, | |||
3402 | static int load_state_from_tss16(struct kvm_vcpu *vcpu, | 3490 | static int load_state_from_tss16(struct kvm_vcpu *vcpu, |
3403 | struct tss_segment_16 *tss) | 3491 | struct tss_segment_16 *tss) |
3404 | { | 3492 | { |
3405 | vcpu->arch.rip = tss->ip; | 3493 | kvm_rip_write(vcpu, tss->ip); |
3406 | kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); | 3494 | kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); |
3407 | vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; | 3495 | kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); |
3408 | vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; | 3496 | kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); |
3409 | vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; | 3497 | kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); |
3410 | vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; | 3498 | kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); |
3411 | vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; | 3499 | kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); |
3412 | vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; | 3500 | kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); |
3413 | vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; | 3501 | kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); |
3414 | vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; | 3502 | kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); |
3415 | 3503 | ||
3416 | if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) | 3504 | if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) |
3417 | return 1; | 3505 | return 1; |
@@ -3534,7 +3622,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) | |||
3534 | } | 3622 | } |
3535 | 3623 | ||
3536 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 3624 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
3537 | kvm_x86_ops->cache_regs(vcpu); | ||
3538 | 3625 | ||
3539 | if (nseg_desc.type & 8) | 3626 | if (nseg_desc.type & 8) |
3540 | ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, | 3627 | ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, |
@@ -3559,7 +3646,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) | |||
3559 | tr_seg.type = 11; | 3646 | tr_seg.type = 11; |
3560 | kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); | 3647 | kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); |
3561 | out: | 3648 | out: |
3562 | kvm_x86_ops->decache_regs(vcpu); | ||
3563 | return ret; | 3649 | return ret; |
3564 | } | 3650 | } |
3565 | EXPORT_SYMBOL_GPL(kvm_task_switch); | 3651 | EXPORT_SYMBOL_GPL(kvm_task_switch); |
@@ -3622,6 +3708,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
3622 | pr_debug("Set back pending irq %d\n", | 3708 | pr_debug("Set back pending irq %d\n", |
3623 | pending_vec); | 3709 | pending_vec); |
3624 | } | 3710 | } |
3711 | kvm_pic_clear_isr_ack(vcpu->kvm); | ||
3625 | } | 3712 | } |
3626 | 3713 | ||
3627 | kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); | 3714 | kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); |
@@ -3634,6 +3721,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
3634 | kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); | 3721 | kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); |
3635 | kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); | 3722 | kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); |
3636 | 3723 | ||
3724 | /* Older userspace won't unhalt the vcpu on reset. */ | ||
3725 | if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && | ||
3726 | sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && | ||
3727 | !(vcpu->arch.cr0 & X86_CR0_PE)) | ||
3728 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
3729 | |||
3637 | vcpu_put(vcpu); | 3730 | vcpu_put(vcpu); |
3638 | 3731 | ||
3639 | return 0; | 3732 | return 0; |
@@ -3918,6 +4011,7 @@ struct kvm *kvm_arch_create_vm(void) | |||
3918 | return ERR_PTR(-ENOMEM); | 4011 | return ERR_PTR(-ENOMEM); |
3919 | 4012 | ||
3920 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); | 4013 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); |
4014 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); | ||
3921 | 4015 | ||
3922 | return kvm; | 4016 | return kvm; |
3923 | } | 4017 | } |
@@ -3950,6 +4044,8 @@ static void kvm_free_vcpus(struct kvm *kvm) | |||
3950 | 4044 | ||
3951 | void kvm_arch_destroy_vm(struct kvm *kvm) | 4045 | void kvm_arch_destroy_vm(struct kvm *kvm) |
3952 | { | 4046 | { |
4047 | kvm_iommu_unmap_guest(kvm); | ||
4048 | kvm_free_all_assigned_devices(kvm); | ||
3953 | kvm_free_pit(kvm); | 4049 | kvm_free_pit(kvm); |
3954 | kfree(kvm->arch.vpic); | 4050 | kfree(kvm->arch.vpic); |
3955 | kfree(kvm->arch.vioapic); | 4051 | kfree(kvm->arch.vioapic); |
@@ -3981,7 +4077,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, | |||
3981 | userspace_addr = do_mmap(NULL, 0, | 4077 | userspace_addr = do_mmap(NULL, 0, |
3982 | npages * PAGE_SIZE, | 4078 | npages * PAGE_SIZE, |
3983 | PROT_READ | PROT_WRITE, | 4079 | PROT_READ | PROT_WRITE, |
3984 | MAP_SHARED | MAP_ANONYMOUS, | 4080 | MAP_PRIVATE | MAP_ANONYMOUS, |
3985 | 0); | 4081 | 0); |
3986 | up_write(¤t->mm->mmap_sem); | 4082 | up_write(¤t->mm->mmap_sem); |
3987 | 4083 | ||
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h new file mode 100644 index 000000000000..6a4be78a7384 --- /dev/null +++ b/arch/x86/kvm/x86.h | |||
@@ -0,0 +1,22 @@ | |||
1 | #ifndef ARCH_X86_KVM_X86_H | ||
2 | #define ARCH_X86_KVM_X86_H | ||
3 | |||
4 | #include <linux/kvm_host.h> | ||
5 | |||
6 | static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) | ||
7 | { | ||
8 | vcpu->arch.exception.pending = false; | ||
9 | } | ||
10 | |||
11 | static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) | ||
12 | { | ||
13 | vcpu->arch.interrupt.pending = true; | ||
14 | vcpu->arch.interrupt.nr = vector; | ||
15 | } | ||
16 | |||
17 | static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) | ||
18 | { | ||
19 | vcpu->arch.interrupt.pending = false; | ||
20 | } | ||
21 | |||
22 | #endif | ||
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index f2f90468f8b1..ea051173b0da 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #define DPRINTF(_f, _a ...) printf(_f , ## _a) | 26 | #define DPRINTF(_f, _a ...) printf(_f , ## _a) |
27 | #else | 27 | #else |
28 | #include <linux/kvm_host.h> | 28 | #include <linux/kvm_host.h> |
29 | #include "kvm_cache_regs.h" | ||
29 | #define DPRINTF(x...) do {} while (0) | 30 | #define DPRINTF(x...) do {} while (0) |
30 | #endif | 31 | #endif |
31 | #include <linux/module.h> | 32 | #include <linux/module.h> |
@@ -46,25 +47,26 @@ | |||
46 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ | 47 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ |
47 | #define DstReg (2<<1) /* Register operand. */ | 48 | #define DstReg (2<<1) /* Register operand. */ |
48 | #define DstMem (3<<1) /* Memory operand. */ | 49 | #define DstMem (3<<1) /* Memory operand. */ |
49 | #define DstMask (3<<1) | 50 | #define DstAcc (4<<1) /* Destination Accumulator */ |
51 | #define DstMask (7<<1) | ||
50 | /* Source operand type. */ | 52 | /* Source operand type. */ |
51 | #define SrcNone (0<<3) /* No source operand. */ | 53 | #define SrcNone (0<<4) /* No source operand. */ |
52 | #define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ | 54 | #define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ |
53 | #define SrcReg (1<<3) /* Register operand. */ | 55 | #define SrcReg (1<<4) /* Register operand. */ |
54 | #define SrcMem (2<<3) /* Memory operand. */ | 56 | #define SrcMem (2<<4) /* Memory operand. */ |
55 | #define SrcMem16 (3<<3) /* Memory operand (16-bit). */ | 57 | #define SrcMem16 (3<<4) /* Memory operand (16-bit). */ |
56 | #define SrcMem32 (4<<3) /* Memory operand (32-bit). */ | 58 | #define SrcMem32 (4<<4) /* Memory operand (32-bit). */ |
57 | #define SrcImm (5<<3) /* Immediate operand. */ | 59 | #define SrcImm (5<<4) /* Immediate operand. */ |
58 | #define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ | 60 | #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ |
59 | #define SrcMask (7<<3) | 61 | #define SrcMask (7<<4) |
60 | /* Generic ModRM decode. */ | 62 | /* Generic ModRM decode. */ |
61 | #define ModRM (1<<6) | 63 | #define ModRM (1<<7) |
62 | /* Destination is only written; never read. */ | 64 | /* Destination is only written; never read. */ |
63 | #define Mov (1<<7) | 65 | #define Mov (1<<8) |
64 | #define BitOp (1<<8) | 66 | #define BitOp (1<<9) |
65 | #define MemAbs (1<<9) /* Memory operand is absolute displacement */ | 67 | #define MemAbs (1<<10) /* Memory operand is absolute displacement */ |
66 | #define String (1<<10) /* String instruction (rep capable) */ | 68 | #define String (1<<12) /* String instruction (rep capable) */ |
67 | #define Stack (1<<11) /* Stack instruction (push/pop) */ | 69 | #define Stack (1<<13) /* Stack instruction (push/pop) */ |
68 | #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ | 70 | #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ |
69 | #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ | 71 | #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ |
70 | #define GroupMask 0xff /* Group number stored in bits 0:7 */ | 72 | #define GroupMask 0xff /* Group number stored in bits 0:7 */ |
@@ -94,7 +96,7 @@ static u16 opcode_table[256] = { | |||
94 | /* 0x20 - 0x27 */ | 96 | /* 0x20 - 0x27 */ |
95 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 97 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
96 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 98 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
97 | SrcImmByte, SrcImm, 0, 0, | 99 | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, |
98 | /* 0x28 - 0x2F */ | 100 | /* 0x28 - 0x2F */ |
99 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 101 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
100 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 102 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
@@ -106,7 +108,8 @@ static u16 opcode_table[256] = { | |||
106 | /* 0x38 - 0x3F */ | 108 | /* 0x38 - 0x3F */ |
107 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 109 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
108 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 110 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
109 | 0, 0, 0, 0, | 111 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, |
112 | 0, 0, | ||
110 | /* 0x40 - 0x47 */ | 113 | /* 0x40 - 0x47 */ |
111 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | 114 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, |
112 | /* 0x48 - 0x4F */ | 115 | /* 0x48 - 0x4F */ |
@@ -153,9 +156,16 @@ static u16 opcode_table[256] = { | |||
153 | 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | 156 | 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, |
154 | ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | 157 | ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, |
155 | ByteOp | ImplicitOps | String, ImplicitOps | String, | 158 | ByteOp | ImplicitOps | String, ImplicitOps | String, |
156 | /* 0xB0 - 0xBF */ | 159 | /* 0xB0 - 0xB7 */ |
157 | 0, 0, 0, 0, 0, 0, 0, 0, | 160 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, |
158 | DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0, | 161 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, |
162 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, | ||
163 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, | ||
164 | /* 0xB8 - 0xBF */ | ||
165 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
166 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
167 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
168 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
159 | /* 0xC0 - 0xC7 */ | 169 | /* 0xC0 - 0xC7 */ |
160 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | 170 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, |
161 | 0, ImplicitOps | Stack, 0, 0, | 171 | 0, ImplicitOps | Stack, 0, 0, |
@@ -169,17 +179,20 @@ static u16 opcode_table[256] = { | |||
169 | /* 0xD8 - 0xDF */ | 179 | /* 0xD8 - 0xDF */ |
170 | 0, 0, 0, 0, 0, 0, 0, 0, | 180 | 0, 0, 0, 0, 0, 0, 0, 0, |
171 | /* 0xE0 - 0xE7 */ | 181 | /* 0xE0 - 0xE7 */ |
172 | 0, 0, 0, 0, 0, 0, 0, 0, | 182 | 0, 0, 0, 0, |
183 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | ||
184 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | ||
173 | /* 0xE8 - 0xEF */ | 185 | /* 0xE8 - 0xEF */ |
174 | ImplicitOps | Stack, SrcImm | ImplicitOps, | 186 | ImplicitOps | Stack, SrcImm | ImplicitOps, |
175 | ImplicitOps, SrcImmByte | ImplicitOps, | 187 | ImplicitOps, SrcImmByte | ImplicitOps, |
176 | 0, 0, 0, 0, | 188 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, |
189 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | ||
177 | /* 0xF0 - 0xF7 */ | 190 | /* 0xF0 - 0xF7 */ |
178 | 0, 0, 0, 0, | 191 | 0, 0, 0, 0, |
179 | ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, | 192 | ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, |
180 | /* 0xF8 - 0xFF */ | 193 | /* 0xF8 - 0xFF */ |
181 | ImplicitOps, 0, ImplicitOps, ImplicitOps, | 194 | ImplicitOps, 0, ImplicitOps, ImplicitOps, |
182 | 0, 0, Group | Group4, Group | Group5, | 195 | ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, |
183 | }; | 196 | }; |
184 | 197 | ||
185 | static u16 twobyte_table[256] = { | 198 | static u16 twobyte_table[256] = { |
@@ -268,15 +281,16 @@ static u16 group_table[] = { | |||
268 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | 281 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, |
269 | 0, 0, 0, 0, | 282 | 0, 0, 0, 0, |
270 | [Group3*8] = | 283 | [Group3*8] = |
271 | DstMem | SrcImm | ModRM | SrcImm, 0, | 284 | DstMem | SrcImm | ModRM, 0, |
272 | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | 285 | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, |
273 | 0, 0, 0, 0, | 286 | 0, 0, 0, 0, |
274 | [Group4*8] = | 287 | [Group4*8] = |
275 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | 288 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, |
276 | 0, 0, 0, 0, 0, 0, | 289 | 0, 0, 0, 0, 0, 0, |
277 | [Group5*8] = | 290 | [Group5*8] = |
278 | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, | 291 | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, |
279 | SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, | 292 | SrcMem | ModRM | Stack, 0, |
293 | SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, | ||
280 | [Group7*8] = | 294 | [Group7*8] = |
281 | 0, 0, ModRM | SrcMem, ModRM | SrcMem, | 295 | 0, 0, ModRM | SrcMem, ModRM | SrcMem, |
282 | SrcNone | ModRM | DstMem | Mov, 0, | 296 | SrcNone | ModRM | DstMem | Mov, 0, |
@@ -839,7 +853,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
839 | /* Shadow copy of register state. Committed on successful emulation. */ | 853 | /* Shadow copy of register state. Committed on successful emulation. */ |
840 | 854 | ||
841 | memset(c, 0, sizeof(struct decode_cache)); | 855 | memset(c, 0, sizeof(struct decode_cache)); |
842 | c->eip = ctxt->vcpu->arch.rip; | 856 | c->eip = kvm_rip_read(ctxt->vcpu); |
843 | ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); | 857 | ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); |
844 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); | 858 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); |
845 | 859 | ||
@@ -1048,6 +1062,23 @@ done_prefixes: | |||
1048 | } | 1062 | } |
1049 | c->dst.type = OP_MEM; | 1063 | c->dst.type = OP_MEM; |
1050 | break; | 1064 | break; |
1065 | case DstAcc: | ||
1066 | c->dst.type = OP_REG; | ||
1067 | c->dst.bytes = c->op_bytes; | ||
1068 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1069 | switch (c->op_bytes) { | ||
1070 | case 1: | ||
1071 | c->dst.val = *(u8 *)c->dst.ptr; | ||
1072 | break; | ||
1073 | case 2: | ||
1074 | c->dst.val = *(u16 *)c->dst.ptr; | ||
1075 | break; | ||
1076 | case 4: | ||
1077 | c->dst.val = *(u32 *)c->dst.ptr; | ||
1078 | break; | ||
1079 | } | ||
1080 | c->dst.orig_val = c->dst.val; | ||
1081 | break; | ||
1051 | } | 1082 | } |
1052 | 1083 | ||
1053 | if (c->rip_relative) | 1084 | if (c->rip_relative) |
@@ -1151,6 +1182,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, | |||
1151 | case 1: /* dec */ | 1182 | case 1: /* dec */ |
1152 | emulate_1op("dec", c->dst, ctxt->eflags); | 1183 | emulate_1op("dec", c->dst, ctxt->eflags); |
1153 | break; | 1184 | break; |
1185 | case 2: /* call near abs */ { | ||
1186 | long int old_eip; | ||
1187 | old_eip = c->eip; | ||
1188 | c->eip = c->src.val; | ||
1189 | c->src.val = old_eip; | ||
1190 | emulate_push(ctxt); | ||
1191 | break; | ||
1192 | } | ||
1154 | case 4: /* jmp abs */ | 1193 | case 4: /* jmp abs */ |
1155 | c->eip = c->src.val; | 1194 | c->eip = c->src.val; |
1156 | break; | 1195 | break; |
@@ -1251,6 +1290,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1251 | u64 msr_data; | 1290 | u64 msr_data; |
1252 | unsigned long saved_eip = 0; | 1291 | unsigned long saved_eip = 0; |
1253 | struct decode_cache *c = &ctxt->decode; | 1292 | struct decode_cache *c = &ctxt->decode; |
1293 | unsigned int port; | ||
1294 | int io_dir_in; | ||
1254 | int rc = 0; | 1295 | int rc = 0; |
1255 | 1296 | ||
1256 | /* Shadow copy of register state. Committed on successful emulation. | 1297 | /* Shadow copy of register state. Committed on successful emulation. |
@@ -1267,7 +1308,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1267 | if (c->rep_prefix && (c->d & String)) { | 1308 | if (c->rep_prefix && (c->d & String)) { |
1268 | /* All REP prefixes have the same first termination condition */ | 1309 | /* All REP prefixes have the same first termination condition */ |
1269 | if (c->regs[VCPU_REGS_RCX] == 0) { | 1310 | if (c->regs[VCPU_REGS_RCX] == 0) { |
1270 | ctxt->vcpu->arch.rip = c->eip; | 1311 | kvm_rip_write(ctxt->vcpu, c->eip); |
1271 | goto done; | 1312 | goto done; |
1272 | } | 1313 | } |
1273 | /* The second termination condition only applies for REPE | 1314 | /* The second termination condition only applies for REPE |
@@ -1281,17 +1322,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1281 | (c->b == 0xae) || (c->b == 0xaf)) { | 1322 | (c->b == 0xae) || (c->b == 0xaf)) { |
1282 | if ((c->rep_prefix == REPE_PREFIX) && | 1323 | if ((c->rep_prefix == REPE_PREFIX) && |
1283 | ((ctxt->eflags & EFLG_ZF) == 0)) { | 1324 | ((ctxt->eflags & EFLG_ZF) == 0)) { |
1284 | ctxt->vcpu->arch.rip = c->eip; | 1325 | kvm_rip_write(ctxt->vcpu, c->eip); |
1285 | goto done; | 1326 | goto done; |
1286 | } | 1327 | } |
1287 | if ((c->rep_prefix == REPNE_PREFIX) && | 1328 | if ((c->rep_prefix == REPNE_PREFIX) && |
1288 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { | 1329 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { |
1289 | ctxt->vcpu->arch.rip = c->eip; | 1330 | kvm_rip_write(ctxt->vcpu, c->eip); |
1290 | goto done; | 1331 | goto done; |
1291 | } | 1332 | } |
1292 | } | 1333 | } |
1293 | c->regs[VCPU_REGS_RCX]--; | 1334 | c->regs[VCPU_REGS_RCX]--; |
1294 | c->eip = ctxt->vcpu->arch.rip; | 1335 | c->eip = kvm_rip_read(ctxt->vcpu); |
1295 | } | 1336 | } |
1296 | 1337 | ||
1297 | if (c->src.type == OP_MEM) { | 1338 | if (c->src.type == OP_MEM) { |
@@ -1351,27 +1392,10 @@ special_insn: | |||
1351 | sbb: /* sbb */ | 1392 | sbb: /* sbb */ |
1352 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); | 1393 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); |
1353 | break; | 1394 | break; |
1354 | case 0x20 ... 0x23: | 1395 | case 0x20 ... 0x25: |
1355 | and: /* and */ | 1396 | and: /* and */ |
1356 | emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); | 1397 | emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); |
1357 | break; | 1398 | break; |
1358 | case 0x24: /* and al imm8 */ | ||
1359 | c->dst.type = OP_REG; | ||
1360 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1361 | c->dst.val = *(u8 *)c->dst.ptr; | ||
1362 | c->dst.bytes = 1; | ||
1363 | c->dst.orig_val = c->dst.val; | ||
1364 | goto and; | ||
1365 | case 0x25: /* and ax imm16, or eax imm32 */ | ||
1366 | c->dst.type = OP_REG; | ||
1367 | c->dst.bytes = c->op_bytes; | ||
1368 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1369 | if (c->op_bytes == 2) | ||
1370 | c->dst.val = *(u16 *)c->dst.ptr; | ||
1371 | else | ||
1372 | c->dst.val = *(u32 *)c->dst.ptr; | ||
1373 | c->dst.orig_val = c->dst.val; | ||
1374 | goto and; | ||
1375 | case 0x28 ... 0x2d: | 1399 | case 0x28 ... 0x2d: |
1376 | sub: /* sub */ | 1400 | sub: /* sub */ |
1377 | emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); | 1401 | emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); |
@@ -1659,7 +1683,7 @@ special_insn: | |||
1659 | case 0xae ... 0xaf: /* scas */ | 1683 | case 0xae ... 0xaf: /* scas */ |
1660 | DPRINTF("Urk! I don't handle SCAS.\n"); | 1684 | DPRINTF("Urk! I don't handle SCAS.\n"); |
1661 | goto cannot_emulate; | 1685 | goto cannot_emulate; |
1662 | case 0xb8: /* mov r, imm */ | 1686 | case 0xb0 ... 0xbf: /* mov r, imm */ |
1663 | goto mov; | 1687 | goto mov; |
1664 | case 0xc0 ... 0xc1: | 1688 | case 0xc0 ... 0xc1: |
1665 | emulate_grp2(ctxt); | 1689 | emulate_grp2(ctxt); |
@@ -1679,6 +1703,16 @@ special_insn: | |||
1679 | c->src.val = c->regs[VCPU_REGS_RCX]; | 1703 | c->src.val = c->regs[VCPU_REGS_RCX]; |
1680 | emulate_grp2(ctxt); | 1704 | emulate_grp2(ctxt); |
1681 | break; | 1705 | break; |
1706 | case 0xe4: /* inb */ | ||
1707 | case 0xe5: /* in */ | ||
1708 | port = insn_fetch(u8, 1, c->eip); | ||
1709 | io_dir_in = 1; | ||
1710 | goto do_io; | ||
1711 | case 0xe6: /* outb */ | ||
1712 | case 0xe7: /* out */ | ||
1713 | port = insn_fetch(u8, 1, c->eip); | ||
1714 | io_dir_in = 0; | ||
1715 | goto do_io; | ||
1682 | case 0xe8: /* call (near) */ { | 1716 | case 0xe8: /* call (near) */ { |
1683 | long int rel; | 1717 | long int rel; |
1684 | switch (c->op_bytes) { | 1718 | switch (c->op_bytes) { |
@@ -1729,6 +1763,22 @@ special_insn: | |||
1729 | jmp_rel(c, c->src.val); | 1763 | jmp_rel(c, c->src.val); |
1730 | c->dst.type = OP_NONE; /* Disable writeback. */ | 1764 | c->dst.type = OP_NONE; /* Disable writeback. */ |
1731 | break; | 1765 | break; |
1766 | case 0xec: /* in al,dx */ | ||
1767 | case 0xed: /* in (e/r)ax,dx */ | ||
1768 | port = c->regs[VCPU_REGS_RDX]; | ||
1769 | io_dir_in = 1; | ||
1770 | goto do_io; | ||
1771 | case 0xee: /* out al,dx */ | ||
1772 | case 0xef: /* out (e/r)ax,dx */ | ||
1773 | port = c->regs[VCPU_REGS_RDX]; | ||
1774 | io_dir_in = 0; | ||
1775 | do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, | ||
1776 | (c->d & ByteOp) ? 1 : c->op_bytes, | ||
1777 | port) != 0) { | ||
1778 | c->eip = saved_eip; | ||
1779 | goto cannot_emulate; | ||
1780 | } | ||
1781 | return 0; | ||
1732 | case 0xf4: /* hlt */ | 1782 | case 0xf4: /* hlt */ |
1733 | ctxt->vcpu->arch.halt_request = 1; | 1783 | ctxt->vcpu->arch.halt_request = 1; |
1734 | break; | 1784 | break; |
@@ -1754,6 +1804,14 @@ special_insn: | |||
1754 | ctxt->eflags |= X86_EFLAGS_IF; | 1804 | ctxt->eflags |= X86_EFLAGS_IF; |
1755 | c->dst.type = OP_NONE; /* Disable writeback. */ | 1805 | c->dst.type = OP_NONE; /* Disable writeback. */ |
1756 | break; | 1806 | break; |
1807 | case 0xfc: /* cld */ | ||
1808 | ctxt->eflags &= ~EFLG_DF; | ||
1809 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1810 | break; | ||
1811 | case 0xfd: /* std */ | ||
1812 | ctxt->eflags |= EFLG_DF; | ||
1813 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1814 | break; | ||
1757 | case 0xfe ... 0xff: /* Grp4/Grp5 */ | 1815 | case 0xfe ... 0xff: /* Grp4/Grp5 */ |
1758 | rc = emulate_grp45(ctxt, ops); | 1816 | rc = emulate_grp45(ctxt, ops); |
1759 | if (rc != 0) | 1817 | if (rc != 0) |
@@ -1768,7 +1826,7 @@ writeback: | |||
1768 | 1826 | ||
1769 | /* Commit shadow register state. */ | 1827 | /* Commit shadow register state. */ |
1770 | memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); | 1828 | memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); |
1771 | ctxt->vcpu->arch.rip = c->eip; | 1829 | kvm_rip_write(ctxt->vcpu, c->eip); |
1772 | 1830 | ||
1773 | done: | 1831 | done: |
1774 | if (rc == X86EMUL_UNHANDLEABLE) { | 1832 | if (rc == X86EMUL_UNHANDLEABLE) { |
@@ -1793,7 +1851,7 @@ twobyte_insn: | |||
1793 | goto done; | 1851 | goto done; |
1794 | 1852 | ||
1795 | /* Let the processor re-execute the fixed hypercall */ | 1853 | /* Let the processor re-execute the fixed hypercall */ |
1796 | c->eip = ctxt->vcpu->arch.rip; | 1854 | c->eip = kvm_rip_read(ctxt->vcpu); |
1797 | /* Disable writeback. */ | 1855 | /* Disable writeback. */ |
1798 | c->dst.type = OP_NONE; | 1856 | c->dst.type = OP_NONE; |
1799 | break; | 1857 | break; |
@@ -1889,7 +1947,7 @@ twobyte_insn: | |||
1889 | rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); | 1947 | rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); |
1890 | if (rc) { | 1948 | if (rc) { |
1891 | kvm_inject_gp(ctxt->vcpu, 0); | 1949 | kvm_inject_gp(ctxt->vcpu, 0); |
1892 | c->eip = ctxt->vcpu->arch.rip; | 1950 | c->eip = kvm_rip_read(ctxt->vcpu); |
1893 | } | 1951 | } |
1894 | rc = X86EMUL_CONTINUE; | 1952 | rc = X86EMUL_CONTINUE; |
1895 | c->dst.type = OP_NONE; | 1953 | c->dst.type = OP_NONE; |
@@ -1899,7 +1957,7 @@ twobyte_insn: | |||
1899 | rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); | 1957 | rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); |
1900 | if (rc) { | 1958 | if (rc) { |
1901 | kvm_inject_gp(ctxt->vcpu, 0); | 1959 | kvm_inject_gp(ctxt->vcpu, 0); |
1902 | c->eip = ctxt->vcpu->arch.rip; | 1960 | c->eip = kvm_rip_read(ctxt->vcpu); |
1903 | } else { | 1961 | } else { |
1904 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; | 1962 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; |
1905 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; | 1963 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3f2b8962cbd0..31e8730fa246 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -640,24 +640,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
640 | } | 640 | } |
641 | 641 | ||
642 | 642 | ||
643 | #ifdef CONFIG_X86_32 | ||
644 | /* It's safe to allow irq's after cr2 has been saved and the vmalloc | ||
645 | fault has been handled. */ | ||
646 | if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK)) | ||
647 | local_irq_enable(); | ||
648 | |||
649 | /* | 643 | /* |
650 | * If we're in an interrupt, have no user context or are running in an | 644 | * It's safe to allow irq's after cr2 has been saved and the |
651 | * atomic region then we must not take the fault. | 645 | * vmalloc fault has been handled. |
646 | * | ||
647 | * User-mode registers count as a user access even for any | ||
648 | * potential system fault or CPU buglet. | ||
652 | */ | 649 | */ |
653 | if (in_atomic() || !mm) | 650 | if (user_mode_vm(regs)) { |
654 | goto bad_area_nosemaphore; | 651 | local_irq_enable(); |
655 | #else /* CONFIG_X86_64 */ | 652 | error_code |= PF_USER; |
656 | if (likely(regs->flags & X86_EFLAGS_IF)) | 653 | } else if (regs->flags & X86_EFLAGS_IF) |
657 | local_irq_enable(); | 654 | local_irq_enable(); |
658 | 655 | ||
656 | #ifdef CONFIG_X86_64 | ||
659 | if (unlikely(error_code & PF_RSVD)) | 657 | if (unlikely(error_code & PF_RSVD)) |
660 | pgtable_bad(address, regs, error_code); | 658 | pgtable_bad(address, regs, error_code); |
659 | #endif | ||
661 | 660 | ||
662 | /* | 661 | /* |
663 | * If we're in an interrupt, have no user context or are running in an | 662 | * If we're in an interrupt, have no user context or are running in an |
@@ -666,15 +665,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
666 | if (unlikely(in_atomic() || !mm)) | 665 | if (unlikely(in_atomic() || !mm)) |
667 | goto bad_area_nosemaphore; | 666 | goto bad_area_nosemaphore; |
668 | 667 | ||
669 | /* | ||
670 | * User-mode registers count as a user access even for any | ||
671 | * potential system fault or CPU buglet. | ||
672 | */ | ||
673 | if (user_mode_vm(regs)) | ||
674 | error_code |= PF_USER; | ||
675 | again: | 668 | again: |
676 | #endif | 669 | /* |
677 | /* When running in the kernel we expect faults to occur only to | 670 | * When running in the kernel we expect faults to occur only to |
678 | * addresses in user space. All other faults represent errors in the | 671 | * addresses in user space. All other faults represent errors in the |
679 | * kernel and should generate an OOPS. Unfortunately, in the case of an | 672 | * kernel and should generate an OOPS. Unfortunately, in the case of an |
680 | * erroneous fault occurring in a code path which already holds mmap_sem | 673 | * erroneous fault occurring in a code path which already holds mmap_sem |
@@ -737,9 +730,6 @@ good_area: | |||
737 | goto bad_area; | 730 | goto bad_area; |
738 | } | 731 | } |
739 | 732 | ||
740 | #ifdef CONFIG_X86_32 | ||
741 | survive: | ||
742 | #endif | ||
743 | /* | 733 | /* |
744 | * If for any reason at all we couldn't handle the fault, | 734 | * If for any reason at all we couldn't handle the fault, |
745 | * make sure we exit gracefully rather than endlessly redo | 735 | * make sure we exit gracefully rather than endlessly redo |
@@ -874,12 +864,11 @@ out_of_memory: | |||
874 | up_read(&mm->mmap_sem); | 864 | up_read(&mm->mmap_sem); |
875 | if (is_global_init(tsk)) { | 865 | if (is_global_init(tsk)) { |
876 | yield(); | 866 | yield(); |
877 | #ifdef CONFIG_X86_32 | 867 | /* |
878 | down_read(&mm->mmap_sem); | 868 | * Re-lookup the vma - in theory the vma tree might |
879 | goto survive; | 869 | * have changed: |
880 | #else | 870 | */ |
881 | goto again; | 871 | goto again; |
882 | #endif | ||
883 | } | 872 | } |
884 | 873 | ||
885 | printk("VM: killing process %s\n", tsk->comm); | 874 | printk("VM: killing process %s\n", tsk->comm); |
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 165c871ba9af..bcc079c282dd 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c | |||
@@ -137,6 +137,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) | |||
137 | 137 | ||
138 | return (void*) vaddr; | 138 | return (void*) vaddr; |
139 | } | 139 | } |
140 | EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ | ||
140 | 141 | ||
141 | struct page *kmap_atomic_to_page(void *ptr) | 142 | struct page *kmap_atomic_to_page(void *ptr) |
142 | { | 143 | { |
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index e4c43ec71b29..ae71e11eb3e5 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
@@ -220,6 +220,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, | |||
220 | return (__force void __iomem *)phys_to_virt(phys_addr); | 220 | return (__force void __iomem *)phys_to_virt(phys_addr); |
221 | 221 | ||
222 | /* | 222 | /* |
223 | * Check if the request spans more than any BAR in the iomem resource | ||
224 | * tree. | ||
225 | */ | ||
226 | WARN_ON(iomem_map_sanity_check(phys_addr, size)); | ||
227 | |||
228 | /* | ||
223 | * Don't allow anybody to remap normal RAM that we're using.. | 229 | * Don't allow anybody to remap normal RAM that we're using.. |
224 | */ | 230 | */ |
225 | for (pfn = phys_addr >> PAGE_SHIFT; | 231 | for (pfn = phys_addr >> PAGE_SHIFT; |
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 004ba86326ae..c9f7cda48ed7 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c | |||
@@ -198,17 +198,10 @@ unsigned long long xen_sched_clock(void) | |||
198 | /* Get the TSC speed from Xen */ | 198 | /* Get the TSC speed from Xen */ |
199 | unsigned long xen_tsc_khz(void) | 199 | unsigned long xen_tsc_khz(void) |
200 | { | 200 | { |
201 | u64 xen_khz = 1000000ULL << 32; | 201 | struct pvclock_vcpu_time_info *info = |
202 | const struct pvclock_vcpu_time_info *info = | ||
203 | &HYPERVISOR_shared_info->vcpu_info[0].time; | 202 | &HYPERVISOR_shared_info->vcpu_info[0].time; |
204 | 203 | ||
205 | do_div(xen_khz, info->tsc_to_system_mul); | 204 | return pvclock_tsc_khz(info); |
206 | if (info->tsc_shift < 0) | ||
207 | xen_khz <<= -info->tsc_shift; | ||
208 | else | ||
209 | xen_khz >>= info->tsc_shift; | ||
210 | |||
211 | return xen_khz; | ||
212 | } | 205 | } |
213 | 206 | ||
214 | cycle_t xen_clocksource_read(void) | 207 | cycle_t xen_clocksource_read(void) |