diff options
Diffstat (limited to 'arch/x86')
| -rw-r--r-- | arch/x86/include/asm/pvclock.h | 1 | ||||
| -rw-r--r-- | arch/x86/kernel/pvclock.c | 44 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.c | 33 | ||||
| -rw-r--r-- | arch/x86/vdso/vclock_gettime.c | 34 |
4 files changed, 43 insertions, 69 deletions
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 25b1cc07d496..d6b078e9fa28 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h | |||
| @@ -95,7 +95,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, | |||
| 95 | 95 | ||
| 96 | struct pvclock_vsyscall_time_info { | 96 | struct pvclock_vsyscall_time_info { |
| 97 | struct pvclock_vcpu_time_info pvti; | 97 | struct pvclock_vcpu_time_info pvti; |
| 98 | u32 migrate_count; | ||
| 99 | } __attribute__((__aligned__(SMP_CACHE_BYTES))); | 98 | } __attribute__((__aligned__(SMP_CACHE_BYTES))); |
| 100 | 99 | ||
| 101 | #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) | 100 | #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) |
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index e5ecd20e72dd..2f355d229a58 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c | |||
| @@ -141,46 +141,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, | |||
| 141 | set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); | 141 | set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); |
| 142 | } | 142 | } |
| 143 | 143 | ||
| 144 | static struct pvclock_vsyscall_time_info *pvclock_vdso_info; | ||
| 145 | |||
| 146 | static struct pvclock_vsyscall_time_info * | ||
| 147 | pvclock_get_vsyscall_user_time_info(int cpu) | ||
| 148 | { | ||
| 149 | if (!pvclock_vdso_info) { | ||
| 150 | BUG(); | ||
| 151 | return NULL; | ||
| 152 | } | ||
| 153 | |||
| 154 | return &pvclock_vdso_info[cpu]; | ||
| 155 | } | ||
| 156 | |||
| 157 | struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) | ||
| 158 | { | ||
| 159 | return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; | ||
| 160 | } | ||
| 161 | |||
| 162 | #ifdef CONFIG_X86_64 | 144 | #ifdef CONFIG_X86_64 |
| 163 | static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, | ||
| 164 | void *v) | ||
| 165 | { | ||
| 166 | struct task_migration_notifier *mn = v; | ||
| 167 | struct pvclock_vsyscall_time_info *pvti; | ||
| 168 | |||
| 169 | pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); | ||
| 170 | |||
| 171 | /* this is NULL when pvclock vsyscall is not initialized */ | ||
| 172 | if (unlikely(pvti == NULL)) | ||
| 173 | return NOTIFY_DONE; | ||
| 174 | |||
| 175 | pvti->migrate_count++; | ||
| 176 | |||
| 177 | return NOTIFY_DONE; | ||
| 178 | } | ||
| 179 | |||
| 180 | static struct notifier_block pvclock_migrate = { | ||
| 181 | .notifier_call = pvclock_task_migrate, | ||
| 182 | }; | ||
| 183 | |||
| 184 | /* | 145 | /* |
| 185 | * Initialize the generic pvclock vsyscall state. This will allocate | 146 | * Initialize the generic pvclock vsyscall state. This will allocate |
| 186 | * a/some page(s) for the per-vcpu pvclock information, set up a | 147 | * a/some page(s) for the per-vcpu pvclock information, set up a |
| @@ -194,17 +155,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, | |||
| 194 | 155 | ||
| 195 | WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); | 156 | WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); |
| 196 | 157 | ||
| 197 | pvclock_vdso_info = i; | ||
| 198 | |||
| 199 | for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { | 158 | for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { |
| 200 | __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, | 159 | __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, |
| 201 | __pa(i) + (idx*PAGE_SIZE), | 160 | __pa(i) + (idx*PAGE_SIZE), |
| 202 | PAGE_KERNEL_VVAR); | 161 | PAGE_KERNEL_VVAR); |
| 203 | } | 162 | } |
| 204 | 163 | ||
| 205 | |||
| 206 | register_task_migration_notifier(&pvclock_migrate); | ||
| 207 | |||
| 208 | return 0; | 164 | return 0; |
| 209 | } | 165 | } |
| 210 | #endif | 166 | #endif |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ed31c31b2485..c73efcd03e29 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
| @@ -1669,12 +1669,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) | |||
| 1669 | &guest_hv_clock, sizeof(guest_hv_clock)))) | 1669 | &guest_hv_clock, sizeof(guest_hv_clock)))) |
| 1670 | return 0; | 1670 | return 0; |
| 1671 | 1671 | ||
| 1672 | /* | 1672 | /* This VCPU is paused, but it's legal for a guest to read another |
| 1673 | * The interface expects us to write an even number signaling that the | 1673 | * VCPU's kvmclock, so we really have to follow the specification where |
| 1674 | * update is finished. Since the guest won't see the intermediate | 1674 | * it says that version is odd if data is being modified, and even after |
| 1675 | * state, we just increase by 2 at the end. | 1675 | * it is consistent. |
| 1676 | * | ||
| 1677 | * Version field updates must be kept separate. This is because | ||
| 1678 | * kvm_write_guest_cached might use a "rep movs" instruction, and | ||
| 1679 | * writes within a string instruction are weakly ordered. So there | ||
| 1680 | * are three writes overall. | ||
| 1681 | * | ||
| 1682 | * As a small optimization, only write the version field in the first | ||
| 1683 | * and third write. The vcpu->pv_time cache is still valid, because the | ||
| 1684 | * version field is the first in the struct. | ||
| 1676 | */ | 1685 | */ |
| 1677 | vcpu->hv_clock.version = guest_hv_clock.version + 2; | 1686 | BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); |
| 1687 | |||
| 1688 | vcpu->hv_clock.version = guest_hv_clock.version + 1; | ||
| 1689 | kvm_write_guest_cached(v->kvm, &vcpu->pv_time, | ||
| 1690 | &vcpu->hv_clock, | ||
| 1691 | sizeof(vcpu->hv_clock.version)); | ||
| 1692 | |||
| 1693 | smp_wmb(); | ||
| 1678 | 1694 | ||
| 1679 | /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ | 1695 | /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ |
| 1680 | pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); | 1696 | pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); |
| @@ -1695,6 +1711,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) | |||
| 1695 | kvm_write_guest_cached(v->kvm, &vcpu->pv_time, | 1711 | kvm_write_guest_cached(v->kvm, &vcpu->pv_time, |
| 1696 | &vcpu->hv_clock, | 1712 | &vcpu->hv_clock, |
| 1697 | sizeof(vcpu->hv_clock)); | 1713 | sizeof(vcpu->hv_clock)); |
| 1714 | |||
| 1715 | smp_wmb(); | ||
| 1716 | |||
| 1717 | vcpu->hv_clock.version++; | ||
| 1718 | kvm_write_guest_cached(v->kvm, &vcpu->pv_time, | ||
| 1719 | &vcpu->hv_clock, | ||
| 1720 | sizeof(vcpu->hv_clock.version)); | ||
| 1698 | return 0; | 1721 | return 0; |
| 1699 | } | 1722 | } |
| 1700 | 1723 | ||
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 40d2473836c9..9793322751e0 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c | |||
| @@ -82,15 +82,18 @@ static notrace cycle_t vread_pvclock(int *mode) | |||
| 82 | cycle_t ret; | 82 | cycle_t ret; |
| 83 | u64 last; | 83 | u64 last; |
| 84 | u32 version; | 84 | u32 version; |
| 85 | u32 migrate_count; | ||
| 86 | u8 flags; | 85 | u8 flags; |
| 87 | unsigned cpu, cpu1; | 86 | unsigned cpu, cpu1; |
| 88 | 87 | ||
| 89 | 88 | ||
| 90 | /* | 89 | /* |
| 91 | * When looping to get a consistent (time-info, tsc) pair, we | 90 | * Note: hypervisor must guarantee that: |
| 92 | * also need to deal with the possibility we can switch vcpus, | 91 | * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. |
| 93 | * so make sure we always re-fetch time-info for the current vcpu. | 92 | * 2. that per-CPU pvclock time info is updated if the |
| 93 | * underlying CPU changes. | ||
| 94 | * 3. that version is increased whenever underlying CPU | ||
| 95 | * changes. | ||
| 96 | * | ||
| 94 | */ | 97 | */ |
| 95 | do { | 98 | do { |
| 96 | cpu = __getcpu() & VGETCPU_CPU_MASK; | 99 | cpu = __getcpu() & VGETCPU_CPU_MASK; |
| @@ -99,27 +102,20 @@ static notrace cycle_t vread_pvclock(int *mode) | |||
| 99 | * __getcpu() calls (Gleb). | 102 | * __getcpu() calls (Gleb). |
| 100 | */ | 103 | */ |
| 101 | 104 | ||
| 102 | /* Make sure migrate_count will change if we leave the VCPU. */ | 105 | pvti = get_pvti(cpu); |
| 103 | do { | ||
| 104 | pvti = get_pvti(cpu); | ||
| 105 | migrate_count = pvti->migrate_count; | ||
| 106 | |||
| 107 | cpu1 = cpu; | ||
| 108 | cpu = __getcpu() & VGETCPU_CPU_MASK; | ||
| 109 | } while (unlikely(cpu != cpu1)); | ||
| 110 | 106 | ||
| 111 | version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); | 107 | version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); |
| 112 | 108 | ||
| 113 | /* | 109 | /* |
| 114 | * Test we're still on the cpu as well as the version. | 110 | * Test we're still on the cpu as well as the version. |
| 115 | * - We must read TSC of pvti's VCPU. | 111 | * We could have been migrated just after the first |
| 116 | * - KVM doesn't follow the versioning protocol, so data could | 112 | * vgetcpu but before fetching the version, so we |
| 117 | * change before version if we left the VCPU. | 113 | * wouldn't notice a version change. |
| 118 | */ | 114 | */ |
| 119 | smp_rmb(); | 115 | cpu1 = __getcpu() & VGETCPU_CPU_MASK; |
| 120 | } while (unlikely((pvti->pvti.version & 1) || | 116 | } while (unlikely(cpu != cpu1 || |
| 121 | pvti->pvti.version != version || | 117 | (pvti->pvti.version & 1) || |
| 122 | pvti->migrate_count != migrate_count)); | 118 | pvti->pvti.version != version)); |
| 123 | 119 | ||
| 124 | if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) | 120 | if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) |
| 125 | *mode = VCLOCK_NONE; | 121 | *mode = VCLOCK_NONE; |
