aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-05 19:26:26 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-05 19:26:26 -0500
commit933425fb0010bd02bd459b41e63082756818ffce (patch)
tree1cbc6c2035b9dcff8cb265c9ac562cbee7c6bb82
parenta3e7531535a0c6e5acbaa5436f37933bb471aa95 (diff)
parenta3eaa8649e4c6a6afdafaa04b9114fb230617bb1 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "First batch of KVM changes for 4.4. s390: A bunch of fixes and optimizations for interrupt and time handling. PPC: Mostly bug fixes. ARM: No big features, but many small fixes and prerequisites including: - a number of fixes for the arch-timer - introducing proper level-triggered semantics for the arch-timers - a series of patches to synchronously halt a guest (prerequisite for IRQ forwarding) - some tracepoint improvements - a tweak for the EL2 panic handlers - some more VGIC cleanups getting rid of redundant state x86: Quite a few changes: - support for VT-d posted interrupts (i.e. PCI devices can inject interrupts directly into vCPUs). This introduces a new component (in virt/lib/) that connects VFIO and KVM together. The same infrastructure will be used for ARM interrupt forwarding as well. - more Hyper-V features, though the main one Hyper-V synthetic interrupt controller will have to wait for 4.5. These will let KVM expose Hyper-V devices. - nested virtualization now supports VPID (same as PCID but for vCPUs) which makes it quite a bit faster - for future hardware that supports NVDIMM, there is support for clflushopt, clwb, pcommit - support for "split irqchip", i.e. LAPIC in kernel + IOAPIC/PIC/PIT in userspace, which reduces the attack surface of the hypervisor - obligatory smattering of SMM fixes - on the guest side, stable scheduler clock support was rewritten to not require help from the hypervisor" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (123 commits) KVM: VMX: Fix commit which broke PML KVM: x86: obey KVM_X86_QUIRK_CD_NW_CLEARED in kvm_set_cr0() KVM: x86: allow RSM from 64-bit mode KVM: VMX: fix SMEP and SMAP without EPT KVM: x86: move kvm_set_irq_inatomic to legacy device assignment KVM: device assignment: remove pointless #ifdefs KVM: x86: merge kvm_arch_set_irq with kvm_set_msi_inatomic KVM: x86: zero apic_arb_prio on reset drivers/hv: share Hyper-V SynIC constants with userspace KVM: x86: handle SMBASE as physical address in RSM KVM: x86: add read_phys to x86_emulate_ops KVM: x86: removing unused variable KVM: don't pointlessly leave KVM_COMPAT=y in non-KVM configs KVM: arm/arm64: Merge vgic_set_lr() and vgic_sync_lr_elrsr() KVM: arm/arm64: Clean up vgic_retire_lr() and surroundings KVM: arm/arm64: Optimize away redundant LR tracking KVM: s390: use simple switch statement as multiplexer KVM: s390: drop useless newline in debugging data KVM: s390: SCA must not cross page boundaries KVM: arm: Do not indent the arguments of DECLARE_BITMAP ...
-rw-r--r--Documentation/kernel-parameters.txt1
-rw-r--r--Documentation/virtual/kvm/api.txt52
-rw-r--r--Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt187
-rw-r--r--Documentation/virtual/kvm/devices/arm-vgic.txt18
-rw-r--r--Documentation/virtual/kvm/locking.txt12
-rw-r--r--MAINTAINERS7
-rw-r--r--Makefile10
-rw-r--r--arch/arm/include/asm/kvm_arm.h20
-rw-r--r--arch/arm/include/asm/kvm_host.h5
-rw-r--r--arch/arm/kvm/Kconfig2
-rw-r--r--arch/arm/kvm/arm.c76
-rw-r--r--arch/arm/kvm/psci.c10
-rw-r--r--arch/arm/kvm/trace.h10
-rw-r--r--arch/arm64/include/asm/kvm_arm.h16
-rw-r--r--arch/arm64/include/asm/kvm_host.h5
-rw-r--r--arch/arm64/kvm/Kconfig2
-rw-r--r--arch/arm64/kvm/hyp.S8
-rw-r--r--arch/mips/include/asm/kvm_host.h2
-rw-r--r--arch/powerpc/include/asm/disassemble.h5
-rw-r--r--arch/powerpc/include/asm/kvm_host.h2
-rw-r--r--arch/powerpc/include/asm/reg_booke.h6
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c3
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c2
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S29
-rw-r--r--arch/powerpc/kvm/e500.c3
-rw-r--r--arch/powerpc/kvm/e500_emulate.c19
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c4
-rw-r--r--arch/powerpc/kvm/powerpc.c3
-rw-r--r--arch/s390/include/asm/kvm_host.h2
-rw-r--r--arch/s390/kvm/intercept.c42
-rw-r--r--arch/s390/kvm/interrupt.c116
-rw-r--r--arch/s390/kvm/kvm-s390.c58
-rw-r--r--arch/s390/kvm/kvm-s390.h35
-rw-r--r--arch/s390/kvm/priv.c19
-rw-r--r--arch/x86/include/asm/irq_remapping.h10
-rw-r--r--arch/x86/include/asm/kvm_emulate.h10
-rw-r--r--arch/x86/include/asm/kvm_host.h38
-rw-r--r--arch/x86/include/asm/vmx.h3
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h18
-rw-r--r--arch/x86/include/uapi/asm/vmx.h4
-rw-r--r--arch/x86/kernel/kvmclock.c46
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/assigned-dev.c62
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/cpuid.h37
-rw-r--r--arch/x86/kvm/emulate.c35
-rw-r--r--arch/x86/kvm/hyperv.c31
-rw-r--r--arch/x86/kvm/i8254.c4
-rw-r--r--arch/x86/kvm/ioapic.c29
-rw-r--r--arch/x86/kvm/ioapic.h15
-rw-r--r--arch/x86/kvm/irq.c40
-rw-r--r--arch/x86/kvm/irq.h27
-rw-r--r--arch/x86/kvm/irq_comm.c129
-rw-r--r--arch/x86/kvm/lapic.c127
-rw-r--r--arch/x86/kvm/lapic.h7
-rw-r--r--arch/x86/kvm/mmu.c91
-rw-r--r--arch/x86/kvm/paging_tmpl.h19
-rw-r--r--arch/x86/kvm/svm.c43
-rw-r--r--arch/x86/kvm/trace.h51
-rw-r--r--arch/x86/kvm/vmx.c750
-rw-r--r--arch/x86/kvm/x86.c256
-rw-r--r--drivers/hv/hyperv_vmbus.h5
-rw-r--r--drivers/iommu/irq_remapping.c12
-rw-r--r--drivers/vfio/Kconfig1
-rw-r--r--drivers/vfio/pci/Kconfig1
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c9
-rw-r--r--drivers/vfio/pci/vfio_pci_private.h2
-rw-r--r--include/kvm/arm_arch_timer.h4
-rw-r--r--include/kvm/arm_vgic.h16
-rw-r--r--include/linux/hyperv.h1
-rw-r--r--include/linux/irqbypass.h90
-rw-r--r--include/linux/kvm_host.h42
-rw-r--r--include/linux/kvm_irqfd.h71
-rw-r--r--include/uapi/linux/kvm.h7
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--virt/Makefile1
-rw-r--r--virt/kvm/Kconfig5
-rw-r--r--virt/kvm/arm/arch_timer.c173
-rw-r--r--virt/kvm/arm/trace.h63
-rw-r--r--virt/kvm/arm/vgic-v2.c6
-rw-r--r--virt/kvm/arm/vgic-v3.c6
-rw-r--r--virt/kvm/arm/vgic.c308
-rw-r--r--virt/kvm/async_pf.c4
-rw-r--r--virt/kvm/eventfd.c190
-rw-r--r--virt/kvm/irqchip.c18
-rw-r--r--virt/kvm/kvm_main.c11
-rw-r--r--virt/lib/Kconfig2
-rw-r--r--virt/lib/Makefile1
-rw-r--r--virt/lib/irqbypass.c257
89 files changed, 2956 insertions, 1029 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 688760f790b1..816bf2fe55f5 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1585,6 +1585,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1585 nosid disable Source ID checking 1585 nosid disable Source ID checking
1586 no_x2apic_optout 1586 no_x2apic_optout
1587 BIOS x2APIC opt-out request will be ignored 1587 BIOS x2APIC opt-out request will be ignored
1588 nopost disable Interrupt Posting
1588 1589
1589 iomem= Disable strict checking of access to MMIO memory 1590 iomem= Disable strict checking of access to MMIO memory
1590 strict regions from userspace. 1591 strict regions from userspace.
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 29ece601008e..092ee9fbaf2b 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -401,10 +401,9 @@ Capability: basic
401Architectures: x86, ppc, mips 401Architectures: x86, ppc, mips
402Type: vcpu ioctl 402Type: vcpu ioctl
403Parameters: struct kvm_interrupt (in) 403Parameters: struct kvm_interrupt (in)
404Returns: 0 on success, -1 on error 404Returns: 0 on success, negative on failure.
405 405
406Queues a hardware interrupt vector to be injected. This is only 406Queues a hardware interrupt vector to be injected.
407useful if in-kernel local APIC or equivalent is not used.
408 407
409/* for KVM_INTERRUPT */ 408/* for KVM_INTERRUPT */
410struct kvm_interrupt { 409struct kvm_interrupt {
@@ -414,7 +413,14 @@ struct kvm_interrupt {
414 413
415X86: 414X86:
416 415
417Note 'irq' is an interrupt vector, not an interrupt pin or line. 416Returns: 0 on success,
417 -EEXIST if an interrupt is already enqueued
418 -EINVAL the the irq number is invalid
419 -ENXIO if the PIC is in the kernel
420 -EFAULT if the pointer is invalid
421
422Note 'irq' is an interrupt vector, not an interrupt pin or line. This
423ioctl is useful if the in-kernel PIC is not used.
418 424
419PPC: 425PPC:
420 426
@@ -1598,7 +1604,7 @@ provided event instead of triggering an exit.
1598struct kvm_ioeventfd { 1604struct kvm_ioeventfd {
1599 __u64 datamatch; 1605 __u64 datamatch;
1600 __u64 addr; /* legal pio/mmio address */ 1606 __u64 addr; /* legal pio/mmio address */
1601 __u32 len; /* 1, 2, 4, or 8 bytes */ 1607 __u32 len; /* 0, 1, 2, 4, or 8 bytes */
1602 __s32 fd; 1608 __s32 fd;
1603 __u32 flags; 1609 __u32 flags;
1604 __u8 pad[36]; 1610 __u8 pad[36];
@@ -1621,6 +1627,10 @@ to the registered address is equal to datamatch in struct kvm_ioeventfd.
1621For virtio-ccw devices, addr contains the subchannel id and datamatch the 1627For virtio-ccw devices, addr contains the subchannel id and datamatch the
1622virtqueue index. 1628virtqueue index.
1623 1629
1630With KVM_CAP_IOEVENTFD_ANY_LENGTH, a zero length ioeventfd is allowed, and
1631the kernel will ignore the length of guest write and may get a faster vmexit.
1632The speedup may only apply to specific architectures, but the ioeventfd will
1633work anyway.
1624 1634
16254.60 KVM_DIRTY_TLB 16354.60 KVM_DIRTY_TLB
1626 1636
@@ -3309,6 +3319,18 @@ Valid values for 'type' are:
3309 to ignore the request, or to gather VM memory core dump and/or 3319 to ignore the request, or to gather VM memory core dump and/or
3310 reset/shutdown of the VM. 3320 reset/shutdown of the VM.
3311 3321
3322 /* KVM_EXIT_IOAPIC_EOI */
3323 struct {
3324 __u8 vector;
3325 } eoi;
3326
3327Indicates that the VCPU's in-kernel local APIC received an EOI for a
3328level-triggered IOAPIC interrupt. This exit only triggers when the
3329IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled);
3330the userspace IOAPIC should process the EOI and retrigger the interrupt if
3331it is still asserted. Vector is the LAPIC interrupt vector for which the
3332EOI was received.
3333
3312 /* Fix the size of the union. */ 3334 /* Fix the size of the union. */
3313 char padding[256]; 3335 char padding[256];
3314 }; 3336 };
@@ -3627,6 +3649,26 @@ struct {
3627 3649
3628KVM handlers should exit to userspace with rc = -EREMOTE. 3650KVM handlers should exit to userspace with rc = -EREMOTE.
3629 3651
36527.5 KVM_CAP_SPLIT_IRQCHIP
3653
3654Architectures: x86
3655Parameters: args[0] - number of routes reserved for userspace IOAPICs
3656Returns: 0 on success, -1 on error
3657
3658Create a local apic for each processor in the kernel. This can be used
3659instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the
3660IOAPIC and PIC (and also the PIT, even though this has to be enabled
3661separately).
3662
3663This capability also enables in kernel routing of interrupt requests;
3664when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are
3665used in the IRQ routing table. The first args[0] MSI routes are reserved
3666for the IOAPIC pins. Whenever the LAPIC receives an EOI for these routes,
3667a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace.
3668
3669Fails if VCPU has already been created, or if the irqchip is already in the
3670kernel (i.e. KVM_CREATE_IRQCHIP has already been called).
3671
3630 3672
36318. Other capabilities. 36738. Other capabilities.
3632---------------------- 3674----------------------
diff --git a/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
new file mode 100644
index 000000000000..38bca2835278
--- /dev/null
+++ b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
@@ -0,0 +1,187 @@
1KVM/ARM VGIC Forwarded Physical Interrupts
2==========================================
3
4The KVM/ARM code implements software support for the ARM Generic
5Interrupt Controller's (GIC's) hardware support for virtualization by
6allowing software to inject virtual interrupts to a VM, which the guest
7OS sees as regular interrupts. The code is famously known as the VGIC.
8
9Some of these virtual interrupts, however, correspond to physical
10interrupts from real physical devices. One example could be the
11architected timer, which itself supports virtualization, and therefore
12lets a guest OS program the hardware device directly to raise an
13interrupt at some point in time. When such an interrupt is raised, the
14host OS initially handles the interrupt and must somehow signal this
15event as a virtual interrupt to the guest. Another example could be a
16passthrough device, where the physical interrupts are initially handled
17by the host, but the device driver for the device lives in the guest OS
18and KVM must therefore somehow inject a virtual interrupt on behalf of
19the physical one to the guest OS.
20
21These virtual interrupts corresponding to a physical interrupt on the
22host are called forwarded physical interrupts, but are also sometimes
23referred to as 'virtualized physical interrupts' and 'mapped interrupts'.
24
25Forwarded physical interrupts are handled slightly differently compared
26to virtual interrupts generated purely by a software emulated device.
27
28
29The HW bit
30----------
31Virtual interrupts are signalled to the guest by programming the List
32Registers (LRs) on the GIC before running a VCPU. The LR is programmed
33with the virtual IRQ number and the state of the interrupt (Pending,
34Active, or Pending+Active). When the guest ACKs and EOIs a virtual
35interrupt, the LR state moves from Pending to Active, and finally to
36inactive.
37
38The LRs include an extra bit, called the HW bit. When this bit is set,
39KVM must also program an additional field in the LR, the physical IRQ
40number, to link the virtual with the physical IRQ.
41
42When the HW bit is set, KVM must EITHER set the Pending OR the Active
43bit, never both at the same time.
44
45Setting the HW bit causes the hardware to deactivate the physical
46interrupt on the physical distributor when the guest deactivates the
47corresponding virtual interrupt.
48
49
50Forwarded Physical Interrupts Life Cycle
51----------------------------------------
52
53The state of forwarded physical interrupts is managed in the following way:
54
55 - The physical interrupt is acked by the host, and becomes active on
56 the physical distributor (*).
57 - KVM sets the LR.Pending bit, because this is the only way the GICV
58 interface is going to present it to the guest.
59 - LR.Pending will stay set as long as the guest has not acked the interrupt.
60 - LR.Pending transitions to LR.Active on the guest read of the IAR, as
61 expected.
62 - On guest EOI, the *physical distributor* active bit gets cleared,
63 but the LR.Active is left untouched (set).
64 - KVM clears the LR on VM exits when the physical distributor
65 active state has been cleared.
66
67(*): The host handling is slightly more complicated. For some forwarded
68interrupts (shared), KVM directly sets the active state on the physical
69distributor before entering the guest, because the interrupt is never actually
70handled on the host (see details on the timer as an example below). For other
71forwarded interrupts (non-shared) the host does not deactivate the interrupt
72when the host ISR completes, but leaves the interrupt active until the guest
73deactivates it. Leaving the interrupt active is allowed, because Linux
74configures the physical GIC with EOIMode=1, which causes EOI operations to
75perform a priority drop allowing the GIC to receive other interrupts of the
76default priority.
77
78
79Forwarded Edge and Level Triggered PPIs and SPIs
80------------------------------------------------
81Forwarded physical interrupts injected should always be active on the
82physical distributor when injected to a guest.
83
84Level-triggered interrupts will keep the interrupt line to the GIC
85asserted, typically until the guest programs the device to deassert the
86line. This means that the interrupt will remain pending on the physical
87distributor until the guest has reprogrammed the device. Since we
88always run the VM with interrupts enabled on the CPU, a pending
89interrupt will exit the guest as soon as we switch into the guest,
90preventing the guest from ever making progress as the process repeats
91over and over. Therefore, the active state on the physical distributor
92must be set when entering the guest, preventing the GIC from forwarding
93the pending interrupt to the CPU. As soon as the guest deactivates the
94interrupt, the physical line is sampled by the hardware again and the host
95takes a new interrupt if and only if the physical line is still asserted.
96
97Edge-triggered interrupts do not exhibit the same problem with
98preventing guest execution that level-triggered interrupts do. One
99option is to not use HW bit at all, and inject edge-triggered interrupts
100from a physical device as pure virtual interrupts. But that would
101potentially slow down handling of the interrupt in the guest, because a
102physical interrupt occurring in the middle of the guest ISR would
103preempt the guest for the host to handle the interrupt. Additionally,
104if you configure the system to handle interrupts on a separate physical
105core from that running your VCPU, you still have to interrupt the VCPU
106to queue the pending state onto the LR, even though the guest won't use
107this information until the guest ISR completes. Therefore, the HW
108bit should always be set for forwarded edge-triggered interrupts. With
109the HW bit set, the virtual interrupt is injected and additional
110physical interrupts occurring before the guest deactivates the interrupt
111simply mark the state on the physical distributor as Pending+Active. As
112soon as the guest deactivates the interrupt, the host takes another
113interrupt if and only if there was a physical interrupt between injecting
114the forwarded interrupt to the guest and the guest deactivating the
115interrupt.
116
117Consequently, whenever we schedule a VCPU with one or more LRs with the
118HW bit set, the interrupt must also be active on the physical
119distributor.
120
121
122Forwarded LPIs
123--------------
124LPIs, introduced in GICv3, are always edge-triggered and do not have an
125active state. They become pending when a device signal them, and as
126soon as they are acked by the CPU, they are inactive again.
127
128It therefore doesn't make sense, and is not supported, to set the HW bit
129for physical LPIs that are forwarded to a VM as virtual interrupts,
130typically virtual SPIs.
131
132For LPIs, there is no other choice than to preempt the VCPU thread if
133necessary, and queue the pending state onto the LR.
134
135
136Putting It Together: The Architected Timer
137------------------------------------------
138The architected timer is a device that signals interrupts with level
139triggered semantics. The timer hardware is directly accessed by VCPUs
140which program the timer to fire at some point in time. Each VCPU on a
141system programs the timer to fire at different times, and therefore the
142hardware is multiplexed between multiple VCPUs. This is implemented by
143context-switching the timer state along with each VCPU thread.
144
145However, this means that a scenario like the following is entirely
146possible, and in fact, typical:
147
1481. KVM runs the VCPU
1492. The guest programs the time to fire in T+100
1503. The guest is idle and calls WFI (wait-for-interrupts)
1514. The hardware traps to the host
1525. KVM stores the timer state to memory and disables the hardware timer
1536. KVM schedules a soft timer to fire in T+(100 - time since step 2)
1547. KVM puts the VCPU thread to sleep (on a waitqueue)
1558. The soft timer fires, waking up the VCPU thread
1569. KVM reprograms the timer hardware with the VCPU's values
15710. KVM marks the timer interrupt as active on the physical distributor
15811. KVM injects a forwarded physical interrupt to the guest
15912. KVM runs the VCPU
160
161Notice that KVM injects a forwarded physical interrupt in step 11 without
162the corresponding interrupt having actually fired on the host. That is
163exactly why we mark the timer interrupt as active in step 10, because
164the active state on the physical distributor is part of the state
165belonging to the timer hardware, which is context-switched along with
166the VCPU thread.
167
168If the guest does not idle because it is busy, the flow looks like this
169instead:
170
1711. KVM runs the VCPU
1722. The guest programs the time to fire in T+100
1734. At T+100 the timer fires and a physical IRQ causes the VM to exit
174 (note that this initially only traps to EL2 and does not run the host ISR
175 until KVM has returned to the host).
1765. With interrupts still disabled on the CPU coming back from the guest, KVM
177 stores the virtual timer state to memory and disables the virtual hw timer.
1786. KVM looks at the timer state (in memory) and injects a forwarded physical
179 interrupt because it concludes the timer has expired.
1807. KVM marks the timer interrupt as active on the physical distributor
1817. KVM enables the timer, enables interrupts, and runs the VCPU
182
183Notice that again the forwarded physical interrupt is injected to the
184guest without having actually been handled on the host. In this case it
185is because the physical interrupt is never actually seen by the host because the
186timer is disabled upon guest return, and the virtual forwarded interrupt is
187injected on the KVM guest entry path.
diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt
index 3fb905429e8a..59541d49e15c 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -44,28 +44,29 @@ Groups:
44 Attributes: 44 Attributes:
45 The attr field of kvm_device_attr encodes two values: 45 The attr field of kvm_device_attr encodes two values:
46 bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | 46 bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 |
47 values: | reserved | cpu id | offset | 47 values: | reserved | vcpu_index | offset |
48 48
49 All distributor regs are (rw, 32-bit) 49 All distributor regs are (rw, 32-bit)
50 50
51 The offset is relative to the "Distributor base address" as defined in the 51 The offset is relative to the "Distributor base address" as defined in the
52 GICv2 specs. Getting or setting such a register has the same effect as 52 GICv2 specs. Getting or setting such a register has the same effect as
53 reading or writing the register on the actual hardware from the cpu 53 reading or writing the register on the actual hardware from the cpu whose
54 specified with cpu id field. Note that most distributor fields are not 54 index is specified with the vcpu_index field. Note that most distributor
55 banked, but return the same value regardless of the cpu id used to access 55 fields are not banked, but return the same value regardless of the
56 the register. 56 vcpu_index used to access the register.
57 Limitations: 57 Limitations:
58 - Priorities are not implemented, and registers are RAZ/WI 58 - Priorities are not implemented, and registers are RAZ/WI
59 - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2. 59 - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
60 Errors: 60 Errors:
61 -ENODEV: Getting or setting this register is not yet supported 61 -ENXIO: Getting or setting this register is not yet supported
62 -EBUSY: One or more VCPUs are running 62 -EBUSY: One or more VCPUs are running
63 -EINVAL: Invalid vcpu_index supplied
63 64
64 KVM_DEV_ARM_VGIC_GRP_CPU_REGS 65 KVM_DEV_ARM_VGIC_GRP_CPU_REGS
65 Attributes: 66 Attributes:
66 The attr field of kvm_device_attr encodes two values: 67 The attr field of kvm_device_attr encodes two values:
67 bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | 68 bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 |
68 values: | reserved | cpu id | offset | 69 values: | reserved | vcpu_index | offset |
69 70
70 All CPU interface regs are (rw, 32-bit) 71 All CPU interface regs are (rw, 32-bit)
71 72
@@ -91,8 +92,9 @@ Groups:
91 - Priorities are not implemented, and registers are RAZ/WI 92 - Priorities are not implemented, and registers are RAZ/WI
92 - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2. 93 - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
93 Errors: 94 Errors:
94 -ENODEV: Getting or setting this register is not yet supported 95 -ENXIO: Getting or setting this register is not yet supported
95 -EBUSY: One or more VCPUs are running 96 -EBUSY: One or more VCPUs are running
97 -EINVAL: Invalid vcpu_index supplied
96 98
97 KVM_DEV_ARM_VGIC_GRP_NR_IRQS 99 KVM_DEV_ARM_VGIC_GRP_NR_IRQS
98 Attributes: 100 Attributes:
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index d68af4dc3006..19f94a6b9bb0 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -166,3 +166,15 @@ Comment: The srcu read lock must be held while accessing memslots (e.g.
166 MMIO/PIO address->device structure mapping (kvm->buses). 166 MMIO/PIO address->device structure mapping (kvm->buses).
167 The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu 167 The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu
168 if it is needed by multiple functions. 168 if it is needed by multiple functions.
169
170Name: blocked_vcpu_on_cpu_lock
171Type: spinlock_t
172Arch: x86
173Protects: blocked_vcpu_on_cpu
174Comment: This is a per-CPU lock and it is used for VT-d posted-interrupts.
175 When VT-d posted-interrupts is supported and the VM has assigned
176 devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu
177 protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues
178 wakeup notification event since external interrupts from the
179 assigned devices happens, we will find the vCPU on the list to
180 wakeup.
diff --git a/MAINTAINERS b/MAINTAINERS
index 1ef6adc990a6..7301ae17ec63 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11348,6 +11348,13 @@ L: netdev@vger.kernel.org
11348S: Maintained 11348S: Maintained
11349F: drivers/net/ethernet/via/via-velocity.* 11349F: drivers/net/ethernet/via/via-velocity.*
11350 11350
11351VIRT LIB
11352M: Alex Williamson <alex.williamson@redhat.com>
11353M: Paolo Bonzini <pbonzini@redhat.com>
11354L: kvm@vger.kernel.org
11355S: Supported
11356F: virt/lib/
11357
11351VIVID VIRTUAL VIDEO DRIVER 11358VIVID VIRTUAL VIDEO DRIVER
11352M: Hans Verkuil <hverkuil@xs4all.nl> 11359M: Hans Verkuil <hverkuil@xs4all.nl>
11353L: linux-media@vger.kernel.org 11360L: linux-media@vger.kernel.org
diff --git a/Makefile b/Makefile
index f71b378005f0..69be581e7c7a 100644
--- a/Makefile
+++ b/Makefile
@@ -550,6 +550,7 @@ drivers-y := drivers/ sound/ firmware/
550net-y := net/ 550net-y := net/
551libs-y := lib/ 551libs-y := lib/
552core-y := usr/ 552core-y := usr/
553virt-y := virt/
553endif # KBUILD_EXTMOD 554endif # KBUILD_EXTMOD
554 555
555ifeq ($(dot-config),1) 556ifeq ($(dot-config),1)
@@ -882,10 +883,10 @@ core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
882 883
883vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ 884vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
884 $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ 885 $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
885 $(net-y) $(net-m) $(libs-y) $(libs-m))) 886 $(net-y) $(net-m) $(libs-y) $(libs-m) $(virt-y)))
886 887
887vmlinux-alldirs := $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \ 888vmlinux-alldirs := $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \
888 $(init-) $(core-) $(drivers-) $(net-) $(libs-)))) 889 $(init-) $(core-) $(drivers-) $(net-) $(libs-) $(virt-))))
889 890
890init-y := $(patsubst %/, %/built-in.o, $(init-y)) 891init-y := $(patsubst %/, %/built-in.o, $(init-y))
891core-y := $(patsubst %/, %/built-in.o, $(core-y)) 892core-y := $(patsubst %/, %/built-in.o, $(core-y))
@@ -894,14 +895,15 @@ net-y := $(patsubst %/, %/built-in.o, $(net-y))
894libs-y1 := $(patsubst %/, %/lib.a, $(libs-y)) 895libs-y1 := $(patsubst %/, %/lib.a, $(libs-y))
895libs-y2 := $(patsubst %/, %/built-in.o, $(libs-y)) 896libs-y2 := $(patsubst %/, %/built-in.o, $(libs-y))
896libs-y := $(libs-y1) $(libs-y2) 897libs-y := $(libs-y1) $(libs-y2)
898virt-y := $(patsubst %/, %/built-in.o, $(virt-y))
897 899
898# Externally visible symbols (used by link-vmlinux.sh) 900# Externally visible symbols (used by link-vmlinux.sh)
899export KBUILD_VMLINUX_INIT := $(head-y) $(init-y) 901export KBUILD_VMLINUX_INIT := $(head-y) $(init-y)
900export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y) 902export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y) $(virt-y)
901export KBUILD_LDS := arch/$(SRCARCH)/kernel/vmlinux.lds 903export KBUILD_LDS := arch/$(SRCARCH)/kernel/vmlinux.lds
902export LDFLAGS_vmlinux 904export LDFLAGS_vmlinux
903# used by scripts/pacmage/Makefile 905# used by scripts/pacmage/Makefile
904export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools virt) 906export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools)
905 907
906vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN) 908vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN)
907 909
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index d995821f1698..dc641ddf0784 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -218,4 +218,24 @@
218#define HSR_DABT_CM (1U << 8) 218#define HSR_DABT_CM (1U << 8)
219#define HSR_DABT_EA (1U << 9) 219#define HSR_DABT_EA (1U << 9)
220 220
221#define kvm_arm_exception_type \
222 {0, "RESET" }, \
223 {1, "UNDEFINED" }, \
224 {2, "SOFTWARE" }, \
225 {3, "PREF_ABORT" }, \
226 {4, "DATA_ABORT" }, \
227 {5, "IRQ" }, \
228 {6, "FIQ" }, \
229 {7, "HVC" }
230
231#define HSRECN(x) { HSR_EC_##x, #x }
232
233#define kvm_arm_exception_class \
234 HSRECN(UNKNOWN), HSRECN(WFI), HSRECN(CP15_32), HSRECN(CP15_64), \
235 HSRECN(CP14_MR), HSRECN(CP14_LS), HSRECN(CP_0_13), HSRECN(CP10_ID), \
236 HSRECN(JAZELLE), HSRECN(BXJ), HSRECN(CP14_64), HSRECN(SVC_HYP), \
237 HSRECN(HVC), HSRECN(SMC), HSRECN(IABT), HSRECN(IABT_HYP), \
238 HSRECN(DABT), HSRECN(DABT_HYP)
239
240
221#endif /* __ARM_KVM_ARM_H__ */ 241#endif /* __ARM_KVM_ARM_H__ */
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index c4072d9f32c7..6692982c9b57 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -126,7 +126,10 @@ struct kvm_vcpu_arch {
126 * here. 126 * here.
127 */ 127 */
128 128
129 /* Don't run the guest on this vcpu */ 129 /* vcpu power-off state */
130 bool power_off;
131
132 /* Don't run the guest (internal implementation need) */
130 bool pause; 133 bool pause;
131 134
132 /* IO related fields */ 135 /* IO related fields */
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 356970f3b25e..95a000515e43 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -46,4 +46,6 @@ config KVM_ARM_HOST
46 ---help--- 46 ---help---
47 Provides host support for ARM processors. 47 Provides host support for ARM processors.
48 48
49source drivers/vhost/Kconfig
50
49endif # VIRTUALIZATION 51endif # VIRTUALIZATION
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 78b286994577..eab83b2435b8 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -271,6 +271,16 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
271 return kvm_timer_should_fire(vcpu); 271 return kvm_timer_should_fire(vcpu);
272} 272}
273 273
274void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
275{
276 kvm_timer_schedule(vcpu);
277}
278
279void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
280{
281 kvm_timer_unschedule(vcpu);
282}
283
274int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 284int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
275{ 285{
276 /* Force users to call KVM_ARM_VCPU_INIT */ 286 /* Force users to call KVM_ARM_VCPU_INIT */
@@ -308,7 +318,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
308int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 318int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
309 struct kvm_mp_state *mp_state) 319 struct kvm_mp_state *mp_state)
310{ 320{
311 if (vcpu->arch.pause) 321 if (vcpu->arch.power_off)
312 mp_state->mp_state = KVM_MP_STATE_STOPPED; 322 mp_state->mp_state = KVM_MP_STATE_STOPPED;
313 else 323 else
314 mp_state->mp_state = KVM_MP_STATE_RUNNABLE; 324 mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
@@ -321,10 +331,10 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
321{ 331{
322 switch (mp_state->mp_state) { 332 switch (mp_state->mp_state) {
323 case KVM_MP_STATE_RUNNABLE: 333 case KVM_MP_STATE_RUNNABLE:
324 vcpu->arch.pause = false; 334 vcpu->arch.power_off = false;
325 break; 335 break;
326 case KVM_MP_STATE_STOPPED: 336 case KVM_MP_STATE_STOPPED:
327 vcpu->arch.pause = true; 337 vcpu->arch.power_off = true;
328 break; 338 break;
329 default: 339 default:
330 return -EINVAL; 340 return -EINVAL;
@@ -342,7 +352,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
342 */ 352 */
343int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 353int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
344{ 354{
345 return !!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v); 355 return ((!!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v))
356 && !v->arch.power_off && !v->arch.pause);
346} 357}
347 358
348/* Just ensure a guest exit from a particular CPU */ 359/* Just ensure a guest exit from a particular CPU */
@@ -468,11 +479,38 @@ bool kvm_arch_intc_initialized(struct kvm *kvm)
468 return vgic_initialized(kvm); 479 return vgic_initialized(kvm);
469} 480}
470 481
471static void vcpu_pause(struct kvm_vcpu *vcpu) 482static void kvm_arm_halt_guest(struct kvm *kvm) __maybe_unused;
483static void kvm_arm_resume_guest(struct kvm *kvm) __maybe_unused;
484
485static void kvm_arm_halt_guest(struct kvm *kvm)
486{
487 int i;
488 struct kvm_vcpu *vcpu;
489
490 kvm_for_each_vcpu(i, vcpu, kvm)
491 vcpu->arch.pause = true;
492 force_vm_exit(cpu_all_mask);
493}
494
495static void kvm_arm_resume_guest(struct kvm *kvm)
496{
497 int i;
498 struct kvm_vcpu *vcpu;
499
500 kvm_for_each_vcpu(i, vcpu, kvm) {
501 wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
502
503 vcpu->arch.pause = false;
504 wake_up_interruptible(wq);
505 }
506}
507
508static void vcpu_sleep(struct kvm_vcpu *vcpu)
472{ 509{
473 wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); 510 wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
474 511
475 wait_event_interruptible(*wq, !vcpu->arch.pause); 512 wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
513 (!vcpu->arch.pause)));
476} 514}
477 515
478static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) 516static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
@@ -522,8 +560,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
522 560
523 update_vttbr(vcpu->kvm); 561 update_vttbr(vcpu->kvm);
524 562
525 if (vcpu->arch.pause) 563 if (vcpu->arch.power_off || vcpu->arch.pause)
526 vcpu_pause(vcpu); 564 vcpu_sleep(vcpu);
527 565
528 /* 566 /*
529 * Disarming the background timer must be done in a 567 * Disarming the background timer must be done in a
@@ -549,11 +587,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
549 run->exit_reason = KVM_EXIT_INTR; 587 run->exit_reason = KVM_EXIT_INTR;
550 } 588 }
551 589
552 if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) { 590 if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
591 vcpu->arch.power_off || vcpu->arch.pause) {
553 local_irq_enable(); 592 local_irq_enable();
593 kvm_timer_sync_hwstate(vcpu);
554 kvm_vgic_sync_hwstate(vcpu); 594 kvm_vgic_sync_hwstate(vcpu);
555 preempt_enable(); 595 preempt_enable();
556 kvm_timer_sync_hwstate(vcpu);
557 continue; 596 continue;
558 } 597 }
559 598
@@ -596,14 +635,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
596 * guest time. 635 * guest time.
597 */ 636 */
598 kvm_guest_exit(); 637 kvm_guest_exit();
599 trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); 638 trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
639
640 /*
641 * We must sync the timer state before the vgic state so that
642 * the vgic can properly sample the updated state of the
643 * interrupt line.
644 */
645 kvm_timer_sync_hwstate(vcpu);
600 646
601 kvm_vgic_sync_hwstate(vcpu); 647 kvm_vgic_sync_hwstate(vcpu);
602 648
603 preempt_enable(); 649 preempt_enable();
604 650
605 kvm_timer_sync_hwstate(vcpu);
606
607 ret = handle_exit(vcpu, run, ret); 651 ret = handle_exit(vcpu, run, ret);
608 } 652 }
609 653
@@ -765,12 +809,12 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
765 vcpu_reset_hcr(vcpu); 809 vcpu_reset_hcr(vcpu);
766 810
767 /* 811 /*
768 * Handle the "start in power-off" case by marking the VCPU as paused. 812 * Handle the "start in power-off" case.
769 */ 813 */
770 if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) 814 if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
771 vcpu->arch.pause = true; 815 vcpu->arch.power_off = true;
772 else 816 else
773 vcpu->arch.pause = false; 817 vcpu->arch.power_off = false;
774 818
775 return 0; 819 return 0;
776} 820}
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index ad6f6424f1d1..0b556968a6da 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -63,7 +63,7 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
63 63
64static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) 64static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
65{ 65{
66 vcpu->arch.pause = true; 66 vcpu->arch.power_off = true;
67} 67}
68 68
69static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) 69static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
@@ -87,7 +87,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
87 */ 87 */
88 if (!vcpu) 88 if (!vcpu)
89 return PSCI_RET_INVALID_PARAMS; 89 return PSCI_RET_INVALID_PARAMS;
90 if (!vcpu->arch.pause) { 90 if (!vcpu->arch.power_off) {
91 if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1) 91 if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
92 return PSCI_RET_ALREADY_ON; 92 return PSCI_RET_ALREADY_ON;
93 else 93 else
@@ -115,7 +115,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
115 * the general puspose registers are undefined upon CPU_ON. 115 * the general puspose registers are undefined upon CPU_ON.
116 */ 116 */
117 *vcpu_reg(vcpu, 0) = context_id; 117 *vcpu_reg(vcpu, 0) = context_id;
118 vcpu->arch.pause = false; 118 vcpu->arch.power_off = false;
119 smp_mb(); /* Make sure the above is visible */ 119 smp_mb(); /* Make sure the above is visible */
120 120
121 wq = kvm_arch_vcpu_wq(vcpu); 121 wq = kvm_arch_vcpu_wq(vcpu);
@@ -153,7 +153,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
153 mpidr = kvm_vcpu_get_mpidr_aff(tmp); 153 mpidr = kvm_vcpu_get_mpidr_aff(tmp);
154 if ((mpidr & target_affinity_mask) == target_affinity) { 154 if ((mpidr & target_affinity_mask) == target_affinity) {
155 matching_cpus++; 155 matching_cpus++;
156 if (!tmp->arch.pause) 156 if (!tmp->arch.power_off)
157 return PSCI_0_2_AFFINITY_LEVEL_ON; 157 return PSCI_0_2_AFFINITY_LEVEL_ON;
158 } 158 }
159 } 159 }
@@ -179,7 +179,7 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
179 * re-initialized. 179 * re-initialized.
180 */ 180 */
181 kvm_for_each_vcpu(i, tmp, vcpu->kvm) { 181 kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
182 tmp->arch.pause = true; 182 tmp->arch.power_off = true;
183 kvm_vcpu_kick(tmp); 183 kvm_vcpu_kick(tmp);
184 } 184 }
185 185
diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h
index 0ec35392d208..c25a88598eb0 100644
--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@@ -25,21 +25,25 @@ TRACE_EVENT(kvm_entry,
25); 25);
26 26
27TRACE_EVENT(kvm_exit, 27TRACE_EVENT(kvm_exit,
28 TP_PROTO(unsigned int exit_reason, unsigned long vcpu_pc), 28 TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc),
29 TP_ARGS(exit_reason, vcpu_pc), 29 TP_ARGS(idx, exit_reason, vcpu_pc),
30 30
31 TP_STRUCT__entry( 31 TP_STRUCT__entry(
32 __field( int, idx )
32 __field( unsigned int, exit_reason ) 33 __field( unsigned int, exit_reason )
33 __field( unsigned long, vcpu_pc ) 34 __field( unsigned long, vcpu_pc )
34 ), 35 ),
35 36
36 TP_fast_assign( 37 TP_fast_assign(
38 __entry->idx = idx;
37 __entry->exit_reason = exit_reason; 39 __entry->exit_reason = exit_reason;
38 __entry->vcpu_pc = vcpu_pc; 40 __entry->vcpu_pc = vcpu_pc;
39 ), 41 ),
40 42
41 TP_printk("HSR_EC: 0x%04x, PC: 0x%08lx", 43 TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
44 __print_symbolic(__entry->idx, kvm_arm_exception_type),
42 __entry->exit_reason, 45 __entry->exit_reason,
46 __print_symbolic(__entry->exit_reason, kvm_arm_exception_class),
43 __entry->vcpu_pc) 47 __entry->vcpu_pc)
44); 48);
45 49
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 9694f2654593..5e6857b6bdc4 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -200,4 +200,20 @@
200/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ 200/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
201#define HPFAR_MASK (~UL(0xf)) 201#define HPFAR_MASK (~UL(0xf))
202 202
203#define kvm_arm_exception_type \
204 {0, "IRQ" }, \
205 {1, "TRAP" }
206
207#define ECN(x) { ESR_ELx_EC_##x, #x }
208
209#define kvm_arm_exception_class \
210 ECN(UNKNOWN), ECN(WFx), ECN(CP15_32), ECN(CP15_64), ECN(CP14_MR), \
211 ECN(CP14_LS), ECN(FP_ASIMD), ECN(CP10_ID), ECN(CP14_64), ECN(SVC64), \
212 ECN(HVC64), ECN(SMC64), ECN(SYS64), ECN(IMP_DEF), ECN(IABT_LOW), \
213 ECN(IABT_CUR), ECN(PC_ALIGN), ECN(DABT_LOW), ECN(DABT_CUR), \
214 ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \
215 ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \
216 ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \
217 ECN(BKPT32), ECN(VECTOR32), ECN(BRK64)
218
203#endif /* __ARM64_KVM_ARM_H__ */ 219#endif /* __ARM64_KVM_ARM_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index ed039688c221..a35ce7266aac 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -149,7 +149,10 @@ struct kvm_vcpu_arch {
149 u32 mdscr_el1; 149 u32 mdscr_el1;
150 } guest_debug_preserved; 150 } guest_debug_preserved;
151 151
152 /* Don't run the guest */ 152 /* vcpu power-off state */
153 bool power_off;
154
155 /* Don't run the guest (internal implementation need) */
153 bool pause; 156 bool pause;
154 157
155 /* IO related fields */ 158 /* IO related fields */
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index c9d1f34daab1..a5272c07d1cb 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -48,4 +48,6 @@ config KVM_ARM_HOST
48 ---help--- 48 ---help---
49 Provides host support for ARM processors. 49 Provides host support for ARM processors.
50 50
51source drivers/vhost/Kconfig
52
51endif # VIRTUALIZATION 53endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index e5836138ec42..1599701ef044 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -880,6 +880,14 @@ __kvm_hyp_panic:
880 880
881 bl __restore_sysregs 881 bl __restore_sysregs
882 882
883 /*
884 * Make sure we have a valid host stack, and don't leave junk in the
885 * frame pointer that will give us a misleading host stack unwinding.
886 */
887 ldr x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)]
888 msr sp_el1, x22
889 mov x29, xzr
890
8831: adr x0, __hyp_panic_str 8911: adr x0, __hyp_panic_str
884 adr x1, 2f 892 adr x1, 2f
885 ldp x2, x3, [x1] 893 ldp x2, x3, [x1]
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 5a1a882e0a75..6ded8d347af9 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -847,5 +847,7 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
847 struct kvm_memory_slot *slot) {} 847 struct kvm_memory_slot *slot) {}
848static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} 848static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
849static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} 849static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
850static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
851static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
850 852
851#endif /* __MIPS_KVM_HOST_H__ */ 853#endif /* __MIPS_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h
index 6330a61b875a..4852e849128b 100644
--- a/arch/powerpc/include/asm/disassemble.h
+++ b/arch/powerpc/include/asm/disassemble.h
@@ -42,6 +42,11 @@ static inline unsigned int get_dcrn(u32 inst)
42 return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0); 42 return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
43} 43}
44 44
45static inline unsigned int get_tmrn(u32 inst)
46{
47 return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
48}
49
45static inline unsigned int get_rt(u32 inst) 50static inline unsigned int get_rt(u32 inst)
46{ 51{
47 return (inst >> 21) & 0x1f; 52 return (inst >> 21) & 0x1f;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 887c259556df..cfa758c6b4f6 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -716,5 +716,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot
716static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} 716static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
717static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} 717static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
718static inline void kvm_arch_exit(void) {} 718static inline void kvm_arch_exit(void) {}
719static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
720static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
719 721
720#endif /* __POWERPC_KVM_HOST_H__ */ 722#endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 16547efa2d5a..2fef74b474f0 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -742,6 +742,12 @@
742#define MMUBE1_VBE4 0x00000002 742#define MMUBE1_VBE4 0x00000002
743#define MMUBE1_VBE5 0x00000001 743#define MMUBE1_VBE5 0x00000001
744 744
745#define TMRN_TMCFG0 16 /* Thread Management Configuration Register 0 */
746#define TMRN_TMCFG0_NPRIBITS 0x003f0000 /* Bits of thread priority */
747#define TMRN_TMCFG0_NPRIBITS_SHIFT 16
748#define TMRN_TMCFG0_NATHRD 0x00003f00 /* Number of active threads */
749#define TMRN_TMCFG0_NATHRD_SHIFT 8
750#define TMRN_TMCFG0_NTHRD 0x0000003f /* Number of threads */
745#define TMRN_IMSR0 0x120 /* Initial MSR Register 0 (e6500) */ 751#define TMRN_IMSR0 0x120 /* Initial MSR Register 0 (e6500) */
746#define TMRN_IMSR1 0x121 /* Initial MSR Register 1 (e6500) */ 752#define TMRN_IMSR1 0x121 /* Initial MSR Register 1 (e6500) */
747#define TMRN_INIA0 0x140 /* Next Instruction Address Register 0 */ 753#define TMRN_INIA0 0x140 /* Next Instruction Address Register 0 */
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 1f9c0a17f445..10722b1e38b5 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -70,7 +70,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
70 } 70 }
71 71
72 /* Lastly try successively smaller sizes from the page allocator */ 72 /* Lastly try successively smaller sizes from the page allocator */
73 while (!hpt && order > PPC_MIN_HPT_ORDER) { 73 /* Only do this if userspace didn't specify a size via ioctl */
74 while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) {
74 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| 75 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
75 __GFP_NOWARN, order - PAGE_SHIFT); 76 __GFP_NOWARN, order - PAGE_SHIFT);
76 if (!hpt) 77 if (!hpt)
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index c1df9bb1e413..97e7f8c853d8 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -470,6 +470,8 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
470 note_hpte_modification(kvm, rev); 470 note_hpte_modification(kvm, rev);
471 unlock_hpte(hpte, 0); 471 unlock_hpte(hpte, 0);
472 472
473 if (v & HPTE_V_ABSENT)
474 v = (v & ~HPTE_V_ABSENT) | HPTE_V_VALID;
473 hpret[0] = v; 475 hpret[0] = v;
474 hpret[1] = r; 476 hpret[1] = r;
475 return H_SUCCESS; 477 return H_SUCCESS;
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b98889e9851d..b1dab8d1d885 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -150,6 +150,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
150 cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK 150 cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
151 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL 151 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
152 beq 11f 152 beq 11f
153 cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
154 beq 15f /* Invoke the H_DOORBELL handler */
153 cmpwi cr2, r12, BOOK3S_INTERRUPT_HMI 155 cmpwi cr2, r12, BOOK3S_INTERRUPT_HMI
154 beq cr2, 14f /* HMI check */ 156 beq cr2, 14f /* HMI check */
155 157
@@ -174,6 +176,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
174 mtspr SPRN_HSRR1, r7 176 mtspr SPRN_HSRR1, r7
175 b hmi_exception_after_realmode 177 b hmi_exception_after_realmode
176 178
17915: mtspr SPRN_HSRR0, r8
180 mtspr SPRN_HSRR1, r7
181 ba 0xe80
182
177kvmppc_primary_no_guest: 183kvmppc_primary_no_guest:
178 /* We handle this much like a ceded vcpu */ 184 /* We handle this much like a ceded vcpu */
179 /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ 185 /* put the HDEC into the DEC, since HDEC interrupts don't wake us */
@@ -2377,7 +2383,6 @@ machine_check_realmode:
2377 mr r3, r9 /* get vcpu pointer */ 2383 mr r3, r9 /* get vcpu pointer */
2378 bl kvmppc_realmode_machine_check 2384 bl kvmppc_realmode_machine_check
2379 nop 2385 nop
2380 cmpdi r3, 0 /* Did we handle MCE ? */
2381 ld r9, HSTATE_KVM_VCPU(r13) 2386 ld r9, HSTATE_KVM_VCPU(r13)
2382 li r12, BOOK3S_INTERRUPT_MACHINE_CHECK 2387 li r12, BOOK3S_INTERRUPT_MACHINE_CHECK
2383 /* 2388 /*
@@ -2390,13 +2395,18 @@ machine_check_realmode:
2390 * The old code used to return to host for unhandled errors which 2395 * The old code used to return to host for unhandled errors which
2391 * was causing guest to hang with soft lockups inside guest and 2396 * was causing guest to hang with soft lockups inside guest and
2392 * makes it difficult to recover guest instance. 2397 * makes it difficult to recover guest instance.
2398 *
2399 * if we receive machine check with MSR(RI=0) then deliver it to
2400 * guest as machine check causing guest to crash.
2393 */ 2401 */
2394 ld r10, VCPU_PC(r9)
2395 ld r11, VCPU_MSR(r9) 2402 ld r11, VCPU_MSR(r9)
2403 andi. r10, r11, MSR_RI /* check for unrecoverable exception */
2404 beq 1f /* Deliver a machine check to guest */
2405 ld r10, VCPU_PC(r9)
2406 cmpdi r3, 0 /* Did we handle MCE ? */
2396 bne 2f /* Continue guest execution. */ 2407 bne 2f /* Continue guest execution. */
2397 /* If not, deliver a machine check. SRR0/1 are already set */ 2408 /* If not, deliver a machine check. SRR0/1 are already set */
2398 li r10, BOOK3S_INTERRUPT_MACHINE_CHECK 24091: li r10, BOOK3S_INTERRUPT_MACHINE_CHECK
2399 ld r11, VCPU_MSR(r9)
2400 bl kvmppc_msr_interrupt 2410 bl kvmppc_msr_interrupt
24012: b fast_interrupt_c_return 24112: b fast_interrupt_c_return
2402 2412
@@ -2436,14 +2446,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
2436 2446
2437 /* hypervisor doorbell */ 2447 /* hypervisor doorbell */
24383: li r12, BOOK3S_INTERRUPT_H_DOORBELL 24483: li r12, BOOK3S_INTERRUPT_H_DOORBELL
2449
2450 /*
2451 * Clear the doorbell as we will invoke the handler
2452 * explicitly in the guest exit path.
2453 */
2454 lis r6, (PPC_DBELL_SERVER << (63-36))@h
2455 PPC_MSGCLR(6)
2439 /* see if it's a host IPI */ 2456 /* see if it's a host IPI */
2440 li r3, 1 2457 li r3, 1
2441 lbz r0, HSTATE_HOST_IPI(r13) 2458 lbz r0, HSTATE_HOST_IPI(r13)
2442 cmpwi r0, 0 2459 cmpwi r0, 0
2443 bnelr 2460 bnelr
2444 /* if not, clear it and return -1 */ 2461 /* if not, return -1 */
2445 lis r6, (PPC_DBELL_SERVER << (63-36))@h
2446 PPC_MSGCLR(6)
2447 li r3, -1 2462 li r3, -1
2448 blr 2463 blr
2449 2464
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index b29ce752c7d6..32fdab57d604 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -237,7 +237,8 @@ void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
237 struct kvm_book3e_206_tlb_entry *gtlbe) 237 struct kvm_book3e_206_tlb_entry *gtlbe)
238{ 238{
239 struct vcpu_id_table *idt = vcpu_e500->idt; 239 struct vcpu_id_table *idt = vcpu_e500->idt;
240 unsigned int pr, tid, ts, pid; 240 unsigned int pr, tid, ts;
241 int pid;
241 u32 val, eaddr; 242 u32 val, eaddr;
242 unsigned long flags; 243 unsigned long flags;
243 244
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index ce7291c79f6c..990db69a1d0b 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -15,6 +15,7 @@
15#include <asm/kvm_ppc.h> 15#include <asm/kvm_ppc.h>
16#include <asm/disassemble.h> 16#include <asm/disassemble.h>
17#include <asm/dbell.h> 17#include <asm/dbell.h>
18#include <asm/reg_booke.h>
18 19
19#include "booke.h" 20#include "booke.h"
20#include "e500.h" 21#include "e500.h"
@@ -22,6 +23,7 @@
22#define XOP_DCBTLS 166 23#define XOP_DCBTLS 166
23#define XOP_MSGSND 206 24#define XOP_MSGSND 206
24#define XOP_MSGCLR 238 25#define XOP_MSGCLR 238
26#define XOP_MFTMR 366
25#define XOP_TLBIVAX 786 27#define XOP_TLBIVAX 786
26#define XOP_TLBSX 914 28#define XOP_TLBSX 914
27#define XOP_TLBRE 946 29#define XOP_TLBRE 946
@@ -113,6 +115,19 @@ static int kvmppc_e500_emul_dcbtls(struct kvm_vcpu *vcpu)
113 return EMULATE_DONE; 115 return EMULATE_DONE;
114} 116}
115 117
118static int kvmppc_e500_emul_mftmr(struct kvm_vcpu *vcpu, unsigned int inst,
119 int rt)
120{
121 /* Expose one thread per vcpu */
122 if (get_tmrn(inst) == TMRN_TMCFG0) {
123 kvmppc_set_gpr(vcpu, rt,
124 1 | (1 << TMRN_TMCFG0_NATHRD_SHIFT));
125 return EMULATE_DONE;
126 }
127
128 return EMULATE_FAIL;
129}
130
116int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, 131int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu,
117 unsigned int inst, int *advance) 132 unsigned int inst, int *advance)
118{ 133{
@@ -165,6 +180,10 @@ int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu,
165 emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); 180 emulated = kvmppc_e500_emul_tlbivax(vcpu, ea);
166 break; 181 break;
167 182
183 case XOP_MFTMR:
184 emulated = kvmppc_e500_emul_mftmr(vcpu, inst, rt);
185 break;
186
168 case XOP_EHPRIV: 187 case XOP_EHPRIV:
169 emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst, 188 emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst,
170 advance); 189 advance);
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 4d33e199edcc..5e2102c19586 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -406,7 +406,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
406 406
407 for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { 407 for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
408 unsigned long gfn_start, gfn_end; 408 unsigned long gfn_start, gfn_end;
409 tsize_pages = 1 << (tsize - 2); 409 tsize_pages = 1UL << (tsize - 2);
410 410
411 gfn_start = gfn & ~(tsize_pages - 1); 411 gfn_start = gfn & ~(tsize_pages - 1);
412 gfn_end = gfn_start + tsize_pages; 412 gfn_end = gfn_start + tsize_pages;
@@ -447,7 +447,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
447 } 447 }
448 448
449 if (likely(!pfnmap)) { 449 if (likely(!pfnmap)) {
450 tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); 450 tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT);
451 pfn = gfn_to_pfn_memslot(slot, gfn); 451 pfn = gfn_to_pfn_memslot(slot, gfn);
452 if (is_error_noslot_pfn(pfn)) { 452 if (is_error_noslot_pfn(pfn)) {
453 if (printk_ratelimit()) 453 if (printk_ratelimit())
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2e51289610e4..6fd2405c7f4a 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -559,6 +559,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
559 else 559 else
560 r = num_online_cpus(); 560 r = num_online_cpus();
561 break; 561 break;
562 case KVM_CAP_NR_MEMSLOTS:
563 r = KVM_USER_MEM_SLOTS;
564 break;
562 case KVM_CAP_MAX_VCPUS: 565 case KVM_CAP_MAX_VCPUS:
563 r = KVM_MAX_VCPUS; 566 r = KVM_MAX_VCPUS;
564 break; 567 break;
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 7f654308817c..efaac2c3bb77 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -644,5 +644,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot
644static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} 644static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
645static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 645static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
646 struct kvm_memory_slot *slot) {} 646 struct kvm_memory_slot *slot) {}
647static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
648static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
647 649
648#endif 650#endif
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 7365e8a46032..b4a5aa110cec 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -336,28 +336,28 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
336 return -EOPNOTSUPP; 336 return -EOPNOTSUPP;
337} 337}
338 338
339static const intercept_handler_t intercept_funcs[] = {
340 [0x00 >> 2] = handle_noop,
341 [0x04 >> 2] = handle_instruction,
342 [0x08 >> 2] = handle_prog,
343 [0x10 >> 2] = handle_noop,
344 [0x14 >> 2] = handle_external_interrupt,
345 [0x18 >> 2] = handle_noop,
346 [0x1C >> 2] = kvm_s390_handle_wait,
347 [0x20 >> 2] = handle_validity,
348 [0x28 >> 2] = handle_stop,
349 [0x38 >> 2] = handle_partial_execution,
350};
351
352int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) 339int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
353{ 340{
354 intercept_handler_t func; 341 switch (vcpu->arch.sie_block->icptcode) {
355 u8 code = vcpu->arch.sie_block->icptcode; 342 case 0x00:
356 343 case 0x10:
357 if (code & 3 || (code >> 2) >= ARRAY_SIZE(intercept_funcs)) 344 case 0x18:
345 return handle_noop(vcpu);
346 case 0x04:
347 return handle_instruction(vcpu);
348 case 0x08:
349 return handle_prog(vcpu);
350 case 0x14:
351 return handle_external_interrupt(vcpu);
352 case 0x1c:
353 return kvm_s390_handle_wait(vcpu);
354 case 0x20:
355 return handle_validity(vcpu);
356 case 0x28:
357 return handle_stop(vcpu);
358 case 0x38:
359 return handle_partial_execution(vcpu);
360 default:
358 return -EOPNOTSUPP; 361 return -EOPNOTSUPP;
359 func = intercept_funcs[code >> 2]; 362 }
360 if (func)
361 return func(vcpu);
362 return -EOPNOTSUPP;
363} 363}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 5c2c169395c3..373e32346d68 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -51,11 +51,9 @@ static int psw_mchk_disabled(struct kvm_vcpu *vcpu)
51 51
52static int psw_interrupts_disabled(struct kvm_vcpu *vcpu) 52static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
53{ 53{
54 if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) || 54 return psw_extint_disabled(vcpu) &&
55 (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO) || 55 psw_ioint_disabled(vcpu) &&
56 (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT)) 56 psw_mchk_disabled(vcpu);
57 return 0;
58 return 1;
59} 57}
60 58
61static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu) 59static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu)
@@ -71,13 +69,8 @@ static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu)
71 69
72static int ckc_irq_pending(struct kvm_vcpu *vcpu) 70static int ckc_irq_pending(struct kvm_vcpu *vcpu)
73{ 71{
74 preempt_disable(); 72 if (vcpu->arch.sie_block->ckc >= kvm_s390_get_tod_clock_fast(vcpu->kvm))
75 if (!(vcpu->arch.sie_block->ckc <
76 get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) {
77 preempt_enable();
78 return 0; 73 return 0;
79 }
80 preempt_enable();
81 return ckc_interrupts_enabled(vcpu); 74 return ckc_interrupts_enabled(vcpu);
82} 75}
83 76
@@ -109,14 +102,10 @@ static inline u8 int_word_to_isc(u32 int_word)
109 return (int_word & 0x38000000) >> 27; 102 return (int_word & 0x38000000) >> 27;
110} 103}
111 104
112static inline unsigned long pending_floating_irqs(struct kvm_vcpu *vcpu) 105static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
113{ 106{
114 return vcpu->kvm->arch.float_int.pending_irqs; 107 return vcpu->kvm->arch.float_int.pending_irqs |
115} 108 vcpu->arch.local_int.pending_irqs;
116
117static inline unsigned long pending_local_irqs(struct kvm_vcpu *vcpu)
118{
119 return vcpu->arch.local_int.pending_irqs;
120} 109}
121 110
122static unsigned long disable_iscs(struct kvm_vcpu *vcpu, 111static unsigned long disable_iscs(struct kvm_vcpu *vcpu,
@@ -135,8 +124,7 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
135{ 124{
136 unsigned long active_mask; 125 unsigned long active_mask;
137 126
138 active_mask = pending_local_irqs(vcpu); 127 active_mask = pending_irqs(vcpu);
139 active_mask |= pending_floating_irqs(vcpu);
140 if (!active_mask) 128 if (!active_mask)
141 return 0; 129 return 0;
142 130
@@ -204,7 +192,7 @@ static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
204 192
205static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) 193static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
206{ 194{
207 if (!(pending_floating_irqs(vcpu) & IRQ_PEND_IO_MASK)) 195 if (!(pending_irqs(vcpu) & IRQ_PEND_IO_MASK))
208 return; 196 return;
209 else if (psw_ioint_disabled(vcpu)) 197 else if (psw_ioint_disabled(vcpu))
210 __set_cpuflag(vcpu, CPUSTAT_IO_INT); 198 __set_cpuflag(vcpu, CPUSTAT_IO_INT);
@@ -214,7 +202,7 @@ static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
214 202
215static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) 203static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu)
216{ 204{
217 if (!(pending_local_irqs(vcpu) & IRQ_PEND_EXT_MASK)) 205 if (!(pending_irqs(vcpu) & IRQ_PEND_EXT_MASK))
218 return; 206 return;
219 if (psw_extint_disabled(vcpu)) 207 if (psw_extint_disabled(vcpu))
220 __set_cpuflag(vcpu, CPUSTAT_EXT_INT); 208 __set_cpuflag(vcpu, CPUSTAT_EXT_INT);
@@ -224,7 +212,7 @@ static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu)
224 212
225static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu) 213static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu)
226{ 214{
227 if (!(pending_local_irqs(vcpu) & IRQ_PEND_MCHK_MASK)) 215 if (!(pending_irqs(vcpu) & IRQ_PEND_MCHK_MASK))
228 return; 216 return;
229 if (psw_mchk_disabled(vcpu)) 217 if (psw_mchk_disabled(vcpu))
230 vcpu->arch.sie_block->ictl |= ICTL_LPSW; 218 vcpu->arch.sie_block->ictl |= ICTL_LPSW;
@@ -815,23 +803,21 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu)
815 803
816int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop) 804int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop)
817{ 805{
818 int rc; 806 if (deliverable_irqs(vcpu))
807 return 1;
819 808
820 rc = !!deliverable_irqs(vcpu); 809 if (kvm_cpu_has_pending_timer(vcpu))
821 810 return 1;
822 if (!rc && kvm_cpu_has_pending_timer(vcpu))
823 rc = 1;
824 811
825 /* external call pending and deliverable */ 812 /* external call pending and deliverable */
826 if (!rc && kvm_s390_ext_call_pending(vcpu) && 813 if (kvm_s390_ext_call_pending(vcpu) &&
827 !psw_extint_disabled(vcpu) && 814 !psw_extint_disabled(vcpu) &&
828 (vcpu->arch.sie_block->gcr[0] & 0x2000ul)) 815 (vcpu->arch.sie_block->gcr[0] & 0x2000ul))
829 rc = 1; 816 return 1;
830
831 if (!rc && !exclude_stop && kvm_s390_is_stop_irq_pending(vcpu))
832 rc = 1;
833 817
834 return rc; 818 if (!exclude_stop && kvm_s390_is_stop_irq_pending(vcpu))
819 return 1;
820 return 0;
835} 821}
836 822
837int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) 823int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
@@ -846,7 +832,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
846 vcpu->stat.exit_wait_state++; 832 vcpu->stat.exit_wait_state++;
847 833
848 /* fast path */ 834 /* fast path */
849 if (kvm_cpu_has_pending_timer(vcpu) || kvm_arch_vcpu_runnable(vcpu)) 835 if (kvm_arch_vcpu_runnable(vcpu))
850 return 0; 836 return 0;
851 837
852 if (psw_interrupts_disabled(vcpu)) { 838 if (psw_interrupts_disabled(vcpu)) {
@@ -860,9 +846,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
860 goto no_timer; 846 goto no_timer;
861 } 847 }
862 848
863 preempt_disable(); 849 now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
864 now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
865 preempt_enable();
866 sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); 850 sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
867 851
868 /* underflow */ 852 /* underflow */
@@ -901,9 +885,7 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
901 u64 now, sltime; 885 u64 now, sltime;
902 886
903 vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer); 887 vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
904 preempt_disable(); 888 now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
905 now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
906 preempt_enable();
907 sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); 889 sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
908 890
909 /* 891 /*
@@ -981,39 +963,30 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
981 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, 963 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
982 irq->u.pgm.code, 0); 964 irq->u.pgm.code, 0);
983 965
984 li->irq.pgm = irq->u.pgm; 966 if (irq->u.pgm.code == PGM_PER) {
967 li->irq.pgm.code |= PGM_PER;
968 /* only modify PER related information */
969 li->irq.pgm.per_address = irq->u.pgm.per_address;
970 li->irq.pgm.per_code = irq->u.pgm.per_code;
971 li->irq.pgm.per_atmid = irq->u.pgm.per_atmid;
972 li->irq.pgm.per_access_id = irq->u.pgm.per_access_id;
973 } else if (!(irq->u.pgm.code & PGM_PER)) {
974 li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) |
975 irq->u.pgm.code;
976 /* only modify non-PER information */
977 li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code;
978 li->irq.pgm.mon_code = irq->u.pgm.mon_code;
979 li->irq.pgm.data_exc_code = irq->u.pgm.data_exc_code;
980 li->irq.pgm.mon_class_nr = irq->u.pgm.mon_class_nr;
981 li->irq.pgm.exc_access_id = irq->u.pgm.exc_access_id;
982 li->irq.pgm.op_access_id = irq->u.pgm.op_access_id;
983 } else {
984 li->irq.pgm = irq->u.pgm;
985 }
985 set_bit(IRQ_PEND_PROG, &li->pending_irqs); 986 set_bit(IRQ_PEND_PROG, &li->pending_irqs);
986 return 0; 987 return 0;
987} 988}
988 989
989int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
990{
991 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
992 struct kvm_s390_irq irq;
993
994 spin_lock(&li->lock);
995 irq.u.pgm.code = code;
996 __inject_prog(vcpu, &irq);
997 BUG_ON(waitqueue_active(li->wq));
998 spin_unlock(&li->lock);
999 return 0;
1000}
1001
1002int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
1003 struct kvm_s390_pgm_info *pgm_info)
1004{
1005 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
1006 struct kvm_s390_irq irq;
1007 int rc;
1008
1009 spin_lock(&li->lock);
1010 irq.u.pgm = *pgm_info;
1011 rc = __inject_prog(vcpu, &irq);
1012 BUG_ON(waitqueue_active(li->wq));
1013 spin_unlock(&li->lock);
1014 return rc;
1015}
1016
1017static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) 990static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
1018{ 991{
1019 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 992 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -1390,12 +1363,9 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
1390 1363
1391static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) 1364static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
1392{ 1365{
1393 struct kvm_s390_float_interrupt *fi;
1394 u64 type = READ_ONCE(inti->type); 1366 u64 type = READ_ONCE(inti->type);
1395 int rc; 1367 int rc;
1396 1368
1397 fi = &kvm->arch.float_int;
1398
1399 switch (type) { 1369 switch (type) {
1400 case KVM_S390_MCHK: 1370 case KVM_S390_MCHK:
1401 rc = __inject_float_mchk(kvm, inti); 1371 rc = __inject_float_mchk(kvm, inti);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index c6b4063fce29..8fe2f1c722dc 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -514,35 +514,20 @@ static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
514 514
515 if (gtod_high != 0) 515 if (gtod_high != 0)
516 return -EINVAL; 516 return -EINVAL;
517 VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x\n", gtod_high); 517 VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x", gtod_high);
518 518
519 return 0; 519 return 0;
520} 520}
521 521
522static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) 522static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
523{ 523{
524 struct kvm_vcpu *cur_vcpu; 524 u64 gtod;
525 unsigned int vcpu_idx;
526 u64 host_tod, gtod;
527 int r;
528 525
529 if (copy_from_user(&gtod, (void __user *)attr->addr, sizeof(gtod))) 526 if (copy_from_user(&gtod, (void __user *)attr->addr, sizeof(gtod)))
530 return -EFAULT; 527 return -EFAULT;
531 528
532 r = store_tod_clock(&host_tod); 529 kvm_s390_set_tod_clock(kvm, gtod);
533 if (r) 530 VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod);
534 return r;
535
536 mutex_lock(&kvm->lock);
537 preempt_disable();
538 kvm->arch.epoch = gtod - host_tod;
539 kvm_s390_vcpu_block_all(kvm);
540 kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm)
541 cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch;
542 kvm_s390_vcpu_unblock_all(kvm);
543 preempt_enable();
544 mutex_unlock(&kvm->lock);
545 VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx\n", gtod);
546 return 0; 531 return 0;
547} 532}
548 533
@@ -574,26 +559,19 @@ static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
574 if (copy_to_user((void __user *)attr->addr, &gtod_high, 559 if (copy_to_user((void __user *)attr->addr, &gtod_high,
575 sizeof(gtod_high))) 560 sizeof(gtod_high)))
576 return -EFAULT; 561 return -EFAULT;
577 VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x\n", gtod_high); 562 VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x", gtod_high);
578 563
579 return 0; 564 return 0;
580} 565}
581 566
582static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) 567static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
583{ 568{
584 u64 host_tod, gtod; 569 u64 gtod;
585 int r;
586 570
587 r = store_tod_clock(&host_tod); 571 gtod = kvm_s390_get_tod_clock_fast(kvm);
588 if (r)
589 return r;
590
591 preempt_disable();
592 gtod = host_tod + kvm->arch.epoch;
593 preempt_enable();
594 if (copy_to_user((void __user *)attr->addr, &gtod, sizeof(gtod))) 572 if (copy_to_user((void __user *)attr->addr, &gtod, sizeof(gtod)))
595 return -EFAULT; 573 return -EFAULT;
596 VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx\n", gtod); 574 VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx", gtod);
597 575
598 return 0; 576 return 0;
599} 577}
@@ -1120,7 +1098,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
1120 if (!kvm->arch.sca) 1098 if (!kvm->arch.sca)
1121 goto out_err; 1099 goto out_err;
1122 spin_lock(&kvm_lock); 1100 spin_lock(&kvm_lock);
1123 sca_offset = (sca_offset + 16) & 0x7f0; 1101 sca_offset += 16;
1102 if (sca_offset + sizeof(struct sca_block) > PAGE_SIZE)
1103 sca_offset = 0;
1124 kvm->arch.sca = (struct sca_block *) ((char *) kvm->arch.sca + sca_offset); 1104 kvm->arch.sca = (struct sca_block *) ((char *) kvm->arch.sca + sca_offset);
1125 spin_unlock(&kvm_lock); 1105 spin_unlock(&kvm_lock);
1126 1106
@@ -1911,6 +1891,22 @@ retry:
1911 return 0; 1891 return 0;
1912} 1892}
1913 1893
1894void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod)
1895{
1896 struct kvm_vcpu *vcpu;
1897 int i;
1898
1899 mutex_lock(&kvm->lock);
1900 preempt_disable();
1901 kvm->arch.epoch = tod - get_tod_clock();
1902 kvm_s390_vcpu_block_all(kvm);
1903 kvm_for_each_vcpu(i, vcpu, kvm)
1904 vcpu->arch.sie_block->epoch = kvm->arch.epoch;
1905 kvm_s390_vcpu_unblock_all(kvm);
1906 preempt_enable();
1907 mutex_unlock(&kvm->lock);
1908}
1909
1914/** 1910/**
1915 * kvm_arch_fault_in_page - fault-in guest page if necessary 1911 * kvm_arch_fault_in_page - fault-in guest page if necessary
1916 * @vcpu: The corresponding virtual cpu 1912 * @vcpu: The corresponding virtual cpu
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index c446aabf60d3..1e70e00d3c5e 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -175,6 +175,7 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
175 return kvm->arch.user_cpu_state_ctrl != 0; 175 return kvm->arch.user_cpu_state_ctrl != 0;
176} 176}
177 177
178/* implemented in interrupt.c */
178int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); 179int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
179void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu); 180void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu);
180enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); 181enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
@@ -185,7 +186,25 @@ int __must_check kvm_s390_inject_vm(struct kvm *kvm,
185 struct kvm_s390_interrupt *s390int); 186 struct kvm_s390_interrupt *s390int);
186int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, 187int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
187 struct kvm_s390_irq *irq); 188 struct kvm_s390_irq *irq);
188int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); 189static inline int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
190 struct kvm_s390_pgm_info *pgm_info)
191{
192 struct kvm_s390_irq irq = {
193 .type = KVM_S390_PROGRAM_INT,
194 .u.pgm = *pgm_info,
195 };
196
197 return kvm_s390_inject_vcpu(vcpu, &irq);
198}
199static inline int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
200{
201 struct kvm_s390_irq irq = {
202 .type = KVM_S390_PROGRAM_INT,
203 .u.pgm.code = code,
204 };
205
206 return kvm_s390_inject_vcpu(vcpu, &irq);
207}
189struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, 208struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
190 u64 isc_mask, u32 schid); 209 u64 isc_mask, u32 schid);
191int kvm_s390_reinject_io_int(struct kvm *kvm, 210int kvm_s390_reinject_io_int(struct kvm *kvm,
@@ -212,6 +231,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
212int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); 231int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
213 232
214/* implemented in kvm-s390.c */ 233/* implemented in kvm-s390.c */
234void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
215long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); 235long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
216int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); 236int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
217int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu, 237int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
@@ -231,9 +251,6 @@ extern unsigned long kvm_s390_fac_list_mask[];
231 251
232/* implemented in diag.c */ 252/* implemented in diag.c */
233int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); 253int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
234/* implemented in interrupt.c */
235int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
236 struct kvm_s390_pgm_info *pgm_info);
237 254
238static inline void kvm_s390_vcpu_block_all(struct kvm *kvm) 255static inline void kvm_s390_vcpu_block_all(struct kvm *kvm)
239{ 256{
@@ -254,6 +271,16 @@ static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm)
254 kvm_s390_vcpu_unblock(vcpu); 271 kvm_s390_vcpu_unblock(vcpu);
255} 272}
256 273
274static inline u64 kvm_s390_get_tod_clock_fast(struct kvm *kvm)
275{
276 u64 rc;
277
278 preempt_disable();
279 rc = get_tod_clock_fast() + kvm->arch.epoch;
280 preempt_enable();
281 return rc;
282}
283
257/** 284/**
258 * kvm_s390_inject_prog_cond - conditionally inject a program check 285 * kvm_s390_inject_prog_cond - conditionally inject a program check
259 * @vcpu: virtual cpu 286 * @vcpu: virtual cpu
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 4d21dc4d1a84..77191b85ea7a 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -33,11 +33,9 @@
33/* Handle SCK (SET CLOCK) interception */ 33/* Handle SCK (SET CLOCK) interception */
34static int handle_set_clock(struct kvm_vcpu *vcpu) 34static int handle_set_clock(struct kvm_vcpu *vcpu)
35{ 35{
36 struct kvm_vcpu *cpup; 36 int rc;
37 s64 hostclk, val;
38 int i, rc;
39 ar_t ar; 37 ar_t ar;
40 u64 op2; 38 u64 op2, val;
41 39
42 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 40 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
43 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 41 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -49,19 +47,8 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
49 if (rc) 47 if (rc)
50 return kvm_s390_inject_prog_cond(vcpu, rc); 48 return kvm_s390_inject_prog_cond(vcpu, rc);
51 49
52 if (store_tod_clock(&hostclk)) {
53 kvm_s390_set_psw_cc(vcpu, 3);
54 return 0;
55 }
56 VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val); 50 VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val);
57 val = (val - hostclk) & ~0x3fUL; 51 kvm_s390_set_tod_clock(vcpu->kvm, val);
58
59 mutex_lock(&vcpu->kvm->lock);
60 preempt_disable();
61 kvm_for_each_vcpu(i, cpup, vcpu->kvm)
62 cpup->arch.sie_block->epoch = val;
63 preempt_enable();
64 mutex_unlock(&vcpu->kvm->lock);
65 52
66 kvm_s390_set_psw_cc(vcpu, 0); 53 kvm_s390_set_psw_cc(vcpu, 0);
67 return 0; 54 return 0;
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 046c7fb1ca43..a210eba2727c 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -33,6 +33,11 @@ enum irq_remap_cap {
33 IRQ_POSTING_CAP = 0, 33 IRQ_POSTING_CAP = 0,
34}; 34};
35 35
36struct vcpu_data {
37 u64 pi_desc_addr; /* Physical address of PI Descriptor */
38 u32 vector; /* Guest vector of the interrupt */
39};
40
36#ifdef CONFIG_IRQ_REMAP 41#ifdef CONFIG_IRQ_REMAP
37 42
38extern bool irq_remapping_cap(enum irq_remap_cap cap); 43extern bool irq_remapping_cap(enum irq_remap_cap cap);
@@ -58,11 +63,6 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void)
58 return x86_vector_domain; 63 return x86_vector_domain;
59} 64}
60 65
61struct vcpu_data {
62 u64 pi_desc_addr; /* Physical address of PI Descriptor */
63 u32 vector; /* Guest vector of the interrupt */
64};
65
66#else /* CONFIG_IRQ_REMAP */ 66#else /* CONFIG_IRQ_REMAP */
67 67
68static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; } 68static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index e16466ec473c..e9cd7befcb76 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -112,6 +112,16 @@ struct x86_emulate_ops {
112 struct x86_exception *fault); 112 struct x86_exception *fault);
113 113
114 /* 114 /*
115 * read_phys: Read bytes of standard (non-emulated/special) memory.
116 * Used for descriptor reading.
117 * @addr: [IN ] Physical address from which to read.
118 * @val: [OUT] Value read from memory.
119 * @bytes: [IN ] Number of bytes to read from memory.
120 */
121 int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr,
122 void *val, unsigned int bytes);
123
124 /*
115 * write_std: Write bytes of standard (non-emulated/special) memory. 125 * write_std: Write bytes of standard (non-emulated/special) memory.
116 * Used for descriptor writing. 126 * Used for descriptor writing.
117 * @addr: [IN ] Linear address to which to write. 127 * @addr: [IN ] Linear address to which to write.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3a36ee704c30..9265196e877f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -24,6 +24,7 @@
24#include <linux/perf_event.h> 24#include <linux/perf_event.h>
25#include <linux/pvclock_gtod.h> 25#include <linux/pvclock_gtod.h>
26#include <linux/clocksource.h> 26#include <linux/clocksource.h>
27#include <linux/irqbypass.h>
27 28
28#include <asm/pvclock-abi.h> 29#include <asm/pvclock-abi.h>
29#include <asm/desc.h> 30#include <asm/desc.h>
@@ -176,6 +177,8 @@ enum {
176 */ 177 */
177#define KVM_APIC_PV_EOI_PENDING 1 178#define KVM_APIC_PV_EOI_PENDING 1
178 179
180struct kvm_kernel_irq_routing_entry;
181
179/* 182/*
180 * We don't want allocation failures within the mmu code, so we preallocate 183 * We don't want allocation failures within the mmu code, so we preallocate
181 * enough memory for a single page fault in a cache. 184 * enough memory for a single page fault in a cache.
@@ -374,6 +377,7 @@ struct kvm_mtrr {
374/* Hyper-V per vcpu emulation context */ 377/* Hyper-V per vcpu emulation context */
375struct kvm_vcpu_hv { 378struct kvm_vcpu_hv {
376 u64 hv_vapic; 379 u64 hv_vapic;
380 s64 runtime_offset;
377}; 381};
378 382
379struct kvm_vcpu_arch { 383struct kvm_vcpu_arch {
@@ -396,6 +400,7 @@ struct kvm_vcpu_arch {
396 u64 efer; 400 u64 efer;
397 u64 apic_base; 401 u64 apic_base;
398 struct kvm_lapic *apic; /* kernel irqchip context */ 402 struct kvm_lapic *apic; /* kernel irqchip context */
403 u64 eoi_exit_bitmap[4];
399 unsigned long apic_attention; 404 unsigned long apic_attention;
400 int32_t apic_arb_prio; 405 int32_t apic_arb_prio;
401 int mp_state; 406 int mp_state;
@@ -573,6 +578,9 @@ struct kvm_vcpu_arch {
573 struct { 578 struct {
574 bool pv_unhalted; 579 bool pv_unhalted;
575 } pv; 580 } pv;
581
582 int pending_ioapic_eoi;
583 int pending_external_vector;
576}; 584};
577 585
578struct kvm_lpage_info { 586struct kvm_lpage_info {
@@ -683,6 +691,9 @@ struct kvm_arch {
683 u32 bsp_vcpu_id; 691 u32 bsp_vcpu_id;
684 692
685 u64 disabled_quirks; 693 u64 disabled_quirks;
694
695 bool irqchip_split;
696 u8 nr_reserved_ioapic_pins;
686}; 697};
687 698
688struct kvm_vm_stat { 699struct kvm_vm_stat {
@@ -819,10 +830,10 @@ struct kvm_x86_ops {
819 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 830 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
820 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 831 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
821 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 832 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
822 int (*vm_has_apicv)(struct kvm *kvm); 833 int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu);
823 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); 834 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
824 void (*hwapic_isr_update)(struct kvm *kvm, int isr); 835 void (*hwapic_isr_update)(struct kvm *kvm, int isr);
825 void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); 836 void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu);
826 void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); 837 void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
827 void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); 838 void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
828 void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); 839 void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
@@ -887,6 +898,20 @@ struct kvm_x86_ops {
887 gfn_t offset, unsigned long mask); 898 gfn_t offset, unsigned long mask);
888 /* pmu operations of sub-arch */ 899 /* pmu operations of sub-arch */
889 const struct kvm_pmu_ops *pmu_ops; 900 const struct kvm_pmu_ops *pmu_ops;
901
902 /*
903 * Architecture specific hooks for vCPU blocking due to
904 * HLT instruction.
905 * Returns for .pre_block():
906 * - 0 means continue to block the vCPU.
907 * - 1 means we cannot block the vCPU since some event
908 * happens during this period, such as, 'ON' bit in
909 * posted-interrupts descriptor is set.
910 */
911 int (*pre_block)(struct kvm_vcpu *vcpu);
912 void (*post_block)(struct kvm_vcpu *vcpu);
913 int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
914 uint32_t guest_irq, bool set);
890}; 915};
891 916
892struct kvm_arch_async_pf { 917struct kvm_arch_async_pf {
@@ -1231,4 +1256,13 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
1231bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); 1256bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
1232bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); 1257bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
1233 1258
1259bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
1260 struct kvm_vcpu **dest_vcpu);
1261
1262void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
1263 struct kvm_lapic_irq *irq);
1264
1265static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
1266static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
1267
1234#endif /* _ASM_X86_KVM_HOST_H */ 1268#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 448b7ca61aee..aa336ff3e03e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -72,7 +72,7 @@
72#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 72#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
73#define SECONDARY_EXEC_ENABLE_PML 0x00020000 73#define SECONDARY_EXEC_ENABLE_PML 0x00020000
74#define SECONDARY_EXEC_XSAVES 0x00100000 74#define SECONDARY_EXEC_XSAVES 0x00100000
75 75#define SECONDARY_EXEC_PCOMMIT 0x00200000
76 76
77#define PIN_BASED_EXT_INTR_MASK 0x00000001 77#define PIN_BASED_EXT_INTR_MASK 0x00000001
78#define PIN_BASED_NMI_EXITING 0x00000008 78#define PIN_BASED_NMI_EXITING 0x00000008
@@ -416,6 +416,7 @@ enum vmcs_field {
416#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 416#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
417#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 417#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
418 418
419#define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */
419#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ 420#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
420#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ 421#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
421 422
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index f0412c50c47b..040d4083c24f 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -153,6 +153,12 @@
153/* MSR used to provide vcpu index */ 153/* MSR used to provide vcpu index */
154#define HV_X64_MSR_VP_INDEX 0x40000002 154#define HV_X64_MSR_VP_INDEX 0x40000002
155 155
156/* MSR used to reset the guest OS. */
157#define HV_X64_MSR_RESET 0x40000003
158
159/* MSR used to provide vcpu runtime in 100ns units */
160#define HV_X64_MSR_VP_RUNTIME 0x40000010
161
156/* MSR used to read the per-partition time reference counter */ 162/* MSR used to read the per-partition time reference counter */
157#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 163#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
158 164
@@ -251,4 +257,16 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
251 __s64 tsc_offset; 257 __s64 tsc_offset;
252} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; 258} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
253 259
260/* Define the number of synthetic interrupt sources. */
261#define HV_SYNIC_SINT_COUNT (16)
262/* Define the expected SynIC version. */
263#define HV_SYNIC_VERSION_1 (0x1)
264
265#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0)
266#define HV_SYNIC_SIMP_ENABLE (1ULL << 0)
267#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0)
268#define HV_SYNIC_SINT_MASKED (1ULL << 16)
269#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17)
270#define HV_SYNIC_SINT_VECTOR_MASK (0xFF)
271
254#endif 272#endif
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 37fee272618f..5b15d94a33f8 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -78,6 +78,7 @@
78#define EXIT_REASON_PML_FULL 62 78#define EXIT_REASON_PML_FULL 62
79#define EXIT_REASON_XSAVES 63 79#define EXIT_REASON_XSAVES 63
80#define EXIT_REASON_XRSTORS 64 80#define EXIT_REASON_XRSTORS 64
81#define EXIT_REASON_PCOMMIT 65
81 82
82#define VMX_EXIT_REASONS \ 83#define VMX_EXIT_REASONS \
83 { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ 84 { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -126,7 +127,8 @@
126 { EXIT_REASON_INVVPID, "INVVPID" }, \ 127 { EXIT_REASON_INVVPID, "INVVPID" }, \
127 { EXIT_REASON_INVPCID, "INVPCID" }, \ 128 { EXIT_REASON_INVPCID, "INVPCID" }, \
128 { EXIT_REASON_XSAVES, "XSAVES" }, \ 129 { EXIT_REASON_XSAVES, "XSAVES" }, \
129 { EXIT_REASON_XRSTORS, "XRSTORS" } 130 { EXIT_REASON_XRSTORS, "XRSTORS" }, \
131 { EXIT_REASON_PCOMMIT, "PCOMMIT" }
130 132
131#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 133#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
132#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 134#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2c7aafa70702..2bd81e302427 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -32,6 +32,7 @@
32static int kvmclock = 1; 32static int kvmclock = 1;
33static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; 33static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
34static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; 34static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
35static cycle_t kvm_sched_clock_offset;
35 36
36static int parse_no_kvmclock(char *arg) 37static int parse_no_kvmclock(char *arg)
37{ 38{
@@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
92 return kvm_clock_read(); 93 return kvm_clock_read();
93} 94}
94 95
96static cycle_t kvm_sched_clock_read(void)
97{
98 return kvm_clock_read() - kvm_sched_clock_offset;
99}
100
101static inline void kvm_sched_clock_init(bool stable)
102{
103 if (!stable) {
104 pv_time_ops.sched_clock = kvm_clock_read;
105 return;
106 }
107
108 kvm_sched_clock_offset = kvm_clock_read();
109 pv_time_ops.sched_clock = kvm_sched_clock_read;
110 set_sched_clock_stable();
111
112 printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
113 kvm_sched_clock_offset);
114
115 BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
116 sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
117}
118
95/* 119/*
96 * If we don't do that, there is the possibility that the guest 120 * If we don't do that, there is the possibility that the guest
97 * will calibrate under heavy load - thus, getting a lower lpj - 121 * will calibrate under heavy load - thus, getting a lower lpj -
@@ -248,7 +272,17 @@ void __init kvmclock_init(void)
248 memblock_free(mem, size); 272 memblock_free(mem, size);
249 return; 273 return;
250 } 274 }
251 pv_time_ops.sched_clock = kvm_clock_read; 275
276 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
277 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
278
279 cpu = get_cpu();
280 vcpu_time = &hv_clock[cpu].pvti;
281 flags = pvclock_read_flags(vcpu_time);
282
283 kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
284 put_cpu();
285
252 x86_platform.calibrate_tsc = kvm_get_tsc_khz; 286 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
253 x86_platform.get_wallclock = kvm_get_wallclock; 287 x86_platform.get_wallclock = kvm_get_wallclock;
254 x86_platform.set_wallclock = kvm_set_wallclock; 288 x86_platform.set_wallclock = kvm_set_wallclock;
@@ -265,16 +299,6 @@ void __init kvmclock_init(void)
265 kvm_get_preset_lpj(); 299 kvm_get_preset_lpj();
266 clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); 300 clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
267 pv_info.name = "KVM"; 301 pv_info.name = "KVM";
268
269 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
270 pvclock_set_flags(~0);
271
272 cpu = get_cpu();
273 vcpu_time = &hv_clock[cpu].pvti;
274 flags = pvclock_read_flags(vcpu_time);
275 if (flags & PVCLOCK_COUNTS_FROM_ZERO)
276 set_sched_clock_stable();
277 put_cpu();
278} 302}
279 303
280int __init kvm_setup_vsyscall_timeinfo(void) 304int __init kvm_setup_vsyscall_timeinfo(void)
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index d8a1d56276e1..639a6e34500c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,8 @@ config KVM
28 select ANON_INODES 28 select ANON_INODES
29 select HAVE_KVM_IRQCHIP 29 select HAVE_KVM_IRQCHIP
30 select HAVE_KVM_IRQFD 30 select HAVE_KVM_IRQFD
31 select IRQ_BYPASS_MANAGER
32 select HAVE_KVM_IRQ_BYPASS
31 select HAVE_KVM_IRQ_ROUTING 33 select HAVE_KVM_IRQ_ROUTING
32 select HAVE_KVM_EVENTFD 34 select HAVE_KVM_EVENTFD
33 select KVM_APIC_ARCHITECTURE 35 select KVM_APIC_ARCHITECTURE
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
index d090ecf08809..9dc091acd5fb 100644
--- a/arch/x86/kvm/assigned-dev.c
+++ b/arch/x86/kvm/assigned-dev.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include "irq.h" 22#include "irq.h"
23#include "assigned-dev.h" 23#include "assigned-dev.h"
24#include "trace/events/kvm.h"
24 25
25struct kvm_assigned_dev_kernel { 26struct kvm_assigned_dev_kernel {
26 struct kvm_irq_ack_notifier ack_notifier; 27 struct kvm_irq_ack_notifier ack_notifier;
@@ -131,7 +132,42 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
131 return IRQ_HANDLED; 132 return IRQ_HANDLED;
132} 133}
133 134
134#ifdef __KVM_HAVE_MSI 135/*
136 * Deliver an IRQ in an atomic context if we can, or return a failure,
137 * user can retry in a process context.
138 * Return value:
139 * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
140 * Other values - No need to retry.
141 */
142static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
143 int level)
144{
145 struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
146 struct kvm_kernel_irq_routing_entry *e;
147 int ret = -EINVAL;
148 int idx;
149
150 trace_kvm_set_irq(irq, level, irq_source_id);
151
152 /*
153 * Injection into either PIC or IOAPIC might need to scan all CPUs,
154 * which would need to be retried from thread context; when same GSI
155 * is connected to both PIC and IOAPIC, we'd have to report a
156 * partial failure here.
157 * Since there's no easy way to do this, we only support injecting MSI
158 * which is limited to 1:1 GSI mapping.
159 */
160 idx = srcu_read_lock(&kvm->irq_srcu);
161 if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
162 e = &entries[0];
163 ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
164 irq, level);
165 }
166 srcu_read_unlock(&kvm->irq_srcu, idx);
167 return ret;
168}
169
170
135static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) 171static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
136{ 172{
137 struct kvm_assigned_dev_kernel *assigned_dev = dev_id; 173 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -150,9 +186,7 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
150 186
151 return IRQ_HANDLED; 187 return IRQ_HANDLED;
152} 188}
153#endif
154 189
155#ifdef __KVM_HAVE_MSIX
156static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) 190static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
157{ 191{
158 struct kvm_assigned_dev_kernel *assigned_dev = dev_id; 192 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -183,7 +217,6 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
183 217
184 return IRQ_HANDLED; 218 return IRQ_HANDLED;
185} 219}
186#endif
187 220
188/* Ack the irq line for an assigned device */ 221/* Ack the irq line for an assigned device */
189static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) 222static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
@@ -386,7 +419,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
386 return 0; 419 return 0;
387} 420}
388 421
389#ifdef __KVM_HAVE_MSI
390static int assigned_device_enable_host_msi(struct kvm *kvm, 422static int assigned_device_enable_host_msi(struct kvm *kvm,
391 struct kvm_assigned_dev_kernel *dev) 423 struct kvm_assigned_dev_kernel *dev)
392{ 424{
@@ -408,9 +440,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
408 440
409 return 0; 441 return 0;
410} 442}
411#endif
412 443
413#ifdef __KVM_HAVE_MSIX
414static int assigned_device_enable_host_msix(struct kvm *kvm, 444static int assigned_device_enable_host_msix(struct kvm *kvm,
415 struct kvm_assigned_dev_kernel *dev) 445 struct kvm_assigned_dev_kernel *dev)
416{ 446{
@@ -443,8 +473,6 @@ err:
443 return r; 473 return r;
444} 474}
445 475
446#endif
447
448static int assigned_device_enable_guest_intx(struct kvm *kvm, 476static int assigned_device_enable_guest_intx(struct kvm *kvm,
449 struct kvm_assigned_dev_kernel *dev, 477 struct kvm_assigned_dev_kernel *dev,
450 struct kvm_assigned_irq *irq) 478 struct kvm_assigned_irq *irq)
@@ -454,7 +482,6 @@ static int assigned_device_enable_guest_intx(struct kvm *kvm,
454 return 0; 482 return 0;
455} 483}
456 484
457#ifdef __KVM_HAVE_MSI
458static int assigned_device_enable_guest_msi(struct kvm *kvm, 485static int assigned_device_enable_guest_msi(struct kvm *kvm,
459 struct kvm_assigned_dev_kernel *dev, 486 struct kvm_assigned_dev_kernel *dev,
460 struct kvm_assigned_irq *irq) 487 struct kvm_assigned_irq *irq)
@@ -463,9 +490,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
463 dev->ack_notifier.gsi = -1; 490 dev->ack_notifier.gsi = -1;
464 return 0; 491 return 0;
465} 492}
466#endif
467 493
468#ifdef __KVM_HAVE_MSIX
469static int assigned_device_enable_guest_msix(struct kvm *kvm, 494static int assigned_device_enable_guest_msix(struct kvm *kvm,
470 struct kvm_assigned_dev_kernel *dev, 495 struct kvm_assigned_dev_kernel *dev,
471 struct kvm_assigned_irq *irq) 496 struct kvm_assigned_irq *irq)
@@ -474,7 +499,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
474 dev->ack_notifier.gsi = -1; 499 dev->ack_notifier.gsi = -1;
475 return 0; 500 return 0;
476} 501}
477#endif
478 502
479static int assign_host_irq(struct kvm *kvm, 503static int assign_host_irq(struct kvm *kvm,
480 struct kvm_assigned_dev_kernel *dev, 504 struct kvm_assigned_dev_kernel *dev,
@@ -492,16 +516,12 @@ static int assign_host_irq(struct kvm *kvm,
492 case KVM_DEV_IRQ_HOST_INTX: 516 case KVM_DEV_IRQ_HOST_INTX:
493 r = assigned_device_enable_host_intx(kvm, dev); 517 r = assigned_device_enable_host_intx(kvm, dev);
494 break; 518 break;
495#ifdef __KVM_HAVE_MSI
496 case KVM_DEV_IRQ_HOST_MSI: 519 case KVM_DEV_IRQ_HOST_MSI:
497 r = assigned_device_enable_host_msi(kvm, dev); 520 r = assigned_device_enable_host_msi(kvm, dev);
498 break; 521 break;
499#endif
500#ifdef __KVM_HAVE_MSIX
501 case KVM_DEV_IRQ_HOST_MSIX: 522 case KVM_DEV_IRQ_HOST_MSIX:
502 r = assigned_device_enable_host_msix(kvm, dev); 523 r = assigned_device_enable_host_msix(kvm, dev);
503 break; 524 break;
504#endif
505 default: 525 default:
506 r = -EINVAL; 526 r = -EINVAL;
507 } 527 }
@@ -534,16 +554,12 @@ static int assign_guest_irq(struct kvm *kvm,
534 case KVM_DEV_IRQ_GUEST_INTX: 554 case KVM_DEV_IRQ_GUEST_INTX:
535 r = assigned_device_enable_guest_intx(kvm, dev, irq); 555 r = assigned_device_enable_guest_intx(kvm, dev, irq);
536 break; 556 break;
537#ifdef __KVM_HAVE_MSI
538 case KVM_DEV_IRQ_GUEST_MSI: 557 case KVM_DEV_IRQ_GUEST_MSI:
539 r = assigned_device_enable_guest_msi(kvm, dev, irq); 558 r = assigned_device_enable_guest_msi(kvm, dev, irq);
540 break; 559 break;
541#endif
542#ifdef __KVM_HAVE_MSIX
543 case KVM_DEV_IRQ_GUEST_MSIX: 560 case KVM_DEV_IRQ_GUEST_MSIX:
544 r = assigned_device_enable_guest_msix(kvm, dev, irq); 561 r = assigned_device_enable_guest_msix(kvm, dev, irq);
545 break; 562 break;
546#endif
547 default: 563 default:
548 r = -EINVAL; 564 r = -EINVAL;
549 } 565 }
@@ -826,7 +842,6 @@ out:
826} 842}
827 843
828 844
829#ifdef __KVM_HAVE_MSIX
830static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, 845static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
831 struct kvm_assigned_msix_nr *entry_nr) 846 struct kvm_assigned_msix_nr *entry_nr)
832{ 847{
@@ -906,7 +921,6 @@ msix_entry_out:
906 921
907 return r; 922 return r;
908} 923}
909#endif
910 924
911static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, 925static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
912 struct kvm_assigned_pci_dev *assigned_dev) 926 struct kvm_assigned_pci_dev *assigned_dev)
@@ -1012,7 +1026,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
1012 goto out; 1026 goto out;
1013 break; 1027 break;
1014 } 1028 }
1015#ifdef __KVM_HAVE_MSIX
1016 case KVM_ASSIGN_SET_MSIX_NR: { 1029 case KVM_ASSIGN_SET_MSIX_NR: {
1017 struct kvm_assigned_msix_nr entry_nr; 1030 struct kvm_assigned_msix_nr entry_nr;
1018 r = -EFAULT; 1031 r = -EFAULT;
@@ -1033,7 +1046,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
1033 goto out; 1046 goto out;
1034 break; 1047 break;
1035 } 1048 }
1036#endif
1037 case KVM_ASSIGN_SET_INTX_MASK: { 1049 case KVM_ASSIGN_SET_INTX_MASK: {
1038 struct kvm_assigned_pci_dev assigned_dev; 1050 struct kvm_assigned_pci_dev assigned_dev;
1039 1051
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 156441bcaac8..6525e926f566 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
348 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | 348 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
349 F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | 349 F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
350 F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | 350 F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
351 F(AVX512CD); 351 F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
352 352
353 /* cpuid 0xD.1.eax */ 353 /* cpuid 0xD.1.eax */
354 const u32 kvm_supported_word10_x86_features = 354 const u32 kvm_supported_word10_x86_features =
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index dd05b9cef6ae..06332cb7e7d1 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -133,4 +133,41 @@ static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
133 best = kvm_find_cpuid_entry(vcpu, 7, 0); 133 best = kvm_find_cpuid_entry(vcpu, 7, 0);
134 return best && (best->ebx & bit(X86_FEATURE_MPX)); 134 return best && (best->ebx & bit(X86_FEATURE_MPX));
135} 135}
136
137static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
138{
139 struct kvm_cpuid_entry2 *best;
140
141 best = kvm_find_cpuid_entry(vcpu, 7, 0);
142 return best && (best->ebx & bit(X86_FEATURE_PCOMMIT));
143}
144
145static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
146{
147 struct kvm_cpuid_entry2 *best;
148
149 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
150 return best && (best->edx & bit(X86_FEATURE_RDTSCP));
151}
152
153/*
154 * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
155 */
156#define BIT_NRIPS 3
157
158static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu)
159{
160 struct kvm_cpuid_entry2 *best;
161
162 best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0);
163
164 /*
165 * NRIPS is a scattered cpuid feature, so we can't use
166 * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit
167 * position 8, not 3).
168 */
169 return best && (best->edx & bit(BIT_NRIPS));
170}
171#undef BIT_NRIPS
172
136#endif 173#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9da95b9daf8d..1505587d06e9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2272,8 +2272,8 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
2272#define GET_SMSTATE(type, smbase, offset) \ 2272#define GET_SMSTATE(type, smbase, offset) \
2273 ({ \ 2273 ({ \
2274 type __val; \ 2274 type __val; \
2275 int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \ 2275 int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \
2276 sizeof(__val), NULL); \ 2276 sizeof(__val)); \
2277 if (r != X86EMUL_CONTINUE) \ 2277 if (r != X86EMUL_CONTINUE) \
2278 return X86EMUL_UNHANDLEABLE; \ 2278 return X86EMUL_UNHANDLEABLE; \
2279 __val; \ 2279 __val; \
@@ -2484,17 +2484,36 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
2484 2484
2485 /* 2485 /*
2486 * Get back to real mode, to prepare a safe state in which to load 2486 * Get back to real mode, to prepare a safe state in which to load
2487 * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed 2487 * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
2488 * to read_std/write_std are not virtual. 2488 * supports long mode.
2489 *
2490 * CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
2491 */ 2489 */
2490 cr4 = ctxt->ops->get_cr(ctxt, 4);
2491 if (emulator_has_longmode(ctxt)) {
2492 struct desc_struct cs_desc;
2493
2494 /* Zero CR4.PCIDE before CR0.PG. */
2495 if (cr4 & X86_CR4_PCIDE) {
2496 ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
2497 cr4 &= ~X86_CR4_PCIDE;
2498 }
2499
2500 /* A 32-bit code segment is required to clear EFER.LMA. */
2501 memset(&cs_desc, 0, sizeof(cs_desc));
2502 cs_desc.type = 0xb;
2503 cs_desc.s = cs_desc.g = cs_desc.p = 1;
2504 ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS);
2505 }
2506
2507 /* For the 64-bit case, this will clear EFER.LMA. */
2492 cr0 = ctxt->ops->get_cr(ctxt, 0); 2508 cr0 = ctxt->ops->get_cr(ctxt, 0);
2493 if (cr0 & X86_CR0_PE) 2509 if (cr0 & X86_CR0_PE)
2494 ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); 2510 ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
2495 cr4 = ctxt->ops->get_cr(ctxt, 4); 2511
2512 /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */
2496 if (cr4 & X86_CR4_PAE) 2513 if (cr4 & X86_CR4_PAE)
2497 ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); 2514 ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
2515
2516 /* And finally go back to 32-bit mode. */
2498 efer = 0; 2517 efer = 0;
2499 ctxt->ops->set_msr(ctxt, MSR_EFER, efer); 2518 ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
2500 2519
@@ -4455,7 +4474,7 @@ static const struct opcode twobyte_table[256] = {
4455 F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, 4474 F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
4456 /* 0xA8 - 0xAF */ 4475 /* 0xA8 - 0xAF */
4457 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), 4476 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
4458 II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm), 4477 II(EmulateOnUD | ImplicitOps, em_rsm, rsm),
4459 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), 4478 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
4460 F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), 4479 F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
4461 F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), 4480 F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index a8160d2ae362..62cf8c915e95 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -41,6 +41,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
41 case HV_X64_MSR_TIME_REF_COUNT: 41 case HV_X64_MSR_TIME_REF_COUNT:
42 case HV_X64_MSR_CRASH_CTL: 42 case HV_X64_MSR_CRASH_CTL:
43 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: 43 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
44 case HV_X64_MSR_RESET:
44 r = true; 45 r = true;
45 break; 46 break;
46 } 47 }
@@ -163,6 +164,12 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
163 data); 164 data);
164 case HV_X64_MSR_CRASH_CTL: 165 case HV_X64_MSR_CRASH_CTL:
165 return kvm_hv_msr_set_crash_ctl(vcpu, data, host); 166 return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
167 case HV_X64_MSR_RESET:
168 if (data == 1) {
169 vcpu_debug(vcpu, "hyper-v reset requested\n");
170 kvm_make_request(KVM_REQ_HV_RESET, vcpu);
171 }
172 break;
166 default: 173 default:
167 vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", 174 vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
168 msr, data); 175 msr, data);
@@ -171,7 +178,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
171 return 0; 178 return 0;
172} 179}
173 180
174static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 181/* Calculate cpu time spent by current task in 100ns units */
182static u64 current_task_runtime_100ns(void)
183{
184 cputime_t utime, stime;
185
186 task_cputime_adjusted(current, &utime, &stime);
187 return div_u64(cputime_to_nsecs(utime + stime), 100);
188}
189
190static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
175{ 191{
176 struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; 192 struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
177 193
@@ -205,6 +221,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
205 return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); 221 return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
206 case HV_X64_MSR_TPR: 222 case HV_X64_MSR_TPR:
207 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); 223 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
224 case HV_X64_MSR_VP_RUNTIME:
225 if (!host)
226 return 1;
227 hv->runtime_offset = data - current_task_runtime_100ns();
228 break;
208 default: 229 default:
209 vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", 230 vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
210 msr, data); 231 msr, data);
@@ -241,6 +262,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
241 pdata); 262 pdata);
242 case HV_X64_MSR_CRASH_CTL: 263 case HV_X64_MSR_CRASH_CTL:
243 return kvm_hv_msr_get_crash_ctl(vcpu, pdata); 264 return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
265 case HV_X64_MSR_RESET:
266 data = 0;
267 break;
244 default: 268 default:
245 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 269 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
246 return 1; 270 return 1;
@@ -277,6 +301,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
277 case HV_X64_MSR_APIC_ASSIST_PAGE: 301 case HV_X64_MSR_APIC_ASSIST_PAGE:
278 data = hv->hv_vapic; 302 data = hv->hv_vapic;
279 break; 303 break;
304 case HV_X64_MSR_VP_RUNTIME:
305 data = current_task_runtime_100ns() + hv->runtime_offset;
306 break;
280 default: 307 default:
281 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 308 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
282 return 1; 309 return 1;
@@ -295,7 +322,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
295 mutex_unlock(&vcpu->kvm->lock); 322 mutex_unlock(&vcpu->kvm->lock);
296 return r; 323 return r;
297 } else 324 } else
298 return kvm_hv_set_msr(vcpu, msr, data); 325 return kvm_hv_set_msr(vcpu, msr, data, host);
299} 326}
300 327
301int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 328int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index f90952f64e79..08116ff227cc 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -35,6 +35,7 @@
35#include <linux/kvm_host.h> 35#include <linux/kvm_host.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37 37
38#include "ioapic.h"
38#include "irq.h" 39#include "irq.h"
39#include "i8254.h" 40#include "i8254.h"
40#include "x86.h" 41#include "x86.h"
@@ -333,7 +334,8 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
333 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; 334 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
334 s64 interval; 335 s64 interval;
335 336
336 if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) 337 if (!ioapic_in_kernel(kvm) ||
338 ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
337 return; 339 return;
338 340
339 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 341 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 856f79105bb5..88d0a92d3f94 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -233,21 +233,7 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
233} 233}
234 234
235 235
236static void update_handled_vectors(struct kvm_ioapic *ioapic) 236void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
237{
238 DECLARE_BITMAP(handled_vectors, 256);
239 int i;
240
241 memset(handled_vectors, 0, sizeof(handled_vectors));
242 for (i = 0; i < IOAPIC_NUM_PINS; ++i)
243 __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
244 memcpy(ioapic->handled_vectors, handled_vectors,
245 sizeof(handled_vectors));
246 smp_wmb();
247}
248
249void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
250 u32 *tmr)
251{ 237{
252 struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; 238 struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
253 union kvm_ioapic_redirect_entry *e; 239 union kvm_ioapic_redirect_entry *e;
@@ -260,13 +246,11 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
260 kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || 246 kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
261 index == RTC_GSI) { 247 index == RTC_GSI) {
262 if (kvm_apic_match_dest(vcpu, NULL, 0, 248 if (kvm_apic_match_dest(vcpu, NULL, 0,
263 e->fields.dest_id, e->fields.dest_mode)) { 249 e->fields.dest_id, e->fields.dest_mode) ||
250 (e->fields.trig_mode == IOAPIC_EDGE_TRIG &&
251 kvm_apic_pending_eoi(vcpu, e->fields.vector)))
264 __set_bit(e->fields.vector, 252 __set_bit(e->fields.vector,
265 (unsigned long *)eoi_exit_bitmap); 253 (unsigned long *)eoi_exit_bitmap);
266 if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG)
267 __set_bit(e->fields.vector,
268 (unsigned long *)tmr);
269 }
270 } 254 }
271 } 255 }
272 spin_unlock(&ioapic->lock); 256 spin_unlock(&ioapic->lock);
@@ -315,7 +299,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
315 e->bits |= (u32) val; 299 e->bits |= (u32) val;
316 e->fields.remote_irr = 0; 300 e->fields.remote_irr = 0;
317 } 301 }
318 update_handled_vectors(ioapic);
319 mask_after = e->fields.mask; 302 mask_after = e->fields.mask;
320 if (mask_before != mask_after) 303 if (mask_before != mask_after)
321 kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); 304 kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
@@ -599,7 +582,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
599 ioapic->id = 0; 582 ioapic->id = 0;
600 memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); 583 memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
601 rtc_irq_eoi_tracking_reset(ioapic); 584 rtc_irq_eoi_tracking_reset(ioapic);
602 update_handled_vectors(ioapic);
603} 585}
604 586
605static const struct kvm_io_device_ops ioapic_mmio_ops = { 587static const struct kvm_io_device_ops ioapic_mmio_ops = {
@@ -628,8 +610,10 @@ int kvm_ioapic_init(struct kvm *kvm)
628 if (ret < 0) { 610 if (ret < 0) {
629 kvm->arch.vioapic = NULL; 611 kvm->arch.vioapic = NULL;
630 kfree(ioapic); 612 kfree(ioapic);
613 return ret;
631 } 614 }
632 615
616 kvm_vcpu_request_scan_ioapic(kvm);
633 return ret; 617 return ret;
634} 618}
635 619
@@ -666,7 +650,6 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
666 memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); 650 memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
667 ioapic->irr = 0; 651 ioapic->irr = 0;
668 ioapic->irr_delivered = 0; 652 ioapic->irr_delivered = 0;
669 update_handled_vectors(ioapic);
670 kvm_vcpu_request_scan_ioapic(kvm); 653 kvm_vcpu_request_scan_ioapic(kvm);
671 kvm_ioapic_inject_all(ioapic, state->irr); 654 kvm_ioapic_inject_all(ioapic, state->irr);
672 spin_unlock(&ioapic->lock); 655 spin_unlock(&ioapic->lock);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index ca0b0b4e6256..084617d37c74 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -9,6 +9,7 @@ struct kvm;
9struct kvm_vcpu; 9struct kvm_vcpu;
10 10
11#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS 11#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
12#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES
12#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ 13#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
13#define IOAPIC_EDGE_TRIG 0 14#define IOAPIC_EDGE_TRIG 0
14#define IOAPIC_LEVEL_TRIG 1 15#define IOAPIC_LEVEL_TRIG 1
@@ -73,7 +74,6 @@ struct kvm_ioapic {
73 struct kvm *kvm; 74 struct kvm *kvm;
74 void (*ack_notifier)(void *opaque, int irq); 75 void (*ack_notifier)(void *opaque, int irq);
75 spinlock_t lock; 76 spinlock_t lock;
76 DECLARE_BITMAP(handled_vectors, 256);
77 struct rtc_status rtc_status; 77 struct rtc_status rtc_status;
78 struct delayed_work eoi_inject; 78 struct delayed_work eoi_inject;
79 u32 irq_eoi[IOAPIC_NUM_PINS]; 79 u32 irq_eoi[IOAPIC_NUM_PINS];
@@ -98,11 +98,12 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
98 return kvm->arch.vioapic; 98 return kvm->arch.vioapic;
99} 99}
100 100
101static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) 101static inline int ioapic_in_kernel(struct kvm *kvm)
102{ 102{
103 struct kvm_ioapic *ioapic = kvm->arch.vioapic; 103 int ret;
104 smp_rmb(); 104
105 return test_bit(vector, ioapic->handled_vectors); 105 ret = (ioapic_irqchip(kvm) != NULL);
106 return ret;
106} 107}
107 108
108void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); 109void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@@ -120,7 +121,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
120 struct kvm_lapic_irq *irq, unsigned long *dest_map); 121 struct kvm_lapic_irq *irq, unsigned long *dest_map);
121int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 122int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
122int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 123int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
123void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, 124void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
124 u32 *tmr); 125void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
125 126
126#endif 127#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index a1ec6a50a05a..097060e33bd6 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -38,14 +38,27 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
38EXPORT_SYMBOL(kvm_cpu_has_pending_timer); 38EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
39 39
40/* 40/*
41 * check if there is a pending userspace external interrupt
42 */
43static int pending_userspace_extint(struct kvm_vcpu *v)
44{
45 return v->arch.pending_external_vector != -1;
46}
47
48/*
41 * check if there is pending interrupt from 49 * check if there is pending interrupt from
42 * non-APIC source without intack. 50 * non-APIC source without intack.
43 */ 51 */
44static int kvm_cpu_has_extint(struct kvm_vcpu *v) 52static int kvm_cpu_has_extint(struct kvm_vcpu *v)
45{ 53{
46 if (kvm_apic_accept_pic_intr(v)) 54 u8 accept = kvm_apic_accept_pic_intr(v);
47 return pic_irqchip(v->kvm)->output; /* PIC */ 55
48 else 56 if (accept) {
57 if (irqchip_split(v->kvm))
58 return pending_userspace_extint(v);
59 else
60 return pic_irqchip(v->kvm)->output;
61 } else
49 return 0; 62 return 0;
50} 63}
51 64
@@ -57,13 +70,13 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
57 */ 70 */
58int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) 71int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
59{ 72{
60 if (!irqchip_in_kernel(v->kvm)) 73 if (!lapic_in_kernel(v))
61 return v->arch.interrupt.pending; 74 return v->arch.interrupt.pending;
62 75
63 if (kvm_cpu_has_extint(v)) 76 if (kvm_cpu_has_extint(v))
64 return 1; 77 return 1;
65 78
66 if (kvm_apic_vid_enabled(v->kvm)) 79 if (kvm_vcpu_apic_vid_enabled(v))
67 return 0; 80 return 0;
68 81
69 return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ 82 return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
@@ -75,7 +88,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
75 */ 88 */
76int kvm_cpu_has_interrupt(struct kvm_vcpu *v) 89int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
77{ 90{
78 if (!irqchip_in_kernel(v->kvm)) 91 if (!lapic_in_kernel(v))
79 return v->arch.interrupt.pending; 92 return v->arch.interrupt.pending;
80 93
81 if (kvm_cpu_has_extint(v)) 94 if (kvm_cpu_has_extint(v))
@@ -91,9 +104,16 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
91 */ 104 */
92static int kvm_cpu_get_extint(struct kvm_vcpu *v) 105static int kvm_cpu_get_extint(struct kvm_vcpu *v)
93{ 106{
94 if (kvm_cpu_has_extint(v)) 107 if (kvm_cpu_has_extint(v)) {
95 return kvm_pic_read_irq(v->kvm); /* PIC */ 108 if (irqchip_split(v->kvm)) {
96 return -1; 109 int vector = v->arch.pending_external_vector;
110
111 v->arch.pending_external_vector = -1;
112 return vector;
113 } else
114 return kvm_pic_read_irq(v->kvm); /* PIC */
115 } else
116 return -1;
97} 117}
98 118
99/* 119/*
@@ -103,7 +123,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
103{ 123{
104 int vector; 124 int vector;
105 125
106 if (!irqchip_in_kernel(v->kvm)) 126 if (!lapic_in_kernel(v))
107 return v->arch.interrupt.nr; 127 return v->arch.interrupt.nr;
108 128
109 vector = kvm_cpu_get_extint(v); 129 vector = kvm_cpu_get_extint(v);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 3d782a2c336a..ae5c78f2337d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -83,13 +83,38 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
83 return kvm->arch.vpic; 83 return kvm->arch.vpic;
84} 84}
85 85
86static inline int pic_in_kernel(struct kvm *kvm)
87{
88 int ret;
89
90 ret = (pic_irqchip(kvm) != NULL);
91 return ret;
92}
93
94static inline int irqchip_split(struct kvm *kvm)
95{
96 return kvm->arch.irqchip_split;
97}
98
86static inline int irqchip_in_kernel(struct kvm *kvm) 99static inline int irqchip_in_kernel(struct kvm *kvm)
87{ 100{
88 struct kvm_pic *vpic = pic_irqchip(kvm); 101 struct kvm_pic *vpic = pic_irqchip(kvm);
102 bool ret;
103
104 ret = (vpic != NULL);
105 ret |= irqchip_split(kvm);
89 106
90 /* Read vpic before kvm->irq_routing. */ 107 /* Read vpic before kvm->irq_routing. */
91 smp_rmb(); 108 smp_rmb();
92 return vpic != NULL; 109 return ret;
110}
111
112static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
113{
114 /* Same as irqchip_in_kernel(vcpu->kvm), but with less
115 * pointer chasing and no unnecessary memory barriers.
116 */
117 return vcpu->arch.apic != NULL;
93} 118}
94 119
95void kvm_pic_reset(struct kvm_kpic_state *s); 120void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 9efff9e5b58c..84b96d319909 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -91,8 +91,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
91 return r; 91 return r;
92} 92}
93 93
94static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, 94void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
95 struct kvm_lapic_irq *irq) 95 struct kvm_lapic_irq *irq)
96{ 96{
97 trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); 97 trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
98 98
@@ -108,6 +108,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
108 irq->level = 1; 108 irq->level = 1;
109 irq->shorthand = 0; 109 irq->shorthand = 0;
110} 110}
111EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
111 112
112int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, 113int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
113 struct kvm *kvm, int irq_source_id, int level, bool line_status) 114 struct kvm *kvm, int irq_source_id, int level, bool line_status)
@@ -123,12 +124,16 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
123} 124}
124 125
125 126
126static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e, 127int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
127 struct kvm *kvm) 128 struct kvm *kvm, int irq_source_id, int level,
129 bool line_status)
128{ 130{
129 struct kvm_lapic_irq irq; 131 struct kvm_lapic_irq irq;
130 int r; 132 int r;
131 133
134 if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
135 return -EWOULDBLOCK;
136
132 kvm_set_msi_irq(e, &irq); 137 kvm_set_msi_irq(e, &irq);
133 138
134 if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) 139 if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
@@ -137,42 +142,6 @@ static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
137 return -EWOULDBLOCK; 142 return -EWOULDBLOCK;
138} 143}
139 144
140/*
141 * Deliver an IRQ in an atomic context if we can, or return a failure,
142 * user can retry in a process context.
143 * Return value:
144 * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
145 * Other values - No need to retry.
146 */
147int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
148{
149 struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
150 struct kvm_kernel_irq_routing_entry *e;
151 int ret = -EINVAL;
152 int idx;
153
154 trace_kvm_set_irq(irq, level, irq_source_id);
155
156 /*
157 * Injection into either PIC or IOAPIC might need to scan all CPUs,
158 * which would need to be retried from thread context; when same GSI
159 * is connected to both PIC and IOAPIC, we'd have to report a
160 * partial failure here.
161 * Since there's no easy way to do this, we only support injecting MSI
162 * which is limited to 1:1 GSI mapping.
163 */
164 idx = srcu_read_lock(&kvm->irq_srcu);
165 if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
166 e = &entries[0];
167 if (likely(e->type == KVM_IRQ_ROUTING_MSI))
168 ret = kvm_set_msi_inatomic(e, kvm);
169 else
170 ret = -EWOULDBLOCK;
171 }
172 srcu_read_unlock(&kvm->irq_srcu, idx);
173 return ret;
174}
175
176int kvm_request_irq_source_id(struct kvm *kvm) 145int kvm_request_irq_source_id(struct kvm *kvm)
177{ 146{
178 unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; 147 unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
@@ -208,7 +177,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
208 goto unlock; 177 goto unlock;
209 } 178 }
210 clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); 179 clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
211 if (!irqchip_in_kernel(kvm)) 180 if (!ioapic_in_kernel(kvm))
212 goto unlock; 181 goto unlock;
213 182
214 kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); 183 kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
@@ -297,6 +266,33 @@ out:
297 return r; 266 return r;
298} 267}
299 268
269bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
270 struct kvm_vcpu **dest_vcpu)
271{
272 int i, r = 0;
273 struct kvm_vcpu *vcpu;
274
275 if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
276 return true;
277
278 kvm_for_each_vcpu(i, vcpu, kvm) {
279 if (!kvm_apic_present(vcpu))
280 continue;
281
282 if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
283 irq->dest_id, irq->dest_mode))
284 continue;
285
286 if (++r == 2)
287 return false;
288
289 *dest_vcpu = vcpu;
290 }
291
292 return r == 1;
293}
294EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
295
300#define IOAPIC_ROUTING_ENTRY(irq) \ 296#define IOAPIC_ROUTING_ENTRY(irq) \
301 { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ 297 { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
302 .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } 298 .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
@@ -328,3 +324,54 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
328 return kvm_set_irq_routing(kvm, default_routing, 324 return kvm_set_irq_routing(kvm, default_routing,
329 ARRAY_SIZE(default_routing), 0); 325 ARRAY_SIZE(default_routing), 0);
330} 326}
327
328static const struct kvm_irq_routing_entry empty_routing[] = {};
329
330int kvm_setup_empty_irq_routing(struct kvm *kvm)
331{
332 return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
333}
334
335void kvm_arch_irq_routing_update(struct kvm *kvm)
336{
337 if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
338 return;
339 kvm_make_scan_ioapic_request(kvm);
340}
341
342void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
343{
344 struct kvm *kvm = vcpu->kvm;
345 struct kvm_kernel_irq_routing_entry *entry;
346 struct kvm_irq_routing_table *table;
347 u32 i, nr_ioapic_pins;
348 int idx;
349
350 /* kvm->irq_routing must be read after clearing
351 * KVM_SCAN_IOAPIC. */
352 smp_mb();
353 idx = srcu_read_lock(&kvm->irq_srcu);
354 table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
355 nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
356 kvm->arch.nr_reserved_ioapic_pins);
357 for (i = 0; i < nr_ioapic_pins; ++i) {
358 hlist_for_each_entry(entry, &table->map[i], link) {
359 u32 dest_id, dest_mode;
360 bool level;
361
362 if (entry->type != KVM_IRQ_ROUTING_MSI)
363 continue;
364 dest_id = (entry->msi.address_lo >> 12) & 0xff;
365 dest_mode = (entry->msi.address_lo >> 2) & 0x1;
366 level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL;
367 if (level && kvm_apic_match_dest(vcpu, NULL, 0,
368 dest_id, dest_mode)) {
369 u32 vector = entry->msi.data & 0xff;
370
371 __set_bit(vector,
372 (unsigned long *) eoi_exit_bitmap);
373 }
374 }
375 }
376 srcu_read_unlock(&kvm->irq_srcu, idx);
377}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 8d9013c5e1ee..ecd4ea1d28a8 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -209,7 +209,7 @@ out:
209 if (old) 209 if (old)
210 kfree_rcu(old, rcu); 210 kfree_rcu(old, rcu);
211 211
212 kvm_vcpu_request_scan_ioapic(kvm); 212 kvm_make_scan_ioapic_request(kvm);
213} 213}
214 214
215static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) 215static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
@@ -348,6 +348,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
348 struct kvm_lapic *apic = vcpu->arch.apic; 348 struct kvm_lapic *apic = vcpu->arch.apic;
349 349
350 __kvm_apic_update_irr(pir, apic->regs); 350 __kvm_apic_update_irr(pir, apic->regs);
351
352 kvm_make_request(KVM_REQ_EVENT, vcpu);
351} 353}
352EXPORT_SYMBOL_GPL(kvm_apic_update_irr); 354EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
353 355
@@ -390,7 +392,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
390 392
391 vcpu = apic->vcpu; 393 vcpu = apic->vcpu;
392 394
393 if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) { 395 if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) {
394 /* try to update RVI */ 396 /* try to update RVI */
395 apic_clear_vector(vec, apic->regs + APIC_IRR); 397 apic_clear_vector(vec, apic->regs + APIC_IRR);
396 kvm_make_request(KVM_REQ_EVENT, vcpu); 398 kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -551,15 +553,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
551 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); 553 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
552} 554}
553 555
554void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
555{
556 struct kvm_lapic *apic = vcpu->arch.apic;
557 int i;
558
559 for (i = 0; i < 8; i++)
560 apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]);
561}
562
563static void apic_update_ppr(struct kvm_lapic *apic) 556static void apic_update_ppr(struct kvm_lapic *apic)
564{ 557{
565 u32 tpr, isrv, ppr, old_ppr; 558 u32 tpr, isrv, ppr, old_ppr;
@@ -764,6 +757,65 @@ out:
764 return ret; 757 return ret;
765} 758}
766 759
760bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
761 struct kvm_vcpu **dest_vcpu)
762{
763 struct kvm_apic_map *map;
764 bool ret = false;
765 struct kvm_lapic *dst = NULL;
766
767 if (irq->shorthand)
768 return false;
769
770 rcu_read_lock();
771 map = rcu_dereference(kvm->arch.apic_map);
772
773 if (!map)
774 goto out;
775
776 if (irq->dest_mode == APIC_DEST_PHYSICAL) {
777 if (irq->dest_id == 0xFF)
778 goto out;
779
780 if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
781 goto out;
782
783 dst = map->phys_map[irq->dest_id];
784 if (dst && kvm_apic_present(dst->vcpu))
785 *dest_vcpu = dst->vcpu;
786 else
787 goto out;
788 } else {
789 u16 cid;
790 unsigned long bitmap = 1;
791 int i, r = 0;
792
793 if (!kvm_apic_logical_map_valid(map))
794 goto out;
795
796 apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
797
798 if (cid >= ARRAY_SIZE(map->logical_map))
799 goto out;
800
801 for_each_set_bit(i, &bitmap, 16) {
802 dst = map->logical_map[cid][i];
803 if (++r == 2)
804 goto out;
805 }
806
807 if (dst && kvm_apic_present(dst->vcpu))
808 *dest_vcpu = dst->vcpu;
809 else
810 goto out;
811 }
812
813 ret = true;
814out:
815 rcu_read_unlock();
816 return ret;
817}
818
767/* 819/*
768 * Add a pending IRQ into lapic. 820 * Add a pending IRQ into lapic.
769 * Return 1 if successfully added and 0 if discarded. 821 * Return 1 if successfully added and 0 if discarded.
@@ -781,6 +833,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
781 case APIC_DM_LOWEST: 833 case APIC_DM_LOWEST:
782 vcpu->arch.apic_arb_prio++; 834 vcpu->arch.apic_arb_prio++;
783 case APIC_DM_FIXED: 835 case APIC_DM_FIXED:
836 if (unlikely(trig_mode && !level))
837 break;
838
784 /* FIXME add logic for vcpu on reset */ 839 /* FIXME add logic for vcpu on reset */
785 if (unlikely(!apic_enabled(apic))) 840 if (unlikely(!apic_enabled(apic)))
786 break; 841 break;
@@ -790,6 +845,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
790 if (dest_map) 845 if (dest_map)
791 __set_bit(vcpu->vcpu_id, dest_map); 846 __set_bit(vcpu->vcpu_id, dest_map);
792 847
848 if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
849 if (trig_mode)
850 apic_set_vector(vector, apic->regs + APIC_TMR);
851 else
852 apic_clear_vector(vector, apic->regs + APIC_TMR);
853 }
854
793 if (kvm_x86_ops->deliver_posted_interrupt) 855 if (kvm_x86_ops->deliver_posted_interrupt)
794 kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); 856 kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
795 else { 857 else {
@@ -868,16 +930,32 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
868 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; 930 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
869} 931}
870 932
933static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
934{
935 return test_bit(vector, (ulong *)apic->vcpu->arch.eoi_exit_bitmap);
936}
937
871static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) 938static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
872{ 939{
873 if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { 940 int trigger_mode;
874 int trigger_mode; 941
875 if (apic_test_vector(vector, apic->regs + APIC_TMR)) 942 /* Eoi the ioapic only if the ioapic doesn't own the vector. */
876 trigger_mode = IOAPIC_LEVEL_TRIG; 943 if (!kvm_ioapic_handles_vector(apic, vector))
877 else 944 return;
878 trigger_mode = IOAPIC_EDGE_TRIG; 945
879 kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); 946 /* Request a KVM exit to inform the userspace IOAPIC. */
947 if (irqchip_split(apic->vcpu->kvm)) {
948 apic->vcpu->arch.pending_ioapic_eoi = vector;
949 kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
950 return;
880 } 951 }
952
953 if (apic_test_vector(vector, apic->regs + APIC_TMR))
954 trigger_mode = IOAPIC_LEVEL_TRIG;
955 else
956 trigger_mode = IOAPIC_EDGE_TRIG;
957
958 kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
881} 959}
882 960
883static int apic_set_eoi(struct kvm_lapic *apic) 961static int apic_set_eoi(struct kvm_lapic *apic)
@@ -1615,7 +1693,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
1615 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); 1693 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
1616 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 1694 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
1617 } 1695 }
1618 apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm); 1696 apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu);
1619 apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0; 1697 apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0;
1620 apic->highest_isr_cache = -1; 1698 apic->highest_isr_cache = -1;
1621 update_divide_count(apic); 1699 update_divide_count(apic);
@@ -1838,7 +1916,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
1838 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, 1916 kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
1839 apic_find_highest_isr(apic)); 1917 apic_find_highest_isr(apic));
1840 kvm_make_request(KVM_REQ_EVENT, vcpu); 1918 kvm_make_request(KVM_REQ_EVENT, vcpu);
1841 kvm_rtc_eoi_tracking_restore_one(vcpu); 1919 if (ioapic_in_kernel(vcpu->kvm))
1920 kvm_rtc_eoi_tracking_restore_one(vcpu);
1921
1922 vcpu->arch.apic_arb_prio = 0;
1842} 1923}
1843 1924
1844void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1925void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1922,7 +2003,7 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
1922 /* Cache not set: could be safe but we don't bother. */ 2003 /* Cache not set: could be safe but we don't bother. */
1923 apic->highest_isr_cache == -1 || 2004 apic->highest_isr_cache == -1 ||
1924 /* Need EOI to update ioapic. */ 2005 /* Need EOI to update ioapic. */
1925 kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { 2006 kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
1926 /* 2007 /*
1927 * PV EOI was disabled by apic_sync_pv_eoi_from_guest 2008 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
1928 * so we need not do anything here. 2009 * so we need not do anything here.
@@ -1978,7 +2059,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1978 struct kvm_lapic *apic = vcpu->arch.apic; 2059 struct kvm_lapic *apic = vcpu->arch.apic;
1979 u32 reg = (msr - APIC_BASE_MSR) << 4; 2060 u32 reg = (msr - APIC_BASE_MSR) << 4;
1980 2061
1981 if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) 2062 if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
1982 return 1; 2063 return 1;
1983 2064
1984 if (reg == APIC_ICR2) 2065 if (reg == APIC_ICR2)
@@ -1995,7 +2076,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
1995 struct kvm_lapic *apic = vcpu->arch.apic; 2076 struct kvm_lapic *apic = vcpu->arch.apic;
1996 u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0; 2077 u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
1997 2078
1998 if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) 2079 if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
1999 return 1; 2080 return 1;
2000 2081
2001 if (reg == APIC_DFR || reg == APIC_ICR2) { 2082 if (reg == APIC_DFR || reg == APIC_ICR2) {
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 764037991d26..fde8e35d5850 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -57,7 +57,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
57u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); 57u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
58void kvm_apic_set_version(struct kvm_vcpu *vcpu); 58void kvm_apic_set_version(struct kvm_vcpu *vcpu);
59 59
60void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
61void __kvm_apic_update_irr(u32 *pir, void *regs); 60void __kvm_apic_update_irr(u32 *pir, void *regs);
62void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); 61void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
63int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, 62int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
@@ -144,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
144 return apic->vcpu->arch.apic_base & X2APIC_ENABLE; 143 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
145} 144}
146 145
147static inline bool kvm_apic_vid_enabled(struct kvm *kvm) 146static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu)
148{ 147{
149 return kvm_x86_ops->vm_has_apicv(kvm); 148 return kvm_x86_ops->cpu_uses_apicv(vcpu);
150} 149}
151 150
152static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) 151static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
@@ -169,4 +168,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
169 168
170void wait_lapic_expire(struct kvm_vcpu *vcpu); 169void wait_lapic_expire(struct kvm_vcpu *vcpu);
171 170
171bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
172 struct kvm_vcpu **dest_vcpu);
172#endif 173#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff606f507913..7d85bcae3332 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -818,14 +818,11 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
818 kvm->arch.indirect_shadow_pages--; 818 kvm->arch.indirect_shadow_pages--;
819} 819}
820 820
821static int has_wrprotected_page(struct kvm_vcpu *vcpu, 821static int __has_wrprotected_page(gfn_t gfn, int level,
822 gfn_t gfn, 822 struct kvm_memory_slot *slot)
823 int level)
824{ 823{
825 struct kvm_memory_slot *slot;
826 struct kvm_lpage_info *linfo; 824 struct kvm_lpage_info *linfo;
827 825
828 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
829 if (slot) { 826 if (slot) {
830 linfo = lpage_info_slot(gfn, slot, level); 827 linfo = lpage_info_slot(gfn, slot, level);
831 return linfo->write_count; 828 return linfo->write_count;
@@ -834,6 +831,14 @@ static int has_wrprotected_page(struct kvm_vcpu *vcpu,
834 return 1; 831 return 1;
835} 832}
836 833
834static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
835{
836 struct kvm_memory_slot *slot;
837
838 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
839 return __has_wrprotected_page(gfn, level, slot);
840}
841
837static int host_mapping_level(struct kvm *kvm, gfn_t gfn) 842static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
838{ 843{
839 unsigned long page_size; 844 unsigned long page_size;
@@ -851,6 +856,17 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
851 return ret; 856 return ret;
852} 857}
853 858
859static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
860 bool no_dirty_log)
861{
862 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
863 return false;
864 if (no_dirty_log && slot->dirty_bitmap)
865 return false;
866
867 return true;
868}
869
854static struct kvm_memory_slot * 870static struct kvm_memory_slot *
855gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, 871gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
856 bool no_dirty_log) 872 bool no_dirty_log)
@@ -858,21 +874,25 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
858 struct kvm_memory_slot *slot; 874 struct kvm_memory_slot *slot;
859 875
860 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 876 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
861 if (!slot || slot->flags & KVM_MEMSLOT_INVALID || 877 if (!memslot_valid_for_gpte(slot, no_dirty_log))
862 (no_dirty_log && slot->dirty_bitmap))
863 slot = NULL; 878 slot = NULL;
864 879
865 return slot; 880 return slot;
866} 881}
867 882
868static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) 883static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
869{ 884 bool *force_pt_level)
870 return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
871}
872
873static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
874{ 885{
875 int host_level, level, max_level; 886 int host_level, level, max_level;
887 struct kvm_memory_slot *slot;
888
889 if (unlikely(*force_pt_level))
890 return PT_PAGE_TABLE_LEVEL;
891
892 slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
893 *force_pt_level = !memslot_valid_for_gpte(slot, true);
894 if (unlikely(*force_pt_level))
895 return PT_PAGE_TABLE_LEVEL;
876 896
877 host_level = host_mapping_level(vcpu->kvm, large_gfn); 897 host_level = host_mapping_level(vcpu->kvm, large_gfn);
878 898
@@ -882,7 +902,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
882 max_level = min(kvm_x86_ops->get_lpage_level(), host_level); 902 max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
883 903
884 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) 904 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
885 if (has_wrprotected_page(vcpu, large_gfn, level)) 905 if (__has_wrprotected_page(large_gfn, level, slot))
886 break; 906 break;
887 907
888 return level - 1; 908 return level - 1;
@@ -2962,14 +2982,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2962{ 2982{
2963 int r; 2983 int r;
2964 int level; 2984 int level;
2965 int force_pt_level; 2985 bool force_pt_level = false;
2966 pfn_t pfn; 2986 pfn_t pfn;
2967 unsigned long mmu_seq; 2987 unsigned long mmu_seq;
2968 bool map_writable, write = error_code & PFERR_WRITE_MASK; 2988 bool map_writable, write = error_code & PFERR_WRITE_MASK;
2969 2989
2970 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); 2990 level = mapping_level(vcpu, gfn, &force_pt_level);
2971 if (likely(!force_pt_level)) { 2991 if (likely(!force_pt_level)) {
2972 level = mapping_level(vcpu, gfn);
2973 /* 2992 /*
2974 * This path builds a PAE pagetable - so we can map 2993 * This path builds a PAE pagetable - so we can map
2975 * 2mb pages at maximum. Therefore check if the level 2994 * 2mb pages at maximum. Therefore check if the level
@@ -2979,8 +2998,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2979 level = PT_DIRECTORY_LEVEL; 2998 level = PT_DIRECTORY_LEVEL;
2980 2999
2981 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 3000 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2982 } else 3001 }
2983 level = PT_PAGE_TABLE_LEVEL;
2984 3002
2985 if (fast_page_fault(vcpu, v, level, error_code)) 3003 if (fast_page_fault(vcpu, v, level, error_code))
2986 return 0; 3004 return 0;
@@ -3427,7 +3445,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
3427 3445
3428static bool can_do_async_pf(struct kvm_vcpu *vcpu) 3446static bool can_do_async_pf(struct kvm_vcpu *vcpu)
3429{ 3447{
3430 if (unlikely(!irqchip_in_kernel(vcpu->kvm) || 3448 if (unlikely(!lapic_in_kernel(vcpu) ||
3431 kvm_event_needs_reinjection(vcpu))) 3449 kvm_event_needs_reinjection(vcpu)))
3432 return false; 3450 return false;
3433 3451
@@ -3476,7 +3494,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3476 pfn_t pfn; 3494 pfn_t pfn;
3477 int r; 3495 int r;
3478 int level; 3496 int level;
3479 int force_pt_level; 3497 bool force_pt_level;
3480 gfn_t gfn = gpa >> PAGE_SHIFT; 3498 gfn_t gfn = gpa >> PAGE_SHIFT;
3481 unsigned long mmu_seq; 3499 unsigned long mmu_seq;
3482 int write = error_code & PFERR_WRITE_MASK; 3500 int write = error_code & PFERR_WRITE_MASK;
@@ -3495,20 +3513,15 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3495 if (r) 3513 if (r)
3496 return r; 3514 return r;
3497 3515
3498 if (mapping_level_dirty_bitmap(vcpu, gfn) || 3516 force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
3499 !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL)) 3517 PT_DIRECTORY_LEVEL);
3500 force_pt_level = 1; 3518 level = mapping_level(vcpu, gfn, &force_pt_level);
3501 else
3502 force_pt_level = 0;
3503
3504 if (likely(!force_pt_level)) { 3519 if (likely(!force_pt_level)) {
3505 level = mapping_level(vcpu, gfn);
3506 if (level > PT_DIRECTORY_LEVEL && 3520 if (level > PT_DIRECTORY_LEVEL &&
3507 !check_hugepage_cache_consistency(vcpu, gfn, level)) 3521 !check_hugepage_cache_consistency(vcpu, gfn, level))
3508 level = PT_DIRECTORY_LEVEL; 3522 level = PT_DIRECTORY_LEVEL;
3509 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 3523 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
3510 } else 3524 }
3511 level = PT_PAGE_TABLE_LEVEL;
3512 3525
3513 if (fast_page_fault(vcpu, gpa, level, error_code)) 3526 if (fast_page_fault(vcpu, gpa, level, error_code))
3514 return 0; 3527 return 0;
@@ -3706,7 +3719,7 @@ static void
3706__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, 3719__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
3707 int maxphyaddr, bool execonly) 3720 int maxphyaddr, bool execonly)
3708{ 3721{
3709 int pte; 3722 u64 bad_mt_xwr;
3710 3723
3711 rsvd_check->rsvd_bits_mask[0][3] = 3724 rsvd_check->rsvd_bits_mask[0][3] =
3712 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); 3725 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
@@ -3724,14 +3737,16 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
3724 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); 3737 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
3725 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; 3738 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
3726 3739
3727 for (pte = 0; pte < 64; pte++) { 3740 bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
3728 int rwx_bits = pte & 7; 3741 bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
3729 int mt = pte >> 3; 3742 bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
3730 if (mt == 0x2 || mt == 0x3 || mt == 0x7 || 3743 bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
3731 rwx_bits == 0x2 || rwx_bits == 0x6 || 3744 bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
3732 (rwx_bits == 0x4 && !execonly)) 3745 if (!execonly) {
3733 rsvd_check->bad_mt_xwr |= (1ull << pte); 3746 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
3747 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
3734 } 3748 }
3749 rsvd_check->bad_mt_xwr = bad_mt_xwr;
3735} 3750}
3736 3751
3737static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, 3752static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 736e6ab8784d..b41faa91a6f9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -698,7 +698,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
698 int r; 698 int r;
699 pfn_t pfn; 699 pfn_t pfn;
700 int level = PT_PAGE_TABLE_LEVEL; 700 int level = PT_PAGE_TABLE_LEVEL;
701 int force_pt_level; 701 bool force_pt_level = false;
702 unsigned long mmu_seq; 702 unsigned long mmu_seq;
703 bool map_writable, is_self_change_mapping; 703 bool map_writable, is_self_change_mapping;
704 704
@@ -743,15 +743,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
743 is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, 743 is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
744 &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); 744 &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
745 745
746 if (walker.level >= PT_DIRECTORY_LEVEL) 746 if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) {
747 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) 747 level = mapping_level(vcpu, walker.gfn, &force_pt_level);
748 || is_self_change_mapping; 748 if (likely(!force_pt_level)) {
749 else 749 level = min(walker.level, level);
750 force_pt_level = 1; 750 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
751 if (!force_pt_level) { 751 }
752 level = min(walker.level, mapping_level(vcpu, walker.gfn)); 752 } else
753 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); 753 force_pt_level = true;
754 }
755 754
756 mmu_seq = vcpu->kvm->mmu_notifier_seq; 755 mmu_seq = vcpu->kvm->mmu_notifier_seq;
757 smp_rmb(); 756 smp_rmb();
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2f9ed1ff0632..f2c8e4917688 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -159,6 +159,9 @@ struct vcpu_svm {
159 u32 apf_reason; 159 u32 apf_reason;
160 160
161 u64 tsc_ratio; 161 u64 tsc_ratio;
162
163 /* cached guest cpuid flags for faster access */
164 bool nrips_enabled : 1;
162}; 165};
163 166
164static DEFINE_PER_CPU(u64, current_tsc_ratio); 167static DEFINE_PER_CPU(u64, current_tsc_ratio);
@@ -1086,7 +1089,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1086 return target_tsc - tsc; 1089 return target_tsc - tsc;
1087} 1090}
1088 1091
1089static void init_vmcb(struct vcpu_svm *svm, bool init_event) 1092static void init_vmcb(struct vcpu_svm *svm)
1090{ 1093{
1091 struct vmcb_control_area *control = &svm->vmcb->control; 1094 struct vmcb_control_area *control = &svm->vmcb->control;
1092 struct vmcb_save_area *save = &svm->vmcb->save; 1095 struct vmcb_save_area *save = &svm->vmcb->save;
@@ -1157,8 +1160,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event)
1157 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 1160 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1158 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 1161 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1159 1162
1160 if (!init_event) 1163 svm_set_efer(&svm->vcpu, 0);
1161 svm_set_efer(&svm->vcpu, 0);
1162 save->dr6 = 0xffff0ff0; 1164 save->dr6 = 0xffff0ff0;
1163 kvm_set_rflags(&svm->vcpu, 2); 1165 kvm_set_rflags(&svm->vcpu, 2);
1164 save->rip = 0x0000fff0; 1166 save->rip = 0x0000fff0;
@@ -1212,7 +1214,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1212 if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) 1214 if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
1213 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 1215 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1214 } 1216 }
1215 init_vmcb(svm, init_event); 1217 init_vmcb(svm);
1216 1218
1217 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); 1219 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
1218 kvm_register_write(vcpu, VCPU_REGS_RDX, eax); 1220 kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
@@ -1268,7 +1270,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1268 clear_page(svm->vmcb); 1270 clear_page(svm->vmcb);
1269 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 1271 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
1270 svm->asid_generation = 0; 1272 svm->asid_generation = 0;
1271 init_vmcb(svm, false); 1273 init_vmcb(svm);
1272 1274
1273 svm_init_osvw(&svm->vcpu); 1275 svm_init_osvw(&svm->vcpu);
1274 1276
@@ -1890,7 +1892,7 @@ static int shutdown_interception(struct vcpu_svm *svm)
1890 * so reinitialize it. 1892 * so reinitialize it.
1891 */ 1893 */
1892 clear_page(svm->vmcb); 1894 clear_page(svm->vmcb);
1893 init_vmcb(svm, false); 1895 init_vmcb(svm);
1894 1896
1895 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 1897 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1896 return 0; 1898 return 0;
@@ -2365,7 +2367,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
2365 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 2367 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
2366 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 2368 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
2367 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 2369 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2368 nested_vmcb->control.next_rip = vmcb->control.next_rip; 2370
2371 if (svm->nrips_enabled)
2372 nested_vmcb->control.next_rip = vmcb->control.next_rip;
2369 2373
2370 /* 2374 /*
2371 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have 2375 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -3060,7 +3064,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
3060 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 3064 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
3061 /* instruction emulation calls kvm_set_cr8() */ 3065 /* instruction emulation calls kvm_set_cr8() */
3062 r = cr_interception(svm); 3066 r = cr_interception(svm);
3063 if (irqchip_in_kernel(svm->vcpu.kvm)) 3067 if (lapic_in_kernel(&svm->vcpu))
3064 return r; 3068 return r;
3065 if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) 3069 if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
3066 return r; 3070 return r;
@@ -3294,24 +3298,11 @@ static int msr_interception(struct vcpu_svm *svm)
3294 3298
3295static int interrupt_window_interception(struct vcpu_svm *svm) 3299static int interrupt_window_interception(struct vcpu_svm *svm)
3296{ 3300{
3297 struct kvm_run *kvm_run = svm->vcpu.run;
3298
3299 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3301 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3300 svm_clear_vintr(svm); 3302 svm_clear_vintr(svm);
3301 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 3303 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3302 mark_dirty(svm->vmcb, VMCB_INTR); 3304 mark_dirty(svm->vmcb, VMCB_INTR);
3303 ++svm->vcpu.stat.irq_window_exits; 3305 ++svm->vcpu.stat.irq_window_exits;
3304 /*
3305 * If the user space waits to inject interrupts, exit as soon as
3306 * possible
3307 */
3308 if (!irqchip_in_kernel(svm->vcpu.kvm) &&
3309 kvm_run->request_interrupt_window &&
3310 !kvm_cpu_has_interrupt(&svm->vcpu)) {
3311 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3312 return 0;
3313 }
3314
3315 return 1; 3306 return 1;
3316} 3307}
3317 3308
@@ -3659,12 +3650,12 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
3659 return; 3650 return;
3660} 3651}
3661 3652
3662static int svm_vm_has_apicv(struct kvm *kvm) 3653static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu)
3663{ 3654{
3664 return 0; 3655 return 0;
3665} 3656}
3666 3657
3667static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 3658static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu)
3668{ 3659{
3669 return; 3660 return;
3670} 3661}
@@ -4098,6 +4089,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4098 4089
4099static void svm_cpuid_update(struct kvm_vcpu *vcpu) 4090static void svm_cpuid_update(struct kvm_vcpu *vcpu)
4100{ 4091{
4092 struct vcpu_svm *svm = to_svm(vcpu);
4093
4094 /* Update nrips enabled cache */
4095 svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
4101} 4096}
4102 4097
4103static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 4098static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -4425,7 +4420,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4425 .enable_irq_window = enable_irq_window, 4420 .enable_irq_window = enable_irq_window,
4426 .update_cr8_intercept = update_cr8_intercept, 4421 .update_cr8_intercept = update_cr8_intercept,
4427 .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, 4422 .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
4428 .vm_has_apicv = svm_vm_has_apicv, 4423 .cpu_uses_apicv = svm_cpu_uses_apicv,
4429 .load_eoi_exitmap = svm_load_eoi_exitmap, 4424 .load_eoi_exitmap = svm_load_eoi_exitmap,
4430 .sync_pir_to_irr = svm_sync_pir_to_irr, 4425 .sync_pir_to_irr = svm_sync_pir_to_irr,
4431 4426
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 4eae7c35ddf5..120302511802 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -129,6 +129,24 @@ TRACE_EVENT(kvm_pio,
129); 129);
130 130
131/* 131/*
132 * Tracepoint for fast mmio.
133 */
134TRACE_EVENT(kvm_fast_mmio,
135 TP_PROTO(u64 gpa),
136 TP_ARGS(gpa),
137
138 TP_STRUCT__entry(
139 __field(u64, gpa)
140 ),
141
142 TP_fast_assign(
143 __entry->gpa = gpa;
144 ),
145
146 TP_printk("fast mmio at gpa 0x%llx", __entry->gpa)
147);
148
149/*
132 * Tracepoint for cpuid. 150 * Tracepoint for cpuid.
133 */ 151 */
134TRACE_EVENT(kvm_cpuid, 152TRACE_EVENT(kvm_cpuid,
@@ -974,6 +992,39 @@ TRACE_EVENT(kvm_enter_smm,
974 __entry->smbase) 992 __entry->smbase)
975); 993);
976 994
995/*
996 * Tracepoint for VT-d posted-interrupts.
997 */
998TRACE_EVENT(kvm_pi_irte_update,
999 TP_PROTO(unsigned int vcpu_id, unsigned int gsi,
1000 unsigned int gvec, u64 pi_desc_addr, bool set),
1001 TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set),
1002
1003 TP_STRUCT__entry(
1004 __field( unsigned int, vcpu_id )
1005 __field( unsigned int, gsi )
1006 __field( unsigned int, gvec )
1007 __field( u64, pi_desc_addr )
1008 __field( bool, set )
1009 ),
1010
1011 TP_fast_assign(
1012 __entry->vcpu_id = vcpu_id;
1013 __entry->gsi = gsi;
1014 __entry->gvec = gvec;
1015 __entry->pi_desc_addr = pi_desc_addr;
1016 __entry->set = set;
1017 ),
1018
1019 TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, "
1020 "gvec: 0x%x, pi_desc_addr: 0x%llx",
1021 __entry->set ? "enabled and being updated" : "disabled",
1022 __entry->vcpu_id,
1023 __entry->gsi,
1024 __entry->gvec,
1025 __entry->pi_desc_addr)
1026);
1027
977#endif /* _TRACE_KVM_H */ 1028#endif /* _TRACE_KVM_H */
978 1029
979#undef TRACE_INCLUDE_PATH 1030#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a8bc64566ab..5eb56ed77c1f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -35,6 +35,7 @@
35#include "kvm_cache_regs.h" 35#include "kvm_cache_regs.h"
36#include "x86.h" 36#include "x86.h"
37 37
38#include <asm/cpu.h>
38#include <asm/io.h> 39#include <asm/io.h>
39#include <asm/desc.h> 40#include <asm/desc.h>
40#include <asm/vmx.h> 41#include <asm/vmx.h>
@@ -45,6 +46,7 @@
45#include <asm/debugreg.h> 46#include <asm/debugreg.h>
46#include <asm/kexec.h> 47#include <asm/kexec.h>
47#include <asm/apic.h> 48#include <asm/apic.h>
49#include <asm/irq_remapping.h>
48 50
49#include "trace.h" 51#include "trace.h"
50#include "pmu.h" 52#include "pmu.h"
@@ -424,6 +426,9 @@ struct nested_vmx {
424 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ 426 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
425 u64 vmcs01_debugctl; 427 u64 vmcs01_debugctl;
426 428
429 u16 vpid02;
430 u16 last_vpid;
431
427 u32 nested_vmx_procbased_ctls_low; 432 u32 nested_vmx_procbased_ctls_low;
428 u32 nested_vmx_procbased_ctls_high; 433 u32 nested_vmx_procbased_ctls_high;
429 u32 nested_vmx_true_procbased_ctls_low; 434 u32 nested_vmx_true_procbased_ctls_low;
@@ -440,14 +445,33 @@ struct nested_vmx {
440 u32 nested_vmx_misc_low; 445 u32 nested_vmx_misc_low;
441 u32 nested_vmx_misc_high; 446 u32 nested_vmx_misc_high;
442 u32 nested_vmx_ept_caps; 447 u32 nested_vmx_ept_caps;
448 u32 nested_vmx_vpid_caps;
443}; 449};
444 450
445#define POSTED_INTR_ON 0 451#define POSTED_INTR_ON 0
452#define POSTED_INTR_SN 1
453
446/* Posted-Interrupt Descriptor */ 454/* Posted-Interrupt Descriptor */
447struct pi_desc { 455struct pi_desc {
448 u32 pir[8]; /* Posted interrupt requested */ 456 u32 pir[8]; /* Posted interrupt requested */
449 u32 control; /* bit 0 of control is outstanding notification bit */ 457 union {
450 u32 rsvd[7]; 458 struct {
459 /* bit 256 - Outstanding Notification */
460 u16 on : 1,
461 /* bit 257 - Suppress Notification */
462 sn : 1,
463 /* bit 271:258 - Reserved */
464 rsvd_1 : 14;
465 /* bit 279:272 - Notification Vector */
466 u8 nv;
467 /* bit 287:280 - Reserved */
468 u8 rsvd_2;
469 /* bit 319:288 - Notification Destination */
470 u32 ndst;
471 };
472 u64 control;
473 };
474 u32 rsvd[6];
451} __aligned(64); 475} __aligned(64);
452 476
453static bool pi_test_and_set_on(struct pi_desc *pi_desc) 477static bool pi_test_and_set_on(struct pi_desc *pi_desc)
@@ -467,6 +491,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
467 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); 491 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
468} 492}
469 493
494static inline void pi_clear_sn(struct pi_desc *pi_desc)
495{
496 return clear_bit(POSTED_INTR_SN,
497 (unsigned long *)&pi_desc->control);
498}
499
500static inline void pi_set_sn(struct pi_desc *pi_desc)
501{
502 return set_bit(POSTED_INTR_SN,
503 (unsigned long *)&pi_desc->control);
504}
505
506static inline int pi_test_on(struct pi_desc *pi_desc)
507{
508 return test_bit(POSTED_INTR_ON,
509 (unsigned long *)&pi_desc->control);
510}
511
512static inline int pi_test_sn(struct pi_desc *pi_desc)
513{
514 return test_bit(POSTED_INTR_SN,
515 (unsigned long *)&pi_desc->control);
516}
517
470struct vcpu_vmx { 518struct vcpu_vmx {
471 struct kvm_vcpu vcpu; 519 struct kvm_vcpu vcpu;
472 unsigned long host_rsp; 520 unsigned long host_rsp;
@@ -532,8 +580,6 @@ struct vcpu_vmx {
532 s64 vnmi_blocked_time; 580 s64 vnmi_blocked_time;
533 u32 exit_reason; 581 u32 exit_reason;
534 582
535 bool rdtscp_enabled;
536
537 /* Posted interrupt descriptor */ 583 /* Posted interrupt descriptor */
538 struct pi_desc pi_desc; 584 struct pi_desc pi_desc;
539 585
@@ -563,6 +609,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
563 return container_of(vcpu, struct vcpu_vmx, vcpu); 609 return container_of(vcpu, struct vcpu_vmx, vcpu);
564} 610}
565 611
612static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
613{
614 return &(to_vmx(vcpu)->pi_desc);
615}
616
566#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) 617#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
567#define FIELD(number, name) [number] = VMCS12_OFFSET(name) 618#define FIELD(number, name) [number] = VMCS12_OFFSET(name)
568#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ 619#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
@@ -809,7 +860,7 @@ static void kvm_cpu_vmxon(u64 addr);
809static void kvm_cpu_vmxoff(void); 860static void kvm_cpu_vmxoff(void);
810static bool vmx_mpx_supported(void); 861static bool vmx_mpx_supported(void);
811static bool vmx_xsaves_supported(void); 862static bool vmx_xsaves_supported(void);
812static int vmx_vm_has_apicv(struct kvm *kvm); 863static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu);
813static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 864static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
814static void vmx_set_segment(struct kvm_vcpu *vcpu, 865static void vmx_set_segment(struct kvm_vcpu *vcpu,
815 struct kvm_segment *var, int seg); 866 struct kvm_segment *var, int seg);
@@ -831,6 +882,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
831static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 882static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
832static DEFINE_PER_CPU(struct desc_ptr, host_gdt); 883static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
833 884
885/*
886 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
887 * can find which vCPU should be waken up.
888 */
889static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
890static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
891
834static unsigned long *vmx_io_bitmap_a; 892static unsigned long *vmx_io_bitmap_a;
835static unsigned long *vmx_io_bitmap_b; 893static unsigned long *vmx_io_bitmap_b;
836static unsigned long *vmx_msr_bitmap_legacy; 894static unsigned long *vmx_msr_bitmap_legacy;
@@ -946,9 +1004,9 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
946 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; 1004 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
947} 1005}
948 1006
949static inline bool vm_need_tpr_shadow(struct kvm *kvm) 1007static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
950{ 1008{
951 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); 1009 return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
952} 1010}
953 1011
954static inline bool cpu_has_secondary_exec_ctrls(void) 1012static inline bool cpu_has_secondary_exec_ctrls(void)
@@ -983,7 +1041,8 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
983 1041
984static inline bool cpu_has_vmx_posted_intr(void) 1042static inline bool cpu_has_vmx_posted_intr(void)
985{ 1043{
986 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; 1044 return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
1045 vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
987} 1046}
988 1047
989static inline bool cpu_has_vmx_apicv(void) 1048static inline bool cpu_has_vmx_apicv(void)
@@ -1062,9 +1121,9 @@ static inline bool cpu_has_vmx_ple(void)
1062 SECONDARY_EXEC_PAUSE_LOOP_EXITING; 1121 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1063} 1122}
1064 1123
1065static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm) 1124static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
1066{ 1125{
1067 return flexpriority_enabled && irqchip_in_kernel(kvm); 1126 return flexpriority_enabled && lapic_in_kernel(vcpu);
1068} 1127}
1069 1128
1070static inline bool cpu_has_vmx_vpid(void) 1129static inline bool cpu_has_vmx_vpid(void)
@@ -1157,6 +1216,11 @@ static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
1157 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 1216 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
1158} 1217}
1159 1218
1219static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
1220{
1221 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
1222}
1223
1160static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) 1224static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
1161{ 1225{
1162 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); 1226 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
@@ -1337,13 +1401,13 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1337 __loaded_vmcs_clear, loaded_vmcs, 1); 1401 __loaded_vmcs_clear, loaded_vmcs, 1);
1338} 1402}
1339 1403
1340static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) 1404static inline void vpid_sync_vcpu_single(int vpid)
1341{ 1405{
1342 if (vmx->vpid == 0) 1406 if (vpid == 0)
1343 return; 1407 return;
1344 1408
1345 if (cpu_has_vmx_invvpid_single()) 1409 if (cpu_has_vmx_invvpid_single())
1346 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); 1410 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
1347} 1411}
1348 1412
1349static inline void vpid_sync_vcpu_global(void) 1413static inline void vpid_sync_vcpu_global(void)
@@ -1352,10 +1416,10 @@ static inline void vpid_sync_vcpu_global(void)
1352 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); 1416 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
1353} 1417}
1354 1418
1355static inline void vpid_sync_context(struct vcpu_vmx *vmx) 1419static inline void vpid_sync_context(int vpid)
1356{ 1420{
1357 if (cpu_has_vmx_invvpid_single()) 1421 if (cpu_has_vmx_invvpid_single())
1358 vpid_sync_vcpu_single(vmx); 1422 vpid_sync_vcpu_single(vpid);
1359 else 1423 else
1360 vpid_sync_vcpu_global(); 1424 vpid_sync_vcpu_global();
1361} 1425}
@@ -1895,6 +1959,52 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
1895 preempt_enable(); 1959 preempt_enable();
1896} 1960}
1897 1961
1962static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
1963{
1964 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
1965 struct pi_desc old, new;
1966 unsigned int dest;
1967
1968 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
1969 !irq_remapping_cap(IRQ_POSTING_CAP))
1970 return;
1971
1972 do {
1973 old.control = new.control = pi_desc->control;
1974
1975 /*
1976 * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
1977 * are two possible cases:
1978 * 1. After running 'pre_block', context switch
1979 * happened. For this case, 'sn' was set in
1980 * vmx_vcpu_put(), so we need to clear it here.
1981 * 2. After running 'pre_block', we were blocked,
1982 * and woken up by some other guy. For this case,
1983 * we don't need to do anything, 'pi_post_block'
1984 * will do everything for us. However, we cannot
1985 * check whether it is case #1 or case #2 here
1986 * (maybe, not needed), so we also clear sn here,
1987 * I think it is not a big deal.
1988 */
1989 if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
1990 if (vcpu->cpu != cpu) {
1991 dest = cpu_physical_id(cpu);
1992
1993 if (x2apic_enabled())
1994 new.ndst = dest;
1995 else
1996 new.ndst = (dest << 8) & 0xFF00;
1997 }
1998
1999 /* set 'NV' to 'notification vector' */
2000 new.nv = POSTED_INTR_VECTOR;
2001 }
2002
2003 /* Allow posting non-urgent interrupts */
2004 new.sn = 0;
2005 } while (cmpxchg(&pi_desc->control, old.control,
2006 new.control) != old.control);
2007}
1898/* 2008/*
1899 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 2009 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1900 * vcpu mutex is already taken. 2010 * vcpu mutex is already taken.
@@ -1945,10 +2055,27 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1945 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 2055 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1946 vmx->loaded_vmcs->cpu = cpu; 2056 vmx->loaded_vmcs->cpu = cpu;
1947 } 2057 }
2058
2059 vmx_vcpu_pi_load(vcpu, cpu);
2060}
2061
2062static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
2063{
2064 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2065
2066 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
2067 !irq_remapping_cap(IRQ_POSTING_CAP))
2068 return;
2069
2070 /* Set SN when the vCPU is preempted */
2071 if (vcpu->preempted)
2072 pi_set_sn(pi_desc);
1948} 2073}
1949 2074
1950static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 2075static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1951{ 2076{
2077 vmx_vcpu_pi_put(vcpu);
2078
1952 __vmx_load_host_state(to_vmx(vcpu)); 2079 __vmx_load_host_state(to_vmx(vcpu));
1953 if (!vmm_exclusive) { 2080 if (!vmm_exclusive) {
1954 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); 2081 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
@@ -2207,7 +2334,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
2207 if (index >= 0) 2334 if (index >= 0)
2208 move_msr_up(vmx, index, save_nmsrs++); 2335 move_msr_up(vmx, index, save_nmsrs++);
2209 index = __find_msr_index(vmx, MSR_TSC_AUX); 2336 index = __find_msr_index(vmx, MSR_TSC_AUX);
2210 if (index >= 0 && vmx->rdtscp_enabled) 2337 if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu))
2211 move_msr_up(vmx, index, save_nmsrs++); 2338 move_msr_up(vmx, index, save_nmsrs++);
2212 /* 2339 /*
2213 * MSR_STAR is only needed on long mode guests, and only 2340 * MSR_STAR is only needed on long mode guests, and only
@@ -2377,7 +2504,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2377 vmx->nested.nested_vmx_pinbased_ctls_high |= 2504 vmx->nested.nested_vmx_pinbased_ctls_high |=
2378 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 2505 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2379 PIN_BASED_VMX_PREEMPTION_TIMER; 2506 PIN_BASED_VMX_PREEMPTION_TIMER;
2380 if (vmx_vm_has_apicv(vmx->vcpu.kvm)) 2507 if (vmx_cpu_uses_apicv(&vmx->vcpu))
2381 vmx->nested.nested_vmx_pinbased_ctls_high |= 2508 vmx->nested.nested_vmx_pinbased_ctls_high |=
2382 PIN_BASED_POSTED_INTR; 2509 PIN_BASED_POSTED_INTR;
2383 2510
@@ -2471,10 +2598,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2471 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2598 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2472 SECONDARY_EXEC_RDTSCP | 2599 SECONDARY_EXEC_RDTSCP |
2473 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2600 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2601 SECONDARY_EXEC_ENABLE_VPID |
2474 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2602 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2475 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2603 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2476 SECONDARY_EXEC_WBINVD_EXITING | 2604 SECONDARY_EXEC_WBINVD_EXITING |
2477 SECONDARY_EXEC_XSAVES; 2605 SECONDARY_EXEC_XSAVES |
2606 SECONDARY_EXEC_PCOMMIT;
2478 2607
2479 if (enable_ept) { 2608 if (enable_ept) {
2480 /* nested EPT: emulate EPT also to L1 */ 2609 /* nested EPT: emulate EPT also to L1 */
@@ -2493,6 +2622,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2493 } else 2622 } else
2494 vmx->nested.nested_vmx_ept_caps = 0; 2623 vmx->nested.nested_vmx_ept_caps = 0;
2495 2624
2625 if (enable_vpid)
2626 vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
2627 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
2628 else
2629 vmx->nested.nested_vmx_vpid_caps = 0;
2630
2496 if (enable_unrestricted_guest) 2631 if (enable_unrestricted_guest)
2497 vmx->nested.nested_vmx_secondary_ctls_high |= 2632 vmx->nested.nested_vmx_secondary_ctls_high |=
2498 SECONDARY_EXEC_UNRESTRICTED_GUEST; 2633 SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -2608,7 +2743,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2608 break; 2743 break;
2609 case MSR_IA32_VMX_EPT_VPID_CAP: 2744 case MSR_IA32_VMX_EPT_VPID_CAP:
2610 /* Currently, no nested vpid support */ 2745 /* Currently, no nested vpid support */
2611 *pdata = vmx->nested.nested_vmx_ept_caps; 2746 *pdata = vmx->nested.nested_vmx_ept_caps |
2747 ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
2612 break; 2748 break;
2613 default: 2749 default:
2614 return 1; 2750 return 1;
@@ -2673,7 +2809,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2673 msr_info->data = vcpu->arch.ia32_xss; 2809 msr_info->data = vcpu->arch.ia32_xss;
2674 break; 2810 break;
2675 case MSR_TSC_AUX: 2811 case MSR_TSC_AUX:
2676 if (!to_vmx(vcpu)->rdtscp_enabled) 2812 if (!guest_cpuid_has_rdtscp(vcpu))
2677 return 1; 2813 return 1;
2678 /* Otherwise falls through */ 2814 /* Otherwise falls through */
2679 default: 2815 default:
@@ -2779,7 +2915,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2779 clear_atomic_switch_msr(vmx, MSR_IA32_XSS); 2915 clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
2780 break; 2916 break;
2781 case MSR_TSC_AUX: 2917 case MSR_TSC_AUX:
2782 if (!vmx->rdtscp_enabled) 2918 if (!guest_cpuid_has_rdtscp(vcpu))
2783 return 1; 2919 return 1;
2784 /* Check reserved bit, higher 32 bits should be zero */ 2920 /* Check reserved bit, higher 32 bits should be zero */
2785 if ((data >> 32) != 0) 2921 if ((data >> 32) != 0)
@@ -2874,6 +3010,8 @@ static int hardware_enable(void)
2874 return -EBUSY; 3010 return -EBUSY;
2875 3011
2876 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 3012 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
3013 INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
3014 spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
2877 3015
2878 /* 3016 /*
2879 * Now we can enable the vmclear operation in kdump 3017 * Now we can enable the vmclear operation in kdump
@@ -3015,7 +3153,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
3015 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 3153 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3016 SECONDARY_EXEC_SHADOW_VMCS | 3154 SECONDARY_EXEC_SHADOW_VMCS |
3017 SECONDARY_EXEC_XSAVES | 3155 SECONDARY_EXEC_XSAVES |
3018 SECONDARY_EXEC_ENABLE_PML; 3156 SECONDARY_EXEC_ENABLE_PML |
3157 SECONDARY_EXEC_PCOMMIT;
3019 if (adjust_vmx_controls(min2, opt2, 3158 if (adjust_vmx_controls(min2, opt2,
3020 MSR_IA32_VMX_PROCBASED_CTLS2, 3159 MSR_IA32_VMX_PROCBASED_CTLS2,
3021 &_cpu_based_2nd_exec_control) < 0) 3160 &_cpu_based_2nd_exec_control) < 0)
@@ -3441,9 +3580,9 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
3441 3580
3442#endif 3581#endif
3443 3582
3444static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 3583static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
3445{ 3584{
3446 vpid_sync_context(to_vmx(vcpu)); 3585 vpid_sync_context(vpid);
3447 if (enable_ept) { 3586 if (enable_ept) {
3448 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3587 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3449 return; 3588 return;
@@ -3451,6 +3590,11 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
3451 } 3590 }
3452} 3591}
3453 3592
3593static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
3594{
3595 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
3596}
3597
3454static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 3598static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
3455{ 3599{
3456 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 3600 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -3644,20 +3788,21 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3644 if (!is_paging(vcpu)) { 3788 if (!is_paging(vcpu)) {
3645 hw_cr4 &= ~X86_CR4_PAE; 3789 hw_cr4 &= ~X86_CR4_PAE;
3646 hw_cr4 |= X86_CR4_PSE; 3790 hw_cr4 |= X86_CR4_PSE;
3647 /*
3648 * SMEP/SMAP is disabled if CPU is in non-paging mode
3649 * in hardware. However KVM always uses paging mode to
3650 * emulate guest non-paging mode with TDP.
3651 * To emulate this behavior, SMEP/SMAP needs to be
3652 * manually disabled when guest switches to non-paging
3653 * mode.
3654 */
3655 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
3656 } else if (!(cr4 & X86_CR4_PAE)) { 3791 } else if (!(cr4 & X86_CR4_PAE)) {
3657 hw_cr4 &= ~X86_CR4_PAE; 3792 hw_cr4 &= ~X86_CR4_PAE;
3658 } 3793 }
3659 } 3794 }
3660 3795
3796 if (!enable_unrestricted_guest && !is_paging(vcpu))
3797 /*
3798 * SMEP/SMAP is disabled if CPU is in non-paging mode in
3799 * hardware. However KVM always uses paging mode without
3800 * unrestricted guest.
3801 * To emulate this behavior, SMEP/SMAP needs to be manually
3802 * disabled when guest switches to non-paging mode.
3803 */
3804 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
3805
3661 vmcs_writel(CR4_READ_SHADOW, cr4); 3806 vmcs_writel(CR4_READ_SHADOW, cr4);
3662 vmcs_writel(GUEST_CR4, hw_cr4); 3807 vmcs_writel(GUEST_CR4, hw_cr4);
3663 return 0; 3808 return 0;
@@ -4146,29 +4291,28 @@ static int alloc_identity_pagetable(struct kvm *kvm)
4146 return r; 4291 return r;
4147} 4292}
4148 4293
4149static void allocate_vpid(struct vcpu_vmx *vmx) 4294static int allocate_vpid(void)
4150{ 4295{
4151 int vpid; 4296 int vpid;
4152 4297
4153 vmx->vpid = 0;
4154 if (!enable_vpid) 4298 if (!enable_vpid)
4155 return; 4299 return 0;
4156 spin_lock(&vmx_vpid_lock); 4300 spin_lock(&vmx_vpid_lock);
4157 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 4301 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
4158 if (vpid < VMX_NR_VPIDS) { 4302 if (vpid < VMX_NR_VPIDS)
4159 vmx->vpid = vpid;
4160 __set_bit(vpid, vmx_vpid_bitmap); 4303 __set_bit(vpid, vmx_vpid_bitmap);
4161 } 4304 else
4305 vpid = 0;
4162 spin_unlock(&vmx_vpid_lock); 4306 spin_unlock(&vmx_vpid_lock);
4307 return vpid;
4163} 4308}
4164 4309
4165static void free_vpid(struct vcpu_vmx *vmx) 4310static void free_vpid(int vpid)
4166{ 4311{
4167 if (!enable_vpid) 4312 if (!enable_vpid || vpid == 0)
4168 return; 4313 return;
4169 spin_lock(&vmx_vpid_lock); 4314 spin_lock(&vmx_vpid_lock);
4170 if (vmx->vpid != 0) 4315 __clear_bit(vpid, vmx_vpid_bitmap);
4171 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
4172 spin_unlock(&vmx_vpid_lock); 4316 spin_unlock(&vmx_vpid_lock);
4173} 4317}
4174 4318
@@ -4323,9 +4467,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
4323 msr, MSR_TYPE_W); 4467 msr, MSR_TYPE_W);
4324} 4468}
4325 4469
4326static int vmx_vm_has_apicv(struct kvm *kvm) 4470static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu)
4327{ 4471{
4328 return enable_apicv && irqchip_in_kernel(kvm); 4472 return enable_apicv && lapic_in_kernel(vcpu);
4329} 4473}
4330 4474
4331static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 4475static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -4369,6 +4513,22 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
4369{ 4513{
4370#ifdef CONFIG_SMP 4514#ifdef CONFIG_SMP
4371 if (vcpu->mode == IN_GUEST_MODE) { 4515 if (vcpu->mode == IN_GUEST_MODE) {
4516 struct vcpu_vmx *vmx = to_vmx(vcpu);
4517
4518 /*
4519 * Currently, we don't support urgent interrupt,
4520 * all interrupts are recognized as non-urgent
4521 * interrupt, so we cannot post interrupts when
4522 * 'SN' is set.
4523 *
4524 * If the vcpu is in guest mode, it means it is
4525 * running instead of being scheduled out and
4526 * waiting in the run queue, and that's the only
4527 * case when 'SN' is set currently, warning if
4528 * 'SN' is set.
4529 */
4530 WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
4531
4372 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), 4532 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
4373 POSTED_INTR_VECTOR); 4533 POSTED_INTR_VECTOR);
4374 return true; 4534 return true;
@@ -4505,7 +4665,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4505{ 4665{
4506 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4666 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4507 4667
4508 if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) 4668 if (!vmx_cpu_uses_apicv(&vmx->vcpu))
4509 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4669 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4510 return pin_based_exec_ctrl; 4670 return pin_based_exec_ctrl;
4511} 4671}
@@ -4517,7 +4677,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4517 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4677 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4518 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4678 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4519 4679
4520 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { 4680 if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
4521 exec_control &= ~CPU_BASED_TPR_SHADOW; 4681 exec_control &= ~CPU_BASED_TPR_SHADOW;
4522#ifdef CONFIG_X86_64 4682#ifdef CONFIG_X86_64
4523 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4683 exec_control |= CPU_BASED_CR8_STORE_EXITING |
@@ -4534,7 +4694,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4534static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4694static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4535{ 4695{
4536 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4696 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4537 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) 4697 if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu))
4538 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4698 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4539 if (vmx->vpid == 0) 4699 if (vmx->vpid == 0)
4540 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4700 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
@@ -4548,7 +4708,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4548 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4708 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4549 if (!ple_gap) 4709 if (!ple_gap)
4550 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4710 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4551 if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) 4711 if (!vmx_cpu_uses_apicv(&vmx->vcpu))
4552 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4712 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4553 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4713 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4554 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4714 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
@@ -4558,8 +4718,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4558 a current VMCS12 4718 a current VMCS12
4559 */ 4719 */
4560 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4720 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4561 /* PML is enabled/disabled in creating/destorying vcpu */ 4721
4562 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4722 if (!enable_pml)
4723 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4724
4725 /* Currently, we allow L1 guest to directly run pcommit instruction. */
4726 exec_control &= ~SECONDARY_EXEC_PCOMMIT;
4563 4727
4564 return exec_control; 4728 return exec_control;
4565} 4729}
@@ -4604,12 +4768,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4604 4768
4605 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); 4769 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
4606 4770
4607 if (cpu_has_secondary_exec_ctrls()) { 4771 if (cpu_has_secondary_exec_ctrls())
4608 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 4772 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
4609 vmx_secondary_exec_control(vmx)); 4773 vmx_secondary_exec_control(vmx));
4610 }
4611 4774
4612 if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { 4775 if (vmx_cpu_uses_apicv(&vmx->vcpu)) {
4613 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4776 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4614 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4777 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4615 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4778 vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4753,7 +4916,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4753 4916
4754 if (cpu_has_vmx_tpr_shadow() && !init_event) { 4917 if (cpu_has_vmx_tpr_shadow() && !init_event) {
4755 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4918 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4756 if (vm_need_tpr_shadow(vcpu->kvm)) 4919 if (cpu_need_tpr_shadow(vcpu))
4757 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4920 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4758 __pa(vcpu->arch.apic->regs)); 4921 __pa(vcpu->arch.apic->regs));
4759 vmcs_write32(TPR_THRESHOLD, 0); 4922 vmcs_write32(TPR_THRESHOLD, 0);
@@ -4761,7 +4924,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4761 4924
4762 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4925 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4763 4926
4764 if (vmx_vm_has_apicv(vcpu->kvm)) 4927 if (vmx_cpu_uses_apicv(vcpu))
4765 memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); 4928 memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
4766 4929
4767 if (vmx->vpid != 0) 4930 if (vmx->vpid != 0)
@@ -4771,12 +4934,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4771 vmx_set_cr0(vcpu, cr0); /* enter rmode */ 4934 vmx_set_cr0(vcpu, cr0); /* enter rmode */
4772 vmx->vcpu.arch.cr0 = cr0; 4935 vmx->vcpu.arch.cr0 = cr0;
4773 vmx_set_cr4(vcpu, 0); 4936 vmx_set_cr4(vcpu, 0);
4774 if (!init_event) 4937 vmx_set_efer(vcpu, 0);
4775 vmx_set_efer(vcpu, 0);
4776 vmx_fpu_activate(vcpu); 4938 vmx_fpu_activate(vcpu);
4777 update_exception_bitmap(vcpu); 4939 update_exception_bitmap(vcpu);
4778 4940
4779 vpid_sync_context(vmx); 4941 vpid_sync_context(vmx->vpid);
4780} 4942}
4781 4943
4782/* 4944/*
@@ -5296,7 +5458,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
5296 u8 cr8 = (u8)val; 5458 u8 cr8 = (u8)val;
5297 err = kvm_set_cr8(vcpu, cr8); 5459 err = kvm_set_cr8(vcpu, cr8);
5298 kvm_complete_insn_gp(vcpu, err); 5460 kvm_complete_insn_gp(vcpu, err);
5299 if (irqchip_in_kernel(vcpu->kvm)) 5461 if (lapic_in_kernel(vcpu))
5300 return 1; 5462 return 1;
5301 if (cr8_prev <= cr8) 5463 if (cr8_prev <= cr8)
5302 return 1; 5464 return 1;
@@ -5510,17 +5672,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5510 kvm_make_request(KVM_REQ_EVENT, vcpu); 5672 kvm_make_request(KVM_REQ_EVENT, vcpu);
5511 5673
5512 ++vcpu->stat.irq_window_exits; 5674 ++vcpu->stat.irq_window_exits;
5513
5514 /*
5515 * If the user space waits to inject interrupts, exit as soon as
5516 * possible
5517 */
5518 if (!irqchip_in_kernel(vcpu->kvm) &&
5519 vcpu->run->request_interrupt_window &&
5520 !kvm_cpu_has_interrupt(vcpu)) {
5521 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
5522 return 0;
5523 }
5524 return 1; 5675 return 1;
5525} 5676}
5526 5677
@@ -5753,6 +5904,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5753 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5904 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5754 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5905 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5755 skip_emulated_instruction(vcpu); 5906 skip_emulated_instruction(vcpu);
5907 trace_kvm_fast_mmio(gpa);
5756 return 1; 5908 return 1;
5757 } 5909 }
5758 5910
@@ -5910,6 +6062,25 @@ static void update_ple_window_actual_max(void)
5910 ple_window_grow, INT_MIN); 6062 ple_window_grow, INT_MIN);
5911} 6063}
5912 6064
6065/*
6066 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
6067 */
6068static void wakeup_handler(void)
6069{
6070 struct kvm_vcpu *vcpu;
6071 int cpu = smp_processor_id();
6072
6073 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
6074 list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
6075 blocked_vcpu_list) {
6076 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
6077
6078 if (pi_test_on(pi_desc) == 1)
6079 kvm_vcpu_kick(vcpu);
6080 }
6081 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
6082}
6083
5913static __init int hardware_setup(void) 6084static __init int hardware_setup(void)
5914{ 6085{
5915 int r = -ENOMEM, i, msr; 6086 int r = -ENOMEM, i, msr;
@@ -6096,6 +6267,8 @@ static __init int hardware_setup(void)
6096 kvm_x86_ops->enable_log_dirty_pt_masked = NULL; 6267 kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
6097 } 6268 }
6098 6269
6270 kvm_set_posted_intr_wakeup_handler(wakeup_handler);
6271
6099 return alloc_kvm_area(); 6272 return alloc_kvm_area();
6100 6273
6101out8: 6274out8:
@@ -6627,7 +6800,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
6627 6800
6628static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) 6801static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
6629{ 6802{
6630 u32 exec_control;
6631 if (vmx->nested.current_vmptr == -1ull) 6803 if (vmx->nested.current_vmptr == -1ull)
6632 return; 6804 return;
6633 6805
@@ -6640,9 +6812,8 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
6640 they were modified */ 6812 they were modified */
6641 copy_shadow_to_vmcs12(vmx); 6813 copy_shadow_to_vmcs12(vmx);
6642 vmx->nested.sync_shadow_vmcs = false; 6814 vmx->nested.sync_shadow_vmcs = false;
6643 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6815 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
6644 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 6816 SECONDARY_EXEC_SHADOW_VMCS);
6645 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
6646 vmcs_write64(VMCS_LINK_POINTER, -1ull); 6817 vmcs_write64(VMCS_LINK_POINTER, -1ull);
6647 } 6818 }
6648 vmx->nested.posted_intr_nv = -1; 6819 vmx->nested.posted_intr_nv = -1;
@@ -6662,6 +6833,7 @@ static void free_nested(struct vcpu_vmx *vmx)
6662 return; 6833 return;
6663 6834
6664 vmx->nested.vmxon = false; 6835 vmx->nested.vmxon = false;
6836 free_vpid(vmx->nested.vpid02);
6665 nested_release_vmcs12(vmx); 6837 nested_release_vmcs12(vmx);
6666 if (enable_shadow_vmcs) 6838 if (enable_shadow_vmcs)
6667 free_vmcs(vmx->nested.current_shadow_vmcs); 6839 free_vmcs(vmx->nested.current_shadow_vmcs);
@@ -7038,7 +7210,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
7038{ 7210{
7039 struct vcpu_vmx *vmx = to_vmx(vcpu); 7211 struct vcpu_vmx *vmx = to_vmx(vcpu);
7040 gpa_t vmptr; 7212 gpa_t vmptr;
7041 u32 exec_control;
7042 7213
7043 if (!nested_vmx_check_permission(vcpu)) 7214 if (!nested_vmx_check_permission(vcpu))
7044 return 1; 7215 return 1;
@@ -7070,9 +7241,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
7070 vmx->nested.current_vmcs12 = new_vmcs12; 7241 vmx->nested.current_vmcs12 = new_vmcs12;
7071 vmx->nested.current_vmcs12_page = page; 7242 vmx->nested.current_vmcs12_page = page;
7072 if (enable_shadow_vmcs) { 7243 if (enable_shadow_vmcs) {
7073 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7244 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
7074 exec_control |= SECONDARY_EXEC_SHADOW_VMCS; 7245 SECONDARY_EXEC_SHADOW_VMCS);
7075 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7076 vmcs_write64(VMCS_LINK_POINTER, 7246 vmcs_write64(VMCS_LINK_POINTER,
7077 __pa(vmx->nested.current_shadow_vmcs)); 7247 __pa(vmx->nested.current_shadow_vmcs));
7078 vmx->nested.sync_shadow_vmcs = true; 7248 vmx->nested.sync_shadow_vmcs = true;
@@ -7178,7 +7348,63 @@ static int handle_invept(struct kvm_vcpu *vcpu)
7178 7348
7179static int handle_invvpid(struct kvm_vcpu *vcpu) 7349static int handle_invvpid(struct kvm_vcpu *vcpu)
7180{ 7350{
7181 kvm_queue_exception(vcpu, UD_VECTOR); 7351 struct vcpu_vmx *vmx = to_vmx(vcpu);
7352 u32 vmx_instruction_info;
7353 unsigned long type, types;
7354 gva_t gva;
7355 struct x86_exception e;
7356 int vpid;
7357
7358 if (!(vmx->nested.nested_vmx_secondary_ctls_high &
7359 SECONDARY_EXEC_ENABLE_VPID) ||
7360 !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) {
7361 kvm_queue_exception(vcpu, UD_VECTOR);
7362 return 1;
7363 }
7364
7365 if (!nested_vmx_check_permission(vcpu))
7366 return 1;
7367
7368 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7369 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
7370
7371 types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7;
7372
7373 if (!(types & (1UL << type))) {
7374 nested_vmx_failValid(vcpu,
7375 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7376 return 1;
7377 }
7378
7379 /* according to the intel vmx instruction reference, the memory
7380 * operand is read even if it isn't needed (e.g., for type==global)
7381 */
7382 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
7383 vmx_instruction_info, false, &gva))
7384 return 1;
7385 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid,
7386 sizeof(u32), &e)) {
7387 kvm_inject_page_fault(vcpu, &e);
7388 return 1;
7389 }
7390
7391 switch (type) {
7392 case VMX_VPID_EXTENT_ALL_CONTEXT:
7393 if (get_vmcs12(vcpu)->virtual_processor_id == 0) {
7394 nested_vmx_failValid(vcpu,
7395 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7396 return 1;
7397 }
7398 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
7399 nested_vmx_succeed(vcpu);
7400 break;
7401 default:
7402 /* Trap single context invalidation invvpid calls */
7403 BUG_ON(1);
7404 break;
7405 }
7406
7407 skip_emulated_instruction(vcpu);
7182 return 1; 7408 return 1;
7183} 7409}
7184 7410
@@ -7207,6 +7433,13 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
7207 return 1; 7433 return 1;
7208} 7434}
7209 7435
7436static int handle_pcommit(struct kvm_vcpu *vcpu)
7437{
7438 /* we never catch pcommit instruct for L1 guest. */
7439 WARN_ON(1);
7440 return 1;
7441}
7442
7210/* 7443/*
7211 * The exit handlers return 1 if the exit was handled fully and guest execution 7444 * The exit handlers return 1 if the exit was handled fully and guest execution
7212 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 7445 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7257,6 +7490,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
7257 [EXIT_REASON_XSAVES] = handle_xsaves, 7490 [EXIT_REASON_XSAVES] = handle_xsaves,
7258 [EXIT_REASON_XRSTORS] = handle_xrstors, 7491 [EXIT_REASON_XRSTORS] = handle_xrstors,
7259 [EXIT_REASON_PML_FULL] = handle_pml_full, 7492 [EXIT_REASON_PML_FULL] = handle_pml_full,
7493 [EXIT_REASON_PCOMMIT] = handle_pcommit,
7260}; 7494};
7261 7495
7262static const int kvm_vmx_max_exit_handlers = 7496static const int kvm_vmx_max_exit_handlers =
@@ -7558,6 +7792,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7558 * the XSS exit bitmap in vmcs12. 7792 * the XSS exit bitmap in vmcs12.
7559 */ 7793 */
7560 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 7794 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
7795 case EXIT_REASON_PCOMMIT:
7796 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
7561 default: 7797 default:
7562 return true; 7798 return true;
7563 } 7799 }
@@ -7569,10 +7805,9 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
7569 *info2 = vmcs_read32(VM_EXIT_INTR_INFO); 7805 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
7570} 7806}
7571 7807
7572static int vmx_enable_pml(struct vcpu_vmx *vmx) 7808static int vmx_create_pml_buffer(struct vcpu_vmx *vmx)
7573{ 7809{
7574 struct page *pml_pg; 7810 struct page *pml_pg;
7575 u32 exec_control;
7576 7811
7577 pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); 7812 pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
7578 if (!pml_pg) 7813 if (!pml_pg)
@@ -7583,24 +7818,15 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx)
7583 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 7818 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
7584 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 7819 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
7585 7820
7586 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7587 exec_control |= SECONDARY_EXEC_ENABLE_PML;
7588 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7589
7590 return 0; 7821 return 0;
7591} 7822}
7592 7823
7593static void vmx_disable_pml(struct vcpu_vmx *vmx) 7824static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
7594{ 7825{
7595 u32 exec_control; 7826 if (vmx->pml_pg) {
7596 7827 __free_page(vmx->pml_pg);
7597 ASSERT(vmx->pml_pg); 7828 vmx->pml_pg = NULL;
7598 __free_page(vmx->pml_pg); 7829 }
7599 vmx->pml_pg = NULL;
7600
7601 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7602 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
7603 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7604} 7830}
7605 7831
7606static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 7832static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
@@ -7924,10 +8150,10 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
7924 * apicv 8150 * apicv
7925 */ 8151 */
7926 if (!cpu_has_vmx_virtualize_x2apic_mode() || 8152 if (!cpu_has_vmx_virtualize_x2apic_mode() ||
7927 !vmx_vm_has_apicv(vcpu->kvm)) 8153 !vmx_cpu_uses_apicv(vcpu))
7928 return; 8154 return;
7929 8155
7930 if (!vm_need_tpr_shadow(vcpu->kvm)) 8156 if (!cpu_need_tpr_shadow(vcpu))
7931 return; 8157 return;
7932 8158
7933 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 8159 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -8029,9 +8255,10 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
8029 } 8255 }
8030} 8256}
8031 8257
8032static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 8258static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu)
8033{ 8259{
8034 if (!vmx_vm_has_apicv(vcpu->kvm)) 8260 u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap;
8261 if (!vmx_cpu_uses_apicv(vcpu))
8035 return; 8262 return;
8036 8263
8037 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 8264 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
@@ -8477,8 +8704,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
8477 struct vcpu_vmx *vmx = to_vmx(vcpu); 8704 struct vcpu_vmx *vmx = to_vmx(vcpu);
8478 8705
8479 if (enable_pml) 8706 if (enable_pml)
8480 vmx_disable_pml(vmx); 8707 vmx_destroy_pml_buffer(vmx);
8481 free_vpid(vmx); 8708 free_vpid(vmx->vpid);
8482 leave_guest_mode(vcpu); 8709 leave_guest_mode(vcpu);
8483 vmx_load_vmcs01(vcpu); 8710 vmx_load_vmcs01(vcpu);
8484 free_nested(vmx); 8711 free_nested(vmx);
@@ -8497,7 +8724,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
8497 if (!vmx) 8724 if (!vmx)
8498 return ERR_PTR(-ENOMEM); 8725 return ERR_PTR(-ENOMEM);
8499 8726
8500 allocate_vpid(vmx); 8727 vmx->vpid = allocate_vpid();
8501 8728
8502 err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 8729 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
8503 if (err) 8730 if (err)
@@ -8530,7 +8757,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
8530 put_cpu(); 8757 put_cpu();
8531 if (err) 8758 if (err)
8532 goto free_vmcs; 8759 goto free_vmcs;
8533 if (vm_need_virtualize_apic_accesses(kvm)) { 8760 if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
8534 err = alloc_apic_access_page(kvm); 8761 err = alloc_apic_access_page(kvm);
8535 if (err) 8762 if (err)
8536 goto free_vmcs; 8763 goto free_vmcs;
@@ -8545,8 +8772,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
8545 goto free_vmcs; 8772 goto free_vmcs;
8546 } 8773 }
8547 8774
8548 if (nested) 8775 if (nested) {
8549 nested_vmx_setup_ctls_msrs(vmx); 8776 nested_vmx_setup_ctls_msrs(vmx);
8777 vmx->nested.vpid02 = allocate_vpid();
8778 }
8550 8779
8551 vmx->nested.posted_intr_nv = -1; 8780 vmx->nested.posted_intr_nv = -1;
8552 vmx->nested.current_vmptr = -1ull; 8781 vmx->nested.current_vmptr = -1ull;
@@ -8559,7 +8788,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
8559 * for the guest, etc. 8788 * for the guest, etc.
8560 */ 8789 */
8561 if (enable_pml) { 8790 if (enable_pml) {
8562 err = vmx_enable_pml(vmx); 8791 err = vmx_create_pml_buffer(vmx);
8563 if (err) 8792 if (err)
8564 goto free_vmcs; 8793 goto free_vmcs;
8565 } 8794 }
@@ -8567,13 +8796,14 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
8567 return &vmx->vcpu; 8796 return &vmx->vcpu;
8568 8797
8569free_vmcs: 8798free_vmcs:
8799 free_vpid(vmx->nested.vpid02);
8570 free_loaded_vmcs(vmx->loaded_vmcs); 8800 free_loaded_vmcs(vmx->loaded_vmcs);
8571free_msrs: 8801free_msrs:
8572 kfree(vmx->guest_msrs); 8802 kfree(vmx->guest_msrs);
8573uninit_vcpu: 8803uninit_vcpu:
8574 kvm_vcpu_uninit(&vmx->vcpu); 8804 kvm_vcpu_uninit(&vmx->vcpu);
8575free_vcpu: 8805free_vcpu:
8576 free_vpid(vmx); 8806 free_vpid(vmx->vpid);
8577 kmem_cache_free(kvm_vcpu_cache, vmx); 8807 kmem_cache_free(kvm_vcpu_cache, vmx);
8578 return ERR_PTR(err); 8808 return ERR_PTR(err);
8579} 8809}
@@ -8648,49 +8878,67 @@ static int vmx_get_lpage_level(void)
8648 return PT_PDPE_LEVEL; 8878 return PT_PDPE_LEVEL;
8649} 8879}
8650 8880
8881static void vmcs_set_secondary_exec_control(u32 new_ctl)
8882{
8883 /*
8884 * These bits in the secondary execution controls field
8885 * are dynamic, the others are mostly based on the hypervisor
8886 * architecture and the guest's CPUID. Do not touch the
8887 * dynamic bits.
8888 */
8889 u32 mask =
8890 SECONDARY_EXEC_SHADOW_VMCS |
8891 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
8892 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
8893
8894 u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
8895
8896 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
8897 (new_ctl & ~mask) | (cur_ctl & mask));
8898}
8899
8651static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 8900static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
8652{ 8901{
8653 struct kvm_cpuid_entry2 *best; 8902 struct kvm_cpuid_entry2 *best;
8654 struct vcpu_vmx *vmx = to_vmx(vcpu); 8903 struct vcpu_vmx *vmx = to_vmx(vcpu);
8655 u32 exec_control; 8904 u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx);
8656 8905
8657 vmx->rdtscp_enabled = false;
8658 if (vmx_rdtscp_supported()) { 8906 if (vmx_rdtscp_supported()) {
8659 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 8907 bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu);
8660 if (exec_control & SECONDARY_EXEC_RDTSCP) { 8908 if (!rdtscp_enabled)
8661 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 8909 secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP;
8662 if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) 8910
8663 vmx->rdtscp_enabled = true; 8911 if (nested) {
8664 else { 8912 if (rdtscp_enabled)
8665 exec_control &= ~SECONDARY_EXEC_RDTSCP; 8913 vmx->nested.nested_vmx_secondary_ctls_high |=
8666 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 8914 SECONDARY_EXEC_RDTSCP;
8667 exec_control); 8915 else
8668 } 8916 vmx->nested.nested_vmx_secondary_ctls_high &=
8917 ~SECONDARY_EXEC_RDTSCP;
8669 } 8918 }
8670 if (nested && !vmx->rdtscp_enabled)
8671 vmx->nested.nested_vmx_secondary_ctls_high &=
8672 ~SECONDARY_EXEC_RDTSCP;
8673 } 8919 }
8674 8920
8675 /* Exposing INVPCID only when PCID is exposed */ 8921 /* Exposing INVPCID only when PCID is exposed */
8676 best = kvm_find_cpuid_entry(vcpu, 0x7, 0); 8922 best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
8677 if (vmx_invpcid_supported() && 8923 if (vmx_invpcid_supported() &&
8678 best && (best->ebx & bit(X86_FEATURE_INVPCID)) && 8924 (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
8679 guest_cpuid_has_pcid(vcpu)) { 8925 !guest_cpuid_has_pcid(vcpu))) {
8680 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 8926 secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID;
8681 exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; 8927
8682 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
8683 exec_control);
8684 } else {
8685 if (cpu_has_secondary_exec_ctrls()) {
8686 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
8687 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
8688 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
8689 exec_control);
8690 }
8691 if (best) 8928 if (best)
8692 best->ebx &= ~bit(X86_FEATURE_INVPCID); 8929 best->ebx &= ~bit(X86_FEATURE_INVPCID);
8693 } 8930 }
8931
8932 vmcs_set_secondary_exec_control(secondary_exec_ctl);
8933
8934 if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
8935 if (guest_cpuid_has_pcommit(vcpu))
8936 vmx->nested.nested_vmx_secondary_ctls_high |=
8937 SECONDARY_EXEC_PCOMMIT;
8938 else
8939 vmx->nested.nested_vmx_secondary_ctls_high &=
8940 ~SECONDARY_EXEC_PCOMMIT;
8941 }
8694} 8942}
8695 8943
8696static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 8944static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9298,13 +9546,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
9298 9546
9299 if (cpu_has_secondary_exec_ctrls()) { 9547 if (cpu_has_secondary_exec_ctrls()) {
9300 exec_control = vmx_secondary_exec_control(vmx); 9548 exec_control = vmx_secondary_exec_control(vmx);
9301 if (!vmx->rdtscp_enabled) 9549
9302 exec_control &= ~SECONDARY_EXEC_RDTSCP;
9303 /* Take the following fields only from vmcs12 */ 9550 /* Take the following fields only from vmcs12 */
9304 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 9551 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
9305 SECONDARY_EXEC_RDTSCP | 9552 SECONDARY_EXEC_RDTSCP |
9306 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 9553 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
9307 SECONDARY_EXEC_APIC_REGISTER_VIRT); 9554 SECONDARY_EXEC_APIC_REGISTER_VIRT |
9555 SECONDARY_EXEC_PCOMMIT);
9308 if (nested_cpu_has(vmcs12, 9556 if (nested_cpu_has(vmcs12,
9309 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 9557 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
9310 exec_control |= vmcs12->secondary_vm_exec_control; 9558 exec_control |= vmcs12->secondary_vm_exec_control;
@@ -9323,7 +9571,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
9323 vmcs_write64(APIC_ACCESS_ADDR, 9571 vmcs_write64(APIC_ACCESS_ADDR,
9324 page_to_phys(vmx->nested.apic_access_page)); 9572 page_to_phys(vmx->nested.apic_access_page));
9325 } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && 9573 } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
9326 (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) { 9574 cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
9327 exec_control |= 9575 exec_control |=
9328 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 9576 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
9329 kvm_vcpu_reload_apic_access_page(vcpu); 9577 kvm_vcpu_reload_apic_access_page(vcpu);
@@ -9433,12 +9681,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
9433 9681
9434 if (enable_vpid) { 9682 if (enable_vpid) {
9435 /* 9683 /*
9436 * Trivially support vpid by letting L2s share their parent 9684 * There is no direct mapping between vpid02 and vpid12, the
9437 * L1's vpid. TODO: move to a more elaborate solution, giving 9685 * vpid02 is per-vCPU for L0 and reused while the value of
9438 * each L2 its own vpid and exposing the vpid feature to L1. 9686 * vpid12 is changed w/ one invvpid during nested vmentry.
9687 * The vpid12 is allocated by L1 for L2, so it will not
9688 * influence global bitmap(for vpid01 and vpid02 allocation)
9689 * even if spawn a lot of nested vCPUs.
9439 */ 9690 */
9440 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 9691 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
9441 vmx_flush_tlb(vcpu); 9692 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
9693 if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
9694 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
9695 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
9696 }
9697 } else {
9698 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
9699 vmx_flush_tlb(vcpu);
9700 }
9701
9442 } 9702 }
9443 9703
9444 if (nested_cpu_has_ept(vmcs12)) { 9704 if (nested_cpu_has_ept(vmcs12)) {
@@ -10278,6 +10538,201 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
10278 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); 10538 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
10279} 10539}
10280 10540
10541/*
10542 * This routine does the following things for vCPU which is going
10543 * to be blocked if VT-d PI is enabled.
10544 * - Store the vCPU to the wakeup list, so when interrupts happen
10545 * we can find the right vCPU to wake up.
10546 * - Change the Posted-interrupt descriptor as below:
10547 * 'NDST' <-- vcpu->pre_pcpu
10548 * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
10549 * - If 'ON' is set during this process, which means at least one
10550 * interrupt is posted for this vCPU, we cannot block it, in
10551 * this case, return 1, otherwise, return 0.
10552 *
10553 */
10554static int vmx_pre_block(struct kvm_vcpu *vcpu)
10555{
10556 unsigned long flags;
10557 unsigned int dest;
10558 struct pi_desc old, new;
10559 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
10560
10561 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
10562 !irq_remapping_cap(IRQ_POSTING_CAP))
10563 return 0;
10564
10565 vcpu->pre_pcpu = vcpu->cpu;
10566 spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
10567 vcpu->pre_pcpu), flags);
10568 list_add_tail(&vcpu->blocked_vcpu_list,
10569 &per_cpu(blocked_vcpu_on_cpu,
10570 vcpu->pre_pcpu));
10571 spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
10572 vcpu->pre_pcpu), flags);
10573
10574 do {
10575 old.control = new.control = pi_desc->control;
10576
10577 /*
10578 * We should not block the vCPU if
10579 * an interrupt is posted for it.
10580 */
10581 if (pi_test_on(pi_desc) == 1) {
10582 spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
10583 vcpu->pre_pcpu), flags);
10584 list_del(&vcpu->blocked_vcpu_list);
10585 spin_unlock_irqrestore(
10586 &per_cpu(blocked_vcpu_on_cpu_lock,
10587 vcpu->pre_pcpu), flags);
10588 vcpu->pre_pcpu = -1;
10589
10590 return 1;
10591 }
10592
10593 WARN((pi_desc->sn == 1),
10594 "Warning: SN field of posted-interrupts "
10595 "is set before blocking\n");
10596
10597 /*
10598 * Since vCPU can be preempted during this process,
10599 * vcpu->cpu could be different with pre_pcpu, we
10600 * need to set pre_pcpu as the destination of wakeup
10601 * notification event, then we can find the right vCPU
10602 * to wakeup in wakeup handler if interrupts happen
10603 * when the vCPU is in blocked state.
10604 */
10605 dest = cpu_physical_id(vcpu->pre_pcpu);
10606
10607 if (x2apic_enabled())
10608 new.ndst = dest;
10609 else
10610 new.ndst = (dest << 8) & 0xFF00;
10611
10612 /* set 'NV' to 'wakeup vector' */
10613 new.nv = POSTED_INTR_WAKEUP_VECTOR;
10614 } while (cmpxchg(&pi_desc->control, old.control,
10615 new.control) != old.control);
10616
10617 return 0;
10618}
10619
10620static void vmx_post_block(struct kvm_vcpu *vcpu)
10621{
10622 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
10623 struct pi_desc old, new;
10624 unsigned int dest;
10625 unsigned long flags;
10626
10627 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
10628 !irq_remapping_cap(IRQ_POSTING_CAP))
10629 return;
10630
10631 do {
10632 old.control = new.control = pi_desc->control;
10633
10634 dest = cpu_physical_id(vcpu->cpu);
10635
10636 if (x2apic_enabled())
10637 new.ndst = dest;
10638 else
10639 new.ndst = (dest << 8) & 0xFF00;
10640
10641 /* Allow posting non-urgent interrupts */
10642 new.sn = 0;
10643
10644 /* set 'NV' to 'notification vector' */
10645 new.nv = POSTED_INTR_VECTOR;
10646 } while (cmpxchg(&pi_desc->control, old.control,
10647 new.control) != old.control);
10648
10649 if(vcpu->pre_pcpu != -1) {
10650 spin_lock_irqsave(
10651 &per_cpu(blocked_vcpu_on_cpu_lock,
10652 vcpu->pre_pcpu), flags);
10653 list_del(&vcpu->blocked_vcpu_list);
10654 spin_unlock_irqrestore(
10655 &per_cpu(blocked_vcpu_on_cpu_lock,
10656 vcpu->pre_pcpu), flags);
10657 vcpu->pre_pcpu = -1;
10658 }
10659}
10660
10661/*
10662 * vmx_update_pi_irte - set IRTE for Posted-Interrupts
10663 *
10664 * @kvm: kvm
10665 * @host_irq: host irq of the interrupt
10666 * @guest_irq: gsi of the interrupt
10667 * @set: set or unset PI
10668 * returns 0 on success, < 0 on failure
10669 */
10670static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
10671 uint32_t guest_irq, bool set)
10672{
10673 struct kvm_kernel_irq_routing_entry *e;
10674 struct kvm_irq_routing_table *irq_rt;
10675 struct kvm_lapic_irq irq;
10676 struct kvm_vcpu *vcpu;
10677 struct vcpu_data vcpu_info;
10678 int idx, ret = -EINVAL;
10679
10680 if (!kvm_arch_has_assigned_device(kvm) ||
10681 !irq_remapping_cap(IRQ_POSTING_CAP))
10682 return 0;
10683
10684 idx = srcu_read_lock(&kvm->irq_srcu);
10685 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
10686 BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
10687
10688 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
10689 if (e->type != KVM_IRQ_ROUTING_MSI)
10690 continue;
10691 /*
10692 * VT-d PI cannot support posting multicast/broadcast
10693 * interrupts to a vCPU, we still use interrupt remapping
10694 * for these kind of interrupts.
10695 *
10696 * For lowest-priority interrupts, we only support
10697 * those with single CPU as the destination, e.g. user
10698 * configures the interrupts via /proc/irq or uses
10699 * irqbalance to make the interrupts single-CPU.
10700 *
10701 * We will support full lowest-priority interrupt later.
10702 */
10703
10704 kvm_set_msi_irq(e, &irq);
10705 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
10706 continue;
10707
10708 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
10709 vcpu_info.vector = irq.vector;
10710
10711 trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
10712 vcpu_info.vector, vcpu_info.pi_desc_addr, set);
10713
10714 if (set)
10715 ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
10716 else {
10717 /* suppress notification event before unposting */
10718 pi_set_sn(vcpu_to_pi_desc(vcpu));
10719 ret = irq_set_vcpu_affinity(host_irq, NULL);
10720 pi_clear_sn(vcpu_to_pi_desc(vcpu));
10721 }
10722
10723 if (ret < 0) {
10724 printk(KERN_INFO "%s: failed to update PI IRTE\n",
10725 __func__);
10726 goto out;
10727 }
10728 }
10729
10730 ret = 0;
10731out:
10732 srcu_read_unlock(&kvm->irq_srcu, idx);
10733 return ret;
10734}
10735
10281static struct kvm_x86_ops vmx_x86_ops = { 10736static struct kvm_x86_ops vmx_x86_ops = {
10282 .cpu_has_kvm_support = cpu_has_kvm_support, 10737 .cpu_has_kvm_support = cpu_has_kvm_support,
10283 .disabled_by_bios = vmx_disabled_by_bios, 10738 .disabled_by_bios = vmx_disabled_by_bios,
@@ -10347,7 +10802,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
10347 .update_cr8_intercept = update_cr8_intercept, 10802 .update_cr8_intercept = update_cr8_intercept,
10348 .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, 10803 .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
10349 .set_apic_access_page_addr = vmx_set_apic_access_page_addr, 10804 .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
10350 .vm_has_apicv = vmx_vm_has_apicv, 10805 .cpu_uses_apicv = vmx_cpu_uses_apicv,
10351 .load_eoi_exitmap = vmx_load_eoi_exitmap, 10806 .load_eoi_exitmap = vmx_load_eoi_exitmap,
10352 .hwapic_irr_update = vmx_hwapic_irr_update, 10807 .hwapic_irr_update = vmx_hwapic_irr_update,
10353 .hwapic_isr_update = vmx_hwapic_isr_update, 10808 .hwapic_isr_update = vmx_hwapic_isr_update,
@@ -10394,7 +10849,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
10394 .flush_log_dirty = vmx_flush_log_dirty, 10849 .flush_log_dirty = vmx_flush_log_dirty,
10395 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, 10850 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
10396 10851
10852 .pre_block = vmx_pre_block,
10853 .post_block = vmx_post_block,
10854
10397 .pmu_ops = &intel_pmu_ops, 10855 .pmu_ops = &intel_pmu_ops,
10856
10857 .update_pi_irte = vmx_update_pi_irte,
10398}; 10858};
10399 10859
10400static int __init vmx_init(void) 10860static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bda65690788e..4a6eff166fc6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -51,6 +51,8 @@
51#include <linux/pci.h> 51#include <linux/pci.h>
52#include <linux/timekeeper_internal.h> 52#include <linux/timekeeper_internal.h>
53#include <linux/pvclock_gtod.h> 53#include <linux/pvclock_gtod.h>
54#include <linux/kvm_irqfd.h>
55#include <linux/irqbypass.h>
54#include <trace/events/kvm.h> 56#include <trace/events/kvm.h>
55 57
56#define CREATE_TRACE_POINTS 58#define CREATE_TRACE_POINTS
@@ -64,6 +66,7 @@
64#include <asm/fpu/internal.h> /* Ugh! */ 66#include <asm/fpu/internal.h> /* Ugh! */
65#include <asm/pvclock.h> 67#include <asm/pvclock.h>
66#include <asm/div64.h> 68#include <asm/div64.h>
69#include <asm/irq_remapping.h>
67 70
68#define MAX_IO_MSRS 256 71#define MAX_IO_MSRS 256
69#define KVM_MAX_MCE_BANKS 32 72#define KVM_MAX_MCE_BANKS 32
@@ -622,7 +625,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
622 if ((cr0 ^ old_cr0) & update_bits) 625 if ((cr0 ^ old_cr0) & update_bits)
623 kvm_mmu_reset_context(vcpu); 626 kvm_mmu_reset_context(vcpu);
624 627
625 if ((cr0 ^ old_cr0) & X86_CR0_CD) 628 if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
629 kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
630 !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
626 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); 631 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
627 632
628 return 0; 633 return 0;
@@ -789,7 +794,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
789{ 794{
790 if (cr8 & CR8_RESERVED_BITS) 795 if (cr8 & CR8_RESERVED_BITS)
791 return 1; 796 return 1;
792 if (irqchip_in_kernel(vcpu->kvm)) 797 if (lapic_in_kernel(vcpu))
793 kvm_lapic_set_tpr(vcpu, cr8); 798 kvm_lapic_set_tpr(vcpu, cr8);
794 else 799 else
795 vcpu->arch.cr8 = cr8; 800 vcpu->arch.cr8 = cr8;
@@ -799,7 +804,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
799 804
800unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 805unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
801{ 806{
802 if (irqchip_in_kernel(vcpu->kvm)) 807 if (lapic_in_kernel(vcpu))
803 return kvm_lapic_get_cr8(vcpu); 808 return kvm_lapic_get_cr8(vcpu);
804 else 809 else
805 return vcpu->arch.cr8; 810 return vcpu->arch.cr8;
@@ -953,6 +958,9 @@ static u32 emulated_msrs[] = {
953 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, 958 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
954 HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, 959 HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
955 HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, 960 HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
961 HV_X64_MSR_RESET,
962 HV_X64_MSR_VP_INDEX,
963 HV_X64_MSR_VP_RUNTIME,
956 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 964 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
957 MSR_KVM_PV_EOI_EN, 965 MSR_KVM_PV_EOI_EN,
958 966
@@ -1898,6 +1906,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu)
1898 1906
1899static void record_steal_time(struct kvm_vcpu *vcpu) 1907static void record_steal_time(struct kvm_vcpu *vcpu)
1900{ 1908{
1909 accumulate_steal_time(vcpu);
1910
1901 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) 1911 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1902 return; 1912 return;
1903 1913
@@ -2048,12 +2058,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2048 if (!(data & KVM_MSR_ENABLED)) 2058 if (!(data & KVM_MSR_ENABLED))
2049 break; 2059 break;
2050 2060
2051 vcpu->arch.st.last_steal = current->sched_info.run_delay;
2052
2053 preempt_disable();
2054 accumulate_steal_time(vcpu);
2055 preempt_enable();
2056
2057 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); 2061 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2058 2062
2059 break; 2063 break;
@@ -2449,6 +2453,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2449 case KVM_CAP_ENABLE_CAP_VM: 2453 case KVM_CAP_ENABLE_CAP_VM:
2450 case KVM_CAP_DISABLE_QUIRKS: 2454 case KVM_CAP_DISABLE_QUIRKS:
2451 case KVM_CAP_SET_BOOT_CPU_ID: 2455 case KVM_CAP_SET_BOOT_CPU_ID:
2456 case KVM_CAP_SPLIT_IRQCHIP:
2452#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT 2457#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2453 case KVM_CAP_ASSIGN_DEV_IRQ: 2458 case KVM_CAP_ASSIGN_DEV_IRQ:
2454 case KVM_CAP_PCI_2_3: 2459 case KVM_CAP_PCI_2_3:
@@ -2628,7 +2633,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2628 vcpu->cpu = cpu; 2633 vcpu->cpu = cpu;
2629 } 2634 }
2630 2635
2631 accumulate_steal_time(vcpu);
2632 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); 2636 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2633} 2637}
2634 2638
@@ -2662,12 +2666,24 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2662{ 2666{
2663 if (irq->irq >= KVM_NR_INTERRUPTS) 2667 if (irq->irq >= KVM_NR_INTERRUPTS)
2664 return -EINVAL; 2668 return -EINVAL;
2665 if (irqchip_in_kernel(vcpu->kvm)) 2669
2670 if (!irqchip_in_kernel(vcpu->kvm)) {
2671 kvm_queue_interrupt(vcpu, irq->irq, false);
2672 kvm_make_request(KVM_REQ_EVENT, vcpu);
2673 return 0;
2674 }
2675
2676 /*
2677 * With in-kernel LAPIC, we only use this to inject EXTINT, so
2678 * fail for in-kernel 8259.
2679 */
2680 if (pic_in_kernel(vcpu->kvm))
2666 return -ENXIO; 2681 return -ENXIO;
2667 2682
2668 kvm_queue_interrupt(vcpu, irq->irq, false); 2683 if (vcpu->arch.pending_external_vector != -1)
2669 kvm_make_request(KVM_REQ_EVENT, vcpu); 2684 return -EEXIST;
2670 2685
2686 vcpu->arch.pending_external_vector = irq->irq;
2671 return 0; 2687 return 0;
2672} 2688}
2673 2689
@@ -3176,7 +3192,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
3176 struct kvm_vapic_addr va; 3192 struct kvm_vapic_addr va;
3177 3193
3178 r = -EINVAL; 3194 r = -EINVAL;
3179 if (!irqchip_in_kernel(vcpu->kvm)) 3195 if (!lapic_in_kernel(vcpu))
3180 goto out; 3196 goto out;
3181 r = -EFAULT; 3197 r = -EFAULT;
3182 if (copy_from_user(&va, argp, sizeof va)) 3198 if (copy_from_user(&va, argp, sizeof va))
@@ -3425,41 +3441,35 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
3425 3441
3426static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 3442static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
3427{ 3443{
3428 int r = 0;
3429
3430 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3444 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3431 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 3445 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
3432 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3446 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3433 return r; 3447 return 0;
3434} 3448}
3435 3449
3436static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 3450static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
3437{ 3451{
3438 int r = 0;
3439
3440 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3452 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3441 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 3453 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
3442 kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); 3454 kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
3443 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3455 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3444 return r; 3456 return 0;
3445} 3457}
3446 3458
3447static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 3459static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3448{ 3460{
3449 int r = 0;
3450
3451 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3461 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3452 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, 3462 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
3453 sizeof(ps->channels)); 3463 sizeof(ps->channels));
3454 ps->flags = kvm->arch.vpit->pit_state.flags; 3464 ps->flags = kvm->arch.vpit->pit_state.flags;
3455 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3465 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3456 memset(&ps->reserved, 0, sizeof(ps->reserved)); 3466 memset(&ps->reserved, 0, sizeof(ps->reserved));
3457 return r; 3467 return 0;
3458} 3468}
3459 3469
3460static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 3470static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3461{ 3471{
3462 int r = 0, start = 0; 3472 int start = 0;
3463 u32 prev_legacy, cur_legacy; 3473 u32 prev_legacy, cur_legacy;
3464 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3474 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3465 prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; 3475 prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
@@ -3471,7 +3481,7 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3471 kvm->arch.vpit->pit_state.flags = ps->flags; 3481 kvm->arch.vpit->pit_state.flags = ps->flags;
3472 kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); 3482 kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
3473 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3483 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3474 return r; 3484 return 0;
3475} 3485}
3476 3486
3477static int kvm_vm_ioctl_reinject(struct kvm *kvm, 3487static int kvm_vm_ioctl_reinject(struct kvm *kvm,
@@ -3556,6 +3566,28 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
3556 kvm->arch.disabled_quirks = cap->args[0]; 3566 kvm->arch.disabled_quirks = cap->args[0];
3557 r = 0; 3567 r = 0;
3558 break; 3568 break;
3569 case KVM_CAP_SPLIT_IRQCHIP: {
3570 mutex_lock(&kvm->lock);
3571 r = -EINVAL;
3572 if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
3573 goto split_irqchip_unlock;
3574 r = -EEXIST;
3575 if (irqchip_in_kernel(kvm))
3576 goto split_irqchip_unlock;
3577 if (atomic_read(&kvm->online_vcpus))
3578 goto split_irqchip_unlock;
3579 r = kvm_setup_empty_irq_routing(kvm);
3580 if (r)
3581 goto split_irqchip_unlock;
3582 /* Pairs with irqchip_in_kernel. */
3583 smp_wmb();
3584 kvm->arch.irqchip_split = true;
3585 kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
3586 r = 0;
3587split_irqchip_unlock:
3588 mutex_unlock(&kvm->lock);
3589 break;
3590 }
3559 default: 3591 default:
3560 r = -EINVAL; 3592 r = -EINVAL;
3561 break; 3593 break;
@@ -3669,7 +3701,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
3669 } 3701 }
3670 3702
3671 r = -ENXIO; 3703 r = -ENXIO;
3672 if (!irqchip_in_kernel(kvm)) 3704 if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
3673 goto get_irqchip_out; 3705 goto get_irqchip_out;
3674 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 3706 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
3675 if (r) 3707 if (r)
@@ -3693,7 +3725,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
3693 } 3725 }
3694 3726
3695 r = -ENXIO; 3727 r = -ENXIO;
3696 if (!irqchip_in_kernel(kvm)) 3728 if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
3697 goto set_irqchip_out; 3729 goto set_irqchip_out;
3698 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 3730 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
3699 if (r) 3731 if (r)
@@ -4060,6 +4092,15 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
4060 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); 4092 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
4061} 4093}
4062 4094
4095static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
4096 unsigned long addr, void *val, unsigned int bytes)
4097{
4098 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4099 int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
4100
4101 return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
4102}
4103
4063int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, 4104int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
4064 gva_t addr, void *val, 4105 gva_t addr, void *val,
4065 unsigned int bytes, 4106 unsigned int bytes,
@@ -4795,6 +4836,7 @@ static const struct x86_emulate_ops emulate_ops = {
4795 .write_gpr = emulator_write_gpr, 4836 .write_gpr = emulator_write_gpr,
4796 .read_std = kvm_read_guest_virt_system, 4837 .read_std = kvm_read_guest_virt_system,
4797 .write_std = kvm_write_guest_virt_system, 4838 .write_std = kvm_write_guest_virt_system,
4839 .read_phys = kvm_read_guest_phys_system,
4798 .fetch = kvm_fetch_guest_virt, 4840 .fetch = kvm_fetch_guest_virt,
4799 .read_emulated = emulator_read_emulated, 4841 .read_emulated = emulator_read_emulated,
4800 .write_emulated = emulator_write_emulated, 4842 .write_emulated = emulator_write_emulated,
@@ -5667,7 +5709,7 @@ void kvm_arch_exit(void)
5667int kvm_vcpu_halt(struct kvm_vcpu *vcpu) 5709int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
5668{ 5710{
5669 ++vcpu->stat.halt_exits; 5711 ++vcpu->stat.halt_exits;
5670 if (irqchip_in_kernel(vcpu->kvm)) { 5712 if (lapic_in_kernel(vcpu)) {
5671 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 5713 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
5672 return 1; 5714 return 1;
5673 } else { 5715 } else {
@@ -5774,9 +5816,15 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5774 */ 5816 */
5775static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) 5817static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
5776{ 5818{
5777 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 5819 if (!vcpu->run->request_interrupt_window || pic_in_kernel(vcpu->kvm))
5778 vcpu->run->request_interrupt_window && 5820 return false;
5779 kvm_arch_interrupt_allowed(vcpu)); 5821
5822 if (kvm_cpu_has_interrupt(vcpu))
5823 return false;
5824
5825 return (irqchip_split(vcpu->kvm)
5826 ? kvm_apic_accept_pic_intr(vcpu)
5827 : kvm_arch_interrupt_allowed(vcpu));
5780} 5828}
5781 5829
5782static void post_kvm_run_save(struct kvm_vcpu *vcpu) 5830static void post_kvm_run_save(struct kvm_vcpu *vcpu)
@@ -5787,13 +5835,17 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
5787 kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; 5835 kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
5788 kvm_run->cr8 = kvm_get_cr8(vcpu); 5836 kvm_run->cr8 = kvm_get_cr8(vcpu);
5789 kvm_run->apic_base = kvm_get_apic_base(vcpu); 5837 kvm_run->apic_base = kvm_get_apic_base(vcpu);
5790 if (irqchip_in_kernel(vcpu->kvm)) 5838 if (!irqchip_in_kernel(vcpu->kvm))
5791 kvm_run->ready_for_interrupt_injection = 1;
5792 else
5793 kvm_run->ready_for_interrupt_injection = 5839 kvm_run->ready_for_interrupt_injection =
5794 kvm_arch_interrupt_allowed(vcpu) && 5840 kvm_arch_interrupt_allowed(vcpu) &&
5795 !kvm_cpu_has_interrupt(vcpu) && 5841 !kvm_cpu_has_interrupt(vcpu) &&
5796 !kvm_event_needs_reinjection(vcpu); 5842 !kvm_event_needs_reinjection(vcpu);
5843 else if (!pic_in_kernel(vcpu->kvm))
5844 kvm_run->ready_for_interrupt_injection =
5845 kvm_apic_accept_pic_intr(vcpu) &&
5846 !kvm_cpu_has_interrupt(vcpu);
5847 else
5848 kvm_run->ready_for_interrupt_injection = 1;
5797} 5849}
5798 5850
5799static void update_cr8_intercept(struct kvm_vcpu *vcpu) 5851static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -6144,18 +6196,18 @@ static void process_smi(struct kvm_vcpu *vcpu)
6144 6196
6145static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) 6197static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
6146{ 6198{
6147 u64 eoi_exit_bitmap[4];
6148 u32 tmr[8];
6149
6150 if (!kvm_apic_hw_enabled(vcpu->arch.apic)) 6199 if (!kvm_apic_hw_enabled(vcpu->arch.apic))
6151 return; 6200 return;
6152 6201
6153 memset(eoi_exit_bitmap, 0, 32); 6202 memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8);
6154 memset(tmr, 0, 32);
6155 6203
6156 kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr); 6204 if (irqchip_split(vcpu->kvm))
6157 kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); 6205 kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap);
6158 kvm_apic_update_tmr(vcpu, tmr); 6206 else {
6207 kvm_x86_ops->sync_pir_to_irr(vcpu);
6208 kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
6209 }
6210 kvm_x86_ops->load_eoi_exitmap(vcpu);
6159} 6211}
6160 6212
6161static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) 6213static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
@@ -6168,7 +6220,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
6168{ 6220{
6169 struct page *page = NULL; 6221 struct page *page = NULL;
6170 6222
6171 if (!irqchip_in_kernel(vcpu->kvm)) 6223 if (!lapic_in_kernel(vcpu))
6172 return; 6224 return;
6173 6225
6174 if (!kvm_x86_ops->set_apic_access_page_addr) 6226 if (!kvm_x86_ops->set_apic_access_page_addr)
@@ -6206,7 +6258,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
6206static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 6258static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6207{ 6259{
6208 int r; 6260 int r;
6209 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 6261 bool req_int_win = !lapic_in_kernel(vcpu) &&
6210 vcpu->run->request_interrupt_window; 6262 vcpu->run->request_interrupt_window;
6211 bool req_immediate_exit = false; 6263 bool req_immediate_exit = false;
6212 6264
@@ -6258,6 +6310,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6258 kvm_pmu_handle_event(vcpu); 6310 kvm_pmu_handle_event(vcpu);
6259 if (kvm_check_request(KVM_REQ_PMI, vcpu)) 6311 if (kvm_check_request(KVM_REQ_PMI, vcpu))
6260 kvm_pmu_deliver_pmi(vcpu); 6312 kvm_pmu_deliver_pmi(vcpu);
6313 if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
6314 BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
6315 if (test_bit(vcpu->arch.pending_ioapic_eoi,
6316 (void *) vcpu->arch.eoi_exit_bitmap)) {
6317 vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
6318 vcpu->run->eoi.vector =
6319 vcpu->arch.pending_ioapic_eoi;
6320 r = 0;
6321 goto out;
6322 }
6323 }
6261 if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) 6324 if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
6262 vcpu_scan_ioapic(vcpu); 6325 vcpu_scan_ioapic(vcpu);
6263 if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) 6326 if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
@@ -6268,6 +6331,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6268 r = 0; 6331 r = 0;
6269 goto out; 6332 goto out;
6270 } 6333 }
6334 if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
6335 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
6336 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
6337 r = 0;
6338 goto out;
6339 }
6340 }
6341
6342 /*
6343 * KVM_REQ_EVENT is not set when posted interrupts are set by
6344 * VT-d hardware, so we have to update RVI unconditionally.
6345 */
6346 if (kvm_lapic_enabled(vcpu)) {
6347 /*
6348 * Update architecture specific hints for APIC
6349 * virtual interrupt delivery.
6350 */
6351 if (kvm_x86_ops->hwapic_irr_update)
6352 kvm_x86_ops->hwapic_irr_update(vcpu,
6353 kvm_lapic_find_highest_irr(vcpu));
6271 } 6354 }
6272 6355
6273 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 6356 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -6286,13 +6369,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6286 kvm_x86_ops->enable_irq_window(vcpu); 6369 kvm_x86_ops->enable_irq_window(vcpu);
6287 6370
6288 if (kvm_lapic_enabled(vcpu)) { 6371 if (kvm_lapic_enabled(vcpu)) {
6289 /*
6290 * Update architecture specific hints for APIC
6291 * virtual interrupt delivery.
6292 */
6293 if (kvm_x86_ops->hwapic_irr_update)
6294 kvm_x86_ops->hwapic_irr_update(vcpu,
6295 kvm_lapic_find_highest_irr(vcpu));
6296 update_cr8_intercept(vcpu); 6372 update_cr8_intercept(vcpu);
6297 kvm_lapic_sync_to_vapic(vcpu); 6373 kvm_lapic_sync_to_vapic(vcpu);
6298 } 6374 }
@@ -6428,10 +6504,15 @@ out:
6428 6504
6429static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) 6505static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
6430{ 6506{
6431 if (!kvm_arch_vcpu_runnable(vcpu)) { 6507 if (!kvm_arch_vcpu_runnable(vcpu) &&
6508 (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
6432 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 6509 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
6433 kvm_vcpu_block(vcpu); 6510 kvm_vcpu_block(vcpu);
6434 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 6511 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6512
6513 if (kvm_x86_ops->post_block)
6514 kvm_x86_ops->post_block(vcpu);
6515
6435 if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) 6516 if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
6436 return 1; 6517 return 1;
6437 } 6518 }
@@ -6468,10 +6549,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
6468 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 6549 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6469 6550
6470 for (;;) { 6551 for (;;) {
6471 if (kvm_vcpu_running(vcpu)) 6552 if (kvm_vcpu_running(vcpu)) {
6472 r = vcpu_enter_guest(vcpu); 6553 r = vcpu_enter_guest(vcpu);
6473 else 6554 } else {
6474 r = vcpu_block(kvm, vcpu); 6555 r = vcpu_block(kvm, vcpu);
6556 }
6557
6475 if (r <= 0) 6558 if (r <= 0)
6476 break; 6559 break;
6477 6560
@@ -6480,8 +6563,8 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
6480 kvm_inject_pending_timer_irqs(vcpu); 6563 kvm_inject_pending_timer_irqs(vcpu);
6481 6564
6482 if (dm_request_for_irq_injection(vcpu)) { 6565 if (dm_request_for_irq_injection(vcpu)) {
6483 r = -EINTR; 6566 r = 0;
6484 vcpu->run->exit_reason = KVM_EXIT_INTR; 6567 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
6485 ++vcpu->stat.request_irq_exits; 6568 ++vcpu->stat.request_irq_exits;
6486 break; 6569 break;
6487 } 6570 }
@@ -6608,7 +6691,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
6608 } 6691 }
6609 6692
6610 /* re-sync apic's tpr */ 6693 /* re-sync apic's tpr */
6611 if (!irqchip_in_kernel(vcpu->kvm)) { 6694 if (!lapic_in_kernel(vcpu)) {
6612 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { 6695 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
6613 r = -EINVAL; 6696 r = -EINVAL;
6614 goto out; 6697 goto out;
@@ -7308,7 +7391,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
7308 7391
7309bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) 7392bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
7310{ 7393{
7311 return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); 7394 return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
7312} 7395}
7313 7396
7314struct static_key kvm_no_apic_vcpu __read_mostly; 7397struct static_key kvm_no_apic_vcpu __read_mostly;
@@ -7377,6 +7460,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
7377 kvm_async_pf_hash_reset(vcpu); 7460 kvm_async_pf_hash_reset(vcpu);
7378 kvm_pmu_init(vcpu); 7461 kvm_pmu_init(vcpu);
7379 7462
7463 vcpu->arch.pending_external_vector = -1;
7464
7380 return 0; 7465 return 0;
7381 7466
7382fail_free_mce_banks: 7467fail_free_mce_banks:
@@ -7402,7 +7487,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
7402 kvm_mmu_destroy(vcpu); 7487 kvm_mmu_destroy(vcpu);
7403 srcu_read_unlock(&vcpu->kvm->srcu, idx); 7488 srcu_read_unlock(&vcpu->kvm->srcu, idx);
7404 free_page((unsigned long)vcpu->arch.pio_data); 7489 free_page((unsigned long)vcpu->arch.pio_data);
7405 if (!irqchip_in_kernel(vcpu->kvm)) 7490 if (!lapic_in_kernel(vcpu))
7406 static_key_slow_dec(&kvm_no_apic_vcpu); 7491 static_key_slow_dec(&kvm_no_apic_vcpu);
7407} 7492}
7408 7493
@@ -8029,7 +8114,59 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
8029} 8114}
8030EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); 8115EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
8031 8116
8117int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
8118 struct irq_bypass_producer *prod)
8119{
8120 struct kvm_kernel_irqfd *irqfd =
8121 container_of(cons, struct kvm_kernel_irqfd, consumer);
8122
8123 if (kvm_x86_ops->update_pi_irte) {
8124 irqfd->producer = prod;
8125 return kvm_x86_ops->update_pi_irte(irqfd->kvm,
8126 prod->irq, irqfd->gsi, 1);
8127 }
8128
8129 return -EINVAL;
8130}
8131
8132void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
8133 struct irq_bypass_producer *prod)
8134{
8135 int ret;
8136 struct kvm_kernel_irqfd *irqfd =
8137 container_of(cons, struct kvm_kernel_irqfd, consumer);
8138
8139 if (!kvm_x86_ops->update_pi_irte) {
8140 WARN_ON(irqfd->producer != NULL);
8141 return;
8142 }
8143
8144 WARN_ON(irqfd->producer != prod);
8145 irqfd->producer = NULL;
8146
8147 /*
8148 * When producer of consumer is unregistered, we change back to
8149 * remapped mode, so we can re-use the current implementation
8150 * when the irq is masked/disabed or the consumer side (KVM
8151 * int this case doesn't want to receive the interrupts.
8152 */
8153 ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
8154 if (ret)
8155 printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
8156 " fails: %d\n", irqfd->consumer.token, ret);
8157}
8158
8159int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
8160 uint32_t guest_irq, bool set)
8161{
8162 if (!kvm_x86_ops->update_pi_irte)
8163 return -EINVAL;
8164
8165 return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
8166}
8167
8032EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 8168EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
8169EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
8033EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 8170EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
8034EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 8171EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
8035EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 8172EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
@@ -8044,3 +8181,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
8044EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); 8181EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
8045EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); 8182EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
8046EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); 8183EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
8184EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 3d70e36c918e..3782636562a1 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -63,9 +63,6 @@ enum hv_cpuid_function {
63/* Define version of the synthetic interrupt controller. */ 63/* Define version of the synthetic interrupt controller. */
64#define HV_SYNIC_VERSION (1) 64#define HV_SYNIC_VERSION (1)
65 65
66/* Define the expected SynIC version. */
67#define HV_SYNIC_VERSION_1 (0x1)
68
69/* Define synthetic interrupt controller message constants. */ 66/* Define synthetic interrupt controller message constants. */
70#define HV_MESSAGE_SIZE (256) 67#define HV_MESSAGE_SIZE (256)
71#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) 68#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240)
@@ -105,8 +102,6 @@ enum hv_message_type {
105 HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 102 HVMSG_X64_LEGACY_FP_ERROR = 0x80010005
106}; 103};
107 104
108/* Define the number of synthetic interrupt sources. */
109#define HV_SYNIC_SINT_COUNT (16)
110#define HV_SYNIC_STIMER_COUNT (4) 105#define HV_SYNIC_STIMER_COUNT (4)
111 106
112/* Define invalid partition identifier. */ 107/* Define invalid partition identifier. */
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index 913455a5fd40..8adaaeae3268 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -22,7 +22,7 @@ int irq_remap_broken;
22int disable_sourceid_checking; 22int disable_sourceid_checking;
23int no_x2apic_optout; 23int no_x2apic_optout;
24 24
25int disable_irq_post = 1; 25int disable_irq_post = 0;
26 26
27static int disable_irq_remap; 27static int disable_irq_remap;
28static struct irq_remap_ops *remap_ops; 28static struct irq_remap_ops *remap_ops;
@@ -58,14 +58,18 @@ static __init int setup_irqremap(char *str)
58 return -EINVAL; 58 return -EINVAL;
59 59
60 while (*str) { 60 while (*str) {
61 if (!strncmp(str, "on", 2)) 61 if (!strncmp(str, "on", 2)) {
62 disable_irq_remap = 0; 62 disable_irq_remap = 0;
63 else if (!strncmp(str, "off", 3)) 63 disable_irq_post = 0;
64 } else if (!strncmp(str, "off", 3)) {
64 disable_irq_remap = 1; 65 disable_irq_remap = 1;
65 else if (!strncmp(str, "nosid", 5)) 66 disable_irq_post = 1;
67 } else if (!strncmp(str, "nosid", 5))
66 disable_sourceid_checking = 1; 68 disable_sourceid_checking = 1;
67 else if (!strncmp(str, "no_x2apic_optout", 16)) 69 else if (!strncmp(str, "no_x2apic_optout", 16))
68 no_x2apic_optout = 1; 70 no_x2apic_optout = 1;
71 else if (!strncmp(str, "nopost", 6))
72 disable_irq_post = 1;
69 73
70 str += strcspn(str, ","); 74 str += strcspn(str, ",");
71 while (*str == ',') 75 while (*str == ',')
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 454017928ed0..850d86ca685b 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -33,3 +33,4 @@ menuconfig VFIO
33 33
34source "drivers/vfio/pci/Kconfig" 34source "drivers/vfio/pci/Kconfig"
35source "drivers/vfio/platform/Kconfig" 35source "drivers/vfio/platform/Kconfig"
36source "virt/lib/Kconfig"
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 579d83bf5358..02912f180c6d 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -2,6 +2,7 @@ config VFIO_PCI
2 tristate "VFIO support for PCI devices" 2 tristate "VFIO support for PCI devices"
3 depends on VFIO && PCI && EVENTFD 3 depends on VFIO && PCI && EVENTFD
4 select VFIO_VIRQFD 4 select VFIO_VIRQFD
5 select IRQ_BYPASS_MANAGER
5 help 6 help
6 Support for the PCI VFIO bus driver. This is required to make 7 Support for the PCI VFIO bus driver. This is required to make
7 use of PCI drivers using the VFIO framework. 8 use of PCI drivers using the VFIO framework.
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 1f577b4ac126..3b3ba15558b7 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -319,6 +319,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
319 319
320 if (vdev->ctx[vector].trigger) { 320 if (vdev->ctx[vector].trigger) {
321 free_irq(irq, vdev->ctx[vector].trigger); 321 free_irq(irq, vdev->ctx[vector].trigger);
322 irq_bypass_unregister_producer(&vdev->ctx[vector].producer);
322 kfree(vdev->ctx[vector].name); 323 kfree(vdev->ctx[vector].name);
323 eventfd_ctx_put(vdev->ctx[vector].trigger); 324 eventfd_ctx_put(vdev->ctx[vector].trigger);
324 vdev->ctx[vector].trigger = NULL; 325 vdev->ctx[vector].trigger = NULL;
@@ -360,6 +361,14 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
360 return ret; 361 return ret;
361 } 362 }
362 363
364 vdev->ctx[vector].producer.token = trigger;
365 vdev->ctx[vector].producer.irq = irq;
366 ret = irq_bypass_register_producer(&vdev->ctx[vector].producer);
367 if (unlikely(ret))
368 dev_info(&pdev->dev,
369 "irq bypass producer (token %p) registration fails: %d\n",
370 vdev->ctx[vector].producer.token, ret);
371
363 vdev->ctx[vector].trigger = trigger; 372 vdev->ctx[vector].trigger = trigger;
364 373
365 return 0; 374 return 0;
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index ae0e1b4c1711..0e7394f8f69b 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -13,6 +13,7 @@
13 13
14#include <linux/mutex.h> 14#include <linux/mutex.h>
15#include <linux/pci.h> 15#include <linux/pci.h>
16#include <linux/irqbypass.h>
16 17
17#ifndef VFIO_PCI_PRIVATE_H 18#ifndef VFIO_PCI_PRIVATE_H
18#define VFIO_PCI_PRIVATE_H 19#define VFIO_PCI_PRIVATE_H
@@ -29,6 +30,7 @@ struct vfio_pci_irq_ctx {
29 struct virqfd *mask; 30 struct virqfd *mask;
30 char *name; 31 char *name;
31 bool masked; 32 bool masked;
33 struct irq_bypass_producer producer;
32}; 34};
33 35
34struct vfio_pci_device { 36struct vfio_pci_device {
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index e1e4d7c38dda..1800227af9d6 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -51,7 +51,7 @@ struct arch_timer_cpu {
51 bool armed; 51 bool armed;
52 52
53 /* Timer IRQ */ 53 /* Timer IRQ */
54 const struct kvm_irq_level *irq; 54 struct kvm_irq_level irq;
55 55
56 /* VGIC mapping */ 56 /* VGIC mapping */
57 struct irq_phys_map *map; 57 struct irq_phys_map *map;
@@ -71,5 +71,7 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
71int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); 71int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
72 72
73bool kvm_timer_should_fire(struct kvm_vcpu *vcpu); 73bool kvm_timer_should_fire(struct kvm_vcpu *vcpu);
74void kvm_timer_schedule(struct kvm_vcpu *vcpu);
75void kvm_timer_unschedule(struct kvm_vcpu *vcpu);
74 76
75#endif 77#endif
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 6a3538ef7275..9c747cb14ad8 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -112,7 +112,6 @@ struct vgic_vmcr {
112struct vgic_ops { 112struct vgic_ops {
113 struct vgic_lr (*get_lr)(const struct kvm_vcpu *, int); 113 struct vgic_lr (*get_lr)(const struct kvm_vcpu *, int);
114 void (*set_lr)(struct kvm_vcpu *, int, struct vgic_lr); 114 void (*set_lr)(struct kvm_vcpu *, int, struct vgic_lr);
115 void (*sync_lr_elrsr)(struct kvm_vcpu *, int, struct vgic_lr);
116 u64 (*get_elrsr)(const struct kvm_vcpu *vcpu); 115 u64 (*get_elrsr)(const struct kvm_vcpu *vcpu);
117 u64 (*get_eisr)(const struct kvm_vcpu *vcpu); 116 u64 (*get_eisr)(const struct kvm_vcpu *vcpu);
118 void (*clear_eisr)(struct kvm_vcpu *vcpu); 117 void (*clear_eisr)(struct kvm_vcpu *vcpu);
@@ -159,7 +158,6 @@ struct irq_phys_map {
159 u32 virt_irq; 158 u32 virt_irq;
160 u32 phys_irq; 159 u32 phys_irq;
161 u32 irq; 160 u32 irq;
162 bool active;
163}; 161};
164 162
165struct irq_phys_map_entry { 163struct irq_phys_map_entry {
@@ -296,22 +294,16 @@ struct vgic_v3_cpu_if {
296}; 294};
297 295
298struct vgic_cpu { 296struct vgic_cpu {
299 /* per IRQ to LR mapping */
300 u8 *vgic_irq_lr_map;
301
302 /* Pending/active/both interrupts on this VCPU */ 297 /* Pending/active/both interrupts on this VCPU */
303 DECLARE_BITMAP( pending_percpu, VGIC_NR_PRIVATE_IRQS); 298 DECLARE_BITMAP(pending_percpu, VGIC_NR_PRIVATE_IRQS);
304 DECLARE_BITMAP( active_percpu, VGIC_NR_PRIVATE_IRQS); 299 DECLARE_BITMAP(active_percpu, VGIC_NR_PRIVATE_IRQS);
305 DECLARE_BITMAP( pend_act_percpu, VGIC_NR_PRIVATE_IRQS); 300 DECLARE_BITMAP(pend_act_percpu, VGIC_NR_PRIVATE_IRQS);
306 301
307 /* Pending/active/both shared interrupts, dynamically sized */ 302 /* Pending/active/both shared interrupts, dynamically sized */
308 unsigned long *pending_shared; 303 unsigned long *pending_shared;
309 unsigned long *active_shared; 304 unsigned long *active_shared;
310 unsigned long *pend_act_shared; 305 unsigned long *pend_act_shared;
311 306
312 /* Bitmap of used/free list registers */
313 DECLARE_BITMAP( lr_used, VGIC_V2_MAX_LRS);
314
315 /* Number of list registers on this CPU */ 307 /* Number of list registers on this CPU */
316 int nr_lr; 308 int nr_lr;
317 309
@@ -354,8 +346,6 @@ int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
354struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, 346struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
355 int virt_irq, int irq); 347 int virt_irq, int irq);
356int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map); 348int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
357bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map);
358void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active);
359 349
360#define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) 350#define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel))
361#define vgic_initialized(k) (!!((k)->arch.vgic.nr_cpus)) 351#define vgic_initialized(k) (!!((k)->arch.vgic.nr_cpus))
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 54733d5b503e..8fdc17b84739 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -26,6 +26,7 @@
26#define _HYPERV_H 26#define _HYPERV_H
27 27
28#include <uapi/linux/hyperv.h> 28#include <uapi/linux/hyperv.h>
29#include <uapi/asm/hyperv.h>
29 30
30#include <linux/types.h> 31#include <linux/types.h>
31#include <linux/scatterlist.h> 32#include <linux/scatterlist.h>
diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h
new file mode 100644
index 000000000000..1551b5b2f4c2
--- /dev/null
+++ b/include/linux/irqbypass.h
@@ -0,0 +1,90 @@
1/*
2 * IRQ offload/bypass manager
3 *
4 * Copyright (C) 2015 Red Hat, Inc.
5 * Copyright (c) 2015 Linaro Ltd.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#ifndef IRQBYPASS_H
12#define IRQBYPASS_H
13
14#include <linux/list.h>
15
16struct irq_bypass_consumer;
17
18/*
19 * Theory of operation
20 *
21 * The IRQ bypass manager is a simple set of lists and callbacks that allows
22 * IRQ producers (ex. physical interrupt sources) to be matched to IRQ
23 * consumers (ex. virtualization hardware that allows IRQ bypass or offload)
24 * via a shared token (ex. eventfd_ctx). Producers and consumers register
25 * independently. When a token match is found, the optional @stop callback
26 * will be called for each participant. The pair will then be connected via
27 * the @add_* callbacks, and finally the optional @start callback will allow
28 * any final coordination. When either participant is unregistered, the
29 * process is repeated using the @del_* callbacks in place of the @add_*
30 * callbacks. Match tokens must be unique per producer/consumer, 1:N pairings
31 * are not supported.
32 */
33
34/**
35 * struct irq_bypass_producer - IRQ bypass producer definition
36 * @node: IRQ bypass manager private list management
37 * @token: opaque token to match between producer and consumer
38 * @irq: Linux IRQ number for the producer device
39 * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional)
40 * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional)
41 * @stop: Perform any quiesce operations necessary prior to add/del (optional)
42 * @start: Perform any startup operations necessary after add/del (optional)
43 *
44 * The IRQ bypass producer structure represents an interrupt source for
45 * participation in possible host bypass, for instance an interrupt vector
46 * for a physical device assigned to a VM.
47 */
48struct irq_bypass_producer {
49 struct list_head node;
50 void *token;
51 int irq;
52 int (*add_consumer)(struct irq_bypass_producer *,
53 struct irq_bypass_consumer *);
54 void (*del_consumer)(struct irq_bypass_producer *,
55 struct irq_bypass_consumer *);
56 void (*stop)(struct irq_bypass_producer *);
57 void (*start)(struct irq_bypass_producer *);
58};
59
60/**
61 * struct irq_bypass_consumer - IRQ bypass consumer definition
62 * @node: IRQ bypass manager private list management
63 * @token: opaque token to match between producer and consumer
64 * @add_producer: Connect the IRQ consumer to an IRQ producer
65 * @del_producer: Disconnect the IRQ consumer from an IRQ producer
66 * @stop: Perform any quiesce operations necessary prior to add/del (optional)
67 * @start: Perform any startup operations necessary after add/del (optional)
68 *
69 * The IRQ bypass consumer structure represents an interrupt sink for
70 * participation in possible host bypass, for instance a hypervisor may
71 * support offloads to allow bypassing the host entirely or offload
72 * portions of the interrupt handling to the VM.
73 */
74struct irq_bypass_consumer {
75 struct list_head node;
76 void *token;
77 int (*add_producer)(struct irq_bypass_consumer *,
78 struct irq_bypass_producer *);
79 void (*del_producer)(struct irq_bypass_consumer *,
80 struct irq_bypass_producer *);
81 void (*stop)(struct irq_bypass_consumer *);
82 void (*start)(struct irq_bypass_consumer *);
83};
84
85int irq_bypass_register_producer(struct irq_bypass_producer *);
86void irq_bypass_unregister_producer(struct irq_bypass_producer *);
87int irq_bypass_register_consumer(struct irq_bypass_consumer *);
88void irq_bypass_unregister_consumer(struct irq_bypass_consumer *);
89
90#endif /* IRQBYPASS_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1bef9e21e725..242a6d2b53ff 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -24,6 +24,7 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/irqflags.h> 25#include <linux/irqflags.h>
26#include <linux/context_tracking.h> 26#include <linux/context_tracking.h>
27#include <linux/irqbypass.h>
27#include <asm/signal.h> 28#include <asm/signal.h>
28 29
29#include <linux/kvm.h> 30#include <linux/kvm.h>
@@ -140,6 +141,8 @@ static inline bool is_error_page(struct page *page)
140#define KVM_REQ_APIC_PAGE_RELOAD 25 141#define KVM_REQ_APIC_PAGE_RELOAD 25
141#define KVM_REQ_SMI 26 142#define KVM_REQ_SMI 26
142#define KVM_REQ_HV_CRASH 27 143#define KVM_REQ_HV_CRASH 27
144#define KVM_REQ_IOAPIC_EOI_EXIT 28
145#define KVM_REQ_HV_RESET 29
143 146
144#define KVM_USERSPACE_IRQ_SOURCE_ID 0 147#define KVM_USERSPACE_IRQ_SOURCE_ID 0
145#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 148#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
@@ -231,6 +234,9 @@ struct kvm_vcpu {
231 unsigned long requests; 234 unsigned long requests;
232 unsigned long guest_debug; 235 unsigned long guest_debug;
233 236
237 int pre_pcpu;
238 struct list_head blocked_vcpu_list;
239
234 struct mutex mutex; 240 struct mutex mutex;
235 struct kvm_run *run; 241 struct kvm_run *run;
236 242
@@ -329,6 +335,18 @@ struct kvm_kernel_irq_routing_entry {
329 struct hlist_node link; 335 struct hlist_node link;
330}; 336};
331 337
338#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
339struct kvm_irq_routing_table {
340 int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
341 u32 nr_rt_entries;
342 /*
343 * Array indexed by gsi. Each entry contains list of irq chips
344 * the gsi is connected to.
345 */
346 struct hlist_head map[0];
347};
348#endif
349
332#ifndef KVM_PRIVATE_MEM_SLOTS 350#ifndef KVM_PRIVATE_MEM_SLOTS
333#define KVM_PRIVATE_MEM_SLOTS 0 351#define KVM_PRIVATE_MEM_SLOTS 0
334#endif 352#endif
@@ -455,10 +473,14 @@ void vcpu_put(struct kvm_vcpu *vcpu);
455 473
456#ifdef __KVM_HAVE_IOAPIC 474#ifdef __KVM_HAVE_IOAPIC
457void kvm_vcpu_request_scan_ioapic(struct kvm *kvm); 475void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
476void kvm_arch_irq_routing_update(struct kvm *kvm);
458#else 477#else
459static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) 478static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
460{ 479{
461} 480}
481static inline void kvm_arch_irq_routing_update(struct kvm *kvm)
482{
483}
462#endif 484#endif
463 485
464#ifdef CONFIG_HAVE_KVM_IRQFD 486#ifdef CONFIG_HAVE_KVM_IRQFD
@@ -625,6 +647,8 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
625void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); 647void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
626 648
627void kvm_vcpu_block(struct kvm_vcpu *vcpu); 649void kvm_vcpu_block(struct kvm_vcpu *vcpu);
650void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
651void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
628void kvm_vcpu_kick(struct kvm_vcpu *vcpu); 652void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
629int kvm_vcpu_yield_to(struct kvm_vcpu *target); 653int kvm_vcpu_yield_to(struct kvm_vcpu *target);
630void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); 654void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
@@ -803,10 +827,13 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin);
803 827
804int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, 828int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
805 bool line_status); 829 bool line_status);
806int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
807int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, 830int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
808 int irq_source_id, int level, bool line_status); 831 int irq_source_id, int level, bool line_status);
832int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
833 struct kvm *kvm, int irq_source_id,
834 int level, bool line_status);
809bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin); 835bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
836void kvm_notify_acked_gsi(struct kvm *kvm, int gsi);
810void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); 837void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
811void kvm_register_irq_ack_notifier(struct kvm *kvm, 838void kvm_register_irq_ack_notifier(struct kvm *kvm,
812 struct kvm_irq_ack_notifier *kian); 839 struct kvm_irq_ack_notifier *kian);
@@ -1002,6 +1029,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
1002#endif 1029#endif
1003 1030
1004int kvm_setup_default_irq_routing(struct kvm *kvm); 1031int kvm_setup_default_irq_routing(struct kvm *kvm);
1032int kvm_setup_empty_irq_routing(struct kvm *kvm);
1005int kvm_set_irq_routing(struct kvm *kvm, 1033int kvm_set_irq_routing(struct kvm *kvm,
1006 const struct kvm_irq_routing_entry *entries, 1034 const struct kvm_irq_routing_entry *entries,
1007 unsigned nr, 1035 unsigned nr,
@@ -1144,5 +1172,15 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
1144{ 1172{
1145} 1173}
1146#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ 1174#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
1147#endif
1148 1175
1176#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
1177int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *,
1178 struct irq_bypass_producer *);
1179void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *,
1180 struct irq_bypass_producer *);
1181void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *);
1182void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *);
1183int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
1184 uint32_t guest_irq, bool set);
1185#endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
1186#endif
diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
new file mode 100644
index 000000000000..0c1de05098c8
--- /dev/null
+++ b/include/linux/kvm_irqfd.h
@@ -0,0 +1,71 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License.
5 *
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
10 *
11 * irqfd: Allows an fd to be used to inject an interrupt to the guest
12 * Credit goes to Avi Kivity for the original idea.
13 */
14
15#ifndef __LINUX_KVM_IRQFD_H
16#define __LINUX_KVM_IRQFD_H
17
18#include <linux/kvm_host.h>
19#include <linux/poll.h>
20
21/*
22 * Resampling irqfds are a special variety of irqfds used to emulate
23 * level triggered interrupts. The interrupt is asserted on eventfd
24 * trigger. On acknowledgment through the irq ack notifier, the
25 * interrupt is de-asserted and userspace is notified through the
26 * resamplefd. All resamplers on the same gsi are de-asserted
27 * together, so we don't need to track the state of each individual
28 * user. We can also therefore share the same irq source ID.
29 */
30struct kvm_kernel_irqfd_resampler {
31 struct kvm *kvm;
32 /*
33 * List of resampling struct _irqfd objects sharing this gsi.
34 * RCU list modified under kvm->irqfds.resampler_lock
35 */
36 struct list_head list;
37 struct kvm_irq_ack_notifier notifier;
38 /*
39 * Entry in list of kvm->irqfd.resampler_list. Use for sharing
40 * resamplers among irqfds on the same gsi.
41 * Accessed and modified under kvm->irqfds.resampler_lock
42 */
43 struct list_head link;
44};
45
46struct kvm_kernel_irqfd {
47 /* Used for MSI fast-path */
48 struct kvm *kvm;
49 wait_queue_t wait;
50 /* Update side is protected by irqfds.lock */
51 struct kvm_kernel_irq_routing_entry irq_entry;
52 seqcount_t irq_entry_sc;
53 /* Used for level IRQ fast-path */
54 int gsi;
55 struct work_struct inject;
56 /* The resampler used by this irqfd (resampler-only) */
57 struct kvm_kernel_irqfd_resampler *resampler;
58 /* Eventfd notified on resample (resampler-only) */
59 struct eventfd_ctx *resamplefd;
60 /* Entry in list of irqfds for a resampler (resampler-only) */
61 struct list_head resampler_link;
62 /* Used for setup/shutdown */
63 struct eventfd_ctx *eventfd;
64 struct list_head list;
65 poll_table pt;
66 struct work_struct shutdown;
67 struct irq_bypass_consumer consumer;
68 struct irq_bypass_producer *producer;
69};
70
71#endif /* __LINUX_KVM_IRQFD_H */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a9256f0331ae..03f3618612aa 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -183,6 +183,7 @@ struct kvm_s390_skeys {
183#define KVM_EXIT_EPR 23 183#define KVM_EXIT_EPR 23
184#define KVM_EXIT_SYSTEM_EVENT 24 184#define KVM_EXIT_SYSTEM_EVENT 24
185#define KVM_EXIT_S390_STSI 25 185#define KVM_EXIT_S390_STSI 25
186#define KVM_EXIT_IOAPIC_EOI 26
186 187
187/* For KVM_EXIT_INTERNAL_ERROR */ 188/* For KVM_EXIT_INTERNAL_ERROR */
188/* Emulate instruction failed. */ 189/* Emulate instruction failed. */
@@ -333,6 +334,10 @@ struct kvm_run {
333 __u8 sel1; 334 __u8 sel1;
334 __u16 sel2; 335 __u16 sel2;
335 } s390_stsi; 336 } s390_stsi;
337 /* KVM_EXIT_IOAPIC_EOI */
338 struct {
339 __u8 vector;
340 } eoi;
336 /* Fix the size of the union. */ 341 /* Fix the size of the union. */
337 char padding[256]; 342 char padding[256];
338 }; 343 };
@@ -824,6 +829,8 @@ struct kvm_ppc_smmu_info {
824#define KVM_CAP_MULTI_ADDRESS_SPACE 118 829#define KVM_CAP_MULTI_ADDRESS_SPACE 118
825#define KVM_CAP_GUEST_DEBUG_HW_BPS 119 830#define KVM_CAP_GUEST_DEBUG_HW_BPS 119
826#define KVM_CAP_GUEST_DEBUG_HW_WPS 120 831#define KVM_CAP_GUEST_DEBUG_HW_WPS 120
832#define KVM_CAP_SPLIT_IRQCHIP 121
833#define KVM_CAP_IOEVENTFD_ANY_LENGTH 122
827 834
828#ifdef KVM_CAP_IRQ_ROUTING 835#ifdef KVM_CAP_IRQ_ROUTING
829 836
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8cbc3db671df..26a54461bf59 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
444 *ut = p->utime; 444 *ut = p->utime;
445 *st = p->stime; 445 *st = p->stime;
446} 446}
447EXPORT_SYMBOL_GPL(task_cputime_adjusted);
447 448
448void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 449void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
449{ 450{
@@ -652,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
652 task_cputime(p, &cputime.utime, &cputime.stime); 653 task_cputime(p, &cputime.utime, &cputime.stime);
653 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 654 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
654} 655}
656EXPORT_SYMBOL_GPL(task_cputime_adjusted);
655 657
656void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 658void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
657{ 659{
diff --git a/virt/Makefile b/virt/Makefile
new file mode 100644
index 000000000000..be783472ac81
--- /dev/null
+++ b/virt/Makefile
@@ -0,0 +1 @@
obj-y += lib/
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index e2c876d5a03b..7a79b6853583 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -46,4 +46,7 @@ config KVM_GENERIC_DIRTYLOG_READ_PROTECT
46 46
47config KVM_COMPAT 47config KVM_COMPAT
48 def_bool y 48 def_bool y
49 depends on COMPAT && !S390 49 depends on KVM && COMPAT && !S390
50
51config HAVE_KVM_IRQ_BYPASS
52 bool
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index b9d3a32cbc04..21a0ab2d8919 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -28,6 +28,8 @@
28#include <kvm/arm_vgic.h> 28#include <kvm/arm_vgic.h>
29#include <kvm/arm_arch_timer.h> 29#include <kvm/arm_arch_timer.h>
30 30
31#include "trace.h"
32
31static struct timecounter *timecounter; 33static struct timecounter *timecounter;
32static struct workqueue_struct *wqueue; 34static struct workqueue_struct *wqueue;
33static unsigned int host_vtimer_irq; 35static unsigned int host_vtimer_irq;
@@ -59,18 +61,6 @@ static void timer_disarm(struct arch_timer_cpu *timer)
59 } 61 }
60} 62}
61 63
62static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
63{
64 int ret;
65 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
66
67 kvm_vgic_set_phys_irq_active(timer->map, true);
68 ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
69 timer->map,
70 timer->irq->level);
71 WARN_ON(ret);
72}
73
74static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) 64static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
75{ 65{
76 struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; 66 struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
@@ -111,14 +101,20 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
111 return HRTIMER_NORESTART; 101 return HRTIMER_NORESTART;
112} 102}
113 103
104static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu)
105{
106 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
107
108 return !(timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
109 (timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE);
110}
111
114bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) 112bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
115{ 113{
116 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 114 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
117 cycle_t cval, now; 115 cycle_t cval, now;
118 116
119 if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) || 117 if (!kvm_timer_irq_can_fire(vcpu))
120 !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) ||
121 kvm_vgic_get_phys_irq_active(timer->map))
122 return false; 118 return false;
123 119
124 cval = timer->cntv_cval; 120 cval = timer->cntv_cval;
@@ -127,12 +123,94 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
127 return cval <= now; 123 return cval <= now;
128} 124}
129 125
126static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level)
127{
128 int ret;
129 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
130
131 BUG_ON(!vgic_initialized(vcpu->kvm));
132
133 timer->irq.level = new_level;
134 trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->map->virt_irq,
135 timer->irq.level);
136 ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
137 timer->map,
138 timer->irq.level);
139 WARN_ON(ret);
140}
141
142/*
143 * Check if there was a change in the timer state (should we raise or lower
144 * the line level to the GIC).
145 */
146static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
147{
148 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
149
150 /*
151 * If userspace modified the timer registers via SET_ONE_REG before
152 * the vgic was initialized, we mustn't set the timer->irq.level value
153 * because the guest would never see the interrupt. Instead wait
154 * until we call this function from kvm_timer_flush_hwstate.
155 */
156 if (!vgic_initialized(vcpu->kvm))
157 return;
158
159 if (kvm_timer_should_fire(vcpu) != timer->irq.level)
160 kvm_timer_update_irq(vcpu, !timer->irq.level);
161}
162
163/*
164 * Schedule the background timer before calling kvm_vcpu_block, so that this
165 * thread is removed from its waitqueue and made runnable when there's a timer
166 * interrupt to handle.
167 */
168void kvm_timer_schedule(struct kvm_vcpu *vcpu)
169{
170 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
171 u64 ns;
172 cycle_t cval, now;
173
174 BUG_ON(timer_is_armed(timer));
175
176 /*
177 * No need to schedule a background timer if the guest timer has
178 * already expired, because kvm_vcpu_block will return before putting
179 * the thread to sleep.
180 */
181 if (kvm_timer_should_fire(vcpu))
182 return;
183
184 /*
185 * If the timer is not capable of raising interrupts (disabled or
186 * masked), then there's no more work for us to do.
187 */
188 if (!kvm_timer_irq_can_fire(vcpu))
189 return;
190
191 /* The timer has not yet expired, schedule a background timer */
192 cval = timer->cntv_cval;
193 now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
194
195 ns = cyclecounter_cyc2ns(timecounter->cc,
196 cval - now,
197 timecounter->mask,
198 &timecounter->frac);
199 timer_arm(timer, ns);
200}
201
202void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
203{
204 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
205 timer_disarm(timer);
206}
207
130/** 208/**
131 * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu 209 * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu
132 * @vcpu: The vcpu pointer 210 * @vcpu: The vcpu pointer
133 * 211 *
134 * Disarm any pending soft timers, since the world-switch code will write the 212 * Check if the virtual timer has expired while we were running in the host,
135 * virtual timer state back to the physical CPU. 213 * and inject an interrupt if that was the case.
136 */ 214 */
137void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) 215void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
138{ 216{
@@ -140,28 +218,20 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
140 bool phys_active; 218 bool phys_active;
141 int ret; 219 int ret;
142 220
143 /* 221 kvm_timer_update_state(vcpu);
144 * We're about to run this vcpu again, so there is no need to
145 * keep the background timer running, as we're about to
146 * populate the CPU timer again.
147 */
148 timer_disarm(timer);
149 222
150 /* 223 /*
151 * If the timer expired while we were not scheduled, now is the time 224 * If we enter the guest with the virtual input level to the VGIC
152 * to inject it. 225 * asserted, then we have already told the VGIC what we need to, and
226 * we don't need to exit from the guest until the guest deactivates
227 * the already injected interrupt, so therefore we should set the
228 * hardware active state to prevent unnecessary exits from the guest.
229 *
230 * Conversely, if the virtual input level is deasserted, then always
231 * clear the hardware active state to ensure that hardware interrupts
232 * from the timer triggers a guest exit.
153 */ 233 */
154 if (kvm_timer_should_fire(vcpu)) 234 if (timer->irq.level)
155 kvm_timer_inject_irq(vcpu);
156
157 /*
158 * We keep track of whether the edge-triggered interrupt has been
159 * signalled to the vgic/guest, and if so, we mask the interrupt and
160 * the physical distributor to prevent the timer from raising a
161 * physical interrupt whenever we run a guest, preventing forward
162 * VCPU progress.
163 */
164 if (kvm_vgic_get_phys_irq_active(timer->map))
165 phys_active = true; 235 phys_active = true;
166 else 236 else
167 phys_active = false; 237 phys_active = false;
@@ -176,32 +246,20 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
176 * kvm_timer_sync_hwstate - sync timer state from cpu 246 * kvm_timer_sync_hwstate - sync timer state from cpu
177 * @vcpu: The vcpu pointer 247 * @vcpu: The vcpu pointer
178 * 248 *
179 * Check if the virtual timer was armed and either schedule a corresponding 249 * Check if the virtual timer has expired while we were running in the guest,
180 * soft timer or inject directly if already expired. 250 * and inject an interrupt if that was the case.
181 */ 251 */
182void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) 252void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
183{ 253{
184 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 254 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
185 cycle_t cval, now;
186 u64 ns;
187 255
188 BUG_ON(timer_is_armed(timer)); 256 BUG_ON(timer_is_armed(timer));
189 257
190 if (kvm_timer_should_fire(vcpu)) { 258 /*
191 /* 259 * The guest could have modified the timer registers or the timer
192 * Timer has already expired while we were not 260 * could have expired, update the timer state.
193 * looking. Inject the interrupt and carry on. 261 */
194 */ 262 kvm_timer_update_state(vcpu);
195 kvm_timer_inject_irq(vcpu);
196 return;
197 }
198
199 cval = timer->cntv_cval;
200 now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
201
202 ns = cyclecounter_cyc2ns(timecounter->cc, cval - now, timecounter->mask,
203 &timecounter->frac);
204 timer_arm(timer, ns);
205} 263}
206 264
207int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, 265int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
@@ -216,7 +274,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
216 * kvm_vcpu_set_target(). To handle this, we determine 274 * kvm_vcpu_set_target(). To handle this, we determine
217 * vcpu timer irq number when the vcpu is reset. 275 * vcpu timer irq number when the vcpu is reset.
218 */ 276 */
219 timer->irq = irq; 277 timer->irq.irq = irq->irq;
220 278
221 /* 279 /*
222 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 280 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
@@ -225,6 +283,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
225 * the ARMv7 architecture. 283 * the ARMv7 architecture.
226 */ 284 */
227 timer->cntv_ctl = 0; 285 timer->cntv_ctl = 0;
286 kvm_timer_update_state(vcpu);
228 287
229 /* 288 /*
230 * Tell the VGIC that the virtual interrupt is tied to a 289 * Tell the VGIC that the virtual interrupt is tied to a
@@ -269,6 +328,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
269 default: 328 default:
270 return -1; 329 return -1;
271 } 330 }
331
332 kvm_timer_update_state(vcpu);
272 return 0; 333 return 0;
273} 334}
274 335
diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h
new file mode 100644
index 000000000000..37d8b98867d5
--- /dev/null
+++ b/virt/kvm/arm/trace.h
@@ -0,0 +1,63 @@
1#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
2#define _TRACE_KVM_H
3
4#include <linux/tracepoint.h>
5
6#undef TRACE_SYSTEM
7#define TRACE_SYSTEM kvm
8
9/*
10 * Tracepoints for vgic
11 */
12TRACE_EVENT(vgic_update_irq_pending,
13 TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level),
14 TP_ARGS(vcpu_id, irq, level),
15
16 TP_STRUCT__entry(
17 __field( unsigned long, vcpu_id )
18 __field( __u32, irq )
19 __field( bool, level )
20 ),
21
22 TP_fast_assign(
23 __entry->vcpu_id = vcpu_id;
24 __entry->irq = irq;
25 __entry->level = level;
26 ),
27
28 TP_printk("VCPU: %ld, IRQ %d, level: %d",
29 __entry->vcpu_id, __entry->irq, __entry->level)
30);
31
32/*
33 * Tracepoints for arch_timer
34 */
35TRACE_EVENT(kvm_timer_update_irq,
36 TP_PROTO(unsigned long vcpu_id, __u32 irq, int level),
37 TP_ARGS(vcpu_id, irq, level),
38
39 TP_STRUCT__entry(
40 __field( unsigned long, vcpu_id )
41 __field( __u32, irq )
42 __field( int, level )
43 ),
44
45 TP_fast_assign(
46 __entry->vcpu_id = vcpu_id;
47 __entry->irq = irq;
48 __entry->level = level;
49 ),
50
51 TP_printk("VCPU: %ld, IRQ %d, level %d",
52 __entry->vcpu_id, __entry->irq, __entry->level)
53);
54
55#endif /* _TRACE_KVM_H */
56
57#undef TRACE_INCLUDE_PATH
58#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm
59#undef TRACE_INCLUDE_FILE
60#define TRACE_INCLUDE_FILE trace
61
62/* This part must be outside protection */
63#include <trace/define_trace.h>
diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
index 8d7b04db8471..ff02f08df74d 100644
--- a/virt/kvm/arm/vgic-v2.c
+++ b/virt/kvm/arm/vgic-v2.c
@@ -79,11 +79,7 @@ static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
79 lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT); 79 lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
80 80
81 vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val; 81 vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
82}
83 82
84static void vgic_v2_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
85 struct vgic_lr lr_desc)
86{
87 if (!(lr_desc.state & LR_STATE_MASK)) 83 if (!(lr_desc.state & LR_STATE_MASK))
88 vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr); 84 vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr);
89 else 85 else
@@ -158,6 +154,7 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu)
158 * anyway. 154 * anyway.
159 */ 155 */
160 vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; 156 vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
157 vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;
161 158
162 /* Get the show on the road... */ 159 /* Get the show on the road... */
163 vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; 160 vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
@@ -166,7 +163,6 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu)
166static const struct vgic_ops vgic_v2_ops = { 163static const struct vgic_ops vgic_v2_ops = {
167 .get_lr = vgic_v2_get_lr, 164 .get_lr = vgic_v2_get_lr,
168 .set_lr = vgic_v2_set_lr, 165 .set_lr = vgic_v2_set_lr,
169 .sync_lr_elrsr = vgic_v2_sync_lr_elrsr,
170 .get_elrsr = vgic_v2_get_elrsr, 166 .get_elrsr = vgic_v2_get_elrsr,
171 .get_eisr = vgic_v2_get_eisr, 167 .get_eisr = vgic_v2_get_eisr,
172 .clear_eisr = vgic_v2_clear_eisr, 168 .clear_eisr = vgic_v2_clear_eisr,
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
index 7dd5d62f10a1..487d6357b7e7 100644
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@@ -112,11 +112,7 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
112 } 112 }
113 113
114 vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val; 114 vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val;
115}
116 115
117static void vgic_v3_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
118 struct vgic_lr lr_desc)
119{
120 if (!(lr_desc.state & LR_STATE_MASK)) 116 if (!(lr_desc.state & LR_STATE_MASK))
121 vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr); 117 vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr);
122 else 118 else
@@ -193,6 +189,7 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu)
193 * anyway. 189 * anyway.
194 */ 190 */
195 vgic_v3->vgic_vmcr = 0; 191 vgic_v3->vgic_vmcr = 0;
192 vgic_v3->vgic_elrsr = ~0;
196 193
197 /* 194 /*
198 * If we are emulating a GICv3, we do it in an non-GICv2-compatible 195 * If we are emulating a GICv3, we do it in an non-GICv2-compatible
@@ -211,7 +208,6 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu)
211static const struct vgic_ops vgic_v3_ops = { 208static const struct vgic_ops vgic_v3_ops = {
212 .get_lr = vgic_v3_get_lr, 209 .get_lr = vgic_v3_get_lr,
213 .set_lr = vgic_v3_set_lr, 210 .set_lr = vgic_v3_set_lr,
214 .sync_lr_elrsr = vgic_v3_sync_lr_elrsr,
215 .get_elrsr = vgic_v3_get_elrsr, 211 .get_elrsr = vgic_v3_get_elrsr,
216 .get_eisr = vgic_v3_get_eisr, 212 .get_eisr = vgic_v3_get_eisr,
217 .clear_eisr = vgic_v3_clear_eisr, 213 .clear_eisr = vgic_v3_clear_eisr,
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 30489181922d..533538385d5d 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -34,6 +34,9 @@
34#include <asm/kvm.h> 34#include <asm/kvm.h>
35#include <kvm/iodev.h> 35#include <kvm/iodev.h>
36 36
37#define CREATE_TRACE_POINTS
38#include "trace.h"
39
37/* 40/*
38 * How the whole thing works (courtesy of Christoffer Dall): 41 * How the whole thing works (courtesy of Christoffer Dall):
39 * 42 *
@@ -102,11 +105,13 @@
102#include "vgic.h" 105#include "vgic.h"
103 106
104static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu); 107static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
105static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu); 108static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu);
106static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr); 109static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
107static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc); 110static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
111static u64 vgic_get_elrsr(struct kvm_vcpu *vcpu);
108static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu, 112static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
109 int virt_irq); 113 int virt_irq);
114static int compute_pending_for_cpu(struct kvm_vcpu *vcpu);
110 115
111static const struct vgic_ops *vgic_ops; 116static const struct vgic_ops *vgic_ops;
112static const struct vgic_params *vgic; 117static const struct vgic_params *vgic;
@@ -357,6 +362,11 @@ static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq)
357 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 362 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
358 363
359 vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0); 364 vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0);
365 if (!vgic_dist_irq_get_level(vcpu, irq)) {
366 vgic_dist_irq_clear_pending(vcpu, irq);
367 if (!compute_pending_for_cpu(vcpu))
368 clear_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
369 }
360} 370}
361 371
362static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq) 372static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
@@ -531,34 +541,6 @@ bool vgic_handle_set_pending_reg(struct kvm *kvm,
531 return false; 541 return false;
532} 542}
533 543
534/*
535 * If a mapped interrupt's state has been modified by the guest such that it
536 * is no longer active or pending, without it have gone through the sync path,
537 * then the map->active field must be cleared so the interrupt can be taken
538 * again.
539 */
540static void vgic_handle_clear_mapped_irq(struct kvm_vcpu *vcpu)
541{
542 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
543 struct list_head *root;
544 struct irq_phys_map_entry *entry;
545 struct irq_phys_map *map;
546
547 rcu_read_lock();
548
549 /* Check for PPIs */
550 root = &vgic_cpu->irq_phys_map_list;
551 list_for_each_entry_rcu(entry, root, entry) {
552 map = &entry->map;
553
554 if (!vgic_dist_irq_is_pending(vcpu, map->virt_irq) &&
555 !vgic_irq_is_active(vcpu, map->virt_irq))
556 map->active = false;
557 }
558
559 rcu_read_unlock();
560}
561
562bool vgic_handle_clear_pending_reg(struct kvm *kvm, 544bool vgic_handle_clear_pending_reg(struct kvm *kvm,
563 struct kvm_exit_mmio *mmio, 545 struct kvm_exit_mmio *mmio,
564 phys_addr_t offset, int vcpu_id) 546 phys_addr_t offset, int vcpu_id)
@@ -589,7 +571,6 @@ bool vgic_handle_clear_pending_reg(struct kvm *kvm,
589 vcpu_id, offset); 571 vcpu_id, offset);
590 vgic_reg_access(mmio, reg, offset, mode); 572 vgic_reg_access(mmio, reg, offset, mode);
591 573
592 vgic_handle_clear_mapped_irq(kvm_get_vcpu(kvm, vcpu_id));
593 vgic_update_state(kvm); 574 vgic_update_state(kvm);
594 return true; 575 return true;
595 } 576 }
@@ -627,7 +608,6 @@ bool vgic_handle_clear_active_reg(struct kvm *kvm,
627 ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); 608 ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
628 609
629 if (mmio->is_write) { 610 if (mmio->is_write) {
630 vgic_handle_clear_mapped_irq(kvm_get_vcpu(kvm, vcpu_id));
631 vgic_update_state(kvm); 611 vgic_update_state(kvm);
632 return true; 612 return true;
633 } 613 }
@@ -684,10 +664,9 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
684 vgic_reg_access(mmio, &val, offset, 664 vgic_reg_access(mmio, &val, offset,
685 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); 665 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
686 if (mmio->is_write) { 666 if (mmio->is_write) {
687 if (offset < 8) { 667 /* Ignore writes to read-only SGI and PPI bits */
688 *reg = ~0U; /* Force PPIs/SGIs to 1 */ 668 if (offset < 8)
689 return false; 669 return false;
690 }
691 670
692 val = vgic_cfg_compress(val); 671 val = vgic_cfg_compress(val);
693 if (offset & 4) { 672 if (offset & 4) {
@@ -713,9 +692,11 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
713void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) 692void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
714{ 693{
715 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 694 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
695 u64 elrsr = vgic_get_elrsr(vcpu);
696 unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
716 int i; 697 int i;
717 698
718 for_each_set_bit(i, vgic_cpu->lr_used, vgic_cpu->nr_lr) { 699 for_each_clear_bit(i, elrsr_ptr, vgic_cpu->nr_lr) {
719 struct vgic_lr lr = vgic_get_lr(vcpu, i); 700 struct vgic_lr lr = vgic_get_lr(vcpu, i);
720 701
721 /* 702 /*
@@ -736,30 +717,14 @@ void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
736 * interrupt then move the active state to the 717 * interrupt then move the active state to the
737 * distributor tracking bit. 718 * distributor tracking bit.
738 */ 719 */
739 if (lr.state & LR_STATE_ACTIVE) { 720 if (lr.state & LR_STATE_ACTIVE)
740 vgic_irq_set_active(vcpu, lr.irq); 721 vgic_irq_set_active(vcpu, lr.irq);
741 lr.state &= ~LR_STATE_ACTIVE;
742 }
743 722
744 /* 723 /*
745 * Reestablish the pending state on the distributor and the 724 * Reestablish the pending state on the distributor and the
746 * CPU interface. It may have already been pending, but that 725 * CPU interface and mark the LR as free for other use.
747 * is fine, then we are only setting a few bits that were
748 * already set.
749 */ 726 */
750 if (lr.state & LR_STATE_PENDING) { 727 vgic_retire_lr(i, vcpu);
751 vgic_dist_irq_set_pending(vcpu, lr.irq);
752 lr.state &= ~LR_STATE_PENDING;
753 }
754
755 vgic_set_lr(vcpu, i, lr);
756
757 /*
758 * Mark the LR as free for other use.
759 */
760 BUG_ON(lr.state & LR_STATE_MASK);
761 vgic_retire_lr(i, lr.irq, vcpu);
762 vgic_irq_clear_queued(vcpu, lr.irq);
763 728
764 /* Finally update the VGIC state. */ 729 /* Finally update the VGIC state. */
765 vgic_update_state(vcpu->kvm); 730 vgic_update_state(vcpu->kvm);
@@ -1067,12 +1032,6 @@ static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr,
1067 vgic_ops->set_lr(vcpu, lr, vlr); 1032 vgic_ops->set_lr(vcpu, lr, vlr);
1068} 1033}
1069 1034
1070static void vgic_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
1071 struct vgic_lr vlr)
1072{
1073 vgic_ops->sync_lr_elrsr(vcpu, lr, vlr);
1074}
1075
1076static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu) 1035static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu)
1077{ 1036{
1078 return vgic_ops->get_elrsr(vcpu); 1037 return vgic_ops->get_elrsr(vcpu);
@@ -1118,25 +1077,23 @@ static inline void vgic_enable(struct kvm_vcpu *vcpu)
1118 vgic_ops->enable(vcpu); 1077 vgic_ops->enable(vcpu);
1119} 1078}
1120 1079
1121static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu) 1080static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu)
1122{ 1081{
1123 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
1124 struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr); 1082 struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr);
1125 1083
1084 vgic_irq_clear_queued(vcpu, vlr.irq);
1085
1126 /* 1086 /*
1127 * We must transfer the pending state back to the distributor before 1087 * We must transfer the pending state back to the distributor before
1128 * retiring the LR, otherwise we may loose edge-triggered interrupts. 1088 * retiring the LR, otherwise we may loose edge-triggered interrupts.
1129 */ 1089 */
1130 if (vlr.state & LR_STATE_PENDING) { 1090 if (vlr.state & LR_STATE_PENDING) {
1131 vgic_dist_irq_set_pending(vcpu, irq); 1091 vgic_dist_irq_set_pending(vcpu, vlr.irq);
1132 vlr.hwirq = 0; 1092 vlr.hwirq = 0;
1133 } 1093 }
1134 1094
1135 vlr.state = 0; 1095 vlr.state = 0;
1136 vgic_set_lr(vcpu, lr_nr, vlr); 1096 vgic_set_lr(vcpu, lr_nr, vlr);
1137 clear_bit(lr_nr, vgic_cpu->lr_used);
1138 vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY;
1139 vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
1140} 1097}
1141 1098
1142/* 1099/*
@@ -1150,17 +1107,15 @@ static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu)
1150 */ 1107 */
1151static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu) 1108static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
1152{ 1109{
1153 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 1110 u64 elrsr = vgic_get_elrsr(vcpu);
1111 unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
1154 int lr; 1112 int lr;
1155 1113
1156 for_each_set_bit(lr, vgic_cpu->lr_used, vgic->nr_lr) { 1114 for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
1157 struct vgic_lr vlr = vgic_get_lr(vcpu, lr); 1115 struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
1158 1116
1159 if (!vgic_irq_is_enabled(vcpu, vlr.irq)) { 1117 if (!vgic_irq_is_enabled(vcpu, vlr.irq))
1160 vgic_retire_lr(lr, vlr.irq, vcpu); 1118 vgic_retire_lr(lr, vcpu);
1161 if (vgic_irq_is_queued(vcpu, vlr.irq))
1162 vgic_irq_clear_queued(vcpu, vlr.irq);
1163 }
1164 } 1119 }
1165} 1120}
1166 1121
@@ -1200,7 +1155,6 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
1200 } 1155 }
1201 1156
1202 vgic_set_lr(vcpu, lr_nr, vlr); 1157 vgic_set_lr(vcpu, lr_nr, vlr);
1203 vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
1204} 1158}
1205 1159
1206/* 1160/*
@@ -1210,8 +1164,9 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
1210 */ 1164 */
1211bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) 1165bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
1212{ 1166{
1213 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
1214 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 1167 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
1168 u64 elrsr = vgic_get_elrsr(vcpu);
1169 unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
1215 struct vgic_lr vlr; 1170 struct vgic_lr vlr;
1216 int lr; 1171 int lr;
1217 1172
@@ -1222,28 +1177,22 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
1222 1177
1223 kvm_debug("Queue IRQ%d\n", irq); 1178 kvm_debug("Queue IRQ%d\n", irq);
1224 1179
1225 lr = vgic_cpu->vgic_irq_lr_map[irq];
1226
1227 /* Do we have an active interrupt for the same CPUID? */ 1180 /* Do we have an active interrupt for the same CPUID? */
1228 if (lr != LR_EMPTY) { 1181 for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
1229 vlr = vgic_get_lr(vcpu, lr); 1182 vlr = vgic_get_lr(vcpu, lr);
1230 if (vlr.source == sgi_source_id) { 1183 if (vlr.irq == irq && vlr.source == sgi_source_id) {
1231 kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq); 1184 kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq);
1232 BUG_ON(!test_bit(lr, vgic_cpu->lr_used));
1233 vgic_queue_irq_to_lr(vcpu, irq, lr, vlr); 1185 vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
1234 return true; 1186 return true;
1235 } 1187 }
1236 } 1188 }
1237 1189
1238 /* Try to use another LR for this interrupt */ 1190 /* Try to use another LR for this interrupt */
1239 lr = find_first_zero_bit((unsigned long *)vgic_cpu->lr_used, 1191 lr = find_first_bit(elrsr_ptr, vgic->nr_lr);
1240 vgic->nr_lr);
1241 if (lr >= vgic->nr_lr) 1192 if (lr >= vgic->nr_lr)
1242 return false; 1193 return false;
1243 1194
1244 kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id); 1195 kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id);
1245 vgic_cpu->vgic_irq_lr_map[irq] = lr;
1246 set_bit(lr, vgic_cpu->lr_used);
1247 1196
1248 vlr.irq = irq; 1197 vlr.irq = irq;
1249 vlr.source = sgi_source_id; 1198 vlr.source = sgi_source_id;
@@ -1338,12 +1287,60 @@ epilog:
1338 } 1287 }
1339} 1288}
1340 1289
1290static int process_queued_irq(struct kvm_vcpu *vcpu,
1291 int lr, struct vgic_lr vlr)
1292{
1293 int pending = 0;
1294
1295 /*
1296 * If the IRQ was EOIed (called from vgic_process_maintenance) or it
1297 * went from active to non-active (called from vgic_sync_hwirq) it was
1298 * also ACKed and we we therefore assume we can clear the soft pending
1299 * state (should it had been set) for this interrupt.
1300 *
1301 * Note: if the IRQ soft pending state was set after the IRQ was
1302 * acked, it actually shouldn't be cleared, but we have no way of
1303 * knowing that unless we start trapping ACKs when the soft-pending
1304 * state is set.
1305 */
1306 vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
1307
1308 /*
1309 * Tell the gic to start sampling this interrupt again.
1310 */
1311 vgic_irq_clear_queued(vcpu, vlr.irq);
1312
1313 /* Any additional pending interrupt? */
1314 if (vgic_irq_is_edge(vcpu, vlr.irq)) {
1315 BUG_ON(!(vlr.state & LR_HW));
1316 pending = vgic_dist_irq_is_pending(vcpu, vlr.irq);
1317 } else {
1318 if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
1319 vgic_cpu_irq_set(vcpu, vlr.irq);
1320 pending = 1;
1321 } else {
1322 vgic_dist_irq_clear_pending(vcpu, vlr.irq);
1323 vgic_cpu_irq_clear(vcpu, vlr.irq);
1324 }
1325 }
1326
1327 /*
1328 * Despite being EOIed, the LR may not have
1329 * been marked as empty.
1330 */
1331 vlr.state = 0;
1332 vlr.hwirq = 0;
1333 vgic_set_lr(vcpu, lr, vlr);
1334
1335 return pending;
1336}
1337
1341static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) 1338static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
1342{ 1339{
1343 u32 status = vgic_get_interrupt_status(vcpu); 1340 u32 status = vgic_get_interrupt_status(vcpu);
1344 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 1341 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
1345 bool level_pending = false;
1346 struct kvm *kvm = vcpu->kvm; 1342 struct kvm *kvm = vcpu->kvm;
1343 int level_pending = 0;
1347 1344
1348 kvm_debug("STATUS = %08x\n", status); 1345 kvm_debug("STATUS = %08x\n", status);
1349 1346
@@ -1358,54 +1355,22 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
1358 1355
1359 for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) { 1356 for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) {
1360 struct vgic_lr vlr = vgic_get_lr(vcpu, lr); 1357 struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
1361 WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq));
1362 1358
1363 spin_lock(&dist->lock); 1359 WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq));
1364 vgic_irq_clear_queued(vcpu, vlr.irq);
1365 WARN_ON(vlr.state & LR_STATE_MASK); 1360 WARN_ON(vlr.state & LR_STATE_MASK);
1366 vlr.state = 0;
1367 vgic_set_lr(vcpu, lr, vlr);
1368 1361
1369 /*
1370 * If the IRQ was EOIed it was also ACKed and we we
1371 * therefore assume we can clear the soft pending
1372 * state (should it had been set) for this interrupt.
1373 *
1374 * Note: if the IRQ soft pending state was set after
1375 * the IRQ was acked, it actually shouldn't be
1376 * cleared, but we have no way of knowing that unless
1377 * we start trapping ACKs when the soft-pending state
1378 * is set.
1379 */
1380 vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
1381 1362
1382 /* 1363 /*
1383 * kvm_notify_acked_irq calls kvm_set_irq() 1364 * kvm_notify_acked_irq calls kvm_set_irq()
1384 * to reset the IRQ level. Need to release the 1365 * to reset the IRQ level, which grabs the dist->lock
1385 * lock for kvm_set_irq to grab it. 1366 * so we call this before taking the dist->lock.
1386 */ 1367 */
1387 spin_unlock(&dist->lock);
1388
1389 kvm_notify_acked_irq(kvm, 0, 1368 kvm_notify_acked_irq(kvm, 0,
1390 vlr.irq - VGIC_NR_PRIVATE_IRQS); 1369 vlr.irq - VGIC_NR_PRIVATE_IRQS);
1391 spin_lock(&dist->lock);
1392
1393 /* Any additional pending interrupt? */
1394 if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
1395 vgic_cpu_irq_set(vcpu, vlr.irq);
1396 level_pending = true;
1397 } else {
1398 vgic_dist_irq_clear_pending(vcpu, vlr.irq);
1399 vgic_cpu_irq_clear(vcpu, vlr.irq);
1400 }
1401 1370
1371 spin_lock(&dist->lock);
1372 level_pending |= process_queued_irq(vcpu, lr, vlr);
1402 spin_unlock(&dist->lock); 1373 spin_unlock(&dist->lock);
1403
1404 /*
1405 * Despite being EOIed, the LR may not have
1406 * been marked as empty.
1407 */
1408 vgic_sync_lr_elrsr(vcpu, lr, vlr);
1409 } 1374 }
1410 } 1375 }
1411 1376
@@ -1426,35 +1391,40 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
1426/* 1391/*
1427 * Save the physical active state, and reset it to inactive. 1392 * Save the physical active state, and reset it to inactive.
1428 * 1393 *
1429 * Return 1 if HW interrupt went from active to inactive, and 0 otherwise. 1394 * Return true if there's a pending forwarded interrupt to queue.
1430 */ 1395 */
1431static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr) 1396static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr)
1432{ 1397{
1398 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
1433 struct irq_phys_map *map; 1399 struct irq_phys_map *map;
1400 bool phys_active;
1401 bool level_pending;
1434 int ret; 1402 int ret;
1435 1403
1436 if (!(vlr.state & LR_HW)) 1404 if (!(vlr.state & LR_HW))
1437 return 0; 1405 return false;
1438 1406
1439 map = vgic_irq_map_search(vcpu, vlr.irq); 1407 map = vgic_irq_map_search(vcpu, vlr.irq);
1440 BUG_ON(!map); 1408 BUG_ON(!map);
1441 1409
1442 ret = irq_get_irqchip_state(map->irq, 1410 ret = irq_get_irqchip_state(map->irq,
1443 IRQCHIP_STATE_ACTIVE, 1411 IRQCHIP_STATE_ACTIVE,
1444 &map->active); 1412 &phys_active);
1445 1413
1446 WARN_ON(ret); 1414 WARN_ON(ret);
1447 1415
1448 if (map->active) 1416 if (phys_active)
1449 return 0; 1417 return 0;
1450 1418
1451 return 1; 1419 spin_lock(&dist->lock);
1420 level_pending = process_queued_irq(vcpu, lr, vlr);
1421 spin_unlock(&dist->lock);
1422 return level_pending;
1452} 1423}
1453 1424
1454/* Sync back the VGIC state after a guest run */ 1425/* Sync back the VGIC state after a guest run */
1455static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) 1426static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
1456{ 1427{
1457 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
1458 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 1428 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
1459 u64 elrsr; 1429 u64 elrsr;
1460 unsigned long *elrsr_ptr; 1430 unsigned long *elrsr_ptr;
@@ -1462,40 +1432,18 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
1462 bool level_pending; 1432 bool level_pending;
1463 1433
1464 level_pending = vgic_process_maintenance(vcpu); 1434 level_pending = vgic_process_maintenance(vcpu);
1465 elrsr = vgic_get_elrsr(vcpu);
1466 elrsr_ptr = u64_to_bitmask(&elrsr);
1467 1435
1468 /* Deal with HW interrupts, and clear mappings for empty LRs */ 1436 /* Deal with HW interrupts, and clear mappings for empty LRs */
1469 for (lr = 0; lr < vgic->nr_lr; lr++) { 1437 for (lr = 0; lr < vgic->nr_lr; lr++) {
1470 struct vgic_lr vlr; 1438 struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
1471
1472 if (!test_bit(lr, vgic_cpu->lr_used))
1473 continue;
1474
1475 vlr = vgic_get_lr(vcpu, lr);
1476 if (vgic_sync_hwirq(vcpu, vlr)) {
1477 /*
1478 * So this is a HW interrupt that the guest
1479 * EOI-ed. Clean the LR state and allow the
1480 * interrupt to be sampled again.
1481 */
1482 vlr.state = 0;
1483 vlr.hwirq = 0;
1484 vgic_set_lr(vcpu, lr, vlr);
1485 vgic_irq_clear_queued(vcpu, vlr.irq);
1486 set_bit(lr, elrsr_ptr);
1487 }
1488
1489 if (!test_bit(lr, elrsr_ptr))
1490 continue;
1491
1492 clear_bit(lr, vgic_cpu->lr_used);
1493 1439
1440 level_pending |= vgic_sync_hwirq(vcpu, lr, vlr);
1494 BUG_ON(vlr.irq >= dist->nr_irqs); 1441 BUG_ON(vlr.irq >= dist->nr_irqs);
1495 vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
1496 } 1442 }
1497 1443
1498 /* Check if we still have something up our sleeve... */ 1444 /* Check if we still have something up our sleeve... */
1445 elrsr = vgic_get_elrsr(vcpu);
1446 elrsr_ptr = u64_to_bitmask(&elrsr);
1499 pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr); 1447 pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr);
1500 if (level_pending || pending < vgic->nr_lr) 1448 if (level_pending || pending < vgic->nr_lr)
1501 set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); 1449 set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
@@ -1585,6 +1533,8 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
1585 int enabled; 1533 int enabled;
1586 bool ret = true, can_inject = true; 1534 bool ret = true, can_inject = true;
1587 1535
1536 trace_vgic_update_irq_pending(cpuid, irq_num, level);
1537
1588 if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020)) 1538 if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
1589 return -EINVAL; 1539 return -EINVAL;
1590 1540
@@ -1864,30 +1814,6 @@ static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu)
1864} 1814}
1865 1815
1866/** 1816/**
1867 * kvm_vgic_get_phys_irq_active - Return the active state of a mapped IRQ
1868 *
1869 * Return the logical active state of a mapped interrupt. This doesn't
1870 * necessarily reflects the current HW state.
1871 */
1872bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map)
1873{
1874 BUG_ON(!map);
1875 return map->active;
1876}
1877
1878/**
1879 * kvm_vgic_set_phys_irq_active - Set the active state of a mapped IRQ
1880 *
1881 * Set the logical active state of a mapped interrupt. This doesn't
1882 * immediately affects the HW state.
1883 */
1884void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active)
1885{
1886 BUG_ON(!map);
1887 map->active = active;
1888}
1889
1890/**
1891 * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping 1817 * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
1892 * @vcpu: The VCPU pointer 1818 * @vcpu: The VCPU pointer
1893 * @map: The pointer to a mapping obtained through kvm_vgic_map_phys_irq 1819 * @map: The pointer to a mapping obtained through kvm_vgic_map_phys_irq
@@ -1942,12 +1868,10 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
1942 kfree(vgic_cpu->pending_shared); 1868 kfree(vgic_cpu->pending_shared);
1943 kfree(vgic_cpu->active_shared); 1869 kfree(vgic_cpu->active_shared);
1944 kfree(vgic_cpu->pend_act_shared); 1870 kfree(vgic_cpu->pend_act_shared);
1945 kfree(vgic_cpu->vgic_irq_lr_map);
1946 vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list); 1871 vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list);
1947 vgic_cpu->pending_shared = NULL; 1872 vgic_cpu->pending_shared = NULL;
1948 vgic_cpu->active_shared = NULL; 1873 vgic_cpu->active_shared = NULL;
1949 vgic_cpu->pend_act_shared = NULL; 1874 vgic_cpu->pend_act_shared = NULL;
1950 vgic_cpu->vgic_irq_lr_map = NULL;
1951} 1875}
1952 1876
1953static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs) 1877static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
@@ -1958,18 +1882,14 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
1958 vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL); 1882 vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL);
1959 vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL); 1883 vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL);
1960 vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL); 1884 vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL);
1961 vgic_cpu->vgic_irq_lr_map = kmalloc(nr_irqs, GFP_KERNEL);
1962 1885
1963 if (!vgic_cpu->pending_shared 1886 if (!vgic_cpu->pending_shared
1964 || !vgic_cpu->active_shared 1887 || !vgic_cpu->active_shared
1965 || !vgic_cpu->pend_act_shared 1888 || !vgic_cpu->pend_act_shared) {
1966 || !vgic_cpu->vgic_irq_lr_map) {
1967 kvm_vgic_vcpu_destroy(vcpu); 1889 kvm_vgic_vcpu_destroy(vcpu);
1968 return -ENOMEM; 1890 return -ENOMEM;
1969 } 1891 }
1970 1892
1971 memset(vgic_cpu->vgic_irq_lr_map, LR_EMPTY, nr_irqs);
1972
1973 /* 1893 /*
1974 * Store the number of LRs per vcpu, so we don't have to go 1894 * Store the number of LRs per vcpu, so we don't have to go
1975 * all the way to the distributor structure to find out. Only 1895 * all the way to the distributor structure to find out. Only
@@ -2111,14 +2031,24 @@ int vgic_init(struct kvm *kvm)
2111 break; 2031 break;
2112 } 2032 }
2113 2033
2114 for (i = 0; i < dist->nr_irqs; i++) { 2034 /*
2115 if (i < VGIC_NR_PPIS) 2035 * Enable and configure all SGIs to be edge-triggere and
2036 * configure all PPIs as level-triggered.
2037 */
2038 for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
2039 if (i < VGIC_NR_SGIS) {
2040 /* SGIs */
2116 vgic_bitmap_set_irq_val(&dist->irq_enabled, 2041 vgic_bitmap_set_irq_val(&dist->irq_enabled,
2117 vcpu->vcpu_id, i, 1); 2042 vcpu->vcpu_id, i, 1);
2118 if (i < VGIC_NR_PRIVATE_IRQS)
2119 vgic_bitmap_set_irq_val(&dist->irq_cfg, 2043 vgic_bitmap_set_irq_val(&dist->irq_cfg,
2120 vcpu->vcpu_id, i, 2044 vcpu->vcpu_id, i,
2121 VGIC_CFG_EDGE); 2045 VGIC_CFG_EDGE);
2046 } else if (i < VGIC_NR_PRIVATE_IRQS) {
2047 /* PPIs */
2048 vgic_bitmap_set_irq_val(&dist->irq_cfg,
2049 vcpu->vcpu_id, i,
2050 VGIC_CFG_LEVEL);
2051 }
2122 } 2052 }
2123 2053
2124 vgic_enable(vcpu); 2054 vgic_enable(vcpu);
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 44660aee335f..77d42be6970e 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -94,6 +94,10 @@ static void async_pf_execute(struct work_struct *work)
94 94
95 trace_kvm_async_pf_completed(addr, gva); 95 trace_kvm_async_pf_completed(addr, gva);
96 96
97 /*
98 * This memory barrier pairs with prepare_to_wait's set_current_state()
99 */
100 smp_mb();
97 if (waitqueue_active(&vcpu->wq)) 101 if (waitqueue_active(&vcpu->wq))
98 wake_up_interruptible(&vcpu->wq); 102 wake_up_interruptible(&vcpu->wq);
99 103
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 79db45336e3a..46dbc0a7dfc1 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -23,6 +23,7 @@
23 23
24#include <linux/kvm_host.h> 24#include <linux/kvm_host.h>
25#include <linux/kvm.h> 25#include <linux/kvm.h>
26#include <linux/kvm_irqfd.h>
26#include <linux/workqueue.h> 27#include <linux/workqueue.h>
27#include <linux/syscalls.h> 28#include <linux/syscalls.h>
28#include <linux/wait.h> 29#include <linux/wait.h>
@@ -34,73 +35,20 @@
34#include <linux/srcu.h> 35#include <linux/srcu.h>
35#include <linux/slab.h> 36#include <linux/slab.h>
36#include <linux/seqlock.h> 37#include <linux/seqlock.h>
38#include <linux/irqbypass.h>
37#include <trace/events/kvm.h> 39#include <trace/events/kvm.h>
38 40
39#include <kvm/iodev.h> 41#include <kvm/iodev.h>
40 42
41#ifdef CONFIG_HAVE_KVM_IRQFD 43#ifdef CONFIG_HAVE_KVM_IRQFD
42/*
43 * --------------------------------------------------------------------
44 * irqfd: Allows an fd to be used to inject an interrupt to the guest
45 *
46 * Credit goes to Avi Kivity for the original idea.
47 * --------------------------------------------------------------------
48 */
49
50/*
51 * Resampling irqfds are a special variety of irqfds used to emulate
52 * level triggered interrupts. The interrupt is asserted on eventfd
53 * trigger. On acknowledgement through the irq ack notifier, the
54 * interrupt is de-asserted and userspace is notified through the
55 * resamplefd. All resamplers on the same gsi are de-asserted
56 * together, so we don't need to track the state of each individual
57 * user. We can also therefore share the same irq source ID.
58 */
59struct _irqfd_resampler {
60 struct kvm *kvm;
61 /*
62 * List of resampling struct _irqfd objects sharing this gsi.
63 * RCU list modified under kvm->irqfds.resampler_lock
64 */
65 struct list_head list;
66 struct kvm_irq_ack_notifier notifier;
67 /*
68 * Entry in list of kvm->irqfd.resampler_list. Use for sharing
69 * resamplers among irqfds on the same gsi.
70 * Accessed and modified under kvm->irqfds.resampler_lock
71 */
72 struct list_head link;
73};
74
75struct _irqfd {
76 /* Used for MSI fast-path */
77 struct kvm *kvm;
78 wait_queue_t wait;
79 /* Update side is protected by irqfds.lock */
80 struct kvm_kernel_irq_routing_entry irq_entry;
81 seqcount_t irq_entry_sc;
82 /* Used for level IRQ fast-path */
83 int gsi;
84 struct work_struct inject;
85 /* The resampler used by this irqfd (resampler-only) */
86 struct _irqfd_resampler *resampler;
87 /* Eventfd notified on resample (resampler-only) */
88 struct eventfd_ctx *resamplefd;
89 /* Entry in list of irqfds for a resampler (resampler-only) */
90 struct list_head resampler_link;
91 /* Used for setup/shutdown */
92 struct eventfd_ctx *eventfd;
93 struct list_head list;
94 poll_table pt;
95 struct work_struct shutdown;
96};
97 44
98static struct workqueue_struct *irqfd_cleanup_wq; 45static struct workqueue_struct *irqfd_cleanup_wq;
99 46
100static void 47static void
101irqfd_inject(struct work_struct *work) 48irqfd_inject(struct work_struct *work)
102{ 49{
103 struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); 50 struct kvm_kernel_irqfd *irqfd =
51 container_of(work, struct kvm_kernel_irqfd, inject);
104 struct kvm *kvm = irqfd->kvm; 52 struct kvm *kvm = irqfd->kvm;
105 53
106 if (!irqfd->resampler) { 54 if (!irqfd->resampler) {
@@ -121,12 +69,13 @@ irqfd_inject(struct work_struct *work)
121static void 69static void
122irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) 70irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
123{ 71{
124 struct _irqfd_resampler *resampler; 72 struct kvm_kernel_irqfd_resampler *resampler;
125 struct kvm *kvm; 73 struct kvm *kvm;
126 struct _irqfd *irqfd; 74 struct kvm_kernel_irqfd *irqfd;
127 int idx; 75 int idx;
128 76
129 resampler = container_of(kian, struct _irqfd_resampler, notifier); 77 resampler = container_of(kian,
78 struct kvm_kernel_irqfd_resampler, notifier);
130 kvm = resampler->kvm; 79 kvm = resampler->kvm;
131 80
132 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 81 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
@@ -141,9 +90,9 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
141} 90}
142 91
143static void 92static void
144irqfd_resampler_shutdown(struct _irqfd *irqfd) 93irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
145{ 94{
146 struct _irqfd_resampler *resampler = irqfd->resampler; 95 struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler;
147 struct kvm *kvm = resampler->kvm; 96 struct kvm *kvm = resampler->kvm;
148 97
149 mutex_lock(&kvm->irqfds.resampler_lock); 98 mutex_lock(&kvm->irqfds.resampler_lock);
@@ -168,7 +117,8 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd)
168static void 117static void
169irqfd_shutdown(struct work_struct *work) 118irqfd_shutdown(struct work_struct *work)
170{ 119{
171 struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); 120 struct kvm_kernel_irqfd *irqfd =
121 container_of(work, struct kvm_kernel_irqfd, shutdown);
172 u64 cnt; 122 u64 cnt;
173 123
174 /* 124 /*
@@ -191,6 +141,9 @@ irqfd_shutdown(struct work_struct *work)
191 /* 141 /*
192 * It is now safe to release the object's resources 142 * It is now safe to release the object's resources
193 */ 143 */
144#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
145 irq_bypass_unregister_consumer(&irqfd->consumer);
146#endif
194 eventfd_ctx_put(irqfd->eventfd); 147 eventfd_ctx_put(irqfd->eventfd);
195 kfree(irqfd); 148 kfree(irqfd);
196} 149}
@@ -198,7 +151,7 @@ irqfd_shutdown(struct work_struct *work)
198 151
199/* assumes kvm->irqfds.lock is held */ 152/* assumes kvm->irqfds.lock is held */
200static bool 153static bool
201irqfd_is_active(struct _irqfd *irqfd) 154irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
202{ 155{
203 return list_empty(&irqfd->list) ? false : true; 156 return list_empty(&irqfd->list) ? false : true;
204} 157}
@@ -209,7 +162,7 @@ irqfd_is_active(struct _irqfd *irqfd)
209 * assumes kvm->irqfds.lock is held 162 * assumes kvm->irqfds.lock is held
210 */ 163 */
211static void 164static void
212irqfd_deactivate(struct _irqfd *irqfd) 165irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
213{ 166{
214 BUG_ON(!irqfd_is_active(irqfd)); 167 BUG_ON(!irqfd_is_active(irqfd));
215 168
@@ -218,13 +171,23 @@ irqfd_deactivate(struct _irqfd *irqfd)
218 queue_work(irqfd_cleanup_wq, &irqfd->shutdown); 171 queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
219} 172}
220 173
174int __attribute__((weak)) kvm_arch_set_irq_inatomic(
175 struct kvm_kernel_irq_routing_entry *irq,
176 struct kvm *kvm, int irq_source_id,
177 int level,
178 bool line_status)
179{
180 return -EWOULDBLOCK;
181}
182
221/* 183/*
222 * Called with wqh->lock held and interrupts disabled 184 * Called with wqh->lock held and interrupts disabled
223 */ 185 */
224static int 186static int
225irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) 187irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
226{ 188{
227 struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); 189 struct kvm_kernel_irqfd *irqfd =
190 container_of(wait, struct kvm_kernel_irqfd, wait);
228 unsigned long flags = (unsigned long)key; 191 unsigned long flags = (unsigned long)key;
229 struct kvm_kernel_irq_routing_entry irq; 192 struct kvm_kernel_irq_routing_entry irq;
230 struct kvm *kvm = irqfd->kvm; 193 struct kvm *kvm = irqfd->kvm;
@@ -238,10 +201,9 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
238 irq = irqfd->irq_entry; 201 irq = irqfd->irq_entry;
239 } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq)); 202 } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
240 /* An event has been signaled, inject an interrupt */ 203 /* An event has been signaled, inject an interrupt */
241 if (irq.type == KVM_IRQ_ROUTING_MSI) 204 if (kvm_arch_set_irq_inatomic(&irq, kvm,
242 kvm_set_msi(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, 205 KVM_USERSPACE_IRQ_SOURCE_ID, 1,
243 false); 206 false) == -EWOULDBLOCK)
244 else
245 schedule_work(&irqfd->inject); 207 schedule_work(&irqfd->inject);
246 srcu_read_unlock(&kvm->irq_srcu, idx); 208 srcu_read_unlock(&kvm->irq_srcu, idx);
247 } 209 }
@@ -274,37 +236,54 @@ static void
274irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, 236irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
275 poll_table *pt) 237 poll_table *pt)
276{ 238{
277 struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); 239 struct kvm_kernel_irqfd *irqfd =
240 container_of(pt, struct kvm_kernel_irqfd, pt);
278 add_wait_queue(wqh, &irqfd->wait); 241 add_wait_queue(wqh, &irqfd->wait);
279} 242}
280 243
281/* Must be called under irqfds.lock */ 244/* Must be called under irqfds.lock */
282static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd) 245static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
283{ 246{
284 struct kvm_kernel_irq_routing_entry *e; 247 struct kvm_kernel_irq_routing_entry *e;
285 struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; 248 struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
286 int i, n_entries; 249 int n_entries;
287 250
288 n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); 251 n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
289 252
290 write_seqcount_begin(&irqfd->irq_entry_sc); 253 write_seqcount_begin(&irqfd->irq_entry_sc);
291 254
292 irqfd->irq_entry.type = 0;
293
294 e = entries; 255 e = entries;
295 for (i = 0; i < n_entries; ++i, ++e) { 256 if (n_entries == 1)
296 /* Only fast-path MSI. */ 257 irqfd->irq_entry = *e;
297 if (e->type == KVM_IRQ_ROUTING_MSI) 258 else
298 irqfd->irq_entry = *e; 259 irqfd->irq_entry.type = 0;
299 }
300 260
301 write_seqcount_end(&irqfd->irq_entry_sc); 261 write_seqcount_end(&irqfd->irq_entry_sc);
302} 262}
303 263
264#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
265void __attribute__((weak)) kvm_arch_irq_bypass_stop(
266 struct irq_bypass_consumer *cons)
267{
268}
269
270void __attribute__((weak)) kvm_arch_irq_bypass_start(
271 struct irq_bypass_consumer *cons)
272{
273}
274
275int __attribute__((weak)) kvm_arch_update_irqfd_routing(
276 struct kvm *kvm, unsigned int host_irq,
277 uint32_t guest_irq, bool set)
278{
279 return 0;
280}
281#endif
282
304static int 283static int
305kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) 284kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
306{ 285{
307 struct _irqfd *irqfd, *tmp; 286 struct kvm_kernel_irqfd *irqfd, *tmp;
308 struct fd f; 287 struct fd f;
309 struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; 288 struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
310 int ret; 289 int ret;
@@ -340,7 +319,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
340 irqfd->eventfd = eventfd; 319 irqfd->eventfd = eventfd;
341 320
342 if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) { 321 if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
343 struct _irqfd_resampler *resampler; 322 struct kvm_kernel_irqfd_resampler *resampler;
344 323
345 resamplefd = eventfd_ctx_fdget(args->resamplefd); 324 resamplefd = eventfd_ctx_fdget(args->resamplefd);
346 if (IS_ERR(resamplefd)) { 325 if (IS_ERR(resamplefd)) {
@@ -428,6 +407,17 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
428 * we might race against the POLLHUP 407 * we might race against the POLLHUP
429 */ 408 */
430 fdput(f); 409 fdput(f);
410#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
411 irqfd->consumer.token = (void *)irqfd->eventfd;
412 irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
413 irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
414 irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
415 irqfd->consumer.start = kvm_arch_irq_bypass_start;
416 ret = irq_bypass_register_consumer(&irqfd->consumer);
417 if (ret)
418 pr_info("irq bypass consumer (token %p) registration fails: %d\n",
419 irqfd->consumer.token, ret);
420#endif
431 421
432 return 0; 422 return 0;
433 423
@@ -469,9 +459,18 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
469} 459}
470EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); 460EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
471 461
472void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 462void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
473{ 463{
474 struct kvm_irq_ack_notifier *kian; 464 struct kvm_irq_ack_notifier *kian;
465
466 hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
467 link)
468 if (kian->gsi == gsi)
469 kian->irq_acked(kian);
470}
471
472void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
473{
475 int gsi, idx; 474 int gsi, idx;
476 475
477 trace_kvm_ack_irq(irqchip, pin); 476 trace_kvm_ack_irq(irqchip, pin);
@@ -479,10 +478,7 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
479 idx = srcu_read_lock(&kvm->irq_srcu); 478 idx = srcu_read_lock(&kvm->irq_srcu);
480 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 479 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
481 if (gsi != -1) 480 if (gsi != -1)
482 hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, 481 kvm_notify_acked_gsi(kvm, gsi);
483 link)
484 if (kian->gsi == gsi)
485 kian->irq_acked(kian);
486 srcu_read_unlock(&kvm->irq_srcu, idx); 482 srcu_read_unlock(&kvm->irq_srcu, idx);
487} 483}
488 484
@@ -525,7 +521,7 @@ kvm_eventfd_init(struct kvm *kvm)
525static int 521static int
526kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) 522kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
527{ 523{
528 struct _irqfd *irqfd, *tmp; 524 struct kvm_kernel_irqfd *irqfd, *tmp;
529 struct eventfd_ctx *eventfd; 525 struct eventfd_ctx *eventfd;
530 526
531 eventfd = eventfd_ctx_fdget(args->fd); 527 eventfd = eventfd_ctx_fdget(args->fd);
@@ -581,7 +577,7 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
581void 577void
582kvm_irqfd_release(struct kvm *kvm) 578kvm_irqfd_release(struct kvm *kvm)
583{ 579{
584 struct _irqfd *irqfd, *tmp; 580 struct kvm_kernel_irqfd *irqfd, *tmp;
585 581
586 spin_lock_irq(&kvm->irqfds.lock); 582 spin_lock_irq(&kvm->irqfds.lock);
587 583
@@ -604,13 +600,23 @@ kvm_irqfd_release(struct kvm *kvm)
604 */ 600 */
605void kvm_irq_routing_update(struct kvm *kvm) 601void kvm_irq_routing_update(struct kvm *kvm)
606{ 602{
607 struct _irqfd *irqfd; 603 struct kvm_kernel_irqfd *irqfd;
608 604
609 spin_lock_irq(&kvm->irqfds.lock); 605 spin_lock_irq(&kvm->irqfds.lock);
610 606
611 list_for_each_entry(irqfd, &kvm->irqfds.items, list) 607 list_for_each_entry(irqfd, &kvm->irqfds.items, list) {
612 irqfd_update(kvm, irqfd); 608 irqfd_update(kvm, irqfd);
613 609
610#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
611 if (irqfd->producer) {
612 int ret = kvm_arch_update_irqfd_routing(
613 irqfd->kvm, irqfd->producer->irq,
614 irqfd->gsi, 1);
615 WARN_ON(ret);
616 }
617#endif
618 }
619
614 spin_unlock_irq(&kvm->irqfds.lock); 620 spin_unlock_irq(&kvm->irqfds.lock);
615} 621}
616 622
@@ -914,9 +920,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
914 return -EINVAL; 920 return -EINVAL;
915 921
916 /* ioeventfd with no length can't be combined with DATAMATCH */ 922 /* ioeventfd with no length can't be combined with DATAMATCH */
917 if (!args->len && 923 if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH))
918 args->flags & (KVM_IOEVENTFD_FLAG_PIO |
919 KVM_IOEVENTFD_FLAG_DATAMATCH))
920 return -EINVAL; 924 return -EINVAL;
921 925
922 ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args); 926 ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index d7ea8e20dae4..f0b08a2a48ba 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -31,16 +31,6 @@
31#include <trace/events/kvm.h> 31#include <trace/events/kvm.h>
32#include "irq.h" 32#include "irq.h"
33 33
34struct kvm_irq_routing_table {
35 int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
36 u32 nr_rt_entries;
37 /*
38 * Array indexed by gsi. Each entry contains list of irq chips
39 * the gsi is connected to.
40 */
41 struct hlist_head map[0];
42};
43
44int kvm_irq_map_gsi(struct kvm *kvm, 34int kvm_irq_map_gsi(struct kvm *kvm,
45 struct kvm_kernel_irq_routing_entry *entries, int gsi) 35 struct kvm_kernel_irq_routing_entry *entries, int gsi)
46{ 36{
@@ -154,11 +144,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
154 144
155 /* 145 /*
156 * Do not allow GSI to be mapped to the same irqchip more than once. 146 * Do not allow GSI to be mapped to the same irqchip more than once.
157 * Allow only one to one mapping between GSI and MSI. 147 * Allow only one to one mapping between GSI and non-irqchip routing.
158 */ 148 */
159 hlist_for_each_entry(ei, &rt->map[ue->gsi], link) 149 hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
160 if (ei->type == KVM_IRQ_ROUTING_MSI || 150 if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
161 ue->type == KVM_IRQ_ROUTING_MSI || 151 ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
162 ue->u.irqchip.irqchip == ei->irqchip.irqchip) 152 ue->u.irqchip.irqchip == ei->irqchip.irqchip)
163 return r; 153 return r;
164 154
@@ -231,6 +221,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
231 kvm_irq_routing_update(kvm); 221 kvm_irq_routing_update(kvm);
232 mutex_unlock(&kvm->irq_lock); 222 mutex_unlock(&kvm->irq_lock);
233 223
224 kvm_arch_irq_routing_update(kvm);
225
234 synchronize_srcu_expedited(&kvm->irq_srcu); 226 synchronize_srcu_expedited(&kvm->irq_srcu);
235 227
236 new = old; 228 new = old;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8db1d9361993..484079efea5b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -230,6 +230,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
230 init_waitqueue_head(&vcpu->wq); 230 init_waitqueue_head(&vcpu->wq);
231 kvm_async_pf_vcpu_init(vcpu); 231 kvm_async_pf_vcpu_init(vcpu);
232 232
233 vcpu->pre_pcpu = -1;
234 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
235
233 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 236 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
234 if (!page) { 237 if (!page) {
235 r = -ENOMEM; 238 r = -ENOMEM;
@@ -2018,6 +2021,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
2018 } while (single_task_running() && ktime_before(cur, stop)); 2021 } while (single_task_running() && ktime_before(cur, stop));
2019 } 2022 }
2020 2023
2024 kvm_arch_vcpu_blocking(vcpu);
2025
2021 for (;;) { 2026 for (;;) {
2022 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 2027 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
2023 2028
@@ -2031,6 +2036,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
2031 finish_wait(&vcpu->wq, &wait); 2036 finish_wait(&vcpu->wq, &wait);
2032 cur = ktime_get(); 2037 cur = ktime_get();
2033 2038
2039 kvm_arch_vcpu_unblocking(vcpu);
2034out: 2040out:
2035 block_ns = ktime_to_ns(cur) - ktime_to_ns(start); 2041 block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
2036 2042
@@ -2718,6 +2724,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
2718 case KVM_CAP_IRQFD: 2724 case KVM_CAP_IRQFD:
2719 case KVM_CAP_IRQFD_RESAMPLE: 2725 case KVM_CAP_IRQFD_RESAMPLE:
2720#endif 2726#endif
2727 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
2721 case KVM_CAP_CHECK_EXTENSION_VM: 2728 case KVM_CAP_CHECK_EXTENSION_VM:
2722 return 1; 2729 return 1;
2723#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 2730#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
@@ -3341,7 +3348,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
3341 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 3348 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
3342 return -ENOSPC; 3349 return -ENOSPC;
3343 3350
3344 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) * 3351 new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) *
3345 sizeof(struct kvm_io_range)), GFP_KERNEL); 3352 sizeof(struct kvm_io_range)), GFP_KERNEL);
3346 if (!new_bus) 3353 if (!new_bus)
3347 return -ENOMEM; 3354 return -ENOMEM;
@@ -3373,7 +3380,7 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
3373 if (r) 3380 if (r)
3374 return r; 3381 return r;
3375 3382
3376 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) * 3383 new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
3377 sizeof(struct kvm_io_range)), GFP_KERNEL); 3384 sizeof(struct kvm_io_range)), GFP_KERNEL);
3378 if (!new_bus) 3385 if (!new_bus)
3379 return -ENOMEM; 3386 return -ENOMEM;
diff --git a/virt/lib/Kconfig b/virt/lib/Kconfig
new file mode 100644
index 000000000000..89a414f815d2
--- /dev/null
+++ b/virt/lib/Kconfig
@@ -0,0 +1,2 @@
1config IRQ_BYPASS_MANAGER
2 tristate
diff --git a/virt/lib/Makefile b/virt/lib/Makefile
new file mode 100644
index 000000000000..901228d1ffbc
--- /dev/null
+++ b/virt/lib/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_IRQ_BYPASS_MANAGER) += irqbypass.o
diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c
new file mode 100644
index 000000000000..09a03b5a21ff
--- /dev/null
+++ b/virt/lib/irqbypass.c
@@ -0,0 +1,257 @@
1/*
2 * IRQ offload/bypass manager
3 *
4 * Copyright (C) 2015 Red Hat, Inc.
5 * Copyright (c) 2015 Linaro Ltd.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Various virtualization hardware acceleration techniques allow bypassing or
12 * offloading interrupts received from devices around the host kernel. Posted
13 * Interrupts on Intel VT-d systems can allow interrupts to be received
14 * directly by a virtual machine. ARM IRQ Forwarding allows forwarded physical
15 * interrupts to be directly deactivated by the guest. This manager allows
16 * interrupt producers and consumers to find each other to enable this sort of
17 * bypass.
18 */
19
20#include <linux/irqbypass.h>
21#include <linux/list.h>
22#include <linux/module.h>
23#include <linux/mutex.h>
24
25MODULE_LICENSE("GPL v2");
26MODULE_DESCRIPTION("IRQ bypass manager utility module");
27
28static LIST_HEAD(producers);
29static LIST_HEAD(consumers);
30static DEFINE_MUTEX(lock);
31
32/* @lock must be held when calling connect */
33static int __connect(struct irq_bypass_producer *prod,
34 struct irq_bypass_consumer *cons)
35{
36 int ret = 0;
37
38 if (prod->stop)
39 prod->stop(prod);
40 if (cons->stop)
41 cons->stop(cons);
42
43 if (prod->add_consumer)
44 ret = prod->add_consumer(prod, cons);
45
46 if (!ret) {
47 ret = cons->add_producer(cons, prod);
48 if (ret && prod->del_consumer)
49 prod->del_consumer(prod, cons);
50 }
51
52 if (cons->start)
53 cons->start(cons);
54 if (prod->start)
55 prod->start(prod);
56
57 return ret;
58}
59
60/* @lock must be held when calling disconnect */
61static void __disconnect(struct irq_bypass_producer *prod,
62 struct irq_bypass_consumer *cons)
63{
64 if (prod->stop)
65 prod->stop(prod);
66 if (cons->stop)
67 cons->stop(cons);
68
69 cons->del_producer(cons, prod);
70
71 if (prod->del_consumer)
72 prod->del_consumer(prod, cons);
73
74 if (cons->start)
75 cons->start(cons);
76 if (prod->start)
77 prod->start(prod);
78}
79
80/**
81 * irq_bypass_register_producer - register IRQ bypass producer
82 * @producer: pointer to producer structure
83 *
84 * Add the provided IRQ producer to the list of producers and connect
85 * with any matching token found on the IRQ consumers list.
86 */
87int irq_bypass_register_producer(struct irq_bypass_producer *producer)
88{
89 struct irq_bypass_producer *tmp;
90 struct irq_bypass_consumer *consumer;
91
92 might_sleep();
93
94 if (!try_module_get(THIS_MODULE))
95 return -ENODEV;
96
97 mutex_lock(&lock);
98
99 list_for_each_entry(tmp, &producers, node) {
100 if (tmp->token == producer->token) {
101 mutex_unlock(&lock);
102 module_put(THIS_MODULE);
103 return -EBUSY;
104 }
105 }
106
107 list_for_each_entry(consumer, &consumers, node) {
108 if (consumer->token == producer->token) {
109 int ret = __connect(producer, consumer);
110 if (ret) {
111 mutex_unlock(&lock);
112 module_put(THIS_MODULE);
113 return ret;
114 }
115 break;
116 }
117 }
118
119 list_add(&producer->node, &producers);
120
121 mutex_unlock(&lock);
122
123 return 0;
124}
125EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
126
127/**
128 * irq_bypass_unregister_producer - unregister IRQ bypass producer
129 * @producer: pointer to producer structure
130 *
131 * Remove a previously registered IRQ producer from the list of producers
132 * and disconnect it from any connected IRQ consumer.
133 */
134void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
135{
136 struct irq_bypass_producer *tmp;
137 struct irq_bypass_consumer *consumer;
138
139 might_sleep();
140
141 if (!try_module_get(THIS_MODULE))
142 return; /* nothing in the list anyway */
143
144 mutex_lock(&lock);
145
146 list_for_each_entry(tmp, &producers, node) {
147 if (tmp->token != producer->token)
148 continue;
149
150 list_for_each_entry(consumer, &consumers, node) {
151 if (consumer->token == producer->token) {
152 __disconnect(producer, consumer);
153 break;
154 }
155 }
156
157 list_del(&producer->node);
158 module_put(THIS_MODULE);
159 break;
160 }
161
162 mutex_unlock(&lock);
163
164 module_put(THIS_MODULE);
165}
166EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);
167
168/**
169 * irq_bypass_register_consumer - register IRQ bypass consumer
170 * @consumer: pointer to consumer structure
171 *
172 * Add the provided IRQ consumer to the list of consumers and connect
173 * with any matching token found on the IRQ producer list.
174 */
175int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
176{
177 struct irq_bypass_consumer *tmp;
178 struct irq_bypass_producer *producer;
179
180 if (!consumer->add_producer || !consumer->del_producer)
181 return -EINVAL;
182
183 might_sleep();
184
185 if (!try_module_get(THIS_MODULE))
186 return -ENODEV;
187
188 mutex_lock(&lock);
189
190 list_for_each_entry(tmp, &consumers, node) {
191 if (tmp->token == consumer->token) {
192 mutex_unlock(&lock);
193 module_put(THIS_MODULE);
194 return -EBUSY;
195 }
196 }
197
198 list_for_each_entry(producer, &producers, node) {
199 if (producer->token == consumer->token) {
200 int ret = __connect(producer, consumer);
201 if (ret) {
202 mutex_unlock(&lock);
203 module_put(THIS_MODULE);
204 return ret;
205 }
206 break;
207 }
208 }
209
210 list_add(&consumer->node, &consumers);
211
212 mutex_unlock(&lock);
213
214 return 0;
215}
216EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
217
218/**
219 * irq_bypass_unregister_consumer - unregister IRQ bypass consumer
220 * @consumer: pointer to consumer structure
221 *
222 * Remove a previously registered IRQ consumer from the list of consumers
223 * and disconnect it from any connected IRQ producer.
224 */
225void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
226{
227 struct irq_bypass_consumer *tmp;
228 struct irq_bypass_producer *producer;
229
230 might_sleep();
231
232 if (!try_module_get(THIS_MODULE))
233 return; /* nothing in the list anyway */
234
235 mutex_lock(&lock);
236
237 list_for_each_entry(tmp, &consumers, node) {
238 if (tmp->token != consumer->token)
239 continue;
240
241 list_for_each_entry(producer, &producers, node) {
242 if (producer->token == consumer->token) {
243 __disconnect(producer, consumer);
244 break;
245 }
246 }
247
248 list_del(&consumer->node);
249 module_put(THIS_MODULE);
250 break;
251 }
252
253 mutex_unlock(&lock);
254
255 module_put(THIS_MODULE);
256}
257EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);