From 2cef4deb4018c02fb3cd08f76c8a988f7ddee480 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Mon, 25 Mar 2013 17:22:48 +0100
Subject: KVM: s390: Dont do a gmap update on minor memslot changes

Some memslot updates dont affect the gmap implementation,
e.g. setting/unsetting dirty tracking. Since a gmap update
will cause tlb flushes and segment table invalidations we
want to avoid that.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 33161b4a8280..f241e3315ebb 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1007,6 +1007,16 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 {
 	int rc;
 
+	/* If the basics of the memslot do not change, we do not want
+	 * to update the gmap. Every update causes several unnecessary
+	 * segment translation exceptions. This is usually handled just
+	 * fine by the normal fault handler + gmap, but it will also
+	 * cause faults on the prefix page of running guest CPUs.
+	 */
+	if (old->userspace_addr == mem->userspace_addr &&
+	    old->base_gfn * PAGE_SIZE == mem->guest_phys_addr &&
+	    old->npages * PAGE_SIZE == mem->memory_size)
+		return;
 
 	rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr,
 		mem->guest_phys_addr, mem->memory_size);
-- 
cgit v1.2.2


From d21683ea1f1b03823928a98b6380332b9385e3a7 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 25 Mar 2013 17:22:49 +0100
Subject: KVM: s390: fix 24 bit psw handling in lpsw/lpswe handler

When checking for validity the lpsw/lpswe handler check that only
the lower 20 bits instead of 24 bits have a non-zero value.
There handling valid psws as invalid ones.
Fix the 24 bit psw mask.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/s390/kvm/priv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 7db2ad076f31..7b397b37d11a 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -255,7 +255,7 @@ static void handle_new_psw(struct kvm_vcpu *vcpu)
 
 #define PSW_MASK_ADDR_MODE (PSW_MASK_EA | PSW_MASK_BA)
 #define PSW_MASK_UNASSIGNED 0xb80800fe7fffffffUL
-#define PSW_ADDR_24 0x00000000000fffffUL
+#define PSW_ADDR_24 0x0000000000ffffffUL
 #define PSW_ADDR_31 0x000000007fffffffUL
 
 int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.2


From ace5058763b72d128efcbe27969e89226c9c593a Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 25 Mar 2013 17:22:50 +0100
Subject: KVM: s390: fix psw conversion in lpsw handler

When converting a 64 bit psw to a 128 bit psw the addressing mode bit of
the "addr" part of the 64 bit psw must be moved to the basic addressing
mode bit of the "mask" part of the 128 bit psw.
In addition the addressing mode bit must be cleared when moved to the "addr"
part of the 128 bit psw.
Otherwise an invalid psw would be generated if the orginal psw was in the
31 bit addressing mode.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/s390/kvm/priv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 7b397b37d11a..844a2b986112 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -286,7 +286,8 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.sie_block->gpsw.mask =
 		(new_psw.mask & ~PSW32_MASK_BASE) << 32;
-	vcpu->arch.sie_block->gpsw.addr = new_psw.addr;
+	vcpu->arch.sie_block->gpsw.mask |= new_psw.addr & PSW32_ADDR_AMODE;
+	vcpu->arch.sie_block->gpsw.addr = new_psw.addr & ~PSW32_ADDR_AMODE;
 
 	if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) ||
 	    (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) &&
-- 
cgit v1.2.2


From 6fd0fcc93b1eaf82911782de5c7aa35c174bf620 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 25 Mar 2013 17:22:51 +0100
Subject: KVM: s390: fix return code handling in lpsw/lpswe handlers

kvm_s390_inject_program_int() may return with a non-zero return value, in
case of an error (out of memory). Report that to the calling functions
instead of ignoring the error case.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/s390/kvm/priv.c | 44 ++++++++++++++------------------------------
 1 file changed, 14 insertions(+), 30 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 844a2b986112..9d32c56fb02c 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -269,20 +269,14 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
 
 	addr = kvm_s390_get_base_disp_s(vcpu);
 
-	if (addr & 7) {
-		kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-		goto out;
-	}
+	if (addr & 7)
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-	if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) {
-		kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-		goto out;
-	}
+	if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw)))
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
-	if (!(new_psw.mask & PSW32_MASK_BASE)) {
-		kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-		goto out;
-	}
+	if (!(new_psw.mask & PSW32_MASK_BASE))
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
 	vcpu->arch.sie_block->gpsw.mask =
 		(new_psw.mask & ~PSW32_MASK_BASE) << 32;
@@ -293,13 +287,10 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
 	    (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) &&
 	     (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) ||
 	    ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
-	     PSW_MASK_EA)) {
-		kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-		goto out;
-	}
+	     PSW_MASK_EA))
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
 	handle_new_psw(vcpu);
-out:
 	return 0;
 }
 
@@ -310,15 +301,11 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
 
 	addr = kvm_s390_get_base_disp_s(vcpu);
 
-	if (addr & 7) {
-		kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-		goto out;
-	}
+	if (addr & 7)
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-	if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) {
-		kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-		goto out;
-	}
+	if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw)))
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
 	vcpu->arch.sie_block->gpsw.mask = new_psw.mask;
 	vcpu->arch.sie_block->gpsw.addr = new_psw.addr;
@@ -330,13 +317,10 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
 	    (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) &&
 	     (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) ||
 	    ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
-	     PSW_MASK_EA)) {
-		kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-		goto out;
-	}
+	     PSW_MASK_EA))
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
 	handle_new_psw(vcpu);
-out:
 	return 0;
 }
 
-- 
cgit v1.2.2


From 3736b874a39a1df2a94186c357aabeb6a7d7d4f6 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 25 Mar 2013 17:22:52 +0100
Subject: KVM: s390: make if statements in lpsw/lpswe handlers readable

Being unable to parse the 5- and 8-line if statements I had to split them
to be able to make any sense of them and verify that they match the
architecture.
So change the code since I guess that other people will also have a hard
time parsing such long conditional statements with line breaks.

Introduce a common is_valid_psw() function which does all the checks needed.
In case of lpsw (64 bit psw -> 128 bit psw conversion) it will do some not
needed additional checks, since a couple of bits can't be set anyway, but
that doesn't hurt.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/s390/kvm/priv.c | 58 ++++++++++++++++++++++------------------------------
 1 file changed, 24 insertions(+), 34 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 9d32c56fb02c..05d186c21eca 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -258,68 +258,58 @@ static void handle_new_psw(struct kvm_vcpu *vcpu)
 #define PSW_ADDR_24 0x0000000000ffffffUL
 #define PSW_ADDR_31 0x000000007fffffffUL
 
+static int is_valid_psw(psw_t *psw) {
+	if (psw->mask & PSW_MASK_UNASSIGNED)
+		return 0;
+	if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_BA) {
+		if (psw->addr & ~PSW_ADDR_31)
+			return 0;
+	}
+	if (!(psw->mask & PSW_MASK_ADDR_MODE) && (psw->addr & ~PSW_ADDR_24))
+		return 0;
+	if ((psw->mask & PSW_MASK_ADDR_MODE) ==  PSW_MASK_EA)
+		return 0;
+	return 1;
+}
+
 int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
 {
-	u64 addr;
+	psw_t *gpsw = &vcpu->arch.sie_block->gpsw;
 	psw_compat_t new_psw;
+	u64 addr;
 
-	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+	if (gpsw->mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu,
 						   PGM_PRIVILEGED_OPERATION);
-
 	addr = kvm_s390_get_base_disp_s(vcpu);
-
 	if (addr & 7)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-
 	if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw)))
 		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
 	if (!(new_psw.mask & PSW32_MASK_BASE))
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-
-	vcpu->arch.sie_block->gpsw.mask =
-		(new_psw.mask & ~PSW32_MASK_BASE) << 32;
-	vcpu->arch.sie_block->gpsw.mask |= new_psw.addr & PSW32_ADDR_AMODE;
-	vcpu->arch.sie_block->gpsw.addr = new_psw.addr & ~PSW32_ADDR_AMODE;
-
-	if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) ||
-	    (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) &&
-	     (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) ||
-	    ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
-	     PSW_MASK_EA))
+	gpsw->mask = (new_psw.mask & ~PSW32_MASK_BASE) << 32;
+	gpsw->mask |= new_psw.addr & PSW32_ADDR_AMODE;
+	gpsw->addr = new_psw.addr & ~PSW32_ADDR_AMODE;
+	if (!is_valid_psw(gpsw))
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-
 	handle_new_psw(vcpu);
 	return 0;
 }
 
 static int handle_lpswe(struct kvm_vcpu *vcpu)
 {
-	u64 addr;
 	psw_t new_psw;
+	u64 addr;
 
 	addr = kvm_s390_get_base_disp_s(vcpu);
-
 	if (addr & 7)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-
 	if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw)))
 		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
-	vcpu->arch.sie_block->gpsw.mask = new_psw.mask;
-	vcpu->arch.sie_block->gpsw.addr = new_psw.addr;
-
-	if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) ||
-	    (((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
-	      PSW_MASK_BA) &&
-	     (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_31)) ||
-	    (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) &&
-	     (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) ||
-	    ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
-	     PSW_MASK_EA))
+	vcpu->arch.sie_block->gpsw = new_psw;
+	if (!is_valid_psw(&vcpu->arch.sie_block->gpsw))
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-
 	handle_new_psw(vcpu);
 	return 0;
 }
-- 
cgit v1.2.2


From db4a29cb6ac7b2fda505923bdbc58fc35a719f62 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 25 Mar 2013 17:22:53 +0100
Subject: KVM: s390: fix and enforce return code handling for irq injections

kvm_s390_inject_program_int() and friends may fail if no memory is available.
This must be reported to the calling functions, so that this gets passed
down to user space which should fix the situation.
Alternatively we end up with guest state corruption.

So fix this and enforce return value checking by adding a __must_check
annotation to all of these function prototypes.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/s390/kvm/intercept.c | 12 +++----
 arch/s390/kvm/kvm-s390.c  |  3 +-
 arch/s390/kvm/kvm-s390.h  | 12 +++----
 arch/s390/kvm/priv.c      | 83 +++++++++++++++--------------------------------
 4 files changed, 37 insertions(+), 73 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index c6ba4dfd7f1e..b7d1b2edeeb3 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -45,10 +45,8 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
 	do {
 		rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg],
 			       (u64 __user *) useraddr);
-		if (rc) {
-			kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-			break;
-		}
+		if (rc)
+			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 		useraddr += 8;
 		if (reg == reg3)
 			break;
@@ -79,10 +77,8 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
 	reg = reg1;
 	do {
 		rc = get_guest(vcpu, val, (u32 __user *) useraddr);
-		if (rc) {
-			kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-			break;
-		}
+		if (rc)
+			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 		vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul;
 		vcpu->arch.sie_block->gcr[reg] |= val;
 		useraddr += 4;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index f241e3315ebb..d05a59c1eea7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -633,8 +633,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 		} else {
 			VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
 			trace_kvm_s390_sie_fault(vcpu);
-			kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-			rc = 0;
+			rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 		}
 	}
 	VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 4d89d64a8161..efc14f687265 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -110,12 +110,12 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
 void kvm_s390_tasklet(unsigned long parm);
 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu);
 void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu);
-int kvm_s390_inject_vm(struct kvm *kvm,
-		struct kvm_s390_interrupt *s390int);
-int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
-		struct kvm_s390_interrupt *s390int);
-int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
-int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action);
+int __must_check kvm_s390_inject_vm(struct kvm *kvm,
+				    struct kvm_s390_interrupt *s390int);
+int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
+				      struct kvm_s390_interrupt *s390int);
+int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
+int __must_check kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action);
 struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
 						    u64 cr6, u64 schid);
 
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 05d186c21eca..23a8370b1045 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -36,31 +36,24 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
 	operand2 = kvm_s390_get_base_disp_s(vcpu);
 
 	/* must be word boundary */
-	if (operand2 & 3) {
-		kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-		goto out;
-	}
+	if (operand2 & 3)
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
 	/* get the value */
-	if (get_guest(vcpu, address, (u32 __user *) operand2)) {
-		kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-		goto out;
-	}
+	if (get_guest(vcpu, address, (u32 __user *) operand2))
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
 	address = address & 0x7fffe000u;
 
 	/* make sure that the new value is valid memory */
 	if (copy_from_guest_absolute(vcpu, &tmp, address, 1) ||
-	   (copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1))) {
-		kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-		goto out;
-	}
+	   (copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1)))
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
 	kvm_s390_set_prefix(vcpu, address);
 
 	VCPU_EVENT(vcpu, 5, "setting prefix to %x", address);
 	trace_kvm_s390_handle_prefix(vcpu, 1, address);
-out:
 	return 0;
 }
 
@@ -74,49 +67,37 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
 	operand2 = kvm_s390_get_base_disp_s(vcpu);
 
 	/* must be word boundary */
-	if (operand2 & 3) {
-		kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-		goto out;
-	}
+	if (operand2 & 3)
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
 	address = vcpu->arch.sie_block->prefix;
 	address = address & 0x7fffe000u;
 
 	/* get the value */
-	if (put_guest(vcpu, address, (u32 __user *)operand2)) {
-		kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-		goto out;
-	}
+	if (put_guest(vcpu, address, (u32 __user *)operand2))
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
 	VCPU_EVENT(vcpu, 5, "storing prefix to %x", address);
 	trace_kvm_s390_handle_prefix(vcpu, 0, address);
-out:
 	return 0;
 }
 
 static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
 {
 	u64 useraddr;
-	int rc;
 
 	vcpu->stat.instruction_stap++;
 
 	useraddr = kvm_s390_get_base_disp_s(vcpu);
 
-	if (useraddr & 1) {
-		kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-		goto out;
-	}
+	if (useraddr & 1)
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-	rc = put_guest(vcpu, vcpu->vcpu_id, (u16 __user *)useraddr);
-	if (rc) {
-		kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-		goto out;
-	}
+	if (put_guest(vcpu, vcpu->vcpu_id, (u16 __user *)useraddr))
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
 	VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr);
 	trace_kvm_s390_handle_stap(vcpu, useraddr);
-out:
 	return 0;
 }
 
@@ -135,10 +116,8 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
 	int cc;
 
 	addr = kvm_s390_get_base_disp_s(vcpu);
-	if (addr & 3) {
-		kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-		goto out;
-	}
+	if (addr & 3)
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 	cc = 0;
 	inti = kvm_s390_get_io_int(vcpu->kvm, vcpu->run->s.regs.crs[6], 0);
 	if (!inti)
@@ -167,7 +146,6 @@ no_interrupt:
 	/* Set condition code and we're done. */
 	vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
 	vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44;
-out:
 	return 0;
 }
 
@@ -237,12 +215,9 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
 	rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
 			   &facility_list, sizeof(facility_list));
 	if (rc)
-		kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-	else {
-		VCPU_EVENT(vcpu, 5, "store facility list value %x",
-			   facility_list);
-		trace_kvm_s390_handle_stfl(vcpu, facility_list);
-	}
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+	VCPU_EVENT(vcpu, 5, "store facility list value %x", facility_list);
+	trace_kvm_s390_handle_stfl(vcpu, facility_list);
 	return 0;
 }
 
@@ -317,25 +292,18 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
 static int handle_stidp(struct kvm_vcpu *vcpu)
 {
 	u64 operand2;
-	int rc;
 
 	vcpu->stat.instruction_stidp++;
 
 	operand2 = kvm_s390_get_base_disp_s(vcpu);
 
-	if (operand2 & 7) {
-		kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-		goto out;
-	}
+	if (operand2 & 7)
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-	rc = put_guest(vcpu, vcpu->arch.stidp_data, (u64 __user *)operand2);
-	if (rc) {
-		kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-		goto out;
-	}
+	if (put_guest(vcpu, vcpu->arch.stidp_data, (u64 __user *)operand2))
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
 	VCPU_EVENT(vcpu, 5, "%s", "store cpu id");
-out:
 	return 0;
 }
 
@@ -377,6 +345,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
 	int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff;
 	u64 operand2;
 	unsigned long mem;
+	int rc = 0;
 
 	vcpu->stat.instruction_stsi++;
 	VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2);
@@ -412,7 +381,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
 	}
 
 	if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) {
-		kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+		rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 		goto out_mem;
 	}
 	trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
@@ -425,7 +394,7 @@ out_mem:
 out_fail:
 	/* condition code 3 */
 	vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;
-	return 0;
+	return rc;
 }
 
 static const intercept_handler_t b2_handlers[256] = {
-- 
cgit v1.2.2


From c51f068c23c76a86d427260b8219430ee6f99516 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 25 Mar 2013 17:22:54 +0100
Subject: KVM: s390: fix stsi exception handling

In case of an exception the guest psw condition code should be left alone.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-By: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/s390/kvm/priv.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 23a8370b1045..de1b1b6128e1 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -343,8 +343,8 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
 	int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28;
 	int sel1 = vcpu->run->s.regs.gprs[0] & 0xff;
 	int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff;
+	unsigned long mem = 0;
 	u64 operand2;
-	unsigned long mem;
 	int rc = 0;
 
 	vcpu->stat.instruction_stsi++;
@@ -364,36 +364,36 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
 	case 2:
 		mem = get_zeroed_page(GFP_KERNEL);
 		if (!mem)
-			goto out_fail;
+			goto out_no_data;
 		if (stsi((void *) mem, fc, sel1, sel2))
-			goto out_mem;
+			goto out_no_data;
 		break;
 	case 3:
 		if (sel1 != 2 || sel2 != 2)
-			goto out_fail;
+			goto out_no_data;
 		mem = get_zeroed_page(GFP_KERNEL);
 		if (!mem)
-			goto out_fail;
+			goto out_no_data;
 		handle_stsi_3_2_2(vcpu, (void *) mem);
 		break;
 	default:
-		goto out_fail;
+		goto out_no_data;
 	}
 
 	if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) {
 		rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-		goto out_mem;
+		goto out_exception;
 	}
 	trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
 	free_page(mem);
 	vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
 	vcpu->run->s.regs.gprs[0] = 0;
 	return 0;
-out_mem:
-	free_page(mem);
-out_fail:
+out_no_data:
 	/* condition code 3 */
 	vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;
+out_exception:
+	free_page(mem);
 	return rc;
 }
 
-- 
cgit v1.2.2


From b13b5dc7c96d40ebdadbdb752a92ecde5a9f2914 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 25 Mar 2013 17:22:55 +0100
Subject: KVM: s390: fix compile with !CONFIG_COMPAT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

arch/s390/kvm/priv.c should include both
linux/compat.h and asm/compat.h.
Fixes this one:

In file included from arch/s390/kvm/priv.c:23:0:
arch/s390/include/asm/compat.h: In function ‘arch_compat_alloc_user_space’:
arch/s390/include/asm/compat.h:258:2: error: implicit declaration of function ‘is_compat_task’

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/s390/kvm/priv.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index de1b1b6128e1..6bbd7b5a0bbe 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -14,6 +14,7 @@
 #include <linux/kvm.h>
 #include <linux/gfp.h>
 #include <linux/errno.h>
+#include <linux/compat.h>
 #include <asm/asm-offsets.h>
 #include <asm/current.h>
 #include <asm/debug.h>
-- 
cgit v1.2.2


From dd2887e7c36d0be986ef17a9dbec904e3e334566 Mon Sep 17 00:00:00 2001
From: Nick Wang <jfwang@us.ibm.com>
Date: Mon, 25 Mar 2013 17:22:57 +0100
Subject: KVM: s390: Remove the sanity checks for kvm memory slot

To model the standby memory with memory_region_add_subregion
and friends, the guest would have one or more regions of ram.
Remove the check allowing only one memory slot and the check
requiring the real address of memory slot starts at zero.

Signed-off-by: Nick Wang <jfwang@us.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d05a59c1eea7..b322ff15751d 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -977,18 +977,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_userspace_memory_region *mem,
 				   enum kvm_mr_change change)
 {
-	/* A few sanity checks. We can have exactly one memory slot which has
-	   to start at guest virtual zero and which has to be located at a
-	   page boundary in userland and which has to end at a page boundary.
-	   The memory in userland is ok to be fragmented into various different
-	   vmas. It is okay to mmap() and munmap() stuff in this slot after
-	   doing this call at any time */
-
-	if (mem->slot)
-		return -EINVAL;
-
-	if (mem->guest_phys_addr)
-		return -EINVAL;
+	/* A few sanity checks. We can have memory slots which have to be
+	   located/ended at a segment boundary (1MB). The memory in userland is
+	   ok to be fragmented into various different vmas. It is okay to mmap()
+	   and munmap() stuff in this slot after doing this call at any time */
 
 	if (mem->userspace_addr & 0xffffful)
 		return -EINVAL;
-- 
cgit v1.2.2


From e1e2e605c2ad6791ce6346b22443ce611709fa65 Mon Sep 17 00:00:00 2001
From: Nick Wang <jfwang@us.ibm.com>
Date: Mon, 25 Mar 2013 17:22:58 +0100
Subject: KVM: s390: Enable KVM_CAP_NR_MEMSLOTS on s390

Return KVM_USER_MEM_SLOTS in kvm_dev_ioctl_check_extension().

Signed-off-by: Nick Wang <jfwang@us.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index b322ff15751d..c1c7c683fa26 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -149,6 +149,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_MAX_VCPUS:
 		r = KVM_MAX_VCPUS;
 		break;
+	case KVM_CAP_NR_MEMSLOTS:
+		r = KVM_USER_MEM_SLOTS;
+		break;
 	case KVM_CAP_S390_COW:
 		r = MACHINE_HAS_ESOP;
 		break;
-- 
cgit v1.2.2


From afd80d85aefac27e6e2f9dc10f60515357c504d2 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 28 Mar 2013 17:18:35 +0100
Subject: pmu: prepare for migration support

In order to migrate the PMU state correctly, we need to restore the
values of MSR_CORE_PERF_GLOBAL_STATUS (a read-only register) and
MSR_CORE_PERF_GLOBAL_OVF_CTRL (which has side effects when written).
We also need to write the full 40-bit value of the performance counter,
which would only be possible with a v3 architectural PMU's full-width
counter MSRs.

To distinguish host-initiated writes from the guest's, pass the
full struct msr_data to kvm_pmu_set_msr.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/pmu.c              | 14 +++++++++++---
 arch/x86/kvm/x86.c              |  4 ++--
 3 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b5a64621d5af..3dd84c996d56 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1030,7 +1030,7 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu);
 void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
 bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
 int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
-int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
 int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
 void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
 void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index cfc258a6bf97..c53e797e7369 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -360,10 +360,12 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
 	return 1;
 }
 
-int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	struct kvm_pmu *pmu = &vcpu->arch.pmu;
 	struct kvm_pmc *pmc;
+	u32 index = msr_info->index;
+	u64 data = msr_info->data;
 
 	switch (index) {
 	case MSR_CORE_PERF_FIXED_CTR_CTRL:
@@ -375,6 +377,10 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
 		}
 		break;
 	case MSR_CORE_PERF_GLOBAL_STATUS:
+		if (msr_info->host_initiated) {
+			pmu->global_status = data;
+			return 0;
+		}
 		break; /* RO MSR */
 	case MSR_CORE_PERF_GLOBAL_CTRL:
 		if (pmu->global_ctrl == data)
@@ -386,7 +392,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
 		break;
 	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
 		if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
-			pmu->global_status &= ~data;
+			if (!msr_info->host_initiated)
+				pmu->global_status &= ~data;
 			pmu->global_ovf_ctrl = data;
 			return 0;
 		}
@@ -394,7 +401,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
 	default:
 		if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
 				(pmc = get_fixed_pmc(pmu, index))) {
-			data = (s64)(s32)data;
+			if (!msr_info->host_initiated)
+				data = (s64)(s32)data;
 			pmc->counter += data - read_pmc(pmc);
 			return 0;
 		} else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2aaba814f1c8..78c6f90a60cc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2040,7 +2040,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_P6_EVNTSEL0:
 	case MSR_P6_EVNTSEL1:
 		if (kvm_pmu_msr(vcpu, msr))
-			return kvm_pmu_set_msr(vcpu, msr, data);
+			return kvm_pmu_set_msr(vcpu, msr_info);
 
 		if (pr || data != 0)
 			vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
@@ -2086,7 +2086,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 			return xen_hvm_config(vcpu, data);
 		if (kvm_pmu_msr(vcpu, msr))
-			return kvm_pmu_set_msr(vcpu, msr, data);
+			return kvm_pmu_set_msr(vcpu, msr_info);
 		if (!ignore_msrs) {
 			vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
 				    msr, data);
-- 
cgit v1.2.2


From 450e0b411f56db9839c9725d4d49e70e24f87ba3 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Fri, 29 Mar 2013 14:05:26 +0900
Subject: Revert "KVM: MMU: Move kvm_mmu_free_some_pages() into
 kvm_mmu_alloc_page()"

With the following commit, shadow pages can be zapped at random during
a shadow page talbe walk:
  KVM: MMU: Move kvm_mmu_free_some_pages() into kvm_mmu_alloc_page()
  7ddca7e43c8f28f9419da81a0e7730b66aa60fe9

This patch reverts it and fixes __direct_map() and FNAME(fetch)().

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/mmu.c         | 11 +++++++----
 arch/x86/kvm/paging_tmpl.h |  1 +
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 633e30cfbd63..004cc87b781c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1501,15 +1501,11 @@ static void drop_parent_pte(struct kvm_mmu_page *sp,
 	mmu_spte_clear_no_track(parent_pte);
 }
 
-static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
-
 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 					       u64 *parent_pte, int direct)
 {
 	struct kvm_mmu_page *sp;
 
-	make_mmu_pages_available(vcpu);
-
 	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
 	if (!direct)
@@ -2806,6 +2802,7 @@ exit:
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
+static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
 
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 			 gfn_t gfn, bool prefault)
@@ -2847,6 +2844,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
+	make_mmu_pages_available(vcpu);
 	if (likely(!force_pt_level))
 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
 	r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
@@ -2924,6 +2922,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		spin_lock(&vcpu->kvm->mmu_lock);
+		make_mmu_pages_available(vcpu);
 		sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
 				      1, ACC_ALL, NULL);
 		++sp->root_count;
@@ -2935,6 +2934,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 
 			ASSERT(!VALID_PAGE(root));
 			spin_lock(&vcpu->kvm->mmu_lock);
+			make_mmu_pages_available(vcpu);
 			sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
 					      i << 30,
 					      PT32_ROOT_LEVEL, 1, ACC_ALL,
@@ -2973,6 +2973,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		ASSERT(!VALID_PAGE(root));
 
 		spin_lock(&vcpu->kvm->mmu_lock);
+		make_mmu_pages_available(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
 				      0, ACC_ALL, NULL);
 		root = __pa(sp->spt);
@@ -3006,6 +3007,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 				return 1;
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
+		make_mmu_pages_available(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, 0,
 				      ACC_ALL, NULL);
@@ -3311,6 +3313,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
+	make_mmu_pages_available(vcpu);
 	if (likely(!force_pt_level))
 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
 	r = __direct_map(vcpu, gpa, write, map_writable,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index af143f065532..da20860b457a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -627,6 +627,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		goto out_unlock;
 
 	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
+	make_mmu_pages_available(vcpu);
 	if (!force_pt_level)
 		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
 	r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
-- 
cgit v1.2.2


From 44944d4d28948c71b110b09a2e924e505cd39e8b Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Sun, 7 Apr 2013 08:25:18 +0800
Subject: KVM: Call kvm_apic_match_dest() to check destination vcpu

For a given vcpu, kvm_apic_match_dest() will tell you whether
the vcpu in the destination list quickly. Drop kvm_calculate_eoi_exitmap()
and use kvm_apic_match_dest() instead.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/lapic.c | 47 -----------------------------------------------
 arch/x86/kvm/lapic.h |  4 ----
 2 files changed, 51 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a8e9369f41c5..e2274745ab3d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -145,53 +145,6 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
 	return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
 }
 
-void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
-				struct kvm_lapic_irq *irq,
-				u64 *eoi_exit_bitmap)
-{
-	struct kvm_lapic **dst;
-	struct kvm_apic_map *map;
-	unsigned long bitmap = 1;
-	int i;
-
-	rcu_read_lock();
-	map = rcu_dereference(vcpu->kvm->arch.apic_map);
-
-	if (unlikely(!map)) {
-		__set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap);
-		goto out;
-	}
-
-	if (irq->dest_mode == 0) { /* physical mode */
-		if (irq->delivery_mode == APIC_DM_LOWEST ||
-				irq->dest_id == 0xff) {
-			__set_bit(irq->vector,
-				  (unsigned long *)eoi_exit_bitmap);
-			goto out;
-		}
-		dst = &map->phys_map[irq->dest_id & 0xff];
-	} else {
-		u32 mda = irq->dest_id << (32 - map->ldr_bits);
-
-		dst = map->logical_map[apic_cluster_id(map, mda)];
-
-		bitmap = apic_logical_id(map, mda);
-	}
-
-	for_each_set_bit(i, &bitmap, 16) {
-		if (!dst[i])
-			continue;
-		if (dst[i]->vcpu == vcpu) {
-			__set_bit(irq->vector,
-				  (unsigned long *)eoi_exit_bitmap);
-			break;
-		}
-	}
-
-out:
-	rcu_read_unlock();
-}
-
 static void recalculate_apic_map(struct kvm *kvm)
 {
 	struct kvm_apic_map *new, *old = NULL;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 2c721b986eec..baa20cfb95ae 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -160,10 +160,6 @@ static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
 	return ldr & map->lid_mask;
 }
 
-void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
-				struct kvm_lapic_irq *irq,
-				u64 *eoi_bitmap);
-
 static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.apic->pending_events;
-- 
cgit v1.2.2


From b8c07d55d010702eff61562cf9a77366833d9da2 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sat, 6 Apr 2013 13:51:21 +0200
Subject: KVM: nVMX: Check exit control for VM_EXIT_SAVE_IA32_PAT, not entry
 controls

Obviously a copy&paste mistake: prepare_vmcs12 has to check L1's exit
controls for VM_EXIT_SAVE_IA32_PAT.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 03f574641852..b67f35475b09 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7402,7 +7402,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	/* TODO: These cannot have changed unless we have MSR bitmaps and
 	 * the relevant bit asks not to trap the change */
 	vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
-	if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT)
+	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
 		vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
 	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
 	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
-- 
cgit v1.2.2


From 458f212e36b291067b74c0cac2bdcf8278817ee7 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Mon, 8 Apr 2013 15:26:33 +0800
Subject: KVM: x86: fix memory leak in vmx_init

Free vmx_msr_bitmap_longmode_x2apic and vmx_msr_bitmap_longmode if
kvm_init() fails.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b67f35475b09..1cf202cb47d5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7741,7 +7741,7 @@ static int __init vmx_init(void)
 	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
 		     __alignof__(struct vcpu_vmx), THIS_MODULE);
 	if (r)
-		goto out3;
+		goto out5;
 
 #ifdef CONFIG_KEXEC
 	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
@@ -7789,6 +7789,8 @@ static int __init vmx_init(void)
 
 	return 0;
 
+out5:
+	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
 	free_page((unsigned long)vmx_msr_bitmap_longmode);
 out3:
-- 
cgit v1.2.2


From a63cb56061239372fce2452dbedd35c95bd665aa Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 8 Apr 2013 11:07:46 +0200
Subject: KVM: VMX: Add missing braces to avoid redundant error check

The code was already properly aligned, now also add the braces to avoid
that err is checked even if alloc_apic_access_page didn't run and change
it. Found via Coccinelle by Fengguang Wu.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1cf202cb47d5..669b80378731 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6785,10 +6785,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	put_cpu();
 	if (err)
 		goto free_vmcs;
-	if (vm_need_virtualize_apic_accesses(kvm))
+	if (vm_need_virtualize_apic_accesses(kvm)) {
 		err = alloc_apic_access_page(kvm);
 		if (err)
 			goto free_vmcs;
+	}
 
 	if (enable_ept) {
 		if (!kvm->arch.ept_identity_map_addr)
-- 
cgit v1.2.2


From fc1b74925f87f6aca5432eb73f6a57eff30afde7 Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Fri, 5 Apr 2013 19:20:30 +0000
Subject: KVM: Move vm_list kvm_lock declarations out of x86

The variables vm_list and kvm_lock are common to all architectures, so
move the declarations from arch/x86/include/asm/kvm_host.h to
include/linux/kvm_host.h.

Fixes sparse warnings like these when building for arm64:

  virt/kvm/kvm_main.c: warning: symbol 'kvm_lock' was not declared. Should it be static?
  virt/kvm/kvm_main.c: warning: symbol 'vm_list' was not declared. Should it be static?

Signed-off-by: Geoff Levand <geoff@infradead.org>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3dd84c996d56..628163c0c6e4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -94,9 +94,6 @@
 
 #define ASYNC_PF_PER_VCPU 64
 
-extern raw_spinlock_t kvm_lock;
-extern struct list_head vm_list;
-
 struct kvm_vcpu;
 struct kvm;
 struct kvm_async_pf;
-- 
cgit v1.2.2


From e3ba45b8041740f4ab8bbba3c6239876661aeed6 Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Fri, 5 Apr 2013 19:20:30 +0000
Subject: KVM: Move kvm_spurious_fault to x86.c

The routine kvm_spurious_fault() is an x86 specific routine, so
move it from virt/kvm/kvm_main.c to arch/x86/kvm/x86.c.

Fixes this sparse warning when building on arm64:

  virt/kvm/kvm_main.c:warning: symbol 'kvm_spurious_fault' was not declared. Should it be static?

Signed-off-by: Geoff Levand <geoff@infradead.org>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 78c6f90a60cc..eb9927e0af11 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -261,6 +261,13 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 }
 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 
+asmlinkage void kvm_spurious_fault(void)
+{
+	/* Fault while not rebooting.  We want the trace. */
+	BUG();
+}
+EXPORT_SYMBOL_GPL(kvm_spurious_fault);
+
 #define EXCPT_BENIGN		0
 #define EXCPT_CONTRIBUTORY	1
 #define EXCPT_PF		2
-- 
cgit v1.2.2


From 8b415dcd762607379cf0a69c9dd25940da1d174e Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Fri, 5 Apr 2013 19:20:30 +0000
Subject: KVM: Move kvm_rebooting declaration out of x86

The variable kvm_rebooting is a common kvm variable, so move its
declaration from arch/x86/include/asm/kvm_host.h to
include/asm/kvm_host.h.

Fixes this sparse warning when building on arm64:

  virt/kvm/kvm_main.c:warning: symbol 'kvm_rebooting' was not declared. Should it be static?

Signed-off-by: Geoff Levand <geoff@infradead.org>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 628163c0c6e4..b2c7263c93b6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -972,7 +972,6 @@ enum {
  * Trap the fault and ignore the instruction if that happens.
  */
 asmlinkage void kvm_spurious_fault(void);
-extern bool kvm_rebooting;
 
 #define ____kvm_handle_fault_on_reboot(insn, cleanup_insn)	\
 	"666: " insn "\n\t" \
-- 
cgit v1.2.2


From f8da94e9e44b237fa5cc8521faeb714dc2e83b54 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 11 Apr 2013 14:06:03 +0200
Subject: KVM: x86 emulator: Fix segment loading in VM86

This fixes a regression introduced in commit 03ebebeb1 ("KVM: x86
emulator: Leave segment limit and attributs alone in real mode").

The mentioned commit changed the segment descriptors for both real mode
and VM86 to only update the segment base instead of creating a
completely new descriptor with limit 0xffff so that unreal mode keeps
working across a segment register reload.

This leads to an invalid segment descriptor in the eyes of VMX, which
seems to be okay for real mode because KVM will fix it up before the
next VM entry or emulate the state, but it doesn't do this if the guest
is in VM86, so we end up with:

  KVM: entry failed, hardware error 0x80000021

Fix this by effectively reverting commit 03ebebeb1 for VM86 and leaving
it only in place for real mode, which is where it's really needed.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/emulate.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a335cc6cde72..069d79926e2b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1578,12 +1578,21 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 
 	memset(&seg_desc, 0, sizeof seg_desc);
 
-	if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
-	    || ctxt->mode == X86EMUL_MODE_REAL) {
-		/* set real mode segment descriptor */
+	if (ctxt->mode == X86EMUL_MODE_REAL) {
+		/* set real mode segment descriptor (keep limit etc. for
+		 * unreal mode) */
 		ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
 		set_desc_base(&seg_desc, selector << 4);
 		goto load;
+	} else if (seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) {
+		/* VM86 needs a clean new segment descriptor */
+		set_desc_base(&seg_desc, selector << 4);
+		set_desc_limit(&seg_desc, 0xffff);
+		seg_desc.type = 3;
+		seg_desc.p = 1;
+		seg_desc.s = 1;
+		seg_desc.dpl = 3;
+		goto load;
 	}
 
 	rpl = selector & 3;
-- 
cgit v1.2.2


From 0b789eee2c0204da83278f181428560faf6efefb Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 11 Apr 2013 11:59:55 +0300
Subject: KVM: emulator: fix unimplemented instruction detection

Unimplemented instruction detection is broken for group instructions
since it relies on "flags" field of opcode to be zero, but all
instructions in a group inherit flags from a group encoding. Fix that by
having a separate flag for unimplemented instructions.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 069d79926e2b..ab6fda4eb98f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -132,8 +132,9 @@
 #define Priv        (1<<27) /* instruction generates #GP if current CPL != 0 */
 #define No64	    (1<<28)
 #define PageTable   (1 << 29)   /* instruction used to write page table */
+#define NotImpl     (1 << 30)   /* instruction is not implemented */
 /* Source 2 operand type */
-#define Src2Shift   (30)
+#define Src2Shift   (31)
 #define Src2None    (OpNone << Src2Shift)
 #define Src2CL      (OpCL << Src2Shift)
 #define Src2ImmByte (OpImmByte << Src2Shift)
@@ -3624,7 +3625,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 #define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
 #define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
 		      .check_perm = (_p) }
-#define N    D(0)
+#define N    D(NotImpl)
 #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
 #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
 #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
@@ -4382,7 +4383,7 @@ done_prefixes:
 	ctxt->intercept = opcode.intercept;
 
 	/* Unrecognised? */
-	if (ctxt->d == 0 || (ctxt->d & Undefined))
+	if (ctxt->d == 0 || (ctxt->d & NotImpl) || (ctxt->d & Undefined))
 		return EMULATION_FAILED;
 
 	if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
-- 
cgit v1.2.2


From 991eebf9f8e523e7ff1e4d31ac80641582b2e57a Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 11 Apr 2013 12:10:51 +0300
Subject: KVM: VMX: do not try to reexecute failed instruction while emulating
 invalid guest state

During invalid guest state emulation vcpu cannot enter guest mode to try
to reexecute instruction that emulator failed to emulate, so emulation
will happen again and again.  Prevent that by telling the emulator that
instruction reexecution should not be attempted.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx.c              |  2 +-
 arch/x86/kvm/x86.c              | 13 +++++++++----
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b2c7263c93b6..82f1dc67782f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -795,6 +795,7 @@ enum emulation_result {
 #define EMULTYPE_TRAP_UD	    (1 << 1)
 #define EMULTYPE_SKIP		    (1 << 2)
 #define EMULTYPE_RETRY		    (1 << 3)
+#define EMULTYPE_NO_REEXECUTE	    (1 << 4)
 int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
 			    int emulation_type, void *insn, int insn_len);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 669b80378731..d2686771481f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5189,7 +5189,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 		if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
 			return 1;
 
-		err = emulate_instruction(vcpu, 0);
+		err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
 
 		if (err == EMULATE_DO_MMIO) {
 			ret = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eb9927e0af11..999d1243a6ce 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4765,11 +4765,15 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 }
 
 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
-				  bool write_fault_to_shadow_pgtable)
+				  bool write_fault_to_shadow_pgtable,
+				  int emulation_type)
 {
 	gpa_t gpa = cr2;
 	pfn_t pfn;
 
+	if (emulation_type & EMULTYPE_NO_REEXECUTE)
+		return false;
+
 	if (!vcpu->arch.mmu.direct_map) {
 		/*
 		 * Write permission should be allowed since only
@@ -4912,8 +4916,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 		if (r != EMULATION_OK)  {
 			if (emulation_type & EMULTYPE_TRAP_UD)
 				return EMULATE_FAIL;
-			if (reexecute_instruction(vcpu, cr2,
-						  write_fault_to_spt))
+			if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
+						emulation_type))
 				return EMULATE_DONE;
 			if (emulation_type & EMULTYPE_SKIP)
 				return EMULATE_FAIL;
@@ -4943,7 +4947,8 @@ restart:
 		return EMULATE_DONE;
 
 	if (r == EMULATION_FAILED) {
-		if (reexecute_instruction(vcpu, cr2, write_fault_to_spt))
+		if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
+					emulation_type))
 			return EMULATE_DONE;
 
 		return handle_emulation_failure(vcpu);
-- 
cgit v1.2.2


From 1146a78b8d88fc0e0ca2ab9c549821242e289432 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 11 Apr 2013 12:30:01 +0300
Subject: KVM: emulator: Do not fail on emulation of undefined opcode

Emulation of undefined opcode should inject #UD instead of causing
emulation failure. Do that by moving Undefined flag check to emulation
stage and injection #UD there.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/emulate.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ab6fda4eb98f..77b56d5be3e1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4383,7 +4383,7 @@ done_prefixes:
 	ctxt->intercept = opcode.intercept;
 
 	/* Unrecognised? */
-	if (ctxt->d == 0 || (ctxt->d & NotImpl) || (ctxt->d & Undefined))
+	if (ctxt->d == 0 || (ctxt->d & NotImpl))
 		return EMULATION_FAILED;
 
 	if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
@@ -4521,7 +4521,8 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 	ctxt->mem_read.pos = 0;
 
-	if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) {
+	if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
+			(ctxt->d & Undefined)) {
 		rc = emulate_ud(ctxt);
 		goto done;
 	}
-- 
cgit v1.2.2


From 188424ba10313bbbc015727d418b7f18593becdd Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 11 Apr 2013 12:32:14 +0300
Subject: KVM: emulator: mark 0xff 0x7d opcode as undefined.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 77b56d5be3e1..46f63b8d09f4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3723,7 +3723,7 @@ static const struct opcode group5[] = {
 	I(SrcMemFAddr | ImplicitOps | Stack,	em_call_far),
 	I(SrcMem | Stack,			em_grp45),
 	I(SrcMemFAddr | ImplicitOps,		em_grp45),
-	I(SrcMem | Stack,			em_grp45), N,
+	I(SrcMem | Stack,			em_grp45), D(Undefined),
 };
 
 static const struct opcode group6[] = {
-- 
cgit v1.2.2


From 3b656cf764cbc43d3efb9bf5f45c618d4cf0989f Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 14 Apr 2013 12:12:45 +0200
Subject: KVM: nVMX: Fix injection of PENDING_INTERRUPT and NMI_WINDOW exits to
 L1

Check if the interrupt or NMI window exit is for L1 by testing if it has
the corresponding controls enabled. This is required when we allow
direct injection from L0 to L2

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d2686771481f..5ae969b741f8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6112,14 +6112,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_TRIPLE_FAULT:
 		return 1;
 	case EXIT_REASON_PENDING_INTERRUPT:
+		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
 	case EXIT_REASON_NMI_WINDOW:
-		/*
-		 * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
-		 * (aka Interrupt Window Exiting) only when L1 turned it on,
-		 * so if we got a PENDING_INTERRUPT exit, this must be for L1.
-		 * Same for NMI Window Exiting.
-		 */
-		return 1;
+		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
 	case EXIT_REASON_TASK_SWITCH:
 		return 1;
 	case EXIT_REASON_CPUID:
-- 
cgit v1.2.2


From 5f3d5799974b89100268ba813cec8db7bd0693fb Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 14 Apr 2013 12:12:46 +0200
Subject: KVM: nVMX: Rework event injection and recovery

The basic idea is to always transfer the pending event injection on
vmexit into the architectural state of the VCPU and then drop it from
there if it turns out that we left L2 to enter L1, i.e. if we enter
prepare_vmcs12.

vmcs12_save_pending_events takes care to transfer pending L0 events into
the queue of L1. That is mandatory as L1 may decide to switch the guest
state completely, invalidating or preserving the pending events for
later injection (including on a different node, once we support
migration).

This concept is based on the rule that a pending vmlaunch/vmresume is
not canceled. Otherwise, we would risk to lose injected events or leak
them into the wrong queues. Encode this rule via a WARN_ON_ONCE at the
entry of nested_vmx_vmexit.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 99 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 64 insertions(+), 35 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5ae969b741f8..4fb72a764dbd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6493,8 +6493,6 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
 
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
-	if (is_guest_mode(&vmx->vcpu))
-		return;
 	__vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
 				  VM_EXIT_INSTRUCTION_LEN,
 				  IDT_VECTORING_ERROR_CODE);
@@ -6502,8 +6500,6 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 
 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 {
-	if (is_guest_mode(vcpu))
-		return;
 	__vmx_complete_interrupts(vcpu,
 				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
 				  VM_ENTRY_INSTRUCTION_LEN,
@@ -6535,21 +6531,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long debugctlmsr;
 
-	if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
-		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-		if (vmcs12->idt_vectoring_info_field &
-				VECTORING_INFO_VALID_MASK) {
-			vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-				vmcs12->idt_vectoring_info_field);
-			vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-				vmcs12->vm_exit_instruction_len);
-			if (vmcs12->idt_vectoring_info_field &
-					VECTORING_INFO_DELIVER_CODE_MASK)
-				vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-					vmcs12->idt_vectoring_error_code);
-		}
-	}
-
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
 		vmx->entry_time = ktime_get();
@@ -6708,17 +6689,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
-	if (is_guest_mode(vcpu)) {
-		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-		vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
-		if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
-			vmcs12->idt_vectoring_error_code =
-				vmcs_read32(IDT_VECTORING_ERROR_CODE);
-			vmcs12->vm_exit_instruction_len =
-				vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-		}
-	}
-
 	vmx->loaded_vmcs->launched = 1;
 
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
@@ -7326,6 +7296,48 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 			vcpu->arch.cr4_guest_owned_bits));
 }
 
+static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
+				       struct vmcs12 *vmcs12)
+{
+	u32 idt_vectoring;
+	unsigned int nr;
+
+	if (vcpu->arch.exception.pending) {
+		nr = vcpu->arch.exception.nr;
+		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+		if (kvm_exception_is_soft(nr)) {
+			vmcs12->vm_exit_instruction_len =
+				vcpu->arch.event_exit_inst_len;
+			idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
+		} else
+			idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
+
+		if (vcpu->arch.exception.has_error_code) {
+			idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
+			vmcs12->idt_vectoring_error_code =
+				vcpu->arch.exception.error_code;
+		}
+
+		vmcs12->idt_vectoring_info_field = idt_vectoring;
+	} else if (vcpu->arch.nmi_pending) {
+		vmcs12->idt_vectoring_info_field =
+			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
+	} else if (vcpu->arch.interrupt.pending) {
+		nr = vcpu->arch.interrupt.nr;
+		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+		if (vcpu->arch.interrupt.soft) {
+			idt_vectoring |= INTR_TYPE_SOFT_INTR;
+			vmcs12->vm_entry_instruction_len =
+				vcpu->arch.event_exit_inst_len;
+		} else
+			idt_vectoring |= INTR_TYPE_EXT_INTR;
+
+		vmcs12->idt_vectoring_info_field = idt_vectoring;
+	}
+}
+
 /*
  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
@@ -7411,15 +7423,29 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
-	vmcs12->idt_vectoring_info_field = to_vmx(vcpu)->idt_vectoring_info;
-	vmcs12->idt_vectoring_error_code =
-		vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	vmcs12->idt_vectoring_info_field = 0;
 	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 
-	/* clear vm-entry fields which are to be cleared on exit */
-	if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+	if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+		/* vm_entry_intr_info_field is cleared on exit. Emulate this
+		 * instead of reading the real value. */
 		vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
+
+		/*
+		 * Transfer the event that L0 or L1 may wanted to inject into
+		 * L2 to IDT_VECTORING_INFO_FIELD.
+		 */
+		vmcs12_save_pending_event(vcpu, vmcs12);
+	}
+
+	/*
+	 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
+	 * preserved above and would only end up incorrectly in L1.
+	 */
+	vcpu->arch.nmi_injected = false;
+	kvm_clear_exception_queue(vcpu);
+	kvm_clear_interrupt_queue(vcpu);
 }
 
 /*
@@ -7519,6 +7545,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
 	int cpu;
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
+	/* trying to cancel vmlaunch/vmresume is a bug */
+	WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
 	leave_guest_mode(vcpu);
 	prepare_vmcs12(vcpu, vmcs12);
 
-- 
cgit v1.2.2


From e8457c67a4ec1268ec616bd8be1d9f1cc20f1493 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 14 Apr 2013 12:12:48 +0200
Subject: KVM: nVMX: Fix conditions for interrupt injection

If we are entering guest mode, we do not want L0 to interrupt this
vmentry with all its side effects on the vmcs. Therefore, injection
shall be disallowed during L1->L2 transitions, as in the previous
version. However, this check is conceptually independent of
nested_exit_on_intr, so decouple it.

If L1 traps external interrupts, we can kick the guest from L2 to L1,
also just like the previous code worked. But we no longer need to
consider L1's idt_vectoring_info_field. It will always be empty at this
point. Instead, if L2 has pending events, those are now found in the
architectural queues and will, thus, prevent vmx_interrupt_allowed from
being called at all.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4fb72a764dbd..5e6391112275 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4325,16 +4325,20 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
-	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+	if (is_guest_mode(vcpu)) {
 		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-		if (to_vmx(vcpu)->nested.nested_run_pending ||
-		    (vmcs12->idt_vectoring_info_field &
-		     VECTORING_INFO_VALID_MASK))
+
+		if (to_vmx(vcpu)->nested.nested_run_pending)
 			return 0;
-		nested_vmx_vmexit(vcpu);
-		vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
-		vmcs12->vm_exit_intr_info = 0;
-		/* fall through to normal code, but now in L1, not L2 */
+		if (nested_exit_on_intr(vcpu)) {
+			nested_vmx_vmexit(vcpu);
+			vmcs12->vm_exit_reason =
+				EXIT_REASON_EXTERNAL_INTERRUPT;
+			vmcs12->vm_exit_intr_info = 0;
+			/*
+			 * fall through to normal code, but now in L1, not L2
+			 */
+		}
 	}
 
 	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-- 
cgit v1.2.2


From c0d1c770c05ac7051df86914f9627b68f29c1d67 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 14 Apr 2013 12:12:50 +0200
Subject: KVM: nVMX: Avoid reading VM_EXIT_INTR_ERROR_CODE needlessly on nested
 exits

We only need to update vm_exit_intr_error_code if there is a valid exit
interruption information and it comes with a valid error code.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5e6391112275..71755573b7ca 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7426,7 +7426,11 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
 	vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-	vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	if ((vmcs12->vm_exit_intr_info &
+	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
+		vmcs12->vm_exit_intr_error_code =
+			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	vmcs12->idt_vectoring_info_field = 0;
 	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
-- 
cgit v1.2.2


From 1fcc7890dbf32571c25278803ee19182c801c006 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:21:35 +0800
Subject: KVM: Add vcpu info to ioapic_update_eoi()

Add vcpu info to ioapic_update_eoi, so we can know which vcpu
issued this EOI.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e2274745ab3d..3e22536a7031 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -739,7 +739,7 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 			trigger_mode = IOAPIC_LEVEL_TRIG;
 		else
 			trigger_mode = IOAPIC_EDGE_TRIG;
-		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+		kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
 	}
 }
 
-- 
cgit v1.2.2


From b4f2225c07dd4d8eef7aa7f5b36a3b72c3cbbe5b Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:21:37 +0800
Subject: KVM: Return destination vcpu on interrupt injection

Add a new parameter to know vcpus who received the interrupt.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 25 ++++++++++++++++---------
 arch/x86/kvm/lapic.h |  5 +++--
 2 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3e22536a7031..0b734025fc87 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -384,14 +384,16 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 }
 
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
-			     int vector, int level, int trig_mode);
+			     int vector, int level, int trig_mode,
+			     unsigned long *dest_map);
 
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+		unsigned long *dest_map)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
-			irq->level, irq->trig_mode);
+			irq->level, irq->trig_mode, dest_map);
 }
 
 static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
@@ -564,7 +566,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 }
 
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-		struct kvm_lapic_irq *irq, int *r)
+		struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
 {
 	struct kvm_apic_map *map;
 	unsigned long bitmap = 1;
@@ -575,7 +577,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 	*r = -1;
 
 	if (irq->shorthand == APIC_DEST_SELF) {
-		*r = kvm_apic_set_irq(src->vcpu, irq);
+		*r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
 		return true;
 	}
 
@@ -620,7 +622,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 			continue;
 		if (*r < 0)
 			*r = 0;
-		*r += kvm_apic_set_irq(dst[i]->vcpu, irq);
+		*r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
 	}
 
 	ret = true;
@@ -634,7 +636,8 @@ out:
  * Return 1 if successfully added and 0 if discarded.
  */
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
-			     int vector, int level, int trig_mode)
+			     int vector, int level, int trig_mode,
+			     unsigned long *dest_map)
 {
 	int result = 0;
 	struct kvm_vcpu *vcpu = apic->vcpu;
@@ -647,6 +650,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (unlikely(!apic_enabled(apic)))
 			break;
 
+		if (dest_map)
+			__set_bit(vcpu->vcpu_id, dest_map);
+
 		if (trig_mode) {
 			apic_debug("level trig mode for vector %d", vector);
 			apic_set_vector(vector, apic->regs + APIC_TMR);
@@ -805,7 +811,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 		   irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
 		   irq.vector);
 
-	kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
+	kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
 }
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -1441,7 +1447,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
 		vector = reg & APIC_VECTOR_MASK;
 		mode = reg & APIC_MODE_MASK;
 		trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
-		return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
+		return __apic_accept_irq(apic, mode, vector, 1, trig_mode,
+					NULL);
 	}
 	return 0;
 }
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index baa20cfb95ae..3e5a43160cb7 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -55,11 +55,12 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+		unsigned long *dest_map);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
 
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-		struct kvm_lapic_irq *irq, int *r);
+		struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-- 
cgit v1.2.2


From 106069193ce501ea68649037bde7ea50ed492948 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:21:38 +0800
Subject: KVM: Add reset/restore rtc_status support

restore rtc_status from migration or save/restore

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 9 +++++++++
 arch/x86/kvm/lapic.h | 2 ++
 2 files changed, 11 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0b734025fc87..67962188c775 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -94,6 +94,14 @@ static inline int apic_test_vector(int vec, void *bitmap)
 	return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
 }
 
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	return apic_test_vector(vector, apic->regs + APIC_ISR) ||
+		apic_test_vector(vector, apic->regs + APIC_IRR);
+}
+
 static inline void apic_set_vector(int vec, void *bitmap)
 {
 	set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -1618,6 +1626,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 	apic->highest_isr_cache = -1;
 	kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
+	kvm_rtc_eoi_tracking_restore_one(vcpu);
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 3e5a43160cb7..16304b1a8cba 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -166,4 +166,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
 	return vcpu->arch.apic->pending_events;
 }
 
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+
 #endif
-- 
cgit v1.2.2


From aa2fbe6d44892070d78995f0df875ce930904e29 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:21:40 +0800
Subject: KVM: Let ioapic know the irq line status

Userspace may deliver RTC interrupt without query the status. So we
want to track RTC EOI for this case.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8254.c | 4 ++--
 arch/x86/kvm/x86.c   | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c1d30b2fc9bb..412a5aa0ef94 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -290,8 +290,8 @@ static void pit_do_work(struct kthread_work *work)
 	}
 	spin_unlock(&ps->inject_lock);
 	if (inject) {
-		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
-		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
+		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
+		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
 
 		/*
 		 * Provides NMI watchdog support via Virtual Wire mode.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 999d1243a6ce..ae9744d03c83 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3491,13 +3491,15 @@ out:
 	return r;
 }
 
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+			bool line_status)
 {
 	if (!irqchip_in_kernel(kvm))
 		return -ENXIO;
 
 	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-					irq_event->irq, irq_event->level);
+					irq_event->irq, irq_event->level,
+					line_status);
 	return 0;
 }
 
-- 
cgit v1.2.2


From a547c6db4d2f16ba5ce8e7054bffad6acc248d40 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:25:10 +0800
Subject: KVM: VMX: Enable acknowledge interupt on vmexit

The "acknowledge interrupt on exit" feature controls processor behavior
for external interrupt acknowledgement. When this control is set, the
processor acknowledges the interrupt controller to acquire the
interrupt vector on VM exit.

After enabling this feature, an interrupt which arrived when target cpu is
running in vmx non-root mode will be handled by vmx handler instead of handler
in idt. Currently, vmx handler only fakes an interrupt stack and jump to idt
table to let real handler to handle it. Further, we will recognize the interrupt
and only delivery the interrupt which not belong to current vcpu through idt table.
The interrupt which belonged to current vcpu will be handled inside vmx handler.
This will reduce the interrupt handle cost of KVM.

Also, interrupt enable logic is changed if this feature is turnning on:
Before this patch, hypervior call local_irq_enable() to enable it directly.
Now IF bit is set on interrupt stack frame, and will be enabled on a return from
interrupt handler if exterrupt interrupt exists. If no external interrupt, still
call local_irq_enable() to enable it.

Refer to Intel SDM volum 3, chapter 33.2.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm.c              |  6 +++++
 arch/x86/kvm/vmx.c              | 58 ++++++++++++++++++++++++++++++++++++++---
 arch/x86/kvm/x86.c              |  4 ++-
 4 files changed, 64 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 82f1dc67782f..68d438630dd3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -727,6 +727,7 @@ struct kvm_x86_ops {
 	int (*check_intercept)(struct kvm_vcpu *vcpu,
 			       struct x86_instruction_info *info,
 			       enum x86_intercept_stage stage);
+	void (*handle_external_intr)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7a46c1f46861..2f8fe3f06837 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4233,6 +4233,11 @@ out:
 	return ret;
 }
 
+static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
+{
+	local_irq_enable();
+}
+
 static struct kvm_x86_ops svm_x86_ops = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
@@ -4328,6 +4333,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_tdp_cr3 = set_tdp_cr3,
 
 	.check_intercept = svm_check_intercept,
+	.handle_external_intr = svm_handle_external_intr,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 71755573b7ca..7a7605f0444b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -378,6 +378,7 @@ struct vcpu_vmx {
 	struct shared_msr_entry *guest_msrs;
 	int                   nmsrs;
 	int                   save_nmsrs;
+	unsigned long	      host_idt_base;
 #ifdef CONFIG_X86_64
 	u64 		      msr_host_kernel_gs_base;
 	u64 		      msr_guest_kernel_gs_base;
@@ -2627,7 +2628,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
-	opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
+	opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
+		VM_EXIT_ACK_INTR_ON_EXIT;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
 				&_vmexit_control) < 0)
 		return -EIO;
@@ -3879,7 +3881,7 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
  * Note that host-state that does change is set elsewhere. E.g., host-state
  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
  */
-static void vmx_set_constant_host_state(void)
+static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 {
 	u32 low32, high32;
 	unsigned long tmpl;
@@ -3907,6 +3909,7 @@ static void vmx_set_constant_host_state(void)
 
 	native_store_idt(&dt);
 	vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
+	vmx->host_idt_base = dt.address;
 
 	vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
 
@@ -4039,7 +4042,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
 	vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
 	vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
-	vmx_set_constant_host_state();
+	vmx_set_constant_host_state(vmx);
 #ifdef CONFIG_X86_64
 	rdmsrl(MSR_FS_BASE, a);
 	vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
@@ -6399,6 +6402,52 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 	}
 }
 
+static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
+{
+	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+	/*
+	 * If external interrupt exists, IF bit is set in rflags/eflags on the
+	 * interrupt stack frame, and interrupt will be enabled on a return
+	 * from interrupt handler.
+	 */
+	if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
+			== (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
+		unsigned int vector;
+		unsigned long entry;
+		gate_desc *desc;
+		struct vcpu_vmx *vmx = to_vmx(vcpu);
+#ifdef CONFIG_X86_64
+		unsigned long tmp;
+#endif
+
+		vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
+		desc = (gate_desc *)vmx->host_idt_base + vector;
+		entry = gate_offset(*desc);
+		asm volatile(
+#ifdef CONFIG_X86_64
+			"mov %%" _ASM_SP ", %[sp]\n\t"
+			"and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
+			"push $%c[ss]\n\t"
+			"push %[sp]\n\t"
+#endif
+			"pushf\n\t"
+			"orl $0x200, (%%" _ASM_SP ")\n\t"
+			__ASM_SIZE(push) " $%c[cs]\n\t"
+			"call *%[entry]\n\t"
+			:
+#ifdef CONFIG_X86_64
+			[sp]"=&r"(tmp)
+#endif
+			:
+			[entry]"r"(entry),
+			[ss]"i"(__KERNEL_DS),
+			[cs]"i"(__KERNEL_CS)
+			);
+	} else
+		local_irq_enable();
+}
+
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
@@ -7041,7 +7090,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	 * Other fields are different per CPU, and will be set later when
 	 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
 	 */
-	vmx_set_constant_host_state();
+	vmx_set_constant_host_state(vmx);
 
 	/*
 	 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
@@ -7718,6 +7767,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_tdp_cr3 = vmx_set_cr3,
 
 	.check_intercept = vmx_check_intercept,
+	.handle_external_intr = vmx_handle_external_intr,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae9744d03c83..1562671a8e18 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5820,7 +5820,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
-	local_irq_enable();
+
+	/* Interrupt is enabled by handle_external_intr() */
+	kvm_x86_ops->handle_external_intr(vcpu);
 
 	++vcpu->stat.exits;
 
-- 
cgit v1.2.2


From d78f2664832f8d70e36422af9a10e44276dced48 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:25:11 +0800
Subject: KVM: VMX: Register a new IPI for posted interrupt

Posted Interrupt feature requires a special IPI to deliver posted interrupt
to guest. And it should has a high priority so the interrupt will not be
blocked by others.
Normally, the posted interrupt will be consumed by vcpu if target vcpu is
running and transparent to OS. But in some cases, the interrupt will arrive
when target vcpu is scheduled out. And host will see it. So we need to
register a dump handler to handle it.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/entry_arch.h  |  4 ++++
 arch/x86/include/asm/hardirq.h     |  3 +++
 arch/x86/include/asm/hw_irq.h      |  1 +
 arch/x86/include/asm/irq_vectors.h |  5 +++++
 arch/x86/kernel/entry_64.S         |  5 +++++
 arch/x86/kernel/irq.c              | 22 ++++++++++++++++++++++
 arch/x86/kernel/irqinit.c          |  4 ++++
 7 files changed, 44 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 40afa0005c69..9bd4ecac72be 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -19,6 +19,10 @@ BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
 
 BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
 
+#ifdef CONFIG_HAVE_KVM
+BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR)
+#endif
+
 /*
  * every pentium local APIC has two 'local interrupts', with a
  * soft-definable vector attached to both interrupts, one of
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 81f04cee5f74..ab0ae1aa6d0a 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -11,6 +11,9 @@ typedef struct {
 	unsigned int apic_timer_irqs;	/* arch dependent */
 	unsigned int irq_spurious_count;
 	unsigned int icr_read_retry_count;
+#endif
+#ifdef CONFIG_HAVE_KVM
+	unsigned int kvm_posted_intr_ipis;
 #endif
 	unsigned int x86_platform_ipis;	/* arch dependent */
 	unsigned int apic_perf_irqs;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 10a78c3d3d5a..1da97efad08a 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -28,6 +28,7 @@
 /* Interrupt handlers registered during init_IRQ */
 extern void apic_timer_interrupt(void);
 extern void x86_platform_ipi(void);
+extern void kvm_posted_intr_ipi(void);
 extern void error_interrupt(void);
 extern void irq_work_interrupt(void);
 
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index aac5fa62a86c..5702d7e3111d 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -102,6 +102,11 @@
  */
 #define X86_PLATFORM_IPI_VECTOR		0xf7
 
+/* Vector for KVM to deliver posted interrupt IPI */
+#ifdef CONFIG_HAVE_KVM
+#define POSTED_INTR_VECTOR		0xf2
+#endif
+
 /*
  * IRQ work vector:
  */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c1d01e6ca790..727208941030 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1166,6 +1166,11 @@ apicinterrupt LOCAL_TIMER_VECTOR \
 apicinterrupt X86_PLATFORM_IPI_VECTOR \
 	x86_platform_ipi smp_x86_platform_ipi
 
+#ifdef CONFIG_HAVE_KVM
+apicinterrupt POSTED_INTR_VECTOR \
+	kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
+#endif
+
 apicinterrupt THRESHOLD_APIC_VECTOR \
 	threshold_interrupt smp_threshold_interrupt
 apicinterrupt THERMAL_APIC_VECTOR \
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index e4595f105910..6ae6ea1d27d9 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -228,6 +228,28 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
+#ifdef CONFIG_HAVE_KVM
+/*
+ * Handler for POSTED_INTERRUPT_VECTOR.
+ */
+void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	ack_APIC_irq();
+
+	irq_enter();
+
+	exit_idle();
+
+	inc_irq_stat(kvm_posted_intr_ipis);
+
+	irq_exit();
+
+	set_irq_regs(old_regs);
+}
+#endif
+
 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 7dc4e459c2b3..a2a1fbc594ff 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -172,6 +172,10 @@ static void __init apic_intr_init(void)
 
 	/* IPI for X86 platform specific use */
 	alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
+#ifdef CONFIG_HAVE_KVM
+	/* IPI for KVM to deliver posted interrupt */
+	alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
+#endif
 
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-- 
cgit v1.2.2


From 01e439be7753c163932538276f04f95cb1b66697 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:25:12 +0800
Subject: KVM: VMX: Check the posted interrupt capability

Detect the posted interrupt feature. If it exists, then set it in vmcs_config.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/vmx.h |  4 +++
 arch/x86/kvm/vmx.c         | 82 +++++++++++++++++++++++++++++++++++-----------
 2 files changed, 66 insertions(+), 20 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index fc1c3134473b..6f07f1999138 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -71,6 +71,7 @@
 #define PIN_BASED_NMI_EXITING                   0x00000008
 #define PIN_BASED_VIRTUAL_NMIS                  0x00000020
 #define PIN_BASED_VMX_PREEMPTION_TIMER          0x00000040
+#define PIN_BASED_POSTED_INTR                   0x00000080
 
 #define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR	0x00000016
 
@@ -102,6 +103,7 @@
 /* VMCS Encodings */
 enum vmcs_field {
 	VIRTUAL_PROCESSOR_ID            = 0x00000000,
+	POSTED_INTR_NV                  = 0x00000002,
 	GUEST_ES_SELECTOR               = 0x00000800,
 	GUEST_CS_SELECTOR               = 0x00000802,
 	GUEST_SS_SELECTOR               = 0x00000804,
@@ -136,6 +138,8 @@ enum vmcs_field {
 	VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
 	APIC_ACCESS_ADDR		= 0x00002014,
 	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
+	POSTED_INTR_DESC_ADDR           = 0x00002016,
+	POSTED_INTR_DESC_ADDR_HIGH      = 0x00002017,
 	EPT_POINTER                     = 0x0000201a,
 	EPT_POINTER_HIGH                = 0x0000201b,
 	EOI_EXIT_BITMAP0                = 0x0000201c,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7a7605f0444b..7607c6c09c74 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,7 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO);
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
-static bool __read_mostly enable_apicv_reg_vid;
+static bool __read_mostly enable_apicv;
+module_param(enable_apicv, bool, S_IRUGO);
 
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
@@ -366,6 +367,14 @@ struct nested_vmx {
 	struct page *apic_access_page;
 };
 
+#define POSTED_INTR_ON  0
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+	u32 pir[8];     /* Posted interrupt requested */
+	u32 control;	/* bit 0 of control is outstanding notification bit */
+	u32 rsvd[7];
+} __aligned(64);
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
@@ -430,6 +439,9 @@ struct vcpu_vmx {
 
 	bool rdtscp_enabled;
 
+	/* Posted interrupt descriptor */
+	struct pi_desc pi_desc;
+
 	/* Support for a guest hypervisor (nested VMX) */
 	struct nested_vmx nested;
 };
@@ -785,6 +797,18 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 }
 
+static inline bool cpu_has_vmx_posted_intr(void)
+{
+	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+}
+
+static inline bool cpu_has_vmx_apicv(void)
+{
+	return cpu_has_vmx_apic_register_virt() &&
+		cpu_has_vmx_virtual_intr_delivery() &&
+		cpu_has_vmx_posted_intr();
+}
+
 static inline bool cpu_has_vmx_flexpriority(void)
 {
 	return cpu_has_vmx_tpr_shadow() &&
@@ -2552,12 +2576,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	u32 _vmexit_control = 0;
 	u32 _vmentry_control = 0;
 
-	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-	opt = PIN_BASED_VIRTUAL_NMIS;
-	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
-				&_pin_based_exec_control) < 0)
-		return -EIO;
-
 	min = CPU_BASED_HLT_EXITING |
 #ifdef CONFIG_X86_64
 	      CPU_BASED_CR8_LOAD_EXITING |
@@ -2634,6 +2652,17 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				&_vmexit_control) < 0)
 		return -EIO;
 
+	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
+				&_pin_based_exec_control) < 0)
+		return -EIO;
+
+	if (!(_cpu_based_2nd_exec_control &
+		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
+		!(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
+		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
+
 	min = 0;
 	opt = VM_ENTRY_LOAD_IA32_PAT;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
@@ -2812,11 +2841,10 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_ple())
 		ple_gap = 0;
 
-	if (!cpu_has_vmx_apic_register_virt() ||
-				!cpu_has_vmx_virtual_intr_delivery())
-		enable_apicv_reg_vid = 0;
+	if (!cpu_has_vmx_apicv())
+		enable_apicv = 0;
 
-	if (enable_apicv_reg_vid)
+	if (enable_apicv)
 		kvm_x86_ops->update_cr8_intercept = NULL;
 	else
 		kvm_x86_ops->hwapic_irr_update = NULL;
@@ -3875,6 +3903,11 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
 			msr, MSR_TYPE_W);
 }
 
+static int vmx_vm_has_apicv(struct kvm *kvm)
+{
+	return enable_apicv && irqchip_in_kernel(kvm);
+}
+
 /*
  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
  * will not change in the lifetime of the guest.
@@ -3935,6 +3968,15 @@ static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
 	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 }
 
+static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
+{
+	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
+
+	if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+	return pin_based_exec_ctrl;
+}
+
 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 {
 	u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
@@ -3952,11 +3994,6 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 	return exec_control;
 }
 
-static int vmx_vm_has_apicv(struct kvm *kvm)
-{
-	return enable_apicv_reg_vid && irqchip_in_kernel(kvm);
-}
-
 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 {
 	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
@@ -4012,8 +4049,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 
 	/* Control */
-	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
-		vmcs_config.pin_based_exec_ctrl);
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
 
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
@@ -4022,13 +4058,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 				vmx_secondary_exec_control(vmx));
 	}
 
-	if (enable_apicv_reg_vid) {
+	if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
 		vmcs_write64(EOI_EXIT_BITMAP0, 0);
 		vmcs_write64(EOI_EXIT_BITMAP1, 0);
 		vmcs_write64(EOI_EXIT_BITMAP2, 0);
 		vmcs_write64(EOI_EXIT_BITMAP3, 0);
 
 		vmcs_write16(GUEST_INTR_STATUS, 0);
+
+		vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
 	}
 
 	if (ple_gap) {
@@ -4170,6 +4209,9 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		vmcs_write64(APIC_ACCESS_ADDR,
 			     page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
 
+	if (vmx_vm_has_apicv(vcpu->kvm))
+		memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
+
 	if (vmx->vpid != 0)
 		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
@@ -7842,7 +7884,7 @@ static int __init vmx_init(void)
 	memcpy(vmx_msr_bitmap_longmode_x2apic,
 			vmx_msr_bitmap_longmode, PAGE_SIZE);
 
-	if (enable_apicv_reg_vid) {
+	if (enable_apicv) {
 		for (msr = 0x800; msr <= 0x8ff; msr++)
 			vmx_disable_intercept_msr_read_x2apic(msr);
 
-- 
cgit v1.2.2


From 3d81bc7e96d6bca0b8f8b7d1bf6ea72caa3aac57 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:25:13 +0800
Subject: KVM: Call common update function when ioapic entry changed.

Both TMR and EOI exit bitmap need to be updated when ioapic changed
or vcpu's id/ldr/dfr changed. So use common function instead eoi exit
bitmap specific function.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/ia64/kvm/lapic.h |  6 ------
 arch/x86/kvm/lapic.c  |  2 +-
 arch/x86/kvm/vmx.c    |  3 +++
 arch/x86/kvm/x86.c    | 11 +++++++----
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index c3e2935b6db4..c5f92a926a9a 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -27,10 +27,4 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
 #define kvm_apic_present(x) (true)
 #define kvm_lapic_enabled(x) (true)
 
-static inline bool kvm_apic_vid_enabled(void)
-{
-	/* IA64 has no apicv supporting, do nothing here */
-	return false;
-}
-
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 67962188c775..34a8ca845280 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -217,7 +217,7 @@ out:
 	if (old)
 		kfree_rcu(old, rcu);
 
-	kvm_ioapic_make_eoibitmap_request(kvm);
+	kvm_vcpu_request_scan_ioapic(kvm);
 }
 
 static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7607c6c09c74..8535519d0352 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6414,6 +6414,9 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
+	if (!vmx_vm_has_apicv(vcpu->kvm))
+		return;
+
 	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
 	vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
 	vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1562671a8e18..87a05df3eae4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5661,13 +5661,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
-static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
+static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
 	u64 eoi_exit_bitmap[4];
 
+	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+		return;
+
 	memset(eoi_exit_bitmap, 0, 32);
 
-	kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
+	kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap);
 	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
 }
 
@@ -5724,8 +5727,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_handle_pmu_event(vcpu);
 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
 			kvm_deliver_pmi(vcpu);
-		if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
-			update_eoi_exitmap(vcpu);
+		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
+			vcpu_scan_ioapic(vcpu);
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
-- 
cgit v1.2.2


From cf9e65b773394c5ad8fa7455c43268bc8ec2109f Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:25:14 +0800
Subject: KVM: Set TMR when programming ioapic entry

We already know the trigger mode of a given interrupt when programming
the ioapice entry. So it's not necessary to set it in each interrupt
delivery.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 15 +++++++++------
 arch/x86/kvm/lapic.h |  1 +
 arch/x86/kvm/x86.c   |  5 ++++-
 3 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 34a8ca845280..d197579435d0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -468,6 +468,15 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
 	return result;
 }
 
+void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	int i;
+
+	for (i = 0; i < 8; i++)
+		apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]);
+}
+
 static void apic_update_ppr(struct kvm_lapic *apic)
 {
 	u32 tpr, isrv, ppr, old_ppr;
@@ -661,12 +670,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (dest_map)
 			__set_bit(vcpu->vcpu_id, dest_map);
 
-		if (trig_mode) {
-			apic_debug("level trig mode for vector %d", vector);
-			apic_set_vector(vector, apic->regs + APIC_TMR);
-		} else
-			apic_clear_vector(vector, apic->regs + APIC_TMR);
-
 		result = !apic_test_and_set_irr(vector, apic);
 		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
 					  trig_mode, vector, !result);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 16304b1a8cba..7fe0c9180ea1 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -53,6 +53,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 
+void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 87a05df3eae4..276b4a9a5605 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5664,14 +5664,17 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
 	u64 eoi_exit_bitmap[4];
+	u32 tmr[8];
 
 	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
 		return;
 
 	memset(eoi_exit_bitmap, 0, 32);
+	memset(tmr, 0, 32);
 
-	kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap);
+	kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
 	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+	kvm_apic_update_tmr(vcpu, tmr);
 }
 
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.2


From a20ed54d6e470bf0d28921b7aadb6ca0da0ff0c3 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:25:15 +0800
Subject: KVM: VMX: Add the deliver posted interrupt algorithm

Only deliver the posted interrupt when target vcpu is running
and there is no previous interrupt pending in pir.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/lapic.c            | 13 +++++++++
 arch/x86/kvm/lapic.h            |  1 +
 arch/x86/kvm/svm.c              |  6 ++++
 arch/x86/kvm/vmx.c              | 64 ++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 85 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 68d438630dd3..599f98b612d4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -701,6 +701,8 @@ struct kvm_x86_ops {
 	void (*hwapic_isr_update)(struct kvm *kvm, int isr);
 	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
+	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
+	void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d197579435d0..dbf74c922aa1 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -318,6 +318,19 @@ static u8 count_vectors(void *bitmap)
 	return count;
 }
 
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+{
+	u32 i, pir_val;
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	for (i = 0; i <= 7; i++) {
+		pir_val = xchg(&pir[i], 0);
+		if (pir_val)
+			*((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val;
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
+
 static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
 {
 	apic->irr_pending = true;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 7fe0c9180ea1..c730ac9fe801 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -54,6 +54,7 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 
 void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2f8fe3f06837..d6713e18bbc1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3577,6 +3577,11 @@ static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
 	return;
 }
 
+static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+	return;
+}
+
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -4305,6 +4310,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.vm_has_apicv = svm_vm_has_apicv,
 	.load_eoi_exitmap = svm_load_eoi_exitmap,
 	.hwapic_isr_update = svm_hwapic_isr_update,
+	.sync_pir_to_irr = svm_sync_pir_to_irr,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8535519d0352..3a14d8a0ee46 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -375,6 +375,23 @@ struct pi_desc {
 	u32 rsvd[7];
 } __aligned(64);
 
+static bool pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+	return test_and_set_bit(POSTED_INTR_ON,
+			(unsigned long *)&pi_desc->control);
+}
+
+static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
+{
+	return test_and_clear_bit(POSTED_INTR_ON,
+			(unsigned long *)&pi_desc->control);
+}
+
+static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+{
+	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
@@ -639,6 +656,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg);
 static bool guest_state_valid(struct kvm_vcpu *vcpu);
 static u32 vmx_segment_access_rights(struct kvm_segment *var);
+static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -2846,8 +2864,11 @@ static __init int hardware_setup(void)
 
 	if (enable_apicv)
 		kvm_x86_ops->update_cr8_intercept = NULL;
-	else
+	else {
 		kvm_x86_ops->hwapic_irr_update = NULL;
+		kvm_x86_ops->deliver_posted_interrupt = NULL;
+		kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
+	}
 
 	if (nested)
 		nested_vmx_setup_ctls_msrs();
@@ -3908,6 +3929,45 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
 	return enable_apicv && irqchip_in_kernel(kvm);
 }
 
+/*
+ * Send interrupt to vcpu via posted interrupt way.
+ * 1. If target vcpu is running(non-root mode), send posted interrupt
+ * notification to vcpu and hardware will sync PIR to vIRR atomically.
+ * 2. If target vcpu isn't running(root mode), kick it to pick up the
+ * interrupt from PIR in next vmentry.
+ */
+static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int r;
+
+	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
+		return;
+
+	r = pi_test_and_set_on(&vmx->pi_desc);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+	if (!r && (vcpu->mode == IN_GUEST_MODE))
+		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+				POSTED_INTR_VECTOR);
+	else
+		kvm_vcpu_kick(vcpu);
+}
+
+static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!pi_test_and_clear_on(&vmx->pi_desc))
+		return;
+
+	kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+}
+
+static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
+{
+	return;
+}
+
 /*
  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
  * will not change in the lifetime of the guest.
@@ -7784,6 +7844,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.load_eoi_exitmap = vmx_load_eoi_exitmap,
 	.hwapic_irr_update = vmx_hwapic_irr_update,
 	.hwapic_isr_update = vmx_hwapic_isr_update,
+	.sync_pir_to_irr = vmx_sync_pir_to_irr,
+	.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
-- 
cgit v1.2.2


From 5a71785dde307f6ac80e83c0ad3fd694912010a1 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:25:16 +0800
Subject: KVM: VMX: Use posted interrupt to deliver virtual interrupt

If posted interrupt is avaliable, then uses it to inject virtual
interrupt to guest.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 30 +++++++++++++++++++-----------
 arch/x86/kvm/vmx.c   |  2 +-
 arch/x86/kvm/x86.c   |  1 +
 3 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index dbf74c922aa1..e29883c604ff 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -353,6 +353,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
 	if (!apic->irr_pending)
 		return -1;
 
+	kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
 	result = apic_search_irr(apic);
 	ASSERT(result == -1 || result >= 16);
 
@@ -683,18 +684,25 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (dest_map)
 			__set_bit(vcpu->vcpu_id, dest_map);
 
-		result = !apic_test_and_set_irr(vector, apic);
-		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
-					  trig_mode, vector, !result);
-		if (!result) {
-			if (trig_mode)
-				apic_debug("level trig mode repeatedly for "
-						"vector %d", vector);
-			break;
-		}
+		if (kvm_x86_ops->deliver_posted_interrupt) {
+			result = 1;
+			kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
+		} else {
+			result = !apic_test_and_set_irr(vector, apic);
 
-		kvm_make_request(KVM_REQ_EVENT, vcpu);
-		kvm_vcpu_kick(vcpu);
+			if (!result) {
+				if (trig_mode)
+					apic_debug("level trig mode repeatedly "
+						"for vector %d", vector);
+				goto out;
+			}
+
+			kvm_make_request(KVM_REQ_EVENT, vcpu);
+			kvm_vcpu_kick(vcpu);
+		}
+out:
+		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+				trig_mode, vector, !result);
 		break;
 
 	case APIC_DM_REMRD:
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3a14d8a0ee46..5a87a58af49d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,7 +84,7 @@ module_param(vmm_exclusive, bool, S_IRUGO);
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
-static bool __read_mostly enable_apicv;
+static bool __read_mostly enable_apicv = 1;
 module_param(enable_apicv, bool, S_IRUGO);
 
 /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 276b4a9a5605..50e2e10b8041 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2692,6 +2692,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
+	kvm_x86_ops->sync_pir_to_irr(vcpu);
 	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
 
 	return 0;
-- 
cgit v1.2.2


From 79558f112fc0352e057f7b5e158e3d88b8b62c60 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 16 Apr 2013 19:21:41 +0200
Subject: KVM: ARM: Fix kvm_vm_ioctl_irq_line

Commit aa2fbe6d broke the ARM KVM target by introducing a new parameter
to irq handling functions.

Fix the function prototype to get things compiling again and ignore the
parameter just like we did before

Signed-off-by: Alexander Graf <agraf@suse.de>
Acked-by: Christoffer Dall <cdall@cs.columbia.edu>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/arm/kvm/arm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index e4ad0bb01843..678596f699f3 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -805,7 +805,8 @@ static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
 	return 0;
 }
 
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level)
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
+			  bool line_status)
 {
 	u32 irq = irq_level->irq;
 	unsigned int irq_type, vcpu_idx, irq_num;
-- 
cgit v1.2.2