KVM: x86 emulator: framework for streamlining arithmetic opcodes

We emulate arithmetic opcodes by executing a "similar" (same operation, different operands) on the cpu. This ensures accurate emulation, esp. wrt. eflags. However, the prologue and epilogue around the opcode is fairly long, consisting of a switch (for the operand size) and code to load and save the operands. This is repeated for every opcode. This patch introduces an alternative way to emulate arithmetic opcodes. Instead of the above, we have four (three on i386) functions consisting of just the opcode and a ret; one for each operand size. For example: .align 8 em_notb: not %al ret .align 8 em_notw: not %ax ret .align 8 em_notl: not %eax ret .align 8 em_notq: not %rax ret The prologue and epilogue are shared across all opcodes. Note the functions use a special calling convention; notably eflags is an input/output parameter and is not clobbered. Rather than dispatching the four functions through a jump table, the functions are declared as a constant size (8) so their address can be calculated. Acked-by: Gleb Natapov <gleb@redhat.com> Signed-off-by: Avi Kivity <avi.kivity@gmail.com> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
author: Avi Kivity <avi.kivity@gmail.com> 2013-01-04 09:18:48 -0500
committer: Marcelo Tosatti <mtosatti@redhat.com> 2013-01-09 14:39:17 -0500
commit: e28bbd44dad134046ef9463cbb8c1cf81f53de5e (patch)
tree: ba58057fd3ad51430fffe57e41d7bc3be62b91db
parent: b09408d00fd82be80289a329dd94d1a0d6b77dc2 (diff)
1 files changed, 41 insertions, 0 deletions
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 53c5ad6851d1..dd71567d7c71 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -149,6 +149,7 @@
 #define Aligned     ((u64)1 << 41)  /* Explicitly aligned (e.g. MOVDQA) */
 #define Unaligned   ((u64)1 << 42)  /* Explicitly unaligned (e.g. MOVDQU) */
 #define Avx         ((u64)1 << 43)  /* Advanced Vector Extensions */
+#define Fastop      ((u64)1 << 44)  /* Use opcode::u.fastop */
 #define X2(x...) x, x
 #define X3(x...) X2(x), x
@@ -159,6 +160,27 @@
 #define X8(x...) X4(x), X4(x)
 #define X16(x...) X8(x), X8(x)
+#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
+#define FASTOP_SIZE 8
+/*
+ * fastop functions have a special calling convention:
+ *
+ * dst:    [rdx]:rax  (in/out)
+ * src:    rbx        (in/out)
+ * src2:   rcx        (in)
+ * flags:  rflags     (in/out)
+ *
+ * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
+ * different operand sizes can be reached by calculation, rather than a jump
+ * table (which would be bigger than the code).
+ *
+ * fastop functions are declared as taking a never-defined fastop parameter,
+ * so they can't be called from C directly.
+ */
+struct fastop;
 struct opcode {
        u64 flags : 56;
        u64 intercept : 8;
@@ -168,6 +190,7 @@ struct opcode {
                const struct group_dual *gdual;
                const struct gprefix *gprefix;
                const struct escape *esc;
+                void (*fastop)(struct fastop *fake);
        } u;
        int (*check_perm)(struct x86_emulate_ctxt *ctxt);
 };
@@ -3646,6 +3669,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
 #define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
 #define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
 #define II(_f, _e, _i) \
        { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
 #define IIP(_f, _e, _i, _p) \
@@ -4502,6 +4526,16 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
                read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
 }
+static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
+{
+        ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
+        fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
+        asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
+            : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags)
+        : "c"(ctxt->src2.val), [fastop]"S"(fop));
+        ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
+        return X86EMUL_CONTINUE;
+}
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
@@ -4631,6 +4665,13 @@ special_insn:
        }
        if (ctxt->execute) {
+                if (ctxt->d & Fastop) {
+                        void (*fop)(struct fastop *) = (void *)ctxt->execute;
+                        rc = fastop(ctxt, fop);
+                        if (rc != X86EMUL_CONTINUE)
+                                goto done;
+                        goto writeback;
+                }
                rc = ctxt->execute(ctxt);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
author	Avi Kivity <avi.kivity@gmail.com>	2013-01-04 09:18:48 -0500
committer	Marcelo Tosatti <mtosatti@redhat.com>	2013-01-09 14:39:17 -0500
commit	e28bbd44dad134046ef9463cbb8c1cf81f53de5e (patch)
tree	ba58057fd3ad51430fffe57e41d7bc3be62b91db
parent	b09408d00fd82be80289a329dd94d1a0d6b77dc2 (diff)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 53c5ad6851d1..dd71567d7c71 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c
@@ -149,6 +149,7 @@
149	#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */	149	#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
150	#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */	150	#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */
151	#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */	151	#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */
		152	#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
152		153
153	#define X2(x...) x, x	154	#define X2(x...) x, x
154	#define X3(x...) X2(x), x	155	#define X3(x...) X2(x), x
@@ -159,6 +160,27 @@
159	#define X8(x...) X4(x), X4(x)	160	#define X8(x...) X4(x), X4(x)
160	#define X16(x...) X8(x), X8(x)	161	#define X16(x...) X8(x), X8(x)
161		162
		163	#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
		164	#define FASTOP_SIZE 8
		165
		166	/*
		167	* fastop functions have a special calling convention:
		168	*
		169	* dst: [rdx]:rax (in/out)
		170	* src: rbx (in/out)
		171	* src2: rcx (in)
		172	* flags: rflags (in/out)
		173	*
		174	* Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
		175	* different operand sizes can be reached by calculation, rather than a jump
		176	* table (which would be bigger than the code).
		177	*
		178	* fastop functions are declared as taking a never-defined fastop parameter,
		179	* so they can't be called from C directly.
		180	*/
		181
		182	struct fastop;
		183
162	struct opcode {	184	struct opcode {
163	u64 flags : 56;	185	u64 flags : 56;
164	u64 intercept : 8;	186	u64 intercept : 8;
@@ -168,6 +190,7 @@ struct opcode {
168	const struct group_dual *gdual;	190	const struct group_dual *gdual;
169	const struct gprefix *gprefix;	191	const struct gprefix *gprefix;
170	const struct escape *esc;	192	const struct escape *esc;
		193	void (fastop)(struct fastop fake);
171	} u;	194	} u;
172	int (check_perm)(struct x86_emulate_ctxt ctxt);	195	int (check_perm)(struct x86_emulate_ctxt ctxt);
173	};	196	};
@@ -3646,6 +3669,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3646	#define GD(_f, _g) { .flags = ((_f) \| GroupDual \| ModRM), .u.gdual = (_g) }	3669	#define GD(_f, _g) { .flags = ((_f) \| GroupDual \| ModRM), .u.gdual = (_g) }
3647	#define E(_f, _e) { .flags = ((_f) \| Escape \| ModRM), .u.esc = (_e) }	3670	#define E(_f, _e) { .flags = ((_f) \| Escape \| ModRM), .u.esc = (_e) }
3648	#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }	3671	#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
		3672	#define F(_f, _e) { .flags = (_f) \| Fastop, .u.fastop = (_e) }
3649	#define II(_f, _e, _i) \	3673	#define II(_f, _e, _i) \
3650	{ .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }	3674	{ .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
3651	#define IIP(_f, _e, _i, _p) \	3675	#define IIP(_f, _e, _i, _p) \
@@ -4502,6 +4526,16 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
4502	read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);	4526	read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
4503	}	4527	}
4504		4528
		4529	static int fastop(struct x86_emulate_ctxt ctxt, void (fop)(struct fastop *))
		4530	{
		4531	ulong flags = (ctxt->eflags & EFLAGS_MASK) \| X86_EFLAGS_IF;
		4532	fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
		4533	asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
		4534	: "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags)
		4535	: "c"(ctxt->src2.val), [fastop]"S"(fop));
		4536	ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) \| (flags & EFLAGS_MASK);
		4537	return X86EMUL_CONTINUE;
		4538	}
4505		4539
4506	int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)	4540	int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4507	{	4541	{
@@ -4631,6 +4665,13 @@ special_insn:
4631	}	4665	}
4632		4666
4633	if (ctxt->execute) {	4667	if (ctxt->execute) {
		4668	if (ctxt->d & Fastop) {
		4669	void (fop)(struct fastop ) = (void *)ctxt->execute;
		4670	rc = fastop(ctxt, fop);
		4671	if (rc != X86EMUL_CONTINUE)
		4672	goto done;
		4673	goto writeback;
		4674	}
4634	rc = ctxt->execute(ctxt);	4675	rc = ctxt->execute(ctxt);
4635	if (rc != X86EMUL_CONTINUE)	4676	if (rc != X86EMUL_CONTINUE)
4636	goto done;	4677	goto done;