1 files changed, 133 insertions, 210 deletions
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 42786f59d9c0..88e15deb8b82 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -1,117 +1,43 @@
 /*
-        Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+ * Asm versions of Xen pv-ops, suitable for either direct use or
-        The inline versions are the same as the direct-use versions, with the
+ * inlining.  The inline versions are the same as the direct-use
-        pre- and post-amble chopped off.
+ * versions, with the pre- and post-amble chopped off.
+ *
-        This code is encoded for size rather than absolute efficiency,
+ * This code is encoded for size rather than absolute efficiency, with
-        with a view to being able to inline as much as possible.
+ * a view to being able to inline as much as possible.
+ *
-        We only bother with direct forms (ie, vcpu in pda) of the operations
+ * We only bother with direct forms (ie, vcpu in pda) of the
-        here; the indirect forms are better handled in C, since they're
+ * operations here; the indirect forms are better handled in C, since
-        generally too large to inline anyway.
+ * they're generally too large to inline anyway.
 */
-#include <linux/linkage.h>
-#include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
-#include <asm/percpu.h>
 #include <asm/processor-flags.h>
 #include <asm/segment.h>
 #include <xen/interface/xen.h>
-#define RELOC(x, v)     .globl x##_reloc; x##_reloc=v
+#include "xen-asm.h"
-#define ENDPATCH(x)     .globl x##_end; x##_end=.
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI  0x80000000
-/*
-        Enable events.  This clears the event mask and tests the pending
-        event status with one and operation.  If there are pending
-        events, then enter the hypervisor to get them handled.
- */
-ENTRY(xen_irq_enable_direct)
-        /* Unmask events */
-        movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-        /* Preempt here doesn't matter because that will deal with
-           any pending interrupts.  The pending check may end up being
-           run on the wrong CPU, but that doesn't hurt. */
-        /* Test for pending */
-        testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
-        jz 1f
-2:      call check_events
-1:
-ENDPATCH(xen_irq_enable_direct)
-        ret
-        ENDPROC(xen_irq_enable_direct)
-        RELOC(xen_irq_enable_direct, 2b+1)
-/*
-        Disabling events is simply a matter of making the event mask
-        non-zero.
- */
-ENTRY(xen_irq_disable_direct)
-        movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
-        ret
-        ENDPROC(xen_irq_disable_direct)
-        RELOC(xen_irq_disable_direct, 0)
 /*
-        (xen_)save_fl is used to get the current interrupt enable status.
+ * Force an event check by making a hypercall, but preserve regs
-        Callers expect the status to be in X86_EFLAGS_IF, and other bits
+ * before making the call.
-        may be set in the return value.  We take advantage of this by
-        making sure that X86_EFLAGS_IF has the right value (and other bits
-        in that byte are 0), but other bits in the return value are
-        undefined.  We need to toggle the state of the bit, because
-        Xen and x86 use opposite senses (mask vs enable).
 */
-ENTRY(xen_save_fl_direct)
+check_events:
-        testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+        push %eax
-        setz %ah
+        push %ecx
-        addb %ah,%ah
+        push %edx
-ENDPATCH(xen_save_fl_direct)
+        call xen_force_evtchn_callback
-        ret
+        pop %edx
-        ENDPROC(xen_save_fl_direct)
+        pop %ecx
-        RELOC(xen_save_fl_direct, 0)
+        pop %eax
-/*
-        In principle the caller should be passing us a value return
-        from xen_save_fl_direct, but for robustness sake we test only
-        the X86_EFLAGS_IF flag rather than the whole byte. After
-        setting the interrupt mask state, it checks for unmasked
-        pending events and enters the hypervisor to get them delivered
-        if so.
- */
-ENTRY(xen_restore_fl_direct)
-        testb $X86_EFLAGS_IF>>8, %ah
-        setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-        /* Preempt here doesn't matter because that will deal with
-           any pending interrupts.  The pending check may end up being
-           run on the wrong CPU, but that doesn't hurt. */
-        /* check for unmasked and pending */
-        cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
-        jz 1f
-2:      call check_events
-1:
-ENDPATCH(xen_restore_fl_direct)
        ret
-        ENDPROC(xen_restore_fl_direct)
-        RELOC(xen_restore_fl_direct, 2b+1)
 /*
-        We can't use sysexit directly, because we're not running in ring0.
+ * We can't use sysexit directly, because we're not running in ring0.
-        But we can easily fake it up using iret.  Assuming xen_sysexit
+ * But we can easily fake it up using iret.  Assuming xen_sysexit is
-        is jumped to with a standard stack frame, we can just strip it
+ * jumped to with a standard stack frame, we can just strip it back to
-        back to a standard iret frame and use iret.
+ * a standard iret frame and use iret.
 */
 ENTRY(xen_sysexit)
        movl PT_EAX(%esp), %eax                 /* Shouldn't be necessary? */
@@ -122,33 +48,31 @@ ENTRY(xen_sysexit)
 ENDPROC(xen_sysexit)
 /*
-        This is run where a normal iret would be run, with the same stack setup:
+ * This is run where a normal iret would be run, with the same stack setup:
-              8: eflags
+ *      8: eflags
-              4: cs
+ *      4: cs
-        esp-> 0: eip
+ *      esp-> 0: eip
+ *
-        This attempts to make sure that any pending events are dealt
+ * This attempts to make sure that any pending events are dealt with
-        with on return to usermode, but there is a small window in
+ * on return to usermode, but there is a small window in which an
-        which an event can happen just before entering usermode.  If
+ * event can happen just before entering usermode.  If the nested
-        the nested interrupt ends up setting one of the TIF_WORK_MASK
+ * interrupt ends up setting one of the TIF_WORK_MASK pending work
-        pending work flags, they will not be tested again before
+ * flags, they will not be tested again before returning to
-        returning to usermode. This means that a process can end up
+ * usermode. This means that a process can end up with pending work,
-        with pending work, which will be unprocessed until the process
+ * which will be unprocessed until the process enters and leaves the
-        enters and leaves the kernel again, which could be an
+ * kernel again, which could be an unbounded amount of time.  This
-        unbounded amount of time.  This means that a pending signal or
+ * means that a pending signal or reschedule event could be
-        reschedule event could be indefinitely delayed.
+ * indefinitely delayed.
+ *
-        The fix is to notice a nested interrupt in the critical
+ * The fix is to notice a nested interrupt in the critical window, and
-        window, and if one occurs, then fold the nested interrupt into
+ * if one occurs, then fold the nested interrupt into the current
-        the current interrupt stack frame, and re-process it
+ * interrupt stack frame, and re-process it iteratively rather than
-        iteratively rather than recursively.  This means that it will
+ * recursively.  This means that it will exit via the normal path, and
-        exit via the normal path, and all pending work will be dealt
+ * all pending work will be dealt with appropriately.
-        with appropriately.
+ *
+ * Because the nested interrupt handler needs to deal with the current
-        Because the nested interrupt handler needs to deal with the
+ * stack state in whatever form its in, we keep things simple by only
-        current stack state in whatever form its in, we keep things
+ * using a single register which is pushed/popped on the stack.
-        simple by only using a single register which is pushed/popped
-        on the stack.
 */
 ENTRY(xen_iret)
        /* test eflags for special cases */
@@ -158,13 +82,15 @@ ENTRY(xen_iret)
        push %eax
        ESP_OFFSET=4    # bytes pushed onto stack
-        /* Store vcpu_info pointer for easy access.  Do it this
+        /*
-           way to avoid having to reload %fs */
+         * Store vcpu_info pointer for easy access.  Do it this way to
+         * avoid having to reload %fs
+         */
 #ifdef CONFIG_SMP
        GET_THREAD_INFO(%eax)
-        movl TI_cpu(%eax),%eax
+        movl TI_cpu(%eax), %eax
-        movl __per_cpu_offset(,%eax,4),%eax
+        movl __per_cpu_offset(,%eax,4), %eax
-        mov per_cpu__xen_vcpu(%eax),%eax
+        mov per_cpu__xen_vcpu(%eax), %eax
 #else
        movl per_cpu__xen_vcpu, %eax
 #endif
@@ -172,37 +98,46 @@ ENTRY(xen_iret)
        /* check IF state we're restoring */
        testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
-        /* Maybe enable events.  Once this happens we could get a
+        /*
-           recursive event, so the critical region starts immediately
+         * Maybe enable events.  Once this happens we could get a
-           afterwards.  However, if that happens we don't end up
+         * recursive event, so the critical region starts immediately
-           resuming the code, so we don't have to be worried about
+         * afterwards.  However, if that happens we don't end up
-           being preempted to another CPU. */
+         * resuming the code, so we don't have to be worried about
+         * being preempted to another CPU.
+         */
        setz XEN_vcpu_info_mask(%eax)
 xen_iret_start_crit:
        /* check for unmasked and pending */
        cmpw $0x0001, XEN_vcpu_info_pending(%eax)
-        /* If there's something pending, mask events again so we
+        /*
-           can jump back into xen_hypervisor_callback */
+         * If there's something pending, mask events again so we can
+         * jump back into xen_hypervisor_callback
+         */
        sete XEN_vcpu_info_mask(%eax)
        popl %eax
-        /* From this point on the registers are restored and the stack
+        /*
-           updated, so we don't need to worry about it if we're preempted */
+         * From this point on the registers are restored and the stack
+         * updated, so we don't need to worry about it if we're
+         * preempted
+         */
 iret_restore_end:
-        /* Jump to hypervisor_callback after fixing up the stack.
+        /*
-           Events are masked, so jumping out of the critical
+         * Jump to hypervisor_callback after fixing up the stack.
-           region is OK. */
+         * Events are masked, so jumping out of the critical region is
+         * OK.
+         */
        je xen_hypervisor_callback
 1:      iret
 xen_iret_end_crit:
-.section __ex_table,"a"
+.section __ex_table, "a"
        .align 4
-        .long 1b,iret_exc
+        .long 1b, iret_exc
 .previous
 hyper_iret:
@@ -212,55 +147,55 @@ hyper_iret:
        .globl xen_iret_start_crit, xen_iret_end_crit
 /*
-   This is called by xen_hypervisor_callback in entry.S when it sees
+ * This is called by xen_hypervisor_callback in entry.S when it sees
-   that the EIP at the time of interrupt was between xen_iret_start_crit
+ * that the EIP at the time of interrupt was between
-   and xen_iret_end_crit.  We're passed the EIP in %eax so we can do
+ * xen_iret_start_crit and xen_iret_end_crit.  We're passed the EIP in
-   a more refined determination of what to do.
+ * %eax so we can do a more refined determination of what to do.
+ *
-   The stack format at this point is:
+ * The stack format at this point is:
-        ----------------
+ *      ----------------
-         ss             : (ss/esp may be present if we came from usermode)
+ *       ss             : (ss/esp may be present if we came from usermode)
-         esp            :
+ *       esp            :
-         eflags         }  outer exception info
+ *       eflags         }  outer exception info
-         cs             }
+ *       cs             }
-         eip            }
+ *       eip            }
-        ---------------- <- edi (copy dest)
+ *      ---------------- <- edi (copy dest)
-         eax            :  outer eax if it hasn't been restored
+ *       eax            :  outer eax if it hasn't been restored
-        ----------------
+ *      ----------------
-         eflags         }  nested exception info
+ *       eflags         }  nested exception info
-         cs             }   (no ss/esp because we're nested
+ *       cs             }   (no ss/esp because we're nested
-         eip            }    from the same ring)
+ *       eip            }    from the same ring)
-         orig_eax       }<- esi (copy src)
+ *       orig_eax       }<- esi (copy src)
-         - - - - - - - -
+ *       - - - - - - - -
-         fs             }
+ *       fs             }
-         es             }
+ *       es             }
-         ds             }  SAVE_ALL state
+ *       ds             }  SAVE_ALL state
-         eax            }
+ *       eax            }
-          :             :
+ *        :             :
-         ebx            }<- esp
+ *       ebx            }<- esp
-        ----------------
+ *      ----------------
+ *
-   In order to deliver the nested exception properly, we need to shift
+ * In order to deliver the nested exception properly, we need to shift
-   everything from the return addr up to the error code so it
+ * everything from the return addr up to the error code so it sits
-   sits just under the outer exception info.  This means that when we
+ * just under the outer exception info.  This means that when we
-   handle the exception, we do it in the context of the outer exception
+ * handle the exception, we do it in the context of the outer
-   rather than starting a new one.
+ * exception rather than starting a new one.
+ *
-   The only caveat is that if the outer eax hasn't been
+ * The only caveat is that if the outer eax hasn't been restored yet
-   restored yet (ie, it's still on stack), we need to insert
+ * (ie, it's still on stack), we need to insert its value into the
-   its value into the SAVE_ALL state before going on, since
+ * SAVE_ALL state before going on, since it's usermode state which we
-   it's usermode state which we eventually need to restore.
+ * eventually need to restore.
 */
 ENTRY(xen_iret_crit_fixup)
        /*
-           Paranoia: Make sure we're really coming from kernel space.
+         * Paranoia: Make sure we're really coming from kernel space.
-           One could imagine a case where userspace jumps into the
+         * One could imagine a case where userspace jumps into the
-           critical range address, but just before the CPU delivers a GP,
+         * critical range address, but just before the CPU delivers a
-           it decides to deliver an interrupt instead.  Unlikely?
+         * GP, it decides to deliver an interrupt instead.  Unlikely?
-           Definitely.  Easy to avoid?  Yes.  The Intel documents
+         * Definitely.  Easy to avoid?  Yes.  The Intel documents
-           explicitly say that the reported EIP for a bad jump is the
+         * explicitly say that the reported EIP for a bad jump is the
-           jump instruction itself, not the destination, but some virtual
+         * jump instruction itself, not the destination, but some
-           environments get this wrong.
+         * virtual environments get this wrong.
         */
        movl PT_CS(%esp), %ecx
        andl $SEGMENT_RPL_MASK, %ecx
@@ -270,15 +205,17 @@ ENTRY(xen_iret_crit_fixup)
        lea PT_ORIG_EAX(%esp), %esi
        lea PT_EFLAGS(%esp), %edi
-        /* If eip is before iret_restore_end then stack
+        /*
-           hasn't been restored yet. */
+         * If eip is before iret_restore_end then stack
+         * hasn't been restored yet.
+         */
        cmp $iret_restore_end, %eax
        jae 1f
-        movl 0+4(%edi),%eax             /* copy EAX (just above top of frame) */
+        movl 0+4(%edi), %eax            /* copy EAX (just above top of frame) */
        movl %eax, PT_EAX(%esp)
-        lea ESP_OFFSET(%edi),%edi       /* move dest up over saved regs */
+        lea ESP_OFFSET(%edi), %edi      /* move dest up over saved regs */
        /* set up the copy */
 1:      std
@@ -286,20 +223,6 @@ ENTRY(xen_iret_crit_fixup)
        rep movsl
        cld
-        lea 4(%edi),%esp                /* point esp to new frame */
+        lea 4(%edi), %esp               /* point esp to new frame */
 2:      jmp xen_do_upcall
-/*
-        Force an event check by making a hypercall,
-        but preserve regs before making the call.
- */
-check_events:
-        push %eax
-        push %ecx
-        push %edx
-        call xen_force_evtchn_callback
-        pop %edx
-        pop %ecx
-        pop %eax
-        ret

diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 42786f59d9c0..88e15deb8b82 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S
@@ -1,117 +1,43 @@
1	/*	1	/*
2	Asm versions of Xen pv-ops, suitable for either direct use or inlining.	2	* Asm versions of Xen pv-ops, suitable for either direct use or
3	The inline versions are the same as the direct-use versions, with the	3	* inlining. The inline versions are the same as the direct-use
4	pre- and post-amble chopped off.	4	* versions, with the pre- and post-amble chopped off.
5		5	*
6	This code is encoded for size rather than absolute efficiency,	6	* This code is encoded for size rather than absolute efficiency, with
7	with a view to being able to inline as much as possible.	7	* a view to being able to inline as much as possible.
8		8	*
9	We only bother with direct forms (ie, vcpu in pda) of the operations	9	* We only bother with direct forms (ie, vcpu in pda) of the
10	here; the indirect forms are better handled in C, since they're	10	* operations here; the indirect forms are better handled in C, since
11	generally too large to inline anyway.	11	* they're generally too large to inline anyway.
12	*/	12	*/
13		13
14	#include <linux/linkage.h>
15
16	#include <asm/asm-offsets.h>
17	#include <asm/thread_info.h>	14	#include <asm/thread_info.h>
18	#include <asm/percpu.h>
19	#include <asm/processor-flags.h>	15	#include <asm/processor-flags.h>
20	#include <asm/segment.h>	16	#include <asm/segment.h>
21		17
22	#include <xen/interface/xen.h>	18	#include <xen/interface/xen.h>
23		19
24	#define RELOC(x, v) .globl x##_reloc; x##_reloc=v	20	#include "xen-asm.h"
25	#define ENDPATCH(x) .globl x##_end; x##_end=.
26
27	/* Pseudo-flag used for virtual NMI, which we don't implement yet */
28	#define XEN_EFLAGS_NMI 0x80000000
29
30	/*
31	Enable events. This clears the event mask and tests the pending
32	event status with one and operation. If there are pending
33	events, then enter the hypervisor to get them handled.
34	*/
35	ENTRY(xen_irq_enable_direct)
36	/* Unmask events */
37	movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
38
39	/* Preempt here doesn't matter because that will deal with
40	any pending interrupts. The pending check may end up being
41	run on the wrong CPU, but that doesn't hurt. */
42
43	/* Test for pending */
44	testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
45	jz 1f
46
47	2: call check_events
48	1:
49	ENDPATCH(xen_irq_enable_direct)
50	ret
51	ENDPROC(xen_irq_enable_direct)
52	RELOC(xen_irq_enable_direct, 2b+1)
53
54
55	/*
56	Disabling events is simply a matter of making the event mask
57	non-zero.
58	*/
59	ENTRY(xen_irq_disable_direct)
60	movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
61	ENDPATCH(xen_irq_disable_direct)
62	ret
63	ENDPROC(xen_irq_disable_direct)
64	RELOC(xen_irq_disable_direct, 0)
65		21
66	/*	22	/*
67	(xen_)save_fl is used to get the current interrupt enable status.	23	* Force an event check by making a hypercall, but preserve regs
68	Callers expect the status to be in X86_EFLAGS_IF, and other bits	24	* before making the call.
69	may be set in the return value. We take advantage of this by
70	making sure that X86_EFLAGS_IF has the right value (and other bits
71	in that byte are 0), but other bits in the return value are
72	undefined. We need to toggle the state of the bit, because
73	Xen and x86 use opposite senses (mask vs enable).
74	*/	25	*/
75	ENTRY(xen_save_fl_direct)	26	check_events:
76	testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask	27	push %eax
77	setz %ah	28	push %ecx
78	addb %ah,%ah	29	push %edx
79	ENDPATCH(xen_save_fl_direct)	30	call xen_force_evtchn_callback
80	ret	31	pop %edx
81	ENDPROC(xen_save_fl_direct)	32	pop %ecx
82	RELOC(xen_save_fl_direct, 0)	33	pop %eax
83
84
85	/*
86	In principle the caller should be passing us a value return
87	from xen_save_fl_direct, but for robustness sake we test only
88	the X86_EFLAGS_IF flag rather than the whole byte. After
89	setting the interrupt mask state, it checks for unmasked
90	pending events and enters the hypervisor to get them delivered
91	if so.
92	*/
93	ENTRY(xen_restore_fl_direct)
94	testb $X86_EFLAGS_IF>>8, %ah
95	setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
96	/* Preempt here doesn't matter because that will deal with
97	any pending interrupts. The pending check may end up being
98	run on the wrong CPU, but that doesn't hurt. */
99
100	/* check for unmasked and pending */
101	cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
102	jz 1f
103	2: call check_events
104	1:
105	ENDPATCH(xen_restore_fl_direct)
106	ret	34	ret
107	ENDPROC(xen_restore_fl_direct)
108	RELOC(xen_restore_fl_direct, 2b+1)
109		35
110	/*	36	/*
111	We can't use sysexit directly, because we're not running in ring0.	37	* We can't use sysexit directly, because we're not running in ring0.
112	But we can easily fake it up using iret. Assuming xen_sysexit	38	* But we can easily fake it up using iret. Assuming xen_sysexit is
113	is jumped to with a standard stack frame, we can just strip it	39	* jumped to with a standard stack frame, we can just strip it back to
114	back to a standard iret frame and use iret.	40	* a standard iret frame and use iret.
115	*/	41	*/
116	ENTRY(xen_sysexit)	42	ENTRY(xen_sysexit)
117	movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */	43	movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
@@ -122,33 +48,31 @@ ENTRY(xen_sysexit)
122	ENDPROC(xen_sysexit)	48	ENDPROC(xen_sysexit)
123		49
124	/*	50	/*
125	This is run where a normal iret would be run, with the same stack setup:	51	* This is run where a normal iret would be run, with the same stack setup:
126	8: eflags	52	* 8: eflags
127	4: cs	53	* 4: cs
128	esp-> 0: eip	54	* esp-> 0: eip
129		55	*
130	This attempts to make sure that any pending events are dealt	56	* This attempts to make sure that any pending events are dealt with
131	with on return to usermode, but there is a small window in	57	* on return to usermode, but there is a small window in which an
132	which an event can happen just before entering usermode. If	58	* event can happen just before entering usermode. If the nested
133	the nested interrupt ends up setting one of the TIF_WORK_MASK	59	* interrupt ends up setting one of the TIF_WORK_MASK pending work
134	pending work flags, they will not be tested again before	60	* flags, they will not be tested again before returning to
135	returning to usermode. This means that a process can end up	61	* usermode. This means that a process can end up with pending work,
136	with pending work, which will be unprocessed until the process	62	* which will be unprocessed until the process enters and leaves the
137	enters and leaves the kernel again, which could be an	63	* kernel again, which could be an unbounded amount of time. This
138	unbounded amount of time. This means that a pending signal or	64	* means that a pending signal or reschedule event could be
139	reschedule event could be indefinitely delayed.	65	* indefinitely delayed.
140		66	*
141	The fix is to notice a nested interrupt in the critical	67	* The fix is to notice a nested interrupt in the critical window, and
142	window, and if one occurs, then fold the nested interrupt into	68	* if one occurs, then fold the nested interrupt into the current
143	the current interrupt stack frame, and re-process it	69	* interrupt stack frame, and re-process it iteratively rather than
144	iteratively rather than recursively. This means that it will	70	* recursively. This means that it will exit via the normal path, and
145	exit via the normal path, and all pending work will be dealt	71	* all pending work will be dealt with appropriately.
146	with appropriately.	72	*
147		73	* Because the nested interrupt handler needs to deal with the current
148	Because the nested interrupt handler needs to deal with the	74	* stack state in whatever form its in, we keep things simple by only
149	current stack state in whatever form its in, we keep things	75	* using a single register which is pushed/popped on the stack.
150	simple by only using a single register which is pushed/popped
151	on the stack.
152	*/	76	*/
153	ENTRY(xen_iret)	77	ENTRY(xen_iret)
154	/* test eflags for special cases */	78	/* test eflags for special cases */
@@ -158,13 +82,15 @@ ENTRY(xen_iret)
158	push %eax	82	push %eax
159	ESP_OFFSET=4 # bytes pushed onto stack	83	ESP_OFFSET=4 # bytes pushed onto stack
160		84
161	/* Store vcpu_info pointer for easy access. Do it this	85	/*
162	way to avoid having to reload %fs */	86	* Store vcpu_info pointer for easy access. Do it this way to
		87	* avoid having to reload %fs
		88	*/
163	#ifdef CONFIG_SMP	89	#ifdef CONFIG_SMP
164	GET_THREAD_INFO(%eax)	90	GET_THREAD_INFO(%eax)
165	movl TI_cpu(%eax),%eax	91	movl TI_cpu(%eax), %eax
166	movl __per_cpu_offset(,%eax,4),%eax	92	movl __per_cpu_offset(,%eax,4), %eax
167	mov per_cpu__xen_vcpu(%eax),%eax	93	mov per_cpu__xen_vcpu(%eax), %eax
168	#else	94	#else
169	movl per_cpu__xen_vcpu, %eax	95	movl per_cpu__xen_vcpu, %eax
170	#endif	96	#endif
@@ -172,37 +98,46 @@ ENTRY(xen_iret)
172	/* check IF state we're restoring */	98	/* check IF state we're restoring */
173	testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)	99	testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
174		100
175	/* Maybe enable events. Once this happens we could get a	101	/*
176	recursive event, so the critical region starts immediately	102	* Maybe enable events. Once this happens we could get a
177	afterwards. However, if that happens we don't end up	103	* recursive event, so the critical region starts immediately
178	resuming the code, so we don't have to be worried about	104	* afterwards. However, if that happens we don't end up
179	being preempted to another CPU. */	105	* resuming the code, so we don't have to be worried about
		106	* being preempted to another CPU.
		107	*/
180	setz XEN_vcpu_info_mask(%eax)	108	setz XEN_vcpu_info_mask(%eax)
181	xen_iret_start_crit:	109	xen_iret_start_crit:
182		110
183	/* check for unmasked and pending */	111	/* check for unmasked and pending */
184	cmpw $0x0001, XEN_vcpu_info_pending(%eax)	112	cmpw $0x0001, XEN_vcpu_info_pending(%eax)
185		113
186	/* If there's something pending, mask events again so we	114	/*
187	can jump back into xen_hypervisor_callback */	115	* If there's something pending, mask events again so we can
		116	* jump back into xen_hypervisor_callback
		117	*/
188	sete XEN_vcpu_info_mask(%eax)	118	sete XEN_vcpu_info_mask(%eax)
189		119
190	popl %eax	120	popl %eax
191		121
192	/* From this point on the registers are restored and the stack	122	/*
193	updated, so we don't need to worry about it if we're preempted */	123	* From this point on the registers are restored and the stack
		124	* updated, so we don't need to worry about it if we're
		125	* preempted
		126	*/
194	iret_restore_end:	127	iret_restore_end:
195		128
196	/* Jump to hypervisor_callback after fixing up the stack.	129	/*
197	Events are masked, so jumping out of the critical	130	* Jump to hypervisor_callback after fixing up the stack.
198	region is OK. */	131	* Events are masked, so jumping out of the critical region is
		132	* OK.
		133	*/
199	je xen_hypervisor_callback	134	je xen_hypervisor_callback
200		135
201	1: iret	136	1: iret
202	xen_iret_end_crit:	137	xen_iret_end_crit:
203	.section __ex_table,"a"	138	.section __ex_table, "a"
204	.align 4	139	.align 4
205	.long 1b,iret_exc	140	.long 1b, iret_exc
206	.previous	141	.previous
207		142
208	hyper_iret:	143	hyper_iret:
@@ -212,55 +147,55 @@ hyper_iret:
212	.globl xen_iret_start_crit, xen_iret_end_crit	147	.globl xen_iret_start_crit, xen_iret_end_crit
213		148
214	/*	149	/*
215	This is called by xen_hypervisor_callback in entry.S when it sees	150	* This is called by xen_hypervisor_callback in entry.S when it sees
216	that the EIP at the time of interrupt was between xen_iret_start_crit	151	* that the EIP at the time of interrupt was between
217	and xen_iret_end_crit. We're passed the EIP in %eax so we can do	152	* xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in
218	a more refined determination of what to do.	153	* %eax so we can do a more refined determination of what to do.
219		154	*
220	The stack format at this point is:	155	* The stack format at this point is:
221	----------------	156	* ----------------
222	ss : (ss/esp may be present if we came from usermode)	157	* ss : (ss/esp may be present if we came from usermode)
223	esp :	158	* esp :
224	eflags } outer exception info	159	* eflags } outer exception info
225	cs }	160	* cs }
226	eip }	161	* eip }
227	---------------- <- edi (copy dest)	162	* ---------------- <- edi (copy dest)
228	eax : outer eax if it hasn't been restored	163	* eax : outer eax if it hasn't been restored
229	----------------	164	* ----------------
230	eflags } nested exception info	165	* eflags } nested exception info
231	cs } (no ss/esp because we're nested	166	* cs } (no ss/esp because we're nested
232	eip } from the same ring)	167	* eip } from the same ring)
233	orig_eax }<- esi (copy src)	168	* orig_eax }<- esi (copy src)
234	- - - - - - - -	169	* - - - - - - - -
235	fs }	170	* fs }
236	es }	171	* es }
237	ds } SAVE_ALL state	172	* ds } SAVE_ALL state
238	eax }	173	* eax }
239	: :	174	* : :
240	ebx }<- esp	175	* ebx }<- esp
241	----------------	176	* ----------------
242		177	*
243	In order to deliver the nested exception properly, we need to shift	178	* In order to deliver the nested exception properly, we need to shift
244	everything from the return addr up to the error code so it	179	* everything from the return addr up to the error code so it sits
245	sits just under the outer exception info. This means that when we	180	* just under the outer exception info. This means that when we
246	handle the exception, we do it in the context of the outer exception	181	* handle the exception, we do it in the context of the outer
247	rather than starting a new one.	182	* exception rather than starting a new one.
248		183	*
249	The only caveat is that if the outer eax hasn't been	184	* The only caveat is that if the outer eax hasn't been restored yet
250	restored yet (ie, it's still on stack), we need to insert	185	* (ie, it's still on stack), we need to insert its value into the
251	its value into the SAVE_ALL state before going on, since	186	* SAVE_ALL state before going on, since it's usermode state which we
252	it's usermode state which we eventually need to restore.	187	* eventually need to restore.
253	*/	188	*/
254	ENTRY(xen_iret_crit_fixup)	189	ENTRY(xen_iret_crit_fixup)
255	/*	190	/*
256	Paranoia: Make sure we're really coming from kernel space.	191	* Paranoia: Make sure we're really coming from kernel space.
257	One could imagine a case where userspace jumps into the	192	* One could imagine a case where userspace jumps into the
258	critical range address, but just before the CPU delivers a GP,	193	* critical range address, but just before the CPU delivers a
259	it decides to deliver an interrupt instead. Unlikely?	194	* GP, it decides to deliver an interrupt instead. Unlikely?
260	Definitely. Easy to avoid? Yes. The Intel documents	195	* Definitely. Easy to avoid? Yes. The Intel documents
261	explicitly say that the reported EIP for a bad jump is the	196	* explicitly say that the reported EIP for a bad jump is the
262	jump instruction itself, not the destination, but some virtual	197	* jump instruction itself, not the destination, but some
263	environments get this wrong.	198	* virtual environments get this wrong.
264	*/	199	*/
265	movl PT_CS(%esp), %ecx	200	movl PT_CS(%esp), %ecx
266	andl $SEGMENT_RPL_MASK, %ecx	201	andl $SEGMENT_RPL_MASK, %ecx
@@ -270,15 +205,17 @@ ENTRY(xen_iret_crit_fixup)
270	lea PT_ORIG_EAX(%esp), %esi	205	lea PT_ORIG_EAX(%esp), %esi
271	lea PT_EFLAGS(%esp), %edi	206	lea PT_EFLAGS(%esp), %edi
272		207
273	/* If eip is before iret_restore_end then stack	208	/*
274	hasn't been restored yet. */	209	* If eip is before iret_restore_end then stack
		210	* hasn't been restored yet.
		211	*/
275	cmp $iret_restore_end, %eax	212	cmp $iret_restore_end, %eax
276	jae 1f	213	jae 1f
277		214
278	movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */	215	movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */
279	movl %eax, PT_EAX(%esp)	216	movl %eax, PT_EAX(%esp)
280		217
281	lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */	218	lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */
282		219
283	/* set up the copy */	220	/* set up the copy */
284	1: std	221	1: std
@@ -286,20 +223,6 @@ ENTRY(xen_iret_crit_fixup)
286	rep movsl	223	rep movsl
287	cld	224	cld
288		225
289	lea 4(%edi),%esp /* point esp to new frame */	226	lea 4(%edi), %esp /* point esp to new frame */
290	2: jmp xen_do_upcall	227	2: jmp xen_do_upcall
291		228
292
293	/*
294	Force an event check by making a hypercall,
295	but preserve regs before making the call.
296	*/
297	check_events:
298	push %eax
299	push %ecx
300	push %edx
301	call xen_force_evtchn_callback
302	pop %edx
303	pop %ecx
304	pop %eax
305	ret