diff options
Diffstat (limited to 'arch/i386/xen/xen-asm.S')
-rw-r--r-- | arch/i386/xen/xen-asm.S | 291 |
1 files changed, 291 insertions, 0 deletions
diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S new file mode 100644 index 000000000000..1a43b60c0c62 --- /dev/null +++ b/arch/i386/xen/xen-asm.S | |||
@@ -0,0 +1,291 @@ | |||
1 | /* | ||
2 | Asm versions of Xen pv-ops, suitable for either direct use or inlining. | ||
3 | The inline versions are the same as the direct-use versions, with the | ||
4 | pre- and post-amble chopped off. | ||
5 | |||
6 | This code is encoded for size rather than absolute efficiency, | ||
7 | with a view to being able to inline as much as possible. | ||
8 | |||
9 | We only bother with direct forms (ie, vcpu in pda) of the operations | ||
10 | here; the indirect forms are better handled in C, since they're | ||
11 | generally too large to inline anyway. | ||
12 | */ | ||
13 | |||
14 | #include <linux/linkage.h> | ||
15 | |||
16 | #include <asm/asm-offsets.h> | ||
17 | #include <asm/thread_info.h> | ||
18 | #include <asm/percpu.h> | ||
19 | #include <asm/processor-flags.h> | ||
20 | #include <asm/segment.h> | ||
21 | |||
22 | #include <xen/interface/xen.h> | ||
23 | |||
24 | #define RELOC(x, v) .globl x##_reloc; x##_reloc=v | ||
25 | #define ENDPATCH(x) .globl x##_end; x##_end=. | ||
26 | |||
27 | /* Pseudo-flag used for virtual NMI, which we don't implement yet */ | ||
28 | #define XEN_EFLAGS_NMI 0x80000000 | ||
29 | |||
30 | /* | ||
31 | Enable events. This clears the event mask and tests the pending | ||
32 | event status with one and operation. If there are pending | ||
33 | events, then enter the hypervisor to get them handled. | ||
34 | */ | ||
35 | ENTRY(xen_irq_enable_direct) | ||
36 | /* Clear mask and test pending */ | ||
37 | andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending | ||
38 | /* Preempt here doesn't matter because that will deal with | ||
39 | any pending interrupts. The pending check may end up being | ||
40 | run on the wrong CPU, but that doesn't hurt. */ | ||
41 | jz 1f | ||
42 | 2: call check_events | ||
43 | 1: | ||
44 | ENDPATCH(xen_irq_enable_direct) | ||
45 | ret | ||
46 | ENDPROC(xen_irq_enable_direct) | ||
47 | RELOC(xen_irq_enable_direct, 2b+1) | ||
48 | |||
49 | |||
50 | /* | ||
51 | Disabling events is simply a matter of making the event mask | ||
52 | non-zero. | ||
53 | */ | ||
54 | ENTRY(xen_irq_disable_direct) | ||
55 | movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask | ||
56 | ENDPATCH(xen_irq_disable_direct) | ||
57 | ret | ||
58 | ENDPROC(xen_irq_disable_direct) | ||
59 | RELOC(xen_irq_disable_direct, 0) | ||
60 | |||
61 | /* | ||
62 | (xen_)save_fl is used to get the current interrupt enable status. | ||
63 | Callers expect the status to be in X86_EFLAGS_IF, and other bits | ||
64 | may be set in the return value. We take advantage of this by | ||
65 | making sure that X86_EFLAGS_IF has the right value (and other bits | ||
66 | in that byte are 0), but other bits in the return value are | ||
67 | undefined. We need to toggle the state of the bit, because | ||
68 | Xen and x86 use opposite senses (mask vs enable). | ||
69 | */ | ||
70 | ENTRY(xen_save_fl_direct) | ||
71 | testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask | ||
72 | setz %ah | ||
73 | addb %ah,%ah | ||
74 | ENDPATCH(xen_save_fl_direct) | ||
75 | ret | ||
76 | ENDPROC(xen_save_fl_direct) | ||
77 | RELOC(xen_save_fl_direct, 0) | ||
78 | |||
79 | |||
80 | /* | ||
81 | In principle the caller should be passing us a value return | ||
82 | from xen_save_fl_direct, but for robustness sake we test only | ||
83 | the X86_EFLAGS_IF flag rather than the whole byte. After | ||
84 | setting the interrupt mask state, it checks for unmasked | ||
85 | pending events and enters the hypervisor to get them delivered | ||
86 | if so. | ||
87 | */ | ||
88 | ENTRY(xen_restore_fl_direct) | ||
89 | testb $X86_EFLAGS_IF>>8, %ah | ||
90 | setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask | ||
91 | /* Preempt here doesn't matter because that will deal with | ||
92 | any pending interrupts. The pending check may end up being | ||
93 | run on the wrong CPU, but that doesn't hurt. */ | ||
94 | |||
95 | /* check for unmasked and pending */ | ||
96 | cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending | ||
97 | jz 1f | ||
98 | 2: call check_events | ||
99 | 1: | ||
100 | ENDPATCH(xen_restore_fl_direct) | ||
101 | ret | ||
102 | ENDPROC(xen_restore_fl_direct) | ||
103 | RELOC(xen_restore_fl_direct, 2b+1) | ||
104 | |||
105 | /* | ||
106 | This is run where a normal iret would be run, with the same stack setup: | ||
107 | 8: eflags | ||
108 | 4: cs | ||
109 | esp-> 0: eip | ||
110 | |||
111 | This attempts to make sure that any pending events are dealt | ||
112 | with on return to usermode, but there is a small window in | ||
113 | which an event can happen just before entering usermode. If | ||
114 | the nested interrupt ends up setting one of the TIF_WORK_MASK | ||
115 | pending work flags, they will not be tested again before | ||
116 | returning to usermode. This means that a process can end up | ||
117 | with pending work, which will be unprocessed until the process | ||
118 | enters and leaves the kernel again, which could be an | ||
119 | unbounded amount of time. This means that a pending signal or | ||
120 | reschedule event could be indefinitely delayed. | ||
121 | |||
122 | The fix is to notice a nested interrupt in the critical | ||
123 | window, and if one occurs, then fold the nested interrupt into | ||
124 | the current interrupt stack frame, and re-process it | ||
125 | iteratively rather than recursively. This means that it will | ||
126 | exit via the normal path, and all pending work will be dealt | ||
127 | with appropriately. | ||
128 | |||
129 | Because the nested interrupt handler needs to deal with the | ||
130 | current stack state in whatever form its in, we keep things | ||
131 | simple by only using a single register which is pushed/popped | ||
132 | on the stack. | ||
133 | |||
134 | Non-direct iret could be done in the same way, but it would | ||
135 | require an annoying amount of code duplication. We'll assume | ||
136 | that direct mode will be the common case once the hypervisor | ||
137 | support becomes commonplace. | ||
138 | */ | ||
139 | ENTRY(xen_iret_direct) | ||
140 | /* test eflags for special cases */ | ||
141 | testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) | ||
142 | jnz hyper_iret | ||
143 | |||
144 | push %eax | ||
145 | ESP_OFFSET=4 # bytes pushed onto stack | ||
146 | |||
147 | /* Store vcpu_info pointer for easy access. Do it this | ||
148 | way to avoid having to reload %fs */ | ||
149 | #ifdef CONFIG_SMP | ||
150 | GET_THREAD_INFO(%eax) | ||
151 | movl TI_cpu(%eax),%eax | ||
152 | movl __per_cpu_offset(,%eax,4),%eax | ||
153 | lea per_cpu__xen_vcpu_info(%eax),%eax | ||
154 | #else | ||
155 | movl $per_cpu__xen_vcpu_info, %eax | ||
156 | #endif | ||
157 | |||
158 | /* check IF state we're restoring */ | ||
159 | testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) | ||
160 | |||
161 | /* Maybe enable events. Once this happens we could get a | ||
162 | recursive event, so the critical region starts immediately | ||
163 | afterwards. However, if that happens we don't end up | ||
164 | resuming the code, so we don't have to be worried about | ||
165 | being preempted to another CPU. */ | ||
166 | setz XEN_vcpu_info_mask(%eax) | ||
167 | xen_iret_start_crit: | ||
168 | |||
169 | /* check for unmasked and pending */ | ||
170 | cmpw $0x0001, XEN_vcpu_info_pending(%eax) | ||
171 | |||
172 | /* If there's something pending, mask events again so we | ||
173 | can jump back into xen_hypervisor_callback */ | ||
174 | sete XEN_vcpu_info_mask(%eax) | ||
175 | |||
176 | popl %eax | ||
177 | |||
178 | /* From this point on the registers are restored and the stack | ||
179 | updated, so we don't need to worry about it if we're preempted */ | ||
180 | iret_restore_end: | ||
181 | |||
182 | /* Jump to hypervisor_callback after fixing up the stack. | ||
183 | Events are masked, so jumping out of the critical | ||
184 | region is OK. */ | ||
185 | je xen_hypervisor_callback | ||
186 | |||
187 | iret | ||
188 | xen_iret_end_crit: | ||
189 | |||
190 | hyper_iret: | ||
191 | /* put this out of line since its very rarely used */ | ||
192 | jmp hypercall_page + __HYPERVISOR_iret * 32 | ||
193 | |||
194 | .globl xen_iret_start_crit, xen_iret_end_crit | ||
195 | |||
196 | /* | ||
197 | This is called by xen_hypervisor_callback in entry.S when it sees | ||
198 | that the EIP at the time of interrupt was between xen_iret_start_crit | ||
199 | and xen_iret_end_crit. We're passed the EIP in %eax so we can do | ||
200 | a more refined determination of what to do. | ||
201 | |||
202 | The stack format at this point is: | ||
203 | ---------------- | ||
204 | ss : (ss/esp may be present if we came from usermode) | ||
205 | esp : | ||
206 | eflags } outer exception info | ||
207 | cs } | ||
208 | eip } | ||
209 | ---------------- <- edi (copy dest) | ||
210 | eax : outer eax if it hasn't been restored | ||
211 | ---------------- | ||
212 | eflags } nested exception info | ||
213 | cs } (no ss/esp because we're nested | ||
214 | eip } from the same ring) | ||
215 | orig_eax }<- esi (copy src) | ||
216 | - - - - - - - - | ||
217 | fs } | ||
218 | es } | ||
219 | ds } SAVE_ALL state | ||
220 | eax } | ||
221 | : : | ||
222 | ebx } | ||
223 | ---------------- | ||
224 | return addr <- esp | ||
225 | ---------------- | ||
226 | |||
227 | In order to deliver the nested exception properly, we need to shift | ||
228 | everything from the return addr up to the error code so it | ||
229 | sits just under the outer exception info. This means that when we | ||
230 | handle the exception, we do it in the context of the outer exception | ||
231 | rather than starting a new one. | ||
232 | |||
233 | The only caveat is that if the outer eax hasn't been | ||
234 | restored yet (ie, it's still on stack), we need to insert | ||
235 | its value into the SAVE_ALL state before going on, since | ||
236 | it's usermode state which we eventually need to restore. | ||
237 | */ | ||
238 | ENTRY(xen_iret_crit_fixup) | ||
239 | /* offsets +4 for return address */ | ||
240 | |||
241 | /* | ||
242 | Paranoia: Make sure we're really coming from userspace. | ||
243 | One could imagine a case where userspace jumps into the | ||
244 | critical range address, but just before the CPU delivers a GP, | ||
245 | it decides to deliver an interrupt instead. Unlikely? | ||
246 | Definitely. Easy to avoid? Yes. The Intel documents | ||
247 | explicitly say that the reported EIP for a bad jump is the | ||
248 | jump instruction itself, not the destination, but some virtual | ||
249 | environments get this wrong. | ||
250 | */ | ||
251 | movl PT_CS+4(%esp), %ecx | ||
252 | andl $SEGMENT_RPL_MASK, %ecx | ||
253 | cmpl $USER_RPL, %ecx | ||
254 | je 2f | ||
255 | |||
256 | lea PT_ORIG_EAX+4(%esp), %esi | ||
257 | lea PT_EFLAGS+4(%esp), %edi | ||
258 | |||
259 | /* If eip is before iret_restore_end then stack | ||
260 | hasn't been restored yet. */ | ||
261 | cmp $iret_restore_end, %eax | ||
262 | jae 1f | ||
263 | |||
264 | movl 0+4(%edi),%eax /* copy EAX */ | ||
265 | movl %eax, PT_EAX+4(%esp) | ||
266 | |||
267 | lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ | ||
268 | |||
269 | /* set up the copy */ | ||
270 | 1: std | ||
271 | mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ | ||
272 | rep movsl | ||
273 | cld | ||
274 | |||
275 | lea 4(%edi),%esp /* point esp to new frame */ | ||
276 | 2: ret | ||
277 | |||
278 | |||
279 | /* | ||
280 | Force an event check by making a hypercall, | ||
281 | but preserve regs before making the call. | ||
282 | */ | ||
283 | check_events: | ||
284 | push %eax | ||
285 | push %ecx | ||
286 | push %edx | ||
287 | call force_evtchn_callback | ||
288 | pop %edx | ||
289 | pop %ecx | ||
290 | pop %eax | ||
291 | ret | ||