diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-21 13:34:25 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-21 13:34:25 -0400 |
commit | 72a73693aac5ae82850cedc69fa5d264ca977c13 (patch) | |
tree | 80ab4bad93a2204ff264c0b07b63449a91410585 /arch/x86/xen/xen-asm_32.S | |
parent | b7e6f62fe259187f2578d00960ef1b0e6ff6afd5 (diff) | |
parent | 2e2dcc7631e331cf2e8396ce452e7f01e35f1182 (diff) |
Merge branch 'x86/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (160 commits)
x86: remove extra calling to get ext cpuid level
x86: use setup_clear_cpu_cap() when disabling the lapic
KVM: fix exception entry / build bug, on 64-bit
x86: add unknown_nmi_panic kernel parameter
x86, VisWS: turn into generic arch, eliminate leftover files
x86: add ->pre_time_init to x86_quirks
x86: extend and use x86_quirks to clean up NUMAQ code
x86: introduce x86_quirks
x86: improve debug printout: add target bootmem range in early_res_to_bootmem()
Subject: devmem, x86: fix rename of CONFIG_NONPROMISC_DEVMEM
x86: remove arch_get_ram_range
x86: Add a debugfs interface to dump PAT memtype
x86: Add a arch directory for x86 under debugfs
x86: i386: reduce boot fixmap space
i386/xen: add proper unwind annotations to xen_sysenter_target
x86: reduce force_mwait visibility
x86: reduce forbid_dac's visibility
x86: fix two modpost warnings
x86: check function status in EDD boot code
x86_64: ia32_signal.c: remove signal number conversion
...
Diffstat (limited to 'arch/x86/xen/xen-asm_32.S')
-rw-r--r-- | arch/x86/xen/xen-asm_32.S | 305 |
1 files changed, 305 insertions, 0 deletions
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S new file mode 100644 index 000000000000..2497a30f41de --- /dev/null +++ b/arch/x86/xen/xen-asm_32.S | |||
@@ -0,0 +1,305 @@ | |||
1 | /* | ||
2 | Asm versions of Xen pv-ops, suitable for either direct use or inlining. | ||
3 | The inline versions are the same as the direct-use versions, with the | ||
4 | pre- and post-amble chopped off. | ||
5 | |||
6 | This code is encoded for size rather than absolute efficiency, | ||
7 | with a view to being able to inline as much as possible. | ||
8 | |||
9 | We only bother with direct forms (ie, vcpu in pda) of the operations | ||
10 | here; the indirect forms are better handled in C, since they're | ||
11 | generally too large to inline anyway. | ||
12 | */ | ||
13 | |||
14 | #include <linux/linkage.h> | ||
15 | |||
16 | #include <asm/asm-offsets.h> | ||
17 | #include <asm/thread_info.h> | ||
18 | #include <asm/percpu.h> | ||
19 | #include <asm/processor-flags.h> | ||
20 | #include <asm/segment.h> | ||
21 | |||
22 | #include <xen/interface/xen.h> | ||
23 | |||
24 | #define RELOC(x, v) .globl x##_reloc; x##_reloc=v | ||
25 | #define ENDPATCH(x) .globl x##_end; x##_end=. | ||
26 | |||
27 | /* Pseudo-flag used for virtual NMI, which we don't implement yet */ | ||
28 | #define XEN_EFLAGS_NMI 0x80000000 | ||
29 | |||
30 | /* | ||
31 | Enable events. This clears the event mask and tests the pending | ||
32 | event status with one and operation. If there are pending | ||
33 | events, then enter the hypervisor to get them handled. | ||
34 | */ | ||
35 | ENTRY(xen_irq_enable_direct) | ||
36 | /* Unmask events */ | ||
37 | movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask | ||
38 | |||
39 | /* Preempt here doesn't matter because that will deal with | ||
40 | any pending interrupts. The pending check may end up being | ||
41 | run on the wrong CPU, but that doesn't hurt. */ | ||
42 | |||
43 | /* Test for pending */ | ||
44 | testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending | ||
45 | jz 1f | ||
46 | |||
47 | 2: call check_events | ||
48 | 1: | ||
49 | ENDPATCH(xen_irq_enable_direct) | ||
50 | ret | ||
51 | ENDPROC(xen_irq_enable_direct) | ||
52 | RELOC(xen_irq_enable_direct, 2b+1) | ||
53 | |||
54 | |||
55 | /* | ||
56 | Disabling events is simply a matter of making the event mask | ||
57 | non-zero. | ||
58 | */ | ||
59 | ENTRY(xen_irq_disable_direct) | ||
60 | movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask | ||
61 | ENDPATCH(xen_irq_disable_direct) | ||
62 | ret | ||
63 | ENDPROC(xen_irq_disable_direct) | ||
64 | RELOC(xen_irq_disable_direct, 0) | ||
65 | |||
66 | /* | ||
67 | (xen_)save_fl is used to get the current interrupt enable status. | ||
68 | Callers expect the status to be in X86_EFLAGS_IF, and other bits | ||
69 | may be set in the return value. We take advantage of this by | ||
70 | making sure that X86_EFLAGS_IF has the right value (and other bits | ||
71 | in that byte are 0), but other bits in the return value are | ||
72 | undefined. We need to toggle the state of the bit, because | ||
73 | Xen and x86 use opposite senses (mask vs enable). | ||
74 | */ | ||
75 | ENTRY(xen_save_fl_direct) | ||
76 | testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask | ||
77 | setz %ah | ||
78 | addb %ah,%ah | ||
79 | ENDPATCH(xen_save_fl_direct) | ||
80 | ret | ||
81 | ENDPROC(xen_save_fl_direct) | ||
82 | RELOC(xen_save_fl_direct, 0) | ||
83 | |||
84 | |||
85 | /* | ||
86 | In principle the caller should be passing us a value return | ||
87 | from xen_save_fl_direct, but for robustness sake we test only | ||
88 | the X86_EFLAGS_IF flag rather than the whole byte. After | ||
89 | setting the interrupt mask state, it checks for unmasked | ||
90 | pending events and enters the hypervisor to get them delivered | ||
91 | if so. | ||
92 | */ | ||
93 | ENTRY(xen_restore_fl_direct) | ||
94 | testb $X86_EFLAGS_IF>>8, %ah | ||
95 | setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask | ||
96 | /* Preempt here doesn't matter because that will deal with | ||
97 | any pending interrupts. The pending check may end up being | ||
98 | run on the wrong CPU, but that doesn't hurt. */ | ||
99 | |||
100 | /* check for unmasked and pending */ | ||
101 | cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending | ||
102 | jz 1f | ||
103 | 2: call check_events | ||
104 | 1: | ||
105 | ENDPATCH(xen_restore_fl_direct) | ||
106 | ret | ||
107 | ENDPROC(xen_restore_fl_direct) | ||
108 | RELOC(xen_restore_fl_direct, 2b+1) | ||
109 | |||
110 | /* | ||
111 | We can't use sysexit directly, because we're not running in ring0. | ||
112 | But we can easily fake it up using iret. Assuming xen_sysexit | ||
113 | is jumped to with a standard stack frame, we can just strip it | ||
114 | back to a standard iret frame and use iret. | ||
115 | */ | ||
116 | ENTRY(xen_sysexit) | ||
117 | movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ | ||
118 | orl $X86_EFLAGS_IF, PT_EFLAGS(%esp) | ||
119 | lea PT_EIP(%esp), %esp | ||
120 | |||
121 | jmp xen_iret | ||
122 | ENDPROC(xen_sysexit) | ||
123 | |||
124 | /* | ||
125 | This is run where a normal iret would be run, with the same stack setup: | ||
126 | 8: eflags | ||
127 | 4: cs | ||
128 | esp-> 0: eip | ||
129 | |||
130 | This attempts to make sure that any pending events are dealt | ||
131 | with on return to usermode, but there is a small window in | ||
132 | which an event can happen just before entering usermode. If | ||
133 | the nested interrupt ends up setting one of the TIF_WORK_MASK | ||
134 | pending work flags, they will not be tested again before | ||
135 | returning to usermode. This means that a process can end up | ||
136 | with pending work, which will be unprocessed until the process | ||
137 | enters and leaves the kernel again, which could be an | ||
138 | unbounded amount of time. This means that a pending signal or | ||
139 | reschedule event could be indefinitely delayed. | ||
140 | |||
141 | The fix is to notice a nested interrupt in the critical | ||
142 | window, and if one occurs, then fold the nested interrupt into | ||
143 | the current interrupt stack frame, and re-process it | ||
144 | iteratively rather than recursively. This means that it will | ||
145 | exit via the normal path, and all pending work will be dealt | ||
146 | with appropriately. | ||
147 | |||
148 | Because the nested interrupt handler needs to deal with the | ||
149 | current stack state in whatever form its in, we keep things | ||
150 | simple by only using a single register which is pushed/popped | ||
151 | on the stack. | ||
152 | */ | ||
153 | ENTRY(xen_iret) | ||
154 | /* test eflags for special cases */ | ||
155 | testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) | ||
156 | jnz hyper_iret | ||
157 | |||
158 | push %eax | ||
159 | ESP_OFFSET=4 # bytes pushed onto stack | ||
160 | |||
161 | /* Store vcpu_info pointer for easy access. Do it this | ||
162 | way to avoid having to reload %fs */ | ||
163 | #ifdef CONFIG_SMP | ||
164 | GET_THREAD_INFO(%eax) | ||
165 | movl TI_cpu(%eax),%eax | ||
166 | movl __per_cpu_offset(,%eax,4),%eax | ||
167 | mov per_cpu__xen_vcpu(%eax),%eax | ||
168 | #else | ||
169 | movl per_cpu__xen_vcpu, %eax | ||
170 | #endif | ||
171 | |||
172 | /* check IF state we're restoring */ | ||
173 | testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) | ||
174 | |||
175 | /* Maybe enable events. Once this happens we could get a | ||
176 | recursive event, so the critical region starts immediately | ||
177 | afterwards. However, if that happens we don't end up | ||
178 | resuming the code, so we don't have to be worried about | ||
179 | being preempted to another CPU. */ | ||
180 | setz XEN_vcpu_info_mask(%eax) | ||
181 | xen_iret_start_crit: | ||
182 | |||
183 | /* check for unmasked and pending */ | ||
184 | cmpw $0x0001, XEN_vcpu_info_pending(%eax) | ||
185 | |||
186 | /* If there's something pending, mask events again so we | ||
187 | can jump back into xen_hypervisor_callback */ | ||
188 | sete XEN_vcpu_info_mask(%eax) | ||
189 | |||
190 | popl %eax | ||
191 | |||
192 | /* From this point on the registers are restored and the stack | ||
193 | updated, so we don't need to worry about it if we're preempted */ | ||
194 | iret_restore_end: | ||
195 | |||
196 | /* Jump to hypervisor_callback after fixing up the stack. | ||
197 | Events are masked, so jumping out of the critical | ||
198 | region is OK. */ | ||
199 | je xen_hypervisor_callback | ||
200 | |||
201 | 1: iret | ||
202 | xen_iret_end_crit: | ||
203 | .section __ex_table,"a" | ||
204 | .align 4 | ||
205 | .long 1b,iret_exc | ||
206 | .previous | ||
207 | |||
208 | hyper_iret: | ||
209 | /* put this out of line since its very rarely used */ | ||
210 | jmp hypercall_page + __HYPERVISOR_iret * 32 | ||
211 | |||
212 | .globl xen_iret_start_crit, xen_iret_end_crit | ||
213 | |||
214 | /* | ||
215 | This is called by xen_hypervisor_callback in entry.S when it sees | ||
216 | that the EIP at the time of interrupt was between xen_iret_start_crit | ||
217 | and xen_iret_end_crit. We're passed the EIP in %eax so we can do | ||
218 | a more refined determination of what to do. | ||
219 | |||
220 | The stack format at this point is: | ||
221 | ---------------- | ||
222 | ss : (ss/esp may be present if we came from usermode) | ||
223 | esp : | ||
224 | eflags } outer exception info | ||
225 | cs } | ||
226 | eip } | ||
227 | ---------------- <- edi (copy dest) | ||
228 | eax : outer eax if it hasn't been restored | ||
229 | ---------------- | ||
230 | eflags } nested exception info | ||
231 | cs } (no ss/esp because we're nested | ||
232 | eip } from the same ring) | ||
233 | orig_eax }<- esi (copy src) | ||
234 | - - - - - - - - | ||
235 | fs } | ||
236 | es } | ||
237 | ds } SAVE_ALL state | ||
238 | eax } | ||
239 | : : | ||
240 | ebx }<- esp | ||
241 | ---------------- | ||
242 | |||
243 | In order to deliver the nested exception properly, we need to shift | ||
244 | everything from the return addr up to the error code so it | ||
245 | sits just under the outer exception info. This means that when we | ||
246 | handle the exception, we do it in the context of the outer exception | ||
247 | rather than starting a new one. | ||
248 | |||
249 | The only caveat is that if the outer eax hasn't been | ||
250 | restored yet (ie, it's still on stack), we need to insert | ||
251 | its value into the SAVE_ALL state before going on, since | ||
252 | it's usermode state which we eventually need to restore. | ||
253 | */ | ||
254 | ENTRY(xen_iret_crit_fixup) | ||
255 | /* | ||
256 | Paranoia: Make sure we're really coming from kernel space. | ||
257 | One could imagine a case where userspace jumps into the | ||
258 | critical range address, but just before the CPU delivers a GP, | ||
259 | it decides to deliver an interrupt instead. Unlikely? | ||
260 | Definitely. Easy to avoid? Yes. The Intel documents | ||
261 | explicitly say that the reported EIP for a bad jump is the | ||
262 | jump instruction itself, not the destination, but some virtual | ||
263 | environments get this wrong. | ||
264 | */ | ||
265 | movl PT_CS(%esp), %ecx | ||
266 | andl $SEGMENT_RPL_MASK, %ecx | ||
267 | cmpl $USER_RPL, %ecx | ||
268 | je 2f | ||
269 | |||
270 | lea PT_ORIG_EAX(%esp), %esi | ||
271 | lea PT_EFLAGS(%esp), %edi | ||
272 | |||
273 | /* If eip is before iret_restore_end then stack | ||
274 | hasn't been restored yet. */ | ||
275 | cmp $iret_restore_end, %eax | ||
276 | jae 1f | ||
277 | |||
278 | movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */ | ||
279 | movl %eax, PT_EAX(%esp) | ||
280 | |||
281 | lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ | ||
282 | |||
283 | /* set up the copy */ | ||
284 | 1: std | ||
285 | mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */ | ||
286 | rep movsl | ||
287 | cld | ||
288 | |||
289 | lea 4(%edi),%esp /* point esp to new frame */ | ||
290 | 2: jmp xen_do_upcall | ||
291 | |||
292 | |||
293 | /* | ||
294 | Force an event check by making a hypercall, | ||
295 | but preserve regs before making the call. | ||
296 | */ | ||
297 | check_events: | ||
298 | push %eax | ||
299 | push %ecx | ||
300 | push %edx | ||
301 | call force_evtchn_callback | ||
302 | pop %edx | ||
303 | pop %ecx | ||
304 | pop %eax | ||
305 | ret | ||