diff options
Diffstat (limited to 'arch/i386')
-rw-r--r-- | arch/i386/kernel/asm-offsets.c | 1 | ||||
-rw-r--r-- | arch/i386/kernel/entry.S | 16 | ||||
-rw-r--r-- | arch/i386/xen/enlighten.c | 1 | ||||
-rw-r--r-- | arch/i386/xen/xen-asm.S | 185 | ||||
-rw-r--r-- | arch/i386/xen/xen-ops.h | 1 |
5 files changed, 199 insertions, 5 deletions
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index a7c2947b3966..25f7eb513928 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c | |||
@@ -61,6 +61,7 @@ void foo(void) | |||
61 | OFFSET(TI_addr_limit, thread_info, addr_limit); | 61 | OFFSET(TI_addr_limit, thread_info, addr_limit); |
62 | OFFSET(TI_restart_block, thread_info, restart_block); | 62 | OFFSET(TI_restart_block, thread_info, restart_block); |
63 | OFFSET(TI_sysenter_return, thread_info, sysenter_return); | 63 | OFFSET(TI_sysenter_return, thread_info, sysenter_return); |
64 | OFFSET(TI_cpu, thread_info, cpu); | ||
64 | BLANK(); | 65 | BLANK(); |
65 | 66 | ||
66 | OFFSET(GDS_size, Xgt_desc_struct, size); | 67 | OFFSET(GDS_size, Xgt_desc_struct, size); |
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index ffb236544270..32980b834935 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S | |||
@@ -1030,7 +1030,21 @@ ENTRY(xen_hypervisor_callback) | |||
1030 | CFI_ADJUST_CFA_OFFSET 4 | 1030 | CFI_ADJUST_CFA_OFFSET 4 |
1031 | SAVE_ALL | 1031 | SAVE_ALL |
1032 | TRACE_IRQS_OFF | 1032 | TRACE_IRQS_OFF |
1033 | mov %esp, %eax | 1033 | |
1034 | /* Check to see if we got the event in the critical | ||
1035 | region in xen_iret_direct, after we've reenabled | ||
1036 | events and checked for pending events. This simulates | ||
1037 | iret instruction's behaviour where it delivers a | ||
1038 | pending interrupt when enabling interrupts. */ | ||
1039 | movl PT_EIP(%esp),%eax | ||
1040 | cmpl $xen_iret_start_crit,%eax | ||
1041 | jb 1f | ||
1042 | cmpl $xen_iret_end_crit,%eax | ||
1043 | jae 1f | ||
1044 | |||
1045 | call xen_iret_crit_fixup | ||
1046 | |||
1047 | 1: mov %esp, %eax | ||
1034 | call xen_evtchn_do_upcall | 1048 | call xen_evtchn_do_upcall |
1035 | jmp ret_from_intr | 1049 | jmp ret_from_intr |
1036 | CFI_ENDPROC | 1050 | CFI_ENDPROC |
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index 4fa62a4cb7cc..9a8c1181c001 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c | |||
@@ -838,6 +838,7 @@ void __init xen_setup_vcpu_info_placement(void) | |||
838 | paravirt_ops.irq_disable = xen_irq_disable_direct; | 838 | paravirt_ops.irq_disable = xen_irq_disable_direct; |
839 | paravirt_ops.irq_enable = xen_irq_enable_direct; | 839 | paravirt_ops.irq_enable = xen_irq_enable_direct; |
840 | paravirt_ops.read_cr2 = xen_read_cr2_direct; | 840 | paravirt_ops.read_cr2 = xen_read_cr2_direct; |
841 | paravirt_ops.iret = xen_iret_direct; | ||
841 | } | 842 | } |
842 | } | 843 | } |
843 | 844 | ||
diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S index dc4d36d51bc1..1a43b60c0c62 100644 --- a/arch/i386/xen/xen-asm.S +++ b/arch/i386/xen/xen-asm.S | |||
@@ -12,15 +12,21 @@ | |||
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <linux/linkage.h> | 14 | #include <linux/linkage.h> |
15 | |||
15 | #include <asm/asm-offsets.h> | 16 | #include <asm/asm-offsets.h> |
16 | #include <asm/thread_info.h> | 17 | #include <asm/thread_info.h> |
17 | #include <asm/percpu.h> | 18 | #include <asm/percpu.h> |
18 | #include <asm/asm-offsets.h> | ||
19 | #include <asm/processor-flags.h> | 19 | #include <asm/processor-flags.h> |
20 | #include <asm/segment.h> | ||
21 | |||
22 | #include <xen/interface/xen.h> | ||
20 | 23 | ||
21 | #define RELOC(x, v) .globl x##_reloc; x##_reloc=v | 24 | #define RELOC(x, v) .globl x##_reloc; x##_reloc=v |
22 | #define ENDPATCH(x) .globl x##_end; x##_end=. | 25 | #define ENDPATCH(x) .globl x##_end; x##_end=. |
23 | 26 | ||
27 | /* Pseudo-flag used for virtual NMI, which we don't implement yet */ | ||
28 | #define XEN_EFLAGS_NMI 0x80000000 | ||
29 | |||
24 | /* | 30 | /* |
25 | Enable events. This clears the event mask and tests the pending | 31 | Enable events. This clears the event mask and tests the pending |
26 | event status with one and operation. If there are pending | 32 | event status with one and operation. If there are pending |
@@ -81,13 +87,12 @@ ENDPATCH(xen_save_fl_direct) | |||
81 | */ | 87 | */ |
82 | ENTRY(xen_restore_fl_direct) | 88 | ENTRY(xen_restore_fl_direct) |
83 | testb $X86_EFLAGS_IF>>8, %ah | 89 | testb $X86_EFLAGS_IF>>8, %ah |
84 | setz %al | 90 | setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask |
85 | movb %al, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask | ||
86 | /* Preempt here doesn't matter because that will deal with | 91 | /* Preempt here doesn't matter because that will deal with |
87 | any pending interrupts. The pending check may end up being | 92 | any pending interrupts. The pending check may end up being |
88 | run on the wrong CPU, but that doesn't hurt. */ | 93 | run on the wrong CPU, but that doesn't hurt. */ |
89 | 94 | ||
90 | /* check for pending but unmasked */ | 95 | /* check for unmasked and pending */ |
91 | cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending | 96 | cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending |
92 | jz 1f | 97 | jz 1f |
93 | 2: call check_events | 98 | 2: call check_events |
@@ -97,6 +102,178 @@ ENDPATCH(xen_restore_fl_direct) | |||
97 | ENDPROC(xen_restore_fl_direct) | 102 | ENDPROC(xen_restore_fl_direct) |
98 | RELOC(xen_restore_fl_direct, 2b+1) | 103 | RELOC(xen_restore_fl_direct, 2b+1) |
99 | 104 | ||
105 | /* | ||
106 | This is run where a normal iret would be run, with the same stack setup: | ||
107 | 8: eflags | ||
108 | 4: cs | ||
109 | esp-> 0: eip | ||
110 | |||
111 | This attempts to make sure that any pending events are dealt | ||
112 | with on return to usermode, but there is a small window in | ||
113 | which an event can happen just before entering usermode. If | ||
114 | the nested interrupt ends up setting one of the TIF_WORK_MASK | ||
115 | pending work flags, they will not be tested again before | ||
116 | returning to usermode. This means that a process can end up | ||
117 | with pending work, which will be unprocessed until the process | ||
118 | enters and leaves the kernel again, which could be an | ||
119 | unbounded amount of time. This means that a pending signal or | ||
120 | reschedule event could be indefinitely delayed. | ||
121 | |||
122 | The fix is to notice a nested interrupt in the critical | ||
123 | window, and if one occurs, then fold the nested interrupt into | ||
124 | the current interrupt stack frame, and re-process it | ||
125 | iteratively rather than recursively. This means that it will | ||
126 | exit via the normal path, and all pending work will be dealt | ||
127 | with appropriately. | ||
128 | |||
129 | Because the nested interrupt handler needs to deal with the | ||
130 | current stack state in whatever form its in, we keep things | ||
131 | simple by only using a single register which is pushed/popped | ||
132 | on the stack. | ||
133 | |||
134 | Non-direct iret could be done in the same way, but it would | ||
135 | require an annoying amount of code duplication. We'll assume | ||
136 | that direct mode will be the common case once the hypervisor | ||
137 | support becomes commonplace. | ||
138 | */ | ||
139 | ENTRY(xen_iret_direct) | ||
140 | /* test eflags for special cases */ | ||
141 | testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) | ||
142 | jnz hyper_iret | ||
143 | |||
144 | push %eax | ||
145 | ESP_OFFSET=4 # bytes pushed onto stack | ||
146 | |||
147 | /* Store vcpu_info pointer for easy access. Do it this | ||
148 | way to avoid having to reload %fs */ | ||
149 | #ifdef CONFIG_SMP | ||
150 | GET_THREAD_INFO(%eax) | ||
151 | movl TI_cpu(%eax),%eax | ||
152 | movl __per_cpu_offset(,%eax,4),%eax | ||
153 | lea per_cpu__xen_vcpu_info(%eax),%eax | ||
154 | #else | ||
155 | movl $per_cpu__xen_vcpu_info, %eax | ||
156 | #endif | ||
157 | |||
158 | /* check IF state we're restoring */ | ||
159 | testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) | ||
160 | |||
161 | /* Maybe enable events. Once this happens we could get a | ||
162 | recursive event, so the critical region starts immediately | ||
163 | afterwards. However, if that happens we don't end up | ||
164 | resuming the code, so we don't have to be worried about | ||
165 | being preempted to another CPU. */ | ||
166 | setz XEN_vcpu_info_mask(%eax) | ||
167 | xen_iret_start_crit: | ||
168 | |||
169 | /* check for unmasked and pending */ | ||
170 | cmpw $0x0001, XEN_vcpu_info_pending(%eax) | ||
171 | |||
172 | /* If there's something pending, mask events again so we | ||
173 | can jump back into xen_hypervisor_callback */ | ||
174 | sete XEN_vcpu_info_mask(%eax) | ||
175 | |||
176 | popl %eax | ||
177 | |||
178 | /* From this point on the registers are restored and the stack | ||
179 | updated, so we don't need to worry about it if we're preempted */ | ||
180 | iret_restore_end: | ||
181 | |||
182 | /* Jump to hypervisor_callback after fixing up the stack. | ||
183 | Events are masked, so jumping out of the critical | ||
184 | region is OK. */ | ||
185 | je xen_hypervisor_callback | ||
186 | |||
187 | iret | ||
188 | xen_iret_end_crit: | ||
189 | |||
190 | hyper_iret: | ||
191 | /* put this out of line since its very rarely used */ | ||
192 | jmp hypercall_page + __HYPERVISOR_iret * 32 | ||
193 | |||
194 | .globl xen_iret_start_crit, xen_iret_end_crit | ||
195 | |||
196 | /* | ||
197 | This is called by xen_hypervisor_callback in entry.S when it sees | ||
198 | that the EIP at the time of interrupt was between xen_iret_start_crit | ||
199 | and xen_iret_end_crit. We're passed the EIP in %eax so we can do | ||
200 | a more refined determination of what to do. | ||
201 | |||
202 | The stack format at this point is: | ||
203 | ---------------- | ||
204 | ss : (ss/esp may be present if we came from usermode) | ||
205 | esp : | ||
206 | eflags } outer exception info | ||
207 | cs } | ||
208 | eip } | ||
209 | ---------------- <- edi (copy dest) | ||
210 | eax : outer eax if it hasn't been restored | ||
211 | ---------------- | ||
212 | eflags } nested exception info | ||
213 | cs } (no ss/esp because we're nested | ||
214 | eip } from the same ring) | ||
215 | orig_eax }<- esi (copy src) | ||
216 | - - - - - - - - | ||
217 | fs } | ||
218 | es } | ||
219 | ds } SAVE_ALL state | ||
220 | eax } | ||
221 | : : | ||
222 | ebx } | ||
223 | ---------------- | ||
224 | return addr <- esp | ||
225 | ---------------- | ||
226 | |||
227 | In order to deliver the nested exception properly, we need to shift | ||
228 | everything from the return addr up to the error code so it | ||
229 | sits just under the outer exception info. This means that when we | ||
230 | handle the exception, we do it in the context of the outer exception | ||
231 | rather than starting a new one. | ||
232 | |||
233 | The only caveat is that if the outer eax hasn't been | ||
234 | restored yet (ie, it's still on stack), we need to insert | ||
235 | its value into the SAVE_ALL state before going on, since | ||
236 | it's usermode state which we eventually need to restore. | ||
237 | */ | ||
238 | ENTRY(xen_iret_crit_fixup) | ||
239 | /* offsets +4 for return address */ | ||
240 | |||
241 | /* | ||
242 | Paranoia: Make sure we're really coming from userspace. | ||
243 | One could imagine a case where userspace jumps into the | ||
244 | critical range address, but just before the CPU delivers a GP, | ||
245 | it decides to deliver an interrupt instead. Unlikely? | ||
246 | Definitely. Easy to avoid? Yes. The Intel documents | ||
247 | explicitly say that the reported EIP for a bad jump is the | ||
248 | jump instruction itself, not the destination, but some virtual | ||
249 | environments get this wrong. | ||
250 | */ | ||
251 | movl PT_CS+4(%esp), %ecx | ||
252 | andl $SEGMENT_RPL_MASK, %ecx | ||
253 | cmpl $USER_RPL, %ecx | ||
254 | je 2f | ||
255 | |||
256 | lea PT_ORIG_EAX+4(%esp), %esi | ||
257 | lea PT_EFLAGS+4(%esp), %edi | ||
258 | |||
259 | /* If eip is before iret_restore_end then stack | ||
260 | hasn't been restored yet. */ | ||
261 | cmp $iret_restore_end, %eax | ||
262 | jae 1f | ||
263 | |||
264 | movl 0+4(%edi),%eax /* copy EAX */ | ||
265 | movl %eax, PT_EAX+4(%esp) | ||
266 | |||
267 | lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ | ||
268 | |||
269 | /* set up the copy */ | ||
270 | 1: std | ||
271 | mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ | ||
272 | rep movsl | ||
273 | cld | ||
274 | |||
275 | lea 4(%edi),%esp /* point esp to new frame */ | ||
276 | 2: ret | ||
100 | 277 | ||
101 | 278 | ||
102 | /* | 279 | /* |
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h index 33e4c8a16289..b9aaea45f07f 100644 --- a/arch/i386/xen/xen-ops.h +++ b/arch/i386/xen/xen-ops.h | |||
@@ -67,4 +67,5 @@ DECL_ASM(void, xen_irq_disable_direct, void); | |||
67 | DECL_ASM(unsigned long, xen_save_fl_direct, void); | 67 | DECL_ASM(unsigned long, xen_save_fl_direct, void); |
68 | DECL_ASM(void, xen_restore_fl_direct, unsigned long); | 68 | DECL_ASM(void, xen_restore_fl_direct, unsigned long); |
69 | 69 | ||
70 | void xen_iret_direct(void); | ||
70 | #endif /* XEN_OPS_H */ | 71 | #endif /* XEN_OPS_H */ |