aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/process_64.c
diff options
context:
space:
mode:
authorAndy Lutomirski <luto@amacapital.net>2014-12-08 16:55:20 -0500
committerIngo Molnar <mingo@kernel.org>2014-12-11 05:40:08 -0500
commitf647d7c155f069c1a068030255c300663516420e (patch)
tree7232523350a4f0b97e85d67b7e11176f622cbc82 /arch/x86/kernel/process_64.c
parent29258cf49eb794f00989fc47da8700759a42778b (diff)
x86_64, switch_to(): Load TLS descriptors before switching DS and ES
Otherwise, if buggy user code points DS or ES into the TLS array, they would be corrupted after a context switch. This also significantly improves the comments and documents some gotchas in the code. Before this patch, the both tests below failed. With this patch, the es test passes, although the gsbase test still fails. ----- begin es test ----- /* * Copyright (c) 2014 Andy Lutomirski * GPL v2 */ static unsigned short GDT3(int idx) { return (idx << 3) | 3; } static int create_tls(int idx, unsigned int base) { struct user_desc desc = { .entry_number = idx, .base_addr = base, .limit = 0xfffff, .seg_32bit = 1, .contents = 0, /* Data, grow-up */ .read_exec_only = 0, .limit_in_pages = 1, .seg_not_present = 0, .useable = 0, }; if (syscall(SYS_set_thread_area, &desc) != 0) err(1, "set_thread_area"); return desc.entry_number; } int main() { int idx = create_tls(-1, 0); printf("Allocated GDT index %d\n", idx); unsigned short orig_es; asm volatile ("mov %%es,%0" : "=rm" (orig_es)); int errors = 0; int total = 1000; for (int i = 0; i < total; i++) { asm volatile ("mov %0,%%es" : : "rm" (GDT3(idx))); usleep(100); unsigned short es; asm volatile ("mov %%es,%0" : "=rm" (es)); asm volatile ("mov %0,%%es" : : "rm" (orig_es)); if (es != GDT3(idx)) { if (errors == 0) printf("[FAIL]\tES changed from 0x%hx to 0x%hx\n", GDT3(idx), es); errors++; } } if (errors) { printf("[FAIL]\tES was corrupted %d/%d times\n", errors, total); return 1; } else { printf("[OK]\tES was preserved\n"); return 0; } } ----- end es test ----- ----- begin gsbase test ----- /* * gsbase.c, a gsbase test * Copyright (c) 2014 Andy Lutomirski * GPL v2 */ static unsigned char *testptr, *testptr2; static unsigned char read_gs_testvals(void) { unsigned char ret; asm volatile ("movb %%gs:%1, %0" : "=r" (ret) : "m" (*testptr)); return ret; } int main() { int errors = 0; testptr = mmap((void *)0x200000000UL, 1, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0); if (testptr == MAP_FAILED) err(1, "mmap"); testptr2 = mmap((void *)0x300000000UL, 1, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0); if (testptr2 == MAP_FAILED) err(1, "mmap"); *testptr = 0; *testptr2 = 1; if (syscall(SYS_arch_prctl, ARCH_SET_GS, (unsigned long)testptr2 - (unsigned long)testptr) != 0) err(1, "ARCH_SET_GS"); usleep(100); if (read_gs_testvals() == 1) { printf("[OK]\tARCH_SET_GS worked\n"); } else { printf("[FAIL]\tARCH_SET_GS failed\n"); errors++; } asm volatile ("mov %0,%%gs" : : "r" (0)); if (read_gs_testvals() == 0) { printf("[OK]\tWriting 0 to gs worked\n"); } else { printf("[FAIL]\tWriting 0 to gs failed\n"); errors++; } usleep(100); if (read_gs_testvals() == 0) { printf("[OK]\tgsbase is still zero\n"); } else { printf("[FAIL]\tgsbase was corrupted\n"); errors++; } return errors == 0 ? 0 : 1; } ----- end gsbase test ----- Signed-off-by: Andy Lutomirski <luto@amacapital.net> Cc: <stable@vger.kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/509d27c9fec78217691c3dad91cec87e1006b34a.1418075657.git.luto@amacapital.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/kernel/process_64.c')
-rw-r--r--arch/x86/kernel/process_64.c101
1 files changed, 73 insertions, 28 deletions
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3ed4a68d4013..5a2c02913af3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -283,24 +283,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
283 283
284 fpu = switch_fpu_prepare(prev_p, next_p, cpu); 284 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
285 285
286 /* 286 /* Reload esp0 and ss1. */
287 * Reload esp0, LDT and the page table pointer:
288 */
289 load_sp0(tss, next); 287 load_sp0(tss, next);
290 288
291 /*
292 * Switch DS and ES.
293 * This won't pick up thread selector changes, but I guess that is ok.
294 */
295 savesegment(es, prev->es);
296 if (unlikely(next->es | prev->es))
297 loadsegment(es, next->es);
298
299 savesegment(ds, prev->ds);
300 if (unlikely(next->ds | prev->ds))
301 loadsegment(ds, next->ds);
302
303
304 /* We must save %fs and %gs before load_TLS() because 289 /* We must save %fs and %gs before load_TLS() because
305 * %fs and %gs may be cleared by load_TLS(). 290 * %fs and %gs may be cleared by load_TLS().
306 * 291 *
@@ -309,41 +294,101 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
309 savesegment(fs, fsindex); 294 savesegment(fs, fsindex);
310 savesegment(gs, gsindex); 295 savesegment(gs, gsindex);
311 296
297 /*
298 * Load TLS before restoring any segments so that segment loads
299 * reference the correct GDT entries.
300 */
312 load_TLS(next, cpu); 301 load_TLS(next, cpu);
313 302
314 /* 303 /*
315 * Leave lazy mode, flushing any hypercalls made here. 304 * Leave lazy mode, flushing any hypercalls made here. This
316 * This must be done before restoring TLS segments so 305 * must be done after loading TLS entries in the GDT but before
317 * the GDT and LDT are properly updated, and must be 306 * loading segments that might reference them, and and it must
318 * done before math_state_restore, so the TS bit is up 307 * be done before math_state_restore, so the TS bit is up to
319 * to date. 308 * date.
320 */ 309 */
321 arch_end_context_switch(next_p); 310 arch_end_context_switch(next_p);
322 311
312 /* Switch DS and ES.
313 *
314 * Reading them only returns the selectors, but writing them (if
315 * nonzero) loads the full descriptor from the GDT or LDT. The
316 * LDT for next is loaded in switch_mm, and the GDT is loaded
317 * above.
318 *
319 * We therefore need to write new values to the segment
320 * registers on every context switch unless both the new and old
321 * values are zero.
322 *
323 * Note that we don't need to do anything for CS and SS, as
324 * those are saved and restored as part of pt_regs.
325 */
326 savesegment(es, prev->es);
327 if (unlikely(next->es | prev->es))
328 loadsegment(es, next->es);
329
330 savesegment(ds, prev->ds);
331 if (unlikely(next->ds | prev->ds))
332 loadsegment(ds, next->ds);
333
323 /* 334 /*
324 * Switch FS and GS. 335 * Switch FS and GS.
325 * 336 *
326 * Segment register != 0 always requires a reload. Also 337 * These are even more complicated than FS and GS: they have
327 * reload when it has changed. When prev process used 64bit 338 * 64-bit bases are that controlled by arch_prctl. Those bases
328 * base always reload to avoid an information leak. 339 * only differ from the values in the GDT or LDT if the selector
340 * is 0.
341 *
342 * Loading the segment register resets the hidden base part of
343 * the register to 0 or the value from the GDT / LDT. If the
344 * next base address zero, writing 0 to the segment register is
345 * much faster than using wrmsr to explicitly zero the base.
346 *
347 * The thread_struct.fs and thread_struct.gs values are 0
348 * if the fs and gs bases respectively are not overridden
349 * from the values implied by fsindex and gsindex. They
350 * are nonzero, and store the nonzero base addresses, if
351 * the bases are overridden.
352 *
353 * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
354 * be impossible.
355 *
356 * Therefore we need to reload the segment registers if either
357 * the old or new selector is nonzero, and we need to override
358 * the base address if next thread expects it to be overridden.
359 *
360 * This code is unnecessarily slow in the case where the old and
361 * new indexes are zero and the new base is nonzero -- it will
362 * unnecessarily write 0 to the selector before writing the new
363 * base address.
364 *
365 * Note: This all depends on arch_prctl being the only way that
366 * user code can override the segment base. Once wrfsbase and
367 * wrgsbase are enabled, most of this code will need to change.
329 */ 368 */
330 if (unlikely(fsindex | next->fsindex | prev->fs)) { 369 if (unlikely(fsindex | next->fsindex | prev->fs)) {
331 loadsegment(fs, next->fsindex); 370 loadsegment(fs, next->fsindex);
371
332 /* 372 /*
333 * Check if the user used a selector != 0; if yes 373 * If user code wrote a nonzero value to FS, then it also
334 * clear 64bit base, since overloaded base is always 374 * cleared the overridden base address.
335 * mapped to the Null selector 375 *
376 * XXX: if user code wrote 0 to FS and cleared the base
377 * address itself, we won't notice and we'll incorrectly
378 * restore the prior base address next time we reschdule
379 * the process.
336 */ 380 */
337 if (fsindex) 381 if (fsindex)
338 prev->fs = 0; 382 prev->fs = 0;
339 } 383 }
340 /* when next process has a 64bit base use it */
341 if (next->fs) 384 if (next->fs)
342 wrmsrl(MSR_FS_BASE, next->fs); 385 wrmsrl(MSR_FS_BASE, next->fs);
343 prev->fsindex = fsindex; 386 prev->fsindex = fsindex;
344 387
345 if (unlikely(gsindex | next->gsindex | prev->gs)) { 388 if (unlikely(gsindex | next->gsindex | prev->gs)) {
346 load_gs_index(next->gsindex); 389 load_gs_index(next->gsindex);
390
391 /* This works (and fails) the same way as fsindex above. */
347 if (gsindex) 392 if (gsindex)
348 prev->gs = 0; 393 prev->gs = 0;
349 } 394 }