diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-10 17:24:20 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-10 17:24:20 -0500 |
commit | 3100e448e7d74489a96cb7b45d88fe6962774eaa (patch) | |
tree | 53e46a702bd191ca43639b560d2bb1d3b0ad18c8 /arch/x86/kernel | |
parent | c9f861c77269bc9950c16c6404a9476062241671 (diff) | |
parent | 26893107aa717cd11010f0c278d02535defa1ac9 (diff) |
Merge branch 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 vdso updates from Ingo Molnar:
"Various vDSO updates from Andy Lutomirski, mostly cleanups and
reorganization to improve maintainability, but also some
micro-optimizations and robustization changes"
* 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86_64/vsyscall: Restore orig_ax after vsyscall seccomp
x86_64: Add a comment explaining the TASK_SIZE_MAX guard page
x86_64,vsyscall: Make vsyscall emulation configurable
x86_64, vsyscall: Rewrite comment and clean up headers in vsyscall code
x86_64, vsyscall: Turn vsyscalls all the way off when vsyscall==none
x86,vdso: Use LSL unconditionally for vgetcpu
x86: vdso: Fix build with older gcc
x86_64/vdso: Clean up vgetcpu init and merge the vdso initcalls
x86_64/vdso: Remove jiffies from the vvar page
x86/vdso: Make the PER_CPU segment 32 bits
x86/vdso: Make the PER_CPU segment start out accessed
x86/vdso: Change the PER_CPU segment to use struct desc_struct
x86_64/vdso: Move getcpu code from vsyscall_64.c to vdso/vma.c
x86_64/vsyscall: Move all of the gate_area code to vsyscall_64.c
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/Makefile | 3 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 10 | ||||
-rw-r--r-- | arch/x86/kernel/setup.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/time.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/vsyscall_64.c | 147 |
5 files changed, 66 insertions, 98 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8f1e77440b2b..5d4502c8b983 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -28,8 +28,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o | |||
28 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o | 28 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o |
29 | obj-$(CONFIG_X86_64) += mcount_64.o | 29 | obj-$(CONFIG_X86_64) += mcount_64.o |
30 | obj-y += syscall_$(BITS).o vsyscall_gtod.o | 30 | obj-y += syscall_$(BITS).o vsyscall_gtod.o |
31 | obj-$(CONFIG_X86_64) += vsyscall_64.o | 31 | obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o |
32 | obj-$(CONFIG_X86_64) += vsyscall_emu_64.o | ||
33 | obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o | 32 | obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o |
34 | obj-$(CONFIG_SYSFS) += ksysfs.o | 33 | obj-$(CONFIG_SYSFS) += ksysfs.o |
35 | obj-y += bootflag.o e820.o | 34 | obj-y += bootflag.o e820.o |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cfa9b5b2c27a..c6049650c093 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -958,14 +958,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) | |||
958 | } | 958 | } |
959 | 959 | ||
960 | #ifdef CONFIG_X86_64 | 960 | #ifdef CONFIG_X86_64 |
961 | static void vgetcpu_set_mode(void) | ||
962 | { | ||
963 | if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) | ||
964 | vgetcpu_mode = VGETCPU_RDTSCP; | ||
965 | else | ||
966 | vgetcpu_mode = VGETCPU_LSL; | ||
967 | } | ||
968 | |||
969 | #ifdef CONFIG_IA32_EMULATION | 961 | #ifdef CONFIG_IA32_EMULATION |
970 | /* May not be __init: called during resume */ | 962 | /* May not be __init: called during resume */ |
971 | static void syscall32_cpu_init(void) | 963 | static void syscall32_cpu_init(void) |
@@ -1008,8 +1000,6 @@ void __init identify_boot_cpu(void) | |||
1008 | #ifdef CONFIG_X86_32 | 1000 | #ifdef CONFIG_X86_32 |
1009 | sysenter_setup(); | 1001 | sysenter_setup(); |
1010 | enable_sep_cpu(); | 1002 | enable_sep_cpu(); |
1011 | #else | ||
1012 | vgetcpu_set_mode(); | ||
1013 | #endif | 1003 | #endif |
1014 | cpu_detect_tlb(&boot_cpu_data); | 1004 | cpu_detect_tlb(&boot_cpu_data); |
1015 | } | 1005 | } |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 214245d6b996..ab4734e5411d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -1192,9 +1192,7 @@ void __init setup_arch(char **cmdline_p) | |||
1192 | 1192 | ||
1193 | tboot_probe(); | 1193 | tboot_probe(); |
1194 | 1194 | ||
1195 | #ifdef CONFIG_X86_64 | ||
1196 | map_vsyscall(); | 1195 | map_vsyscall(); |
1197 | #endif | ||
1198 | 1196 | ||
1199 | generic_apic_probe(); | 1197 | generic_apic_probe(); |
1200 | 1198 | ||
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 0fa29609b2c4..25adc0e16eaa 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c | |||
@@ -23,7 +23,7 @@ | |||
23 | #include <asm/time.h> | 23 | #include <asm/time.h> |
24 | 24 | ||
25 | #ifdef CONFIG_X86_64 | 25 | #ifdef CONFIG_X86_64 |
26 | __visible DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES; | 26 | __visible volatile unsigned long jiffies __cacheline_aligned = INITIAL_JIFFIES; |
27 | #endif | 27 | #endif |
28 | 28 | ||
29 | unsigned long profile_pc(struct pt_regs *regs) | 29 | unsigned long profile_pc(struct pt_regs *regs) |
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 957779f4eb40..2dcc6ff6fdcc 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c | |||
@@ -1,59 +1,43 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net> | ||
3 | * | ||
4 | * Based on the original implementation which is: | ||
2 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE | 5 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE |
3 | * Copyright 2003 Andi Kleen, SuSE Labs. | 6 | * Copyright 2003 Andi Kleen, SuSE Labs. |
4 | * | 7 | * |
5 | * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ] | 8 | * Parts of the original code have been moved to arch/x86/vdso/vma.c |
9 | * | ||
10 | * This file implements vsyscall emulation. vsyscalls are a legacy ABI: | ||
11 | * Userspace can request certain kernel services by calling fixed | ||
12 | * addresses. This concept is problematic: | ||
6 | * | 13 | * |
7 | * Thanks to hpa@transmeta.com for some useful hint. | 14 | * - It interferes with ASLR. |
8 | * Special thanks to Ingo Molnar for his early experience with | 15 | * - It's awkward to write code that lives in kernel addresses but is |
9 | * a different vsyscall implementation for Linux/IA32 and for the name. | 16 | * callable by userspace at fixed addresses. |
17 | * - The whole concept is impossible for 32-bit compat userspace. | ||
18 | * - UML cannot easily virtualize a vsyscall. | ||
10 | * | 19 | * |
11 | * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located | 20 | * As of mid-2014, I believe that there is no new userspace code that |
12 | * at virtual address -10Mbyte+1024bytes etc... There are at max 4 | 21 | * will use a vsyscall if the vDSO is present. I hope that there will |
13 | * vsyscalls. One vsyscall can reserve more than 1 slot to avoid | 22 | * soon be no new userspace code that will ever use a vsyscall. |
14 | * jumping out of line if necessary. We cannot add more with this | ||
15 | * mechanism because older kernels won't return -ENOSYS. | ||
16 | * | 23 | * |
17 | * Note: the concept clashes with user mode linux. UML users should | 24 | * The code in this file emulates vsyscalls when notified of a page |
18 | * use the vDSO. | 25 | * fault to a vsyscall address. |
19 | */ | 26 | */ |
20 | 27 | ||
21 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
22 | |||
23 | #include <linux/time.h> | ||
24 | #include <linux/init.h> | ||
25 | #include <linux/kernel.h> | 28 | #include <linux/kernel.h> |
26 | #include <linux/timer.h> | 29 | #include <linux/timer.h> |
27 | #include <linux/seqlock.h> | ||
28 | #include <linux/jiffies.h> | ||
29 | #include <linux/sysctl.h> | ||
30 | #include <linux/topology.h> | ||
31 | #include <linux/timekeeper_internal.h> | ||
32 | #include <linux/getcpu.h> | ||
33 | #include <linux/cpu.h> | ||
34 | #include <linux/smp.h> | ||
35 | #include <linux/notifier.h> | ||
36 | #include <linux/syscalls.h> | 30 | #include <linux/syscalls.h> |
37 | #include <linux/ratelimit.h> | 31 | #include <linux/ratelimit.h> |
38 | 32 | ||
39 | #include <asm/vsyscall.h> | 33 | #include <asm/vsyscall.h> |
40 | #include <asm/pgtable.h> | ||
41 | #include <asm/compat.h> | ||
42 | #include <asm/page.h> | ||
43 | #include <asm/unistd.h> | 34 | #include <asm/unistd.h> |
44 | #include <asm/fixmap.h> | 35 | #include <asm/fixmap.h> |
45 | #include <asm/errno.h> | ||
46 | #include <asm/io.h> | ||
47 | #include <asm/segment.h> | ||
48 | #include <asm/desc.h> | ||
49 | #include <asm/topology.h> | ||
50 | #include <asm/traps.h> | 36 | #include <asm/traps.h> |
51 | 37 | ||
52 | #define CREATE_TRACE_POINTS | 38 | #define CREATE_TRACE_POINTS |
53 | #include "vsyscall_trace.h" | 39 | #include "vsyscall_trace.h" |
54 | 40 | ||
55 | DEFINE_VVAR(int, vgetcpu_mode); | ||
56 | |||
57 | static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; | 41 | static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; |
58 | 42 | ||
59 | static int __init vsyscall_setup(char *str) | 43 | static int __init vsyscall_setup(char *str) |
@@ -222,6 +206,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) | |||
222 | "seccomp tried to change syscall nr or ip"); | 206 | "seccomp tried to change syscall nr or ip"); |
223 | do_exit(SIGSYS); | 207 | do_exit(SIGSYS); |
224 | } | 208 | } |
209 | regs->orig_ax = -1; | ||
225 | if (tmp) | 210 | if (tmp) |
226 | goto do_ret; /* skip requested */ | 211 | goto do_ret; /* skip requested */ |
227 | 212 | ||
@@ -284,46 +269,54 @@ sigsegv: | |||
284 | } | 269 | } |
285 | 270 | ||
286 | /* | 271 | /* |
287 | * Assume __initcall executes before all user space. Hopefully kmod | 272 | * A pseudo VMA to allow ptrace access for the vsyscall page. This only |
288 | * doesn't violate that. We'll find out if it does. | 273 | * covers the 64bit vsyscall page now. 32bit has a real VMA now and does |
274 | * not need special handling anymore: | ||
289 | */ | 275 | */ |
290 | static void vsyscall_set_cpu(int cpu) | 276 | static const char *gate_vma_name(struct vm_area_struct *vma) |
291 | { | 277 | { |
292 | unsigned long d; | 278 | return "[vsyscall]"; |
293 | unsigned long node = 0; | ||
294 | #ifdef CONFIG_NUMA | ||
295 | node = cpu_to_node(cpu); | ||
296 | #endif | ||
297 | if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) | ||
298 | write_rdtscp_aux((node << 12) | cpu); | ||
299 | |||
300 | /* | ||
301 | * Store cpu number in limit so that it can be loaded quickly | ||
302 | * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node) | ||
303 | */ | ||
304 | d = 0x0f40000000000ULL; | ||
305 | d |= cpu; | ||
306 | d |= (node & 0xf) << 12; | ||
307 | d |= (node >> 4) << 48; | ||
308 | |||
309 | write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); | ||
310 | } | 279 | } |
311 | 280 | static struct vm_operations_struct gate_vma_ops = { | |
312 | static void cpu_vsyscall_init(void *arg) | 281 | .name = gate_vma_name, |
282 | }; | ||
283 | static struct vm_area_struct gate_vma = { | ||
284 | .vm_start = VSYSCALL_ADDR, | ||
285 | .vm_end = VSYSCALL_ADDR + PAGE_SIZE, | ||
286 | .vm_page_prot = PAGE_READONLY_EXEC, | ||
287 | .vm_flags = VM_READ | VM_EXEC, | ||
288 | .vm_ops = &gate_vma_ops, | ||
289 | }; | ||
290 | |||
291 | struct vm_area_struct *get_gate_vma(struct mm_struct *mm) | ||
313 | { | 292 | { |
314 | /* preemption should be already off */ | 293 | #ifdef CONFIG_IA32_EMULATION |
315 | vsyscall_set_cpu(raw_smp_processor_id()); | 294 | if (!mm || mm->context.ia32_compat) |
295 | return NULL; | ||
296 | #endif | ||
297 | if (vsyscall_mode == NONE) | ||
298 | return NULL; | ||
299 | return &gate_vma; | ||
316 | } | 300 | } |
317 | 301 | ||
318 | static int | 302 | int in_gate_area(struct mm_struct *mm, unsigned long addr) |
319 | cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) | ||
320 | { | 303 | { |
321 | long cpu = (long)arg; | 304 | struct vm_area_struct *vma = get_gate_vma(mm); |
305 | |||
306 | if (!vma) | ||
307 | return 0; | ||
322 | 308 | ||
323 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) | 309 | return (addr >= vma->vm_start) && (addr < vma->vm_end); |
324 | smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); | 310 | } |
325 | 311 | ||
326 | return NOTIFY_DONE; | 312 | /* |
313 | * Use this when you have no reliable mm, typically from interrupt | ||
314 | * context. It is less reliable than using a task's mm and may give | ||
315 | * false positives. | ||
316 | */ | ||
317 | int in_gate_area_no_mm(unsigned long addr) | ||
318 | { | ||
319 | return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; | ||
327 | } | 320 | } |
328 | 321 | ||
329 | void __init map_vsyscall(void) | 322 | void __init map_vsyscall(void) |
@@ -331,24 +324,12 @@ void __init map_vsyscall(void) | |||
331 | extern char __vsyscall_page; | 324 | extern char __vsyscall_page; |
332 | unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); | 325 | unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); |
333 | 326 | ||
334 | __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, | 327 | if (vsyscall_mode != NONE) |
335 | vsyscall_mode == NATIVE | 328 | __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, |
336 | ? PAGE_KERNEL_VSYSCALL | 329 | vsyscall_mode == NATIVE |
337 | : PAGE_KERNEL_VVAR); | 330 | ? PAGE_KERNEL_VSYSCALL |
331 | : PAGE_KERNEL_VVAR); | ||
332 | |||
338 | BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != | 333 | BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != |
339 | (unsigned long)VSYSCALL_ADDR); | 334 | (unsigned long)VSYSCALL_ADDR); |
340 | } | 335 | } |
341 | |||
342 | static int __init vsyscall_init(void) | ||
343 | { | ||
344 | cpu_notifier_register_begin(); | ||
345 | |||
346 | on_each_cpu(cpu_vsyscall_init, NULL, 1); | ||
347 | /* notifier priority > KVM */ | ||
348 | __hotcpu_notifier(cpu_vsyscall_notifier, 30); | ||
349 | |||
350 | cpu_notifier_register_done(); | ||
351 | |||
352 | return 0; | ||
353 | } | ||
354 | __initcall(vsyscall_init); | ||