aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2007-07-21 11:10:01 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-21 21:37:08 -0400
commit2aae950b21e4bc789d1fc6668faf67e8748300b7 (patch)
tree5777768cc2493695ec9f4000c14f3584b3db28fd
parenta586df067afe0580bb02b7a6312ca2afe49bba03 (diff)
x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu
This implements new vDSO for x86-64. The concept is similar to the existing vDSOs on i386 and PPC. x86-64 has had static vsyscalls before, but these are not flexible enough anymore. A vDSO is a ELF shared library supplied by the kernel that is mapped into user address space. The vDSO mapping is randomized for each process for security reasons. Doing this was needed for clock_gettime, because clock_gettime always needs a syscall fallback and having one at a fixed address would have made buffer overflow exploits too easy to write. The vdso can be disabled with vdso=0 It currently includes a new gettimeofday implemention and optimized clock_gettime(). The gettimeofday implementation is slightly faster than the one in the old vsyscall. clock_gettime is significantly faster than the syscall for CLOCK_MONOTONIC and CLOCK_REALTIME. The new calls are generally faster than the old vsyscall. Advantages over the old x86-64 vsyscalls: - Extensible - Randomized - Cleaner - Easier to virtualize (the old static address range previously causes overhead e.g. for Xen because it has to create special page tables for it) Weak points: - glibc support still to be written The VM interface is partly based on Ingo Molnar's i386 version. Includes compile fix from Joachim Deguara Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/kernel-parameters.txt2
-rw-r--r--arch/x86_64/Makefile3
-rw-r--r--arch/x86_64/ia32/ia32_binfmt.c1
-rw-r--r--arch/x86_64/kernel/time.c1
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S9
-rw-r--r--arch/x86_64/kernel/vsyscall.c22
-rw-r--r--arch/x86_64/mm/init.c9
-rw-r--r--arch/x86_64/vdso/Makefile49
-rw-r--r--arch/x86_64/vdso/vclock_gettime.c120
-rw-r--r--arch/x86_64/vdso/vdso-note.S12
-rw-r--r--arch/x86_64/vdso/vdso-start.S2
-rw-r--r--arch/x86_64/vdso/vdso.S2
-rw-r--r--arch/x86_64/vdso/vdso.lds.S77
-rw-r--r--arch/x86_64/vdso/vextern.h16
-rw-r--r--arch/x86_64/vdso/vgetcpu.c50
-rw-r--r--arch/x86_64/vdso/vma.c139
-rw-r--r--arch/x86_64/vdso/voffset.h1
-rw-r--r--arch/x86_64/vdso/vvar.c12
-rw-r--r--include/asm-x86_64/auxvec.h2
-rw-r--r--include/asm-x86_64/elf.h13
-rw-r--r--include/asm-x86_64/mmu.h1
-rw-r--r--include/asm-x86_64/vgtod.h29
-rw-r--r--include/asm-x86_64/vsyscall.h3
23 files changed, 554 insertions, 21 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 5fbe07706ae9..fb80e9ffea68 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1882,7 +1882,7 @@ and is between 256 and 4096 characters. It is defined in the file
1882 usbhid.mousepoll= 1882 usbhid.mousepoll=
1883 [USBHID] The interval which mice are to be polled at. 1883 [USBHID] The interval which mice are to be polled at.
1884 1884
1885 vdso= [IA-32,SH] 1885 vdso= [IA-32,SH,x86-64]
1886 vdso=2: enable compat VDSO (default with COMPAT_VDSO) 1886 vdso=2: enable compat VDSO (default with COMPAT_VDSO)
1887 vdso=1: enable VDSO (default) 1887 vdso=1: enable VDSO (default)
1888 vdso=0: disable VDSO mapping 1888 vdso=0: disable VDSO mapping
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 29617ae3926d..128561d3e876 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -76,7 +76,8 @@ head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kern
76libs-y += arch/x86_64/lib/ 76libs-y += arch/x86_64/lib/
77core-y += arch/x86_64/kernel/ \ 77core-y += arch/x86_64/kernel/ \
78 arch/x86_64/mm/ \ 78 arch/x86_64/mm/ \
79 arch/x86_64/crypto/ 79 arch/x86_64/crypto/ \
80 arch/x86_64/vdso/
80core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/ 81core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/
81drivers-$(CONFIG_PCI) += arch/x86_64/pci/ 82drivers-$(CONFIG_PCI) += arch/x86_64/pci/
82drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/ 83drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index ed56a8806eab..b70f3e7cf06c 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -38,6 +38,7 @@
38 38
39int sysctl_vsyscall32 = 1; 39int sysctl_vsyscall32 = 1;
40 40
41#undef ARCH_DLINFO
41#define ARCH_DLINFO do { \ 42#define ARCH_DLINFO do { \
42 if (sysctl_vsyscall32) { \ 43 if (sysctl_vsyscall32) { \
43 NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \ 44 NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 4a0895bacf51..5405a69a1f72 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -44,6 +44,7 @@
44#include <asm/hpet.h> 44#include <asm/hpet.h>
45#include <asm/mpspec.h> 45#include <asm/mpspec.h>
46#include <asm/nmi.h> 46#include <asm/nmi.h>
47#include <asm/vgtod.h>
47 48
48static char *timename = NULL; 49static char *timename = NULL;
49 50
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 5c57ea4591c1..c2d5a840cb1a 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -93,6 +93,9 @@ SECTIONS
93 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) 93 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
94 { *(.vsyscall_gtod_data) } 94 { *(.vsyscall_gtod_data) }
95 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); 95 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
96 .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
97 { *(.vsyscall_clock) }
98 vsyscall_clock = VVIRT(.vsyscall_clock);
96 99
97 100
98 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) 101 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
@@ -189,6 +192,12 @@ SECTIONS
189 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } 192 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
190 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } 193 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
191 194
195/* vdso blob that is mapped into user space */
196 vdso_start = . ;
197 .vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) }
198 . = ALIGN(4096);
199 vdso_end = .;
200
192#ifdef CONFIG_BLK_DEV_INITRD 201#ifdef CONFIG_BLK_DEV_INITRD
193 . = ALIGN(4096); 202 . = ALIGN(4096);
194 __initramfs_start = .; 203 __initramfs_start = .;
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 57660d58d500..06c34949bfdc 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -42,6 +42,7 @@
42#include <asm/segment.h> 42#include <asm/segment.h>
43#include <asm/desc.h> 43#include <asm/desc.h>
44#include <asm/topology.h> 44#include <asm/topology.h>
45#include <asm/vgtod.h>
45 46
46#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) 47#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
47#define __syscall_clobber "r11","rcx","memory" 48#define __syscall_clobber "r11","rcx","memory"
@@ -57,26 +58,9 @@
57 * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) 58 * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
58 * Try to keep this structure as small as possible to avoid cache line ping pongs 59 * Try to keep this structure as small as possible to avoid cache line ping pongs
59 */ 60 */
60struct vsyscall_gtod_data_t {
61 seqlock_t lock;
62
63 /* open coded 'struct timespec' */
64 time_t wall_time_sec;
65 u32 wall_time_nsec;
66
67 int sysctl_enabled;
68 struct timezone sys_tz;
69 struct { /* extract of a clocksource struct */
70 cycle_t (*vread)(void);
71 cycle_t cycle_last;
72 cycle_t mask;
73 u32 mult;
74 u32 shift;
75 } clock;
76};
77int __vgetcpu_mode __section_vgetcpu_mode; 61int __vgetcpu_mode __section_vgetcpu_mode;
78 62
79struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data = 63struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
80{ 64{
81 .lock = SEQLOCK_UNLOCKED, 65 .lock = SEQLOCK_UNLOCKED,
82 .sysctl_enabled = 1, 66 .sysctl_enabled = 1,
@@ -96,6 +80,8 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
96 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 80 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
97 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 81 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
98 vsyscall_gtod_data.sys_tz = sys_tz; 82 vsyscall_gtod_data.sys_tz = sys_tz;
83 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
84 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
99 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 85 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
100} 86}
101 87
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 9a0e98accf04..2f673225a51f 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -774,3 +774,12 @@ void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
774 return __alloc_bootmem_core(pgdat->bdata, size, 774 return __alloc_bootmem_core(pgdat->bdata, size,
775 SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); 775 SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
776} 776}
777
778const char *arch_vma_name(struct vm_area_struct *vma)
779{
780 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
781 return "[vdso]";
782 if (vma == &gate_vma)
783 return "[vsyscall]";
784 return NULL;
785}
diff --git a/arch/x86_64/vdso/Makefile b/arch/x86_64/vdso/Makefile
new file mode 100644
index 000000000000..faaa72fb250c
--- /dev/null
+++ b/arch/x86_64/vdso/Makefile
@@ -0,0 +1,49 @@
1#
2# x86-64 vDSO.
3#
4
5# files to link into the vdso
6# vdso-start.o has to be first
7vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
8
9# files to link into kernel
10obj-y := vma.o vdso.o vdso-syms.o
11
12vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
13
14$(obj)/vdso.o: $(obj)/vdso.so
15
16targets += vdso.so vdso.lds $(vobjs-y) vdso-syms.o
17
18# The DSO images are built using a special linker script.
19quiet_cmd_syscall = SYSCALL $@
20 cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \
21 -Wl,-T,$(filter-out FORCE,$^) -o $@
22
23export CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
24
25vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \
26 $(call ld-option, -Wl$(comma)--hash-style=sysv) \
27 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
28SYSCFLAGS_vdso.so = $(vdso-flags)
29
30$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
31
32$(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE
33 $(call if_changed,syscall)
34
35CF := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64
36
37$(obj)/vclock_gettime.o: CFLAGS = $(CF)
38$(obj)/vgetcpu.o: CFLAGS = $(CF)
39
40# We also create a special relocatable object that should mirror the symbol
41# table and layout of the linked DSO. With ld -R we can then refer to
42# these symbols in the kernel code rather than hand-coded addresses.
43extra-y += vdso-syms.o
44$(obj)/built-in.o: $(obj)/vdso-syms.o
45$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o
46
47SYSCFLAGS_vdso-syms.o = -r -d
48$(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE
49 $(call if_changed,syscall)
diff --git a/arch/x86_64/vdso/vclock_gettime.c b/arch/x86_64/vdso/vclock_gettime.c
new file mode 100644
index 000000000000..17f6a00de712
--- /dev/null
+++ b/arch/x86_64/vdso/vclock_gettime.c
@@ -0,0 +1,120 @@
1/*
2 * Copyright 2006 Andi Kleen, SUSE Labs.
3 * Subject to the GNU Public License, v.2
4 *
5 * Fast user context implementation of clock_gettime and gettimeofday.
6 *
7 * The code should have no internal unresolved relocations.
8 * Check with readelf after changing.
9 * Also alternative() doesn't work.
10 */
11
12#include <linux/kernel.h>
13#include <linux/posix-timers.h>
14#include <linux/time.h>
15#include <linux/string.h>
16#include <asm/vsyscall.h>
17#include <asm/vgtod.h>
18#include <asm/timex.h>
19#include <asm/hpet.h>
20#include <asm/unistd.h>
21#include <asm/io.h>
22#include <asm/vgtod.h>
23#include "vextern.h"
24
25#define gtod vdso_vsyscall_gtod_data
26
27static long vdso_fallback_gettime(long clock, struct timespec *ts)
28{
29 long ret;
30 asm("syscall" : "=a" (ret) :
31 "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
32 return ret;
33}
34
35static inline long vgetns(void)
36{
37 cycles_t (*vread)(void);
38 vread = gtod->clock.vread;
39 return ((vread() - gtod->clock.cycle_last) * gtod->clock.mult) >>
40 gtod->clock.shift;
41}
42
43static noinline int do_realtime(struct timespec *ts)
44{
45 unsigned long seq, ns;
46 do {
47 seq = read_seqbegin(&gtod->lock);
48 ts->tv_sec = gtod->wall_time_sec;
49 ts->tv_nsec = gtod->wall_time_nsec;
50 ns = vgetns();
51 } while (unlikely(read_seqretry(&gtod->lock, seq)));
52 timespec_add_ns(ts, ns);
53 return 0;
54}
55
56/* Copy of the version in kernel/time.c which we cannot directly access */
57static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
58{
59 while (nsec >= NSEC_PER_SEC) {
60 nsec -= NSEC_PER_SEC;
61 ++sec;
62 }
63 while (nsec < 0) {
64 nsec += NSEC_PER_SEC;
65 --sec;
66 }
67 ts->tv_sec = sec;
68 ts->tv_nsec = nsec;
69}
70
71static noinline int do_monotonic(struct timespec *ts)
72{
73 unsigned long seq, ns, secs;
74 do {
75 seq = read_seqbegin(&gtod->lock);
76 secs = gtod->wall_time_sec;
77 ns = gtod->wall_time_nsec + vgetns();
78 secs += gtod->wall_to_monotonic.tv_sec;
79 ns += gtod->wall_to_monotonic.tv_nsec;
80 } while (unlikely(read_seqretry(&gtod->lock, seq)));
81 vset_normalized_timespec(ts, secs, ns);
82 return 0;
83}
84
85int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
86{
87 if (likely(gtod->sysctl_enabled && gtod->clock.vread))
88 switch (clock) {
89 case CLOCK_REALTIME:
90 return do_realtime(ts);
91 case CLOCK_MONOTONIC:
92 return do_monotonic(ts);
93 }
94 return vdso_fallback_gettime(clock, ts);
95}
96int clock_gettime(clockid_t, struct timespec *)
97 __attribute__((weak, alias("__vdso_clock_gettime")));
98
99int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
100{
101 long ret;
102 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
103 BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
104 offsetof(struct timespec, tv_nsec) ||
105 sizeof(*tv) != sizeof(struct timespec));
106 do_realtime((struct timespec *)tv);
107 tv->tv_usec /= 1000;
108 if (unlikely(tz != NULL)) {
109 /* This relies on gcc inlining the memcpy. We'll notice
110 if it ever fails to do so. */
111 memcpy(tz, &gtod->sys_tz, sizeof(struct timezone));
112 }
113 return 0;
114 }
115 asm("syscall" : "=a" (ret) :
116 "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
117 return ret;
118}
119int gettimeofday(struct timeval *, struct timezone *)
120 __attribute__((weak, alias("__vdso_gettimeofday")));
diff --git a/arch/x86_64/vdso/vdso-note.S b/arch/x86_64/vdso/vdso-note.S
new file mode 100644
index 000000000000..79a071e4357e
--- /dev/null
+++ b/arch/x86_64/vdso/vdso-note.S
@@ -0,0 +1,12 @@
1/*
2 * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
3 * Here we can supply some information useful to userland.
4 */
5
6#include <linux/uts.h>
7#include <linux/version.h>
8#include <linux/elfnote.h>
9
10ELFNOTE_START(Linux, 0, "a")
11 .long LINUX_VERSION_CODE
12ELFNOTE_END
diff --git a/arch/x86_64/vdso/vdso-start.S b/arch/x86_64/vdso/vdso-start.S
new file mode 100644
index 000000000000..2dc2cdb84d67
--- /dev/null
+++ b/arch/x86_64/vdso/vdso-start.S
@@ -0,0 +1,2 @@
1 .globl vdso_kernel_start
2vdso_kernel_start:
diff --git a/arch/x86_64/vdso/vdso.S b/arch/x86_64/vdso/vdso.S
new file mode 100644
index 000000000000..92e80c1972a7
--- /dev/null
+++ b/arch/x86_64/vdso/vdso.S
@@ -0,0 +1,2 @@
1 .section ".vdso","a"
2 .incbin "arch/x86_64/vdso/vdso.so"
diff --git a/arch/x86_64/vdso/vdso.lds.S b/arch/x86_64/vdso/vdso.lds.S
new file mode 100644
index 000000000000..b9a60e665d08
--- /dev/null
+++ b/arch/x86_64/vdso/vdso.lds.S
@@ -0,0 +1,77 @@
1/*
2 * Linker script for vsyscall DSO. The vsyscall page is an ELF shared
3 * object prelinked to its virtual address, and with only one read-only
4 * segment (that fits in one page). This script controls its layout.
5 */
6#include <asm/asm-offsets.h>
7#include "voffset.h"
8
9#define VDSO_PRELINK 0xffffffffff700000
10
11SECTIONS
12{
13 . = VDSO_PRELINK + SIZEOF_HEADERS;
14
15 .hash : { *(.hash) } :text
16 .gnu.hash : { *(.gnu.hash) }
17 .dynsym : { *(.dynsym) }
18 .dynstr : { *(.dynstr) }
19 .gnu.version : { *(.gnu.version) }
20 .gnu.version_d : { *(.gnu.version_d) }
21 .gnu.version_r : { *(.gnu.version_r) }
22
23 /* This linker script is used both with -r and with -shared.
24 For the layouts to match, we need to skip more than enough
25 space for the dynamic symbol table et al. If this amount
26 is insufficient, ld -shared will barf. Just increase it here. */
27 . = VDSO_PRELINK + VDSO_TEXT_OFFSET;
28
29 .text : { *(.text) } :text
30 .text.ptr : { *(.text.ptr) } :text
31 . = VDSO_PRELINK + 0x900;
32 .data : { *(.data) } :text
33 .bss : { *(.bss) } :text
34
35 .altinstructions : { *(.altinstructions) } :text
36 .altinstr_replacement : { *(.altinstr_replacement) } :text
37
38 .note : { *(.note.*) } :text :note
39 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
40 .eh_frame : { KEEP (*(.eh_frame)) } :text
41 .dynamic : { *(.dynamic) } :text :dynamic
42 .useless : {
43 *(.got.plt) *(.got)
44 *(.gnu.linkonce.d.*)
45 *(.dynbss)
46 *(.gnu.linkonce.b.*)
47 } :text
48}
49
50/*
51 * We must supply the ELF program headers explicitly to get just one
52 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
53 */
54PHDRS
55{
56 text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
57 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
58 note PT_NOTE FLAGS(4); /* PF_R */
59 eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
60}
61
62/*
63 * This controls what symbols we export from the DSO.
64 */
65VERSION
66{
67 LINUX_2.6 {
68 global:
69 clock_gettime;
70 __vdso_clock_gettime;
71 gettimeofday;
72 __vdso_gettimeofday;
73 getcpu;
74 __vdso_getcpu;
75 local: *;
76 };
77}
diff --git a/arch/x86_64/vdso/vextern.h b/arch/x86_64/vdso/vextern.h
new file mode 100644
index 000000000000..1683ba2ae3e8
--- /dev/null
+++ b/arch/x86_64/vdso/vextern.h
@@ -0,0 +1,16 @@
1#ifndef VEXTERN
2#include <asm/vsyscall.h>
3#define VEXTERN(x) \
4 extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden")));
5#endif
6
7#define VMAGIC 0xfeedbabeabcdefabUL
8
9/* Any kernel variables used in the vDSO must be exported in the main
10 kernel's vmlinux.lds.S/vsyscall.h/proper __section and
11 put into vextern.h and be referenced as a pointer with vdso prefix.
12 The main kernel later fills in the values. */
13
14VEXTERN(jiffies)
15VEXTERN(vgetcpu_mode)
16VEXTERN(vsyscall_gtod_data)
diff --git a/arch/x86_64/vdso/vgetcpu.c b/arch/x86_64/vdso/vgetcpu.c
new file mode 100644
index 000000000000..91f6e85d0fc2
--- /dev/null
+++ b/arch/x86_64/vdso/vgetcpu.c
@@ -0,0 +1,50 @@
1/*
2 * Copyright 2006 Andi Kleen, SUSE Labs.
3 * Subject to the GNU Public License, v.2
4 *
5 * Fast user context implementation of getcpu()
6 */
7
8#include <linux/kernel.h>
9#include <linux/getcpu.h>
10#include <linux/jiffies.h>
11#include <linux/time.h>
12#include <asm/vsyscall.h>
13#include <asm/vgtod.h>
14#include "vextern.h"
15
16long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
17{
18 unsigned int dummy, p;
19 unsigned long j = 0;
20
21 /* Fast cache - only recompute value once per jiffies and avoid
22 relatively costly rdtscp/cpuid otherwise.
23 This works because the scheduler usually keeps the process
24 on the same CPU and this syscall doesn't guarantee its
25 results anyways.
26 We do this here because otherwise user space would do it on
27 its own in a likely inferior way (no access to jiffies).
28 If you don't like it pass NULL. */
29 if (tcache && tcache->blob[0] == (j = *vdso_jiffies)) {
30 p = tcache->blob[1];
31 } else if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) {
32 /* Load per CPU data from RDTSCP */
33 rdtscp(dummy, dummy, p);
34 } else {
35 /* Load per CPU data from GDT */
36 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
37 }
38 if (tcache) {
39 tcache->blob[0] = j;
40 tcache->blob[1] = p;
41 }
42 if (cpu)
43 *cpu = p & 0xfff;
44 if (node)
45 *node = p >> 12;
46 return 0;
47}
48
49long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
50 __attribute__((weak, alias("__vdso_getcpu")));
diff --git a/arch/x86_64/vdso/vma.c b/arch/x86_64/vdso/vma.c
new file mode 100644
index 000000000000..d4cb83a6c066
--- /dev/null
+++ b/arch/x86_64/vdso/vma.c
@@ -0,0 +1,139 @@
1/*
2 * Set up the VMAs to tell the VM about the vDSO.
3 * Copyright 2007 Andi Kleen, SUSE Labs.
4 * Subject to the GPL, v.2
5 */
6#include <linux/mm.h>
7#include <linux/sched.h>
8#include <linux/init.h>
9#include <linux/random.h>
10#include <asm/vsyscall.h>
11#include <asm/vgtod.h>
12#include <asm/proto.h>
13#include "voffset.h"
14
15int vdso_enabled = 1;
16
17#define VEXTERN(x) extern typeof(__ ## x) *vdso_ ## x;
18#include "vextern.h"
19#undef VEXTERN
20
21extern char vdso_kernel_start[], vdso_start[], vdso_end[];
22extern unsigned short vdso_sync_cpuid;
23
24struct page **vdso_pages;
25
26static inline void *var_ref(void *vbase, char *var, char *name)
27{
28 unsigned offset = var - &vdso_kernel_start[0] + VDSO_TEXT_OFFSET;
29 void *p = vbase + offset;
30 if (*(void **)p != (void *)VMAGIC) {
31 printk("VDSO: variable %s broken\n", name);
32 vdso_enabled = 0;
33 }
34 return p;
35}
36
37static int __init init_vdso_vars(void)
38{
39 int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
40 int i;
41 char *vbase;
42
43 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
44 if (!vdso_pages)
45 goto oom;
46 for (i = 0; i < npages; i++) {
47 struct page *p;
48 p = alloc_page(GFP_KERNEL);
49 if (!p)
50 goto oom;
51 vdso_pages[i] = p;
52 copy_page(page_address(p), vdso_start + i*PAGE_SIZE);
53 }
54
55 vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL);
56 if (!vbase)
57 goto oom;
58
59 if (memcmp(vbase, "\177ELF", 4)) {
60 printk("VDSO: I'm broken; not ELF\n");
61 vdso_enabled = 0;
62 }
63
64#define V(x) *(typeof(x) *) var_ref(vbase, (char *)RELOC_HIDE(&x, 0), #x)
65#define VEXTERN(x) \
66 V(vdso_ ## x) = &__ ## x;
67#include "vextern.h"
68#undef VEXTERN
69 return 0;
70
71 oom:
72 printk("Cannot allocate vdso\n");
73 vdso_enabled = 0;
74 return -ENOMEM;
75}
76__initcall(init_vdso_vars);
77
78struct linux_binprm;
79
80/* Put the vdso above the (randomized) stack with another randomized offset.
81 This way there is no hole in the middle of address space.
82 To save memory make sure it is still in the same PTE as the stack top.
83 This doesn't give that many random bits */
84static unsigned long vdso_addr(unsigned long start, unsigned len)
85{
86 unsigned long addr, end;
87 unsigned offset;
88 end = (start + PMD_SIZE - 1) & PMD_MASK;
89 if (end >= TASK_SIZE64)
90 end = TASK_SIZE64;
91 end -= len;
92 /* This loses some more bits than a modulo, but is cheaper */
93 offset = get_random_int() & (PTRS_PER_PTE - 1);
94 addr = start + (offset << PAGE_SHIFT);
95 if (addr >= end)
96 addr = end;
97 return addr;
98}
99
100/* Setup a VMA at program startup for the vsyscall page.
101 Not called for compat tasks */
102int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
103{
104 struct mm_struct *mm = current->mm;
105 unsigned long addr;
106 int ret;
107 unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
108
109 if (!vdso_enabled)
110 return 0;
111
112 down_write(&mm->mmap_sem);
113 addr = vdso_addr(mm->start_stack, len);
114 addr = get_unmapped_area(NULL, addr, len, 0, 0);
115 if (IS_ERR_VALUE(addr)) {
116 ret = addr;
117 goto up_fail;
118 }
119
120 ret = install_special_mapping(mm, addr, len,
121 VM_READ|VM_EXEC|
122 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
123 VM_ALWAYSDUMP,
124 vdso_pages);
125 if (ret)
126 goto up_fail;
127
128 current->mm->context.vdso = (void *)addr;
129up_fail:
130 up_write(&mm->mmap_sem);
131 return ret;
132}
133
134static __init int vdso_setup(char *s)
135{
136 vdso_enabled = simple_strtoul(s, NULL, 0);
137 return 0;
138}
139__setup("vdso=", vdso_setup);
diff --git a/arch/x86_64/vdso/voffset.h b/arch/x86_64/vdso/voffset.h
new file mode 100644
index 000000000000..5304204911f2
--- /dev/null
+++ b/arch/x86_64/vdso/voffset.h
@@ -0,0 +1 @@
#define VDSO_TEXT_OFFSET 0x500
diff --git a/arch/x86_64/vdso/vvar.c b/arch/x86_64/vdso/vvar.c
new file mode 100644
index 000000000000..6fc22219a472
--- /dev/null
+++ b/arch/x86_64/vdso/vvar.c
@@ -0,0 +1,12 @@
1/* Define pointer to external vDSO variables.
2 These are part of the vDSO. The kernel fills in the real addresses
3 at boot time. This is done because when the vdso is linked the
4 kernel isn't yet and we don't know the final addresses. */
5#include <linux/kernel.h>
6#include <linux/time.h>
7#include <asm/vsyscall.h>
8#include <asm/timex.h>
9#include <asm/vgtod.h>
10
11#define VEXTERN(x) typeof (__ ## x) *vdso_ ## x = (void *)VMAGIC;
12#include "vextern.h"
diff --git a/include/asm-x86_64/auxvec.h b/include/asm-x86_64/auxvec.h
index 2403c4cfced2..1d5ab0d03950 100644
--- a/include/asm-x86_64/auxvec.h
+++ b/include/asm-x86_64/auxvec.h
@@ -1,4 +1,6 @@
1#ifndef __ASM_X86_64_AUXVEC_H 1#ifndef __ASM_X86_64_AUXVEC_H
2#define __ASM_X86_64_AUXVEC_H 2#define __ASM_X86_64_AUXVEC_H
3 3
4#define AT_SYSINFO_EHDR 33
5
4#endif 6#endif
diff --git a/include/asm-x86_64/elf.h b/include/asm-x86_64/elf.h
index 6d24ea7c4d9d..b4fbe47f6ccd 100644
--- a/include/asm-x86_64/elf.h
+++ b/include/asm-x86_64/elf.h
@@ -162,6 +162,19 @@ extern int dump_task_fpu (struct task_struct *, elf_fpregset_t *);
162/* 1GB for 64bit, 8MB for 32bit */ 162/* 1GB for 64bit, 8MB for 32bit */
163#define STACK_RND_MASK (test_thread_flag(TIF_IA32) ? 0x7ff : 0x3fffff) 163#define STACK_RND_MASK (test_thread_flag(TIF_IA32) ? 0x7ff : 0x3fffff)
164 164
165
166#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
167struct linux_binprm;
168extern int arch_setup_additional_pages(struct linux_binprm *bprm,
169 int executable_stack);
170
171extern int vdso_enabled;
172
173#define ARCH_DLINFO \
174do if (vdso_enabled) { \
175 NEW_AUX_ENT(AT_SYSINFO_EHDR,(unsigned long)current->mm->context.vdso);\
176} while (0)
177
165#endif 178#endif
166 179
167#endif 180#endif
diff --git a/include/asm-x86_64/mmu.h b/include/asm-x86_64/mmu.h
index 5dc6ed79859a..d2cd4a9d984d 100644
--- a/include/asm-x86_64/mmu.h
+++ b/include/asm-x86_64/mmu.h
@@ -15,6 +15,7 @@ typedef struct {
15 rwlock_t ldtlock; 15 rwlock_t ldtlock;
16 int size; 16 int size;
17 struct semaphore sem; 17 struct semaphore sem;
18 void *vdso;
18} mm_context_t; 19} mm_context_t;
19 20
20#endif 21#endif
diff --git a/include/asm-x86_64/vgtod.h b/include/asm-x86_64/vgtod.h
new file mode 100644
index 000000000000..3301f0929342
--- /dev/null
+++ b/include/asm-x86_64/vgtod.h
@@ -0,0 +1,29 @@
1#ifndef _ASM_VGTOD_H
2#define _ASM_VGTOD_H 1
3
4#include <asm/vsyscall.h>
5#include <linux/clocksource.h>
6
7struct vsyscall_gtod_data {
8 seqlock_t lock;
9
10 /* open coded 'struct timespec' */
11 time_t wall_time_sec;
12 u32 wall_time_nsec;
13
14 int sysctl_enabled;
15 struct timezone sys_tz;
16 struct { /* extract of a clocksource struct */
17 cycle_t (*vread)(void);
18 cycle_t cycle_last;
19 cycle_t mask;
20 u32 mult;
21 u32 shift;
22 } clock;
23 struct timespec wall_to_monotonic;
24};
25extern struct vsyscall_gtod_data __vsyscall_gtod_data
26__section_vsyscall_gtod_data;
27extern struct vsyscall_gtod_data vsyscall_gtod_data;
28
29#endif
diff --git a/include/asm-x86_64/vsyscall.h b/include/asm-x86_64/vsyscall.h
index 82b4afe65c91..3b8ceb4af2cf 100644
--- a/include/asm-x86_64/vsyscall.h
+++ b/include/asm-x86_64/vsyscall.h
@@ -22,6 +22,8 @@ enum vsyscall_num {
22/* Definitions for CONFIG_GENERIC_TIME definitions */ 22/* Definitions for CONFIG_GENERIC_TIME definitions */
23#define __section_vsyscall_gtod_data __attribute__ \ 23#define __section_vsyscall_gtod_data __attribute__ \
24 ((unused, __section__ (".vsyscall_gtod_data"),aligned(16))) 24 ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
25#define __section_vsyscall_clock __attribute__ \
26 ((unused, __section__ (".vsyscall_clock"),aligned(16)))
25#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn"))) 27#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn")))
26 28
27#define VGETCPU_RDTSCP 1 29#define VGETCPU_RDTSCP 1
@@ -36,7 +38,6 @@ extern volatile unsigned long __jiffies;
36/* kernel space (writeable) */ 38/* kernel space (writeable) */
37extern int vgetcpu_mode; 39extern int vgetcpu_mode;
38extern struct timezone sys_tz; 40extern struct timezone sys_tz;
39extern struct vsyscall_gtod_data_t vsyscall_gtod_data;
40 41
41#endif /* __KERNEL__ */ 42#endif /* __KERNEL__ */
42 43