aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-07-22 20:05:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-22 20:05:15 -0400
commit8e204874db000928e37199c2db82b7eb8966cc3c (patch)
treeeae66035cb761c3c5a79e98b92280b5156bc01ef /Documentation
parent3e0b8df79ddb8955d2cce5e858972a9cfe763384 (diff)
parentaafade242ff24fac3aabf61c7861dfa44a3c2445 (diff)
Merge branch 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: x86-64, vdso: Do not allocate memory for the vDSO clocksource: Change __ARCH_HAS_CLOCKSOURCE_DATA to a CONFIG option x86, vdso: Drop now wrong comment Document the vDSO and add a reference parser ia64: Replace clocksource.fsys_mmio with generic arch data x86-64: Move vread_tsc and vread_hpet into the vDSO clocksource: Replace vread with generic arch data x86-64: Add --no-undefined to vDSO build x86-64: Allow alternative patching in the vDSO x86: Make alternative instruction pointers relative x86-64: Improve vsyscall emulation CS and RIP handling x86-64: Emulate legacy vsyscalls x86-64: Fill unused parts of the vsyscall page with 0xcc x86-64: Remove vsyscall number 3 (venosys) x86-64: Map the HPET NX x86-64: Remove kernel.vsyscall64 sysctl x86-64: Give vvars their own page x86-64: Document some of entry_64.S x86-64: Fix alignment of jiffies variable
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/ABI/stable/vdso27
-rw-r--r--Documentation/vDSO/parse_vdso.c256
-rw-r--r--Documentation/vDSO/vdso_test.c111
-rw-r--r--Documentation/x86/entry_64.txt98
4 files changed, 492 insertions, 0 deletions
diff --git a/Documentation/ABI/stable/vdso b/Documentation/ABI/stable/vdso
new file mode 100644
index 000000000000..8a1cbb594497
--- /dev/null
+++ b/Documentation/ABI/stable/vdso
@@ -0,0 +1,27 @@
1On some architectures, when the kernel loads any userspace program it
2maps an ELF DSO into that program's address space. This DSO is called
3the vDSO and it often contains useful and highly-optimized alternatives
4to real syscalls.
5
6These functions are called just like ordinary C function according to
7your platform's ABI. Call them from a sensible context. (For example,
8if you set CS on x86 to something strange, the vDSO functions are
9within their rights to crash.) In addition, if you pass a bad
10pointer to a vDSO function, you might get SIGSEGV instead of -EFAULT.
11
12To find the DSO, parse the auxiliary vector passed to the program's
13entry point. The AT_SYSINFO_EHDR entry will point to the vDSO.
14
15The vDSO uses symbol versioning; whenever you request a symbol from the
16vDSO, specify the version you are expecting.
17
18Programs that dynamically link to glibc will use the vDSO automatically.
19Otherwise, you can use the reference parser in Documentation/vDSO/parse_vdso.c.
20
21Unless otherwise noted, the set of symbols with any given version and the
22ABI of those symbols is considered stable. It may vary across architectures,
23though.
24
25(As of this writing, this ABI documentation as been confirmed for x86_64.
26 The maintainers of the other vDSO-using architectures should confirm
27 that it is correct for their architecture.) \ No newline at end of file
diff --git a/Documentation/vDSO/parse_vdso.c b/Documentation/vDSO/parse_vdso.c
new file mode 100644
index 000000000000..85870208edcf
--- /dev/null
+++ b/Documentation/vDSO/parse_vdso.c
@@ -0,0 +1,256 @@
1/*
2 * parse_vdso.c: Linux reference vDSO parser
3 * Written by Andrew Lutomirski, 2011.
4 *
5 * This code is meant to be linked in to various programs that run on Linux.
6 * As such, it is available with as few restrictions as possible. This file
7 * is licensed under the Creative Commons Zero License, version 1.0,
8 * available at http://creativecommons.org/publicdomain/zero/1.0/legalcode
9 *
10 * The vDSO is a regular ELF DSO that the kernel maps into user space when
11 * it starts a program. It works equally well in statically and dynamically
12 * linked binaries.
13 *
14 * This code is tested on x86_64. In principle it should work on any 64-bit
15 * architecture that has a vDSO.
16 */
17
18#include <stdbool.h>
19#include <stdint.h>
20#include <string.h>
21#include <elf.h>
22
23/*
24 * To use this vDSO parser, first call one of the vdso_init_* functions.
25 * If you've already parsed auxv, then pass the value of AT_SYSINFO_EHDR
26 * to vdso_init_from_sysinfo_ehdr. Otherwise pass auxv to vdso_init_from_auxv.
27 * Then call vdso_sym for each symbol you want. For example, to look up
28 * gettimeofday on x86_64, use:
29 *
30 * <some pointer> = vdso_sym("LINUX_2.6", "gettimeofday");
31 * or
32 * <some pointer> = vdso_sym("LINUX_2.6", "__vdso_gettimeofday");
33 *
34 * vdso_sym will return 0 if the symbol doesn't exist or if the init function
35 * failed or was not called. vdso_sym is a little slow, so its return value
36 * should be cached.
37 *
38 * vdso_sym is threadsafe; the init functions are not.
39 *
40 * These are the prototypes:
41 */
42extern void vdso_init_from_auxv(void *auxv);
43extern void vdso_init_from_sysinfo_ehdr(uintptr_t base);
44extern void *vdso_sym(const char *version, const char *name);
45
46
47/* And here's the code. */
48
49#ifndef __x86_64__
50# error Not yet ported to non-x86_64 architectures
51#endif
52
53static struct vdso_info
54{
55 bool valid;
56
57 /* Load information */
58 uintptr_t load_addr;
59 uintptr_t load_offset; /* load_addr - recorded vaddr */
60
61 /* Symbol table */
62 Elf64_Sym *symtab;
63 const char *symstrings;
64 Elf64_Word *bucket, *chain;
65 Elf64_Word nbucket, nchain;
66
67 /* Version table */
68 Elf64_Versym *versym;
69 Elf64_Verdef *verdef;
70} vdso_info;
71
72/* Straight from the ELF specification. */
73static unsigned long elf_hash(const unsigned char *name)
74{
75 unsigned long h = 0, g;
76 while (*name)
77 {
78 h = (h << 4) + *name++;
79 if (g = h & 0xf0000000)
80 h ^= g >> 24;
81 h &= ~g;
82 }
83 return h;
84}
85
86void vdso_init_from_sysinfo_ehdr(uintptr_t base)
87{
88 size_t i;
89 bool found_vaddr = false;
90
91 vdso_info.valid = false;
92
93 vdso_info.load_addr = base;
94
95 Elf64_Ehdr *hdr = (Elf64_Ehdr*)base;
96 Elf64_Phdr *pt = (Elf64_Phdr*)(vdso_info.load_addr + hdr->e_phoff);
97 Elf64_Dyn *dyn = 0;
98
99 /*
100 * We need two things from the segment table: the load offset
101 * and the dynamic table.
102 */
103 for (i = 0; i < hdr->e_phnum; i++)
104 {
105 if (pt[i].p_type == PT_LOAD && !found_vaddr) {
106 found_vaddr = true;
107 vdso_info.load_offset = base
108 + (uintptr_t)pt[i].p_offset
109 - (uintptr_t)pt[i].p_vaddr;
110 } else if (pt[i].p_type == PT_DYNAMIC) {
111 dyn = (Elf64_Dyn*)(base + pt[i].p_offset);
112 }
113 }
114
115 if (!found_vaddr || !dyn)
116 return; /* Failed */
117
118 /*
119 * Fish out the useful bits of the dynamic table.
120 */
121 Elf64_Word *hash = 0;
122 vdso_info.symstrings = 0;
123 vdso_info.symtab = 0;
124 vdso_info.versym = 0;
125 vdso_info.verdef = 0;
126 for (i = 0; dyn[i].d_tag != DT_NULL; i++) {
127 switch (dyn[i].d_tag) {
128 case DT_STRTAB:
129 vdso_info.symstrings = (const char *)
130 ((uintptr_t)dyn[i].d_un.d_ptr
131 + vdso_info.load_offset);
132 break;
133 case DT_SYMTAB:
134 vdso_info.symtab = (Elf64_Sym *)
135 ((uintptr_t)dyn[i].d_un.d_ptr
136 + vdso_info.load_offset);
137 break;
138 case DT_HASH:
139 hash = (Elf64_Word *)
140 ((uintptr_t)dyn[i].d_un.d_ptr
141 + vdso_info.load_offset);
142 break;
143 case DT_VERSYM:
144 vdso_info.versym = (Elf64_Versym *)
145 ((uintptr_t)dyn[i].d_un.d_ptr
146 + vdso_info.load_offset);
147 break;
148 case DT_VERDEF:
149 vdso_info.verdef = (Elf64_Verdef *)
150 ((uintptr_t)dyn[i].d_un.d_ptr
151 + vdso_info.load_offset);
152 break;
153 }
154 }
155 if (!vdso_info.symstrings || !vdso_info.symtab || !hash)
156 return; /* Failed */
157
158 if (!vdso_info.verdef)
159 vdso_info.versym = 0;
160
161 /* Parse the hash table header. */
162 vdso_info.nbucket = hash[0];
163 vdso_info.nchain = hash[1];
164 vdso_info.bucket = &hash[2];
165 vdso_info.chain = &hash[vdso_info.nbucket + 2];
166
167 /* That's all we need. */
168 vdso_info.valid = true;
169}
170
171static bool vdso_match_version(Elf64_Versym ver,
172 const char *name, Elf64_Word hash)
173{
174 /*
175 * This is a helper function to check if the version indexed by
176 * ver matches name (which hashes to hash).
177 *
178 * The version definition table is a mess, and I don't know how
179 * to do this in better than linear time without allocating memory
180 * to build an index. I also don't know why the table has
181 * variable size entries in the first place.
182 *
183 * For added fun, I can't find a comprehensible specification of how
184 * to parse all the weird flags in the table.
185 *
186 * So I just parse the whole table every time.
187 */
188
189 /* First step: find the version definition */
190 ver &= 0x7fff; /* Apparently bit 15 means "hidden" */
191 Elf64_Verdef *def = vdso_info.verdef;
192 while(true) {
193 if ((def->vd_flags & VER_FLG_BASE) == 0
194 && (def->vd_ndx & 0x7fff) == ver)
195 break;
196
197 if (def->vd_next == 0)
198 return false; /* No definition. */
199
200 def = (Elf64_Verdef *)((char *)def + def->vd_next);
201 }
202
203 /* Now figure out whether it matches. */
204 Elf64_Verdaux *aux = (Elf64_Verdaux*)((char *)def + def->vd_aux);
205 return def->vd_hash == hash
206 && !strcmp(name, vdso_info.symstrings + aux->vda_name);
207}
208
209void *vdso_sym(const char *version, const char *name)
210{
211 unsigned long ver_hash;
212 if (!vdso_info.valid)
213 return 0;
214
215 ver_hash = elf_hash(version);
216 Elf64_Word chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket];
217
218 for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) {
219 Elf64_Sym *sym = &vdso_info.symtab[chain];
220
221 /* Check for a defined global or weak function w/ right name. */
222 if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC)
223 continue;
224 if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL &&
225 ELF64_ST_BIND(sym->st_info) != STB_WEAK)
226 continue;
227 if (sym->st_shndx == SHN_UNDEF)
228 continue;
229 if (strcmp(name, vdso_info.symstrings + sym->st_name))
230 continue;
231
232 /* Check symbol version. */
233 if (vdso_info.versym
234 && !vdso_match_version(vdso_info.versym[chain],
235 version, ver_hash))
236 continue;
237
238 return (void *)(vdso_info.load_offset + sym->st_value);
239 }
240
241 return 0;
242}
243
244void vdso_init_from_auxv(void *auxv)
245{
246 Elf64_auxv_t *elf_auxv = auxv;
247 for (int i = 0; elf_auxv[i].a_type != AT_NULL; i++)
248 {
249 if (elf_auxv[i].a_type == AT_SYSINFO_EHDR) {
250 vdso_init_from_sysinfo_ehdr(elf_auxv[i].a_un.a_val);
251 return;
252 }
253 }
254
255 vdso_info.valid = false;
256}
diff --git a/Documentation/vDSO/vdso_test.c b/Documentation/vDSO/vdso_test.c
new file mode 100644
index 000000000000..fff633432dff
--- /dev/null
+++ b/Documentation/vDSO/vdso_test.c
@@ -0,0 +1,111 @@
1/*
2 * vdso_test.c: Sample code to test parse_vdso.c on x86_64
3 * Copyright (c) 2011 Andy Lutomirski
4 * Subject to the GNU General Public License, version 2
5 *
6 * You can amuse yourself by compiling with:
7 * gcc -std=gnu99 -nostdlib
8 * -Os -fno-asynchronous-unwind-tables -flto
9 * vdso_test.c parse_vdso.c -o vdso_test
10 * to generate a small binary with no dependencies at all.
11 */
12
13#include <sys/syscall.h>
14#include <sys/time.h>
15#include <unistd.h>
16#include <stdint.h>
17
18extern void *vdso_sym(const char *version, const char *name);
19extern void vdso_init_from_sysinfo_ehdr(uintptr_t base);
20extern void vdso_init_from_auxv(void *auxv);
21
22/* We need a libc functions... */
23int strcmp(const char *a, const char *b)
24{
25 /* This implementation is buggy: it never returns -1. */
26 while (*a || *b) {
27 if (*a != *b)
28 return 1;
29 if (*a == 0 || *b == 0)
30 return 1;
31 a++;
32 b++;
33 }
34
35 return 0;
36}
37
38/* ...and two syscalls. This is x86_64-specific. */
39static inline long linux_write(int fd, const void *data, size_t len)
40{
41
42 long ret;
43 asm volatile ("syscall" : "=a" (ret) : "a" (__NR_write),
44 "D" (fd), "S" (data), "d" (len) :
45 "cc", "memory", "rcx",
46 "r8", "r9", "r10", "r11" );
47 return ret;
48}
49
50static inline void linux_exit(int code)
51{
52 asm volatile ("syscall" : : "a" (__NR_exit), "D" (code));
53}
54
55void to_base10(char *lastdig, uint64_t n)
56{
57 while (n) {
58 *lastdig = (n % 10) + '0';
59 n /= 10;
60 lastdig--;
61 }
62}
63
64__attribute__((externally_visible)) void c_main(void **stack)
65{
66 /* Parse the stack */
67 long argc = (long)*stack;
68 stack += argc + 2;
69
70 /* Now we're pointing at the environment. Skip it. */
71 while(*stack)
72 stack++;
73 stack++;
74
75 /* Now we're pointing at auxv. Initialize the vDSO parser. */
76 vdso_init_from_auxv((void *)stack);
77
78 /* Find gettimeofday. */
79 typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
80 gtod_t gtod = (gtod_t)vdso_sym("LINUX_2.6", "__vdso_gettimeofday");
81
82 if (!gtod)
83 linux_exit(1);
84
85 struct timeval tv;
86 long ret = gtod(&tv, 0);
87
88 if (ret == 0) {
89 char buf[] = "The time is .000000\n";
90 to_base10(buf + 31, tv.tv_sec);
91 to_base10(buf + 38, tv.tv_usec);
92 linux_write(1, buf, sizeof(buf) - 1);
93 } else {
94 linux_exit(ret);
95 }
96
97 linux_exit(0);
98}
99
100/*
101 * This is the real entry point. It passes the initial stack into
102 * the C entry point.
103 */
104asm (
105 ".text\n"
106 ".global _start\n"
107 ".type _start,@function\n"
108 "_start:\n\t"
109 "mov %rsp,%rdi\n\t"
110 "jmp c_main"
111 );
diff --git a/Documentation/x86/entry_64.txt b/Documentation/x86/entry_64.txt
new file mode 100644
index 000000000000..7869f14d055c
--- /dev/null
+++ b/Documentation/x86/entry_64.txt
@@ -0,0 +1,98 @@
1This file documents some of the kernel entries in
2arch/x86/kernel/entry_64.S. A lot of this explanation is adapted from
3an email from Ingo Molnar:
4
5http://lkml.kernel.org/r/<20110529191055.GC9835%40elte.hu>
6
7The x86 architecture has quite a few different ways to jump into
8kernel code. Most of these entry points are registered in
9arch/x86/kernel/traps.c and implemented in arch/x86/kernel/entry_64.S
10and arch/x86/ia32/ia32entry.S.
11
12The IDT vector assignments are listed in arch/x86/include/irq_vectors.h.
13
14Some of these entries are:
15
16 - system_call: syscall instruction from 64-bit code.
17
18 - ia32_syscall: int 0x80 from 32-bit or 64-bit code; compat syscall
19 either way.
20
21 - ia32_syscall, ia32_sysenter: syscall and sysenter from 32-bit
22 code
23
24 - interrupt: An array of entries. Every IDT vector that doesn't
25 explicitly point somewhere else gets set to the corresponding
26 value in interrupts. These point to a whole array of
27 magically-generated functions that make their way to do_IRQ with
28 the interrupt number as a parameter.
29
30 - emulate_vsyscall: int 0xcc, a special non-ABI entry used by
31 vsyscall emulation.
32
33 - APIC interrupts: Various special-purpose interrupts for things
34 like TLB shootdown.
35
36 - Architecturally-defined exceptions like divide_error.
37
38There are a few complexities here. The different x86-64 entries
39have different calling conventions. The syscall and sysenter
40instructions have their own peculiar calling conventions. Some of
41the IDT entries push an error code onto the stack; others don't.
42IDT entries using the IST alternative stack mechanism need their own
43magic to get the stack frames right. (You can find some
44documentation in the AMD APM, Volume 2, Chapter 8 and the Intel SDM,
45Volume 3, Chapter 6.)
46
47Dealing with the swapgs instruction is especially tricky. Swapgs
48toggles whether gs is the kernel gs or the user gs. The swapgs
49instruction is rather fragile: it must nest perfectly and only in
50single depth, it should only be used if entering from user mode to
51kernel mode and then when returning to user-space, and precisely
52so. If we mess that up even slightly, we crash.
53
54So when we have a secondary entry, already in kernel mode, we *must
55not* use SWAPGS blindly - nor must we forget doing a SWAPGS when it's
56not switched/swapped yet.
57
58Now, there's a secondary complication: there's a cheap way to test
59which mode the CPU is in and an expensive way.
60
61The cheap way is to pick this info off the entry frame on the kernel
62stack, from the CS of the ptregs area of the kernel stack:
63
64 xorl %ebx,%ebx
65 testl $3,CS+8(%rsp)
66 je error_kernelspace
67 SWAPGS
68
69The expensive (paranoid) way is to read back the MSR_GS_BASE value
70(which is what SWAPGS modifies):
71
72 movl $1,%ebx
73 movl $MSR_GS_BASE,%ecx
74 rdmsr
75 testl %edx,%edx
76 js 1f /* negative -> in kernel */
77 SWAPGS
78 xorl %ebx,%ebx
791: ret
80
81and the whole paranoid non-paranoid macro complexity is about whether
82to suffer that RDMSR cost.
83
84If we are at an interrupt or user-trap/gate-alike boundary then we can
85use the faster check: the stack will be a reliable indicator of
86whether SWAPGS was already done: if we see that we are a secondary
87entry interrupting kernel mode execution, then we know that the GS
88base has already been switched. If it says that we interrupted
89user-space execution then we must do the SWAPGS.
90
91But if we are in an NMI/MCE/DEBUG/whatever super-atomic entry context,
92which might have triggered right after a normal entry wrote CS to the
93stack but before we executed SWAPGS, then the only safe way to check
94for GS is the slower method: the RDMSR.
95
96So we try only to mark those entry methods 'paranoid' that absolutely
97need the more expensive check for the GS base - and we generate all
98'normal' entry points with the regular (faster) entry macros.