aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/stable/vdso27
-rw-r--r--Documentation/vDSO/parse_vdso.c256
-rw-r--r--Documentation/vDSO/vdso_test.c111
3 files changed, 394 insertions, 0 deletions
diff --git a/Documentation/ABI/stable/vdso b/Documentation/ABI/stable/vdso
new file mode 100644
index 000000000000..8a1cbb594497
--- /dev/null
+++ b/Documentation/ABI/stable/vdso
@@ -0,0 +1,27 @@
1On some architectures, when the kernel loads any userspace program it
2maps an ELF DSO into that program's address space. This DSO is called
3the vDSO and it often contains useful and highly-optimized alternatives
4to real syscalls.
5
6These functions are called just like ordinary C function according to
7your platform's ABI. Call them from a sensible context. (For example,
8if you set CS on x86 to something strange, the vDSO functions are
9within their rights to crash.) In addition, if you pass a bad
10pointer to a vDSO function, you might get SIGSEGV instead of -EFAULT.
11
12To find the DSO, parse the auxiliary vector passed to the program's
13entry point. The AT_SYSINFO_EHDR entry will point to the vDSO.
14
15The vDSO uses symbol versioning; whenever you request a symbol from the
16vDSO, specify the version you are expecting.
17
18Programs that dynamically link to glibc will use the vDSO automatically.
19Otherwise, you can use the reference parser in Documentation/vDSO/parse_vdso.c.
20
21Unless otherwise noted, the set of symbols with any given version and the
22ABI of those symbols is considered stable. It may vary across architectures,
23though.
24
25(As of this writing, this ABI documentation as been confirmed for x86_64.
26 The maintainers of the other vDSO-using architectures should confirm
27 that it is correct for their architecture.) \ No newline at end of file
diff --git a/Documentation/vDSO/parse_vdso.c b/Documentation/vDSO/parse_vdso.c
new file mode 100644
index 000000000000..85870208edcf
--- /dev/null
+++ b/Documentation/vDSO/parse_vdso.c
@@ -0,0 +1,256 @@
1/*
2 * parse_vdso.c: Linux reference vDSO parser
3 * Written by Andrew Lutomirski, 2011.
4 *
5 * This code is meant to be linked in to various programs that run on Linux.
6 * As such, it is available with as few restrictions as possible. This file
7 * is licensed under the Creative Commons Zero License, version 1.0,
8 * available at http://creativecommons.org/publicdomain/zero/1.0/legalcode
9 *
10 * The vDSO is a regular ELF DSO that the kernel maps into user space when
11 * it starts a program. It works equally well in statically and dynamically
12 * linked binaries.
13 *
14 * This code is tested on x86_64. In principle it should work on any 64-bit
15 * architecture that has a vDSO.
16 */
17
18#include <stdbool.h>
19#include <stdint.h>
20#include <string.h>
21#include <elf.h>
22
23/*
24 * To use this vDSO parser, first call one of the vdso_init_* functions.
25 * If you've already parsed auxv, then pass the value of AT_SYSINFO_EHDR
26 * to vdso_init_from_sysinfo_ehdr. Otherwise pass auxv to vdso_init_from_auxv.
27 * Then call vdso_sym for each symbol you want. For example, to look up
28 * gettimeofday on x86_64, use:
29 *
30 * <some pointer> = vdso_sym("LINUX_2.6", "gettimeofday");
31 * or
32 * <some pointer> = vdso_sym("LINUX_2.6", "__vdso_gettimeofday");
33 *
34 * vdso_sym will return 0 if the symbol doesn't exist or if the init function
35 * failed or was not called. vdso_sym is a little slow, so its return value
36 * should be cached.
37 *
38 * vdso_sym is threadsafe; the init functions are not.
39 *
40 * These are the prototypes:
41 */
42extern void vdso_init_from_auxv(void *auxv);
43extern void vdso_init_from_sysinfo_ehdr(uintptr_t base);
44extern void *vdso_sym(const char *version, const char *name);
45
46
47/* And here's the code. */
48
49#ifndef __x86_64__
50# error Not yet ported to non-x86_64 architectures
51#endif
52
53static struct vdso_info
54{
55 bool valid;
56
57 /* Load information */
58 uintptr_t load_addr;
59 uintptr_t load_offset; /* load_addr - recorded vaddr */
60
61 /* Symbol table */
62 Elf64_Sym *symtab;
63 const char *symstrings;
64 Elf64_Word *bucket, *chain;
65 Elf64_Word nbucket, nchain;
66
67 /* Version table */
68 Elf64_Versym *versym;
69 Elf64_Verdef *verdef;
70} vdso_info;
71
72/* Straight from the ELF specification. */
73static unsigned long elf_hash(const unsigned char *name)
74{
75 unsigned long h = 0, g;
76 while (*name)
77 {
78 h = (h << 4) + *name++;
79 if (g = h & 0xf0000000)
80 h ^= g >> 24;
81 h &= ~g;
82 }
83 return h;
84}
85
86void vdso_init_from_sysinfo_ehdr(uintptr_t base)
87{
88 size_t i;
89 bool found_vaddr = false;
90
91 vdso_info.valid = false;
92
93 vdso_info.load_addr = base;
94
95 Elf64_Ehdr *hdr = (Elf64_Ehdr*)base;
96 Elf64_Phdr *pt = (Elf64_Phdr*)(vdso_info.load_addr + hdr->e_phoff);
97 Elf64_Dyn *dyn = 0;
98
99 /*
100 * We need two things from the segment table: the load offset
101 * and the dynamic table.
102 */
103 for (i = 0; i < hdr->e_phnum; i++)
104 {
105 if (pt[i].p_type == PT_LOAD && !found_vaddr) {
106 found_vaddr = true;
107 vdso_info.load_offset = base
108 + (uintptr_t)pt[i].p_offset
109 - (uintptr_t)pt[i].p_vaddr;
110 } else if (pt[i].p_type == PT_DYNAMIC) {
111 dyn = (Elf64_Dyn*)(base + pt[i].p_offset);
112 }
113 }
114
115 if (!found_vaddr || !dyn)
116 return; /* Failed */
117
118 /*
119 * Fish out the useful bits of the dynamic table.
120 */
121 Elf64_Word *hash = 0;
122 vdso_info.symstrings = 0;
123 vdso_info.symtab = 0;
124 vdso_info.versym = 0;
125 vdso_info.verdef = 0;
126 for (i = 0; dyn[i].d_tag != DT_NULL; i++) {
127 switch (dyn[i].d_tag) {
128 case DT_STRTAB:
129 vdso_info.symstrings = (const char *)
130 ((uintptr_t)dyn[i].d_un.d_ptr
131 + vdso_info.load_offset);
132 break;
133 case DT_SYMTAB:
134 vdso_info.symtab = (Elf64_Sym *)
135 ((uintptr_t)dyn[i].d_un.d_ptr
136 + vdso_info.load_offset);
137 break;
138 case DT_HASH:
139 hash = (Elf64_Word *)
140 ((uintptr_t)dyn[i].d_un.d_ptr
141 + vdso_info.load_offset);
142 break;
143 case DT_VERSYM:
144 vdso_info.versym = (Elf64_Versym *)
145 ((uintptr_t)dyn[i].d_un.d_ptr
146 + vdso_info.load_offset);
147 break;
148 case DT_VERDEF:
149 vdso_info.verdef = (Elf64_Verdef *)
150 ((uintptr_t)dyn[i].d_un.d_ptr
151 + vdso_info.load_offset);
152 break;
153 }
154 }
155 if (!vdso_info.symstrings || !vdso_info.symtab || !hash)
156 return; /* Failed */
157
158 if (!vdso_info.verdef)
159 vdso_info.versym = 0;
160
161 /* Parse the hash table header. */
162 vdso_info.nbucket = hash[0];
163 vdso_info.nchain = hash[1];
164 vdso_info.bucket = &hash[2];
165 vdso_info.chain = &hash[vdso_info.nbucket + 2];
166
167 /* That's all we need. */
168 vdso_info.valid = true;
169}
170
171static bool vdso_match_version(Elf64_Versym ver,
172 const char *name, Elf64_Word hash)
173{
174 /*
175 * This is a helper function to check if the version indexed by
176 * ver matches name (which hashes to hash).
177 *
178 * The version definition table is a mess, and I don't know how
179 * to do this in better than linear time without allocating memory
180 * to build an index. I also don't know why the table has
181 * variable size entries in the first place.
182 *
183 * For added fun, I can't find a comprehensible specification of how
184 * to parse all the weird flags in the table.
185 *
186 * So I just parse the whole table every time.
187 */
188
189 /* First step: find the version definition */
190 ver &= 0x7fff; /* Apparently bit 15 means "hidden" */
191 Elf64_Verdef *def = vdso_info.verdef;
192 while(true) {
193 if ((def->vd_flags & VER_FLG_BASE) == 0
194 && (def->vd_ndx & 0x7fff) == ver)
195 break;
196
197 if (def->vd_next == 0)
198 return false; /* No definition. */
199
200 def = (Elf64_Verdef *)((char *)def + def->vd_next);
201 }
202
203 /* Now figure out whether it matches. */
204 Elf64_Verdaux *aux = (Elf64_Verdaux*)((char *)def + def->vd_aux);
205 return def->vd_hash == hash
206 && !strcmp(name, vdso_info.symstrings + aux->vda_name);
207}
208
209void *vdso_sym(const char *version, const char *name)
210{
211 unsigned long ver_hash;
212 if (!vdso_info.valid)
213 return 0;
214
215 ver_hash = elf_hash(version);
216 Elf64_Word chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket];
217
218 for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) {
219 Elf64_Sym *sym = &vdso_info.symtab[chain];
220
221 /* Check for a defined global or weak function w/ right name. */
222 if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC)
223 continue;
224 if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL &&
225 ELF64_ST_BIND(sym->st_info) != STB_WEAK)
226 continue;
227 if (sym->st_shndx == SHN_UNDEF)
228 continue;
229 if (strcmp(name, vdso_info.symstrings + sym->st_name))
230 continue;
231
232 /* Check symbol version. */
233 if (vdso_info.versym
234 && !vdso_match_version(vdso_info.versym[chain],
235 version, ver_hash))
236 continue;
237
238 return (void *)(vdso_info.load_offset + sym->st_value);
239 }
240
241 return 0;
242}
243
244void vdso_init_from_auxv(void *auxv)
245{
246 Elf64_auxv_t *elf_auxv = auxv;
247 for (int i = 0; elf_auxv[i].a_type != AT_NULL; i++)
248 {
249 if (elf_auxv[i].a_type == AT_SYSINFO_EHDR) {
250 vdso_init_from_sysinfo_ehdr(elf_auxv[i].a_un.a_val);
251 return;
252 }
253 }
254
255 vdso_info.valid = false;
256}
diff --git a/Documentation/vDSO/vdso_test.c b/Documentation/vDSO/vdso_test.c
new file mode 100644
index 000000000000..fff633432dff
--- /dev/null
+++ b/Documentation/vDSO/vdso_test.c
@@ -0,0 +1,111 @@
1/*
2 * vdso_test.c: Sample code to test parse_vdso.c on x86_64
3 * Copyright (c) 2011 Andy Lutomirski
4 * Subject to the GNU General Public License, version 2
5 *
6 * You can amuse yourself by compiling with:
7 * gcc -std=gnu99 -nostdlib
8 * -Os -fno-asynchronous-unwind-tables -flto
9 * vdso_test.c parse_vdso.c -o vdso_test
10 * to generate a small binary with no dependencies at all.
11 */
12
13#include <sys/syscall.h>
14#include <sys/time.h>
15#include <unistd.h>
16#include <stdint.h>
17
18extern void *vdso_sym(const char *version, const char *name);
19extern void vdso_init_from_sysinfo_ehdr(uintptr_t base);
20extern void vdso_init_from_auxv(void *auxv);
21
22/* We need a libc functions... */
23int strcmp(const char *a, const char *b)
24{
25 /* This implementation is buggy: it never returns -1. */
26 while (*a || *b) {
27 if (*a != *b)
28 return 1;
29 if (*a == 0 || *b == 0)
30 return 1;
31 a++;
32 b++;
33 }
34
35 return 0;
36}
37
38/* ...and two syscalls. This is x86_64-specific. */
39static inline long linux_write(int fd, const void *data, size_t len)
40{
41
42 long ret;
43 asm volatile ("syscall" : "=a" (ret) : "a" (__NR_write),
44 "D" (fd), "S" (data), "d" (len) :
45 "cc", "memory", "rcx",
46 "r8", "r9", "r10", "r11" );
47 return ret;
48}
49
50static inline void linux_exit(int code)
51{
52 asm volatile ("syscall" : : "a" (__NR_exit), "D" (code));
53}
54
55void to_base10(char *lastdig, uint64_t n)
56{
57 while (n) {
58 *lastdig = (n % 10) + '0';
59 n /= 10;
60 lastdig--;
61 }
62}
63
64__attribute__((externally_visible)) void c_main(void **stack)
65{
66 /* Parse the stack */
67 long argc = (long)*stack;
68 stack += argc + 2;
69
70 /* Now we're pointing at the environment. Skip it. */
71 while(*stack)
72 stack++;
73 stack++;
74
75 /* Now we're pointing at auxv. Initialize the vDSO parser. */
76 vdso_init_from_auxv((void *)stack);
77
78 /* Find gettimeofday. */
79 typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
80 gtod_t gtod = (gtod_t)vdso_sym("LINUX_2.6", "__vdso_gettimeofday");
81
82 if (!gtod)
83 linux_exit(1);
84
85 struct timeval tv;
86 long ret = gtod(&tv, 0);
87
88 if (ret == 0) {
89 char buf[] = "The time is .000000\n";
90 to_base10(buf + 31, tv.tv_sec);
91 to_base10(buf + 38, tv.tv_usec);
92 linux_write(1, buf, sizeof(buf) - 1);
93 } else {
94 linux_exit(ret);
95 }
96
97 linux_exit(0);
98}
99
100/*
101 * This is the real entry point. It passes the initial stack into
102 * the C entry point.
103 */
104asm (
105 ".text\n"
106 ".global _start\n"
107 ".type _start,@function\n"
108 "_start:\n\t"
109 "mov %rsp,%rdi\n\t"
110 "jmp c_main"
111 );