aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-07-22 20:05:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-22 20:05:15 -0400
commit8e204874db000928e37199c2db82b7eb8966cc3c (patch)
treeeae66035cb761c3c5a79e98b92280b5156bc01ef
parent3e0b8df79ddb8955d2cce5e858972a9cfe763384 (diff)
parentaafade242ff24fac3aabf61c7861dfa44a3c2445 (diff)
Merge branch 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: x86-64, vdso: Do not allocate memory for the vDSO clocksource: Change __ARCH_HAS_CLOCKSOURCE_DATA to a CONFIG option x86, vdso: Drop now wrong comment Document the vDSO and add a reference parser ia64: Replace clocksource.fsys_mmio with generic arch data x86-64: Move vread_tsc and vread_hpet into the vDSO clocksource: Replace vread with generic arch data x86-64: Add --no-undefined to vDSO build x86-64: Allow alternative patching in the vDSO x86: Make alternative instruction pointers relative x86-64: Improve vsyscall emulation CS and RIP handling x86-64: Emulate legacy vsyscalls x86-64: Fill unused parts of the vsyscall page with 0xcc x86-64: Remove vsyscall number 3 (venosys) x86-64: Map the HPET NX x86-64: Remove kernel.vsyscall64 sysctl x86-64: Give vvars their own page x86-64: Document some of entry_64.S x86-64: Fix alignment of jiffies variable
-rw-r--r--Documentation/ABI/stable/vdso27
-rw-r--r--Documentation/vDSO/parse_vdso.c256
-rw-r--r--Documentation/vDSO/vdso_test.c111
-rw-r--r--Documentation/x86/entry_64.txt98
-rw-r--r--arch/ia64/Kconfig3
-rw-r--r--arch/ia64/include/asm/clocksource.h10
-rw-r--r--arch/ia64/kernel/cyclone.c2
-rw-r--r--arch/ia64/kernel/time.c2
-rw-r--r--arch/ia64/sn/kernel/sn2/timer.c2
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/include/asm/alternative-asm.h4
-rw-r--r--arch/x86/include/asm/alternative.h8
-rw-r--r--arch/x86/include/asm/clocksource.h18
-rw-r--r--arch/x86/include/asm/cpufeature.h8
-rw-r--r--arch/x86/include/asm/fixmap.h1
-rw-r--r--arch/x86/include/asm/irq_vectors.h6
-rw-r--r--arch/x86/include/asm/pgtable_types.h6
-rw-r--r--arch/x86/include/asm/traps.h4
-rw-r--r--arch/x86/include/asm/tsc.h4
-rw-r--r--arch/x86/include/asm/vgtod.h3
-rw-r--r--arch/x86/include/asm/vsyscall.h4
-rw-r--r--arch/x86/include/asm/vvar.h24
-rw-r--r--arch/x86/kernel/Makefile8
-rw-r--r--arch/x86/kernel/alternative.c23
-rw-r--r--arch/x86/kernel/entry_64.S4
-rw-r--r--arch/x86/kernel/hpet.c11
-rw-r--r--arch/x86/kernel/traps.c6
-rw-r--r--arch/x86/kernel/tsc.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S49
-rw-r--r--arch/x86/kernel/vread_tsc_64.c36
-rw-r--r--arch/x86/kernel/vsyscall_64.c310
-rw-r--r--arch/x86/kernel/vsyscall_emu_64.S27
-rw-r--r--arch/x86/lib/copy_page_64.S9
-rw-r--r--arch/x86/lib/memmove_64.S11
-rw-r--r--arch/x86/vdso/Makefile1
-rw-r--r--arch/x86/vdso/vclock_gettime.c103
-rw-r--r--arch/x86/vdso/vdso.S15
-rw-r--r--arch/x86/vdso/vma.c58
-rw-r--r--drivers/char/hpet.c2
-rw-r--r--include/linux/clocksource.h15
-rw-r--r--include/linux/seccomp.h10
41 files changed, 927 insertions, 378 deletions
diff --git a/Documentation/ABI/stable/vdso b/Documentation/ABI/stable/vdso
new file mode 100644
index 000000000000..8a1cbb594497
--- /dev/null
+++ b/Documentation/ABI/stable/vdso
@@ -0,0 +1,27 @@
1On some architectures, when the kernel loads any userspace program it
2maps an ELF DSO into that program's address space. This DSO is called
3the vDSO and it often contains useful and highly-optimized alternatives
4to real syscalls.
5
6These functions are called just like ordinary C function according to
7your platform's ABI. Call them from a sensible context. (For example,
8if you set CS on x86 to something strange, the vDSO functions are
9within their rights to crash.) In addition, if you pass a bad
10pointer to a vDSO function, you might get SIGSEGV instead of -EFAULT.
11
12To find the DSO, parse the auxiliary vector passed to the program's
13entry point. The AT_SYSINFO_EHDR entry will point to the vDSO.
14
15The vDSO uses symbol versioning; whenever you request a symbol from the
16vDSO, specify the version you are expecting.
17
18Programs that dynamically link to glibc will use the vDSO automatically.
19Otherwise, you can use the reference parser in Documentation/vDSO/parse_vdso.c.
20
21Unless otherwise noted, the set of symbols with any given version and the
22ABI of those symbols is considered stable. It may vary across architectures,
23though.
24
25(As of this writing, this ABI documentation as been confirmed for x86_64.
26 The maintainers of the other vDSO-using architectures should confirm
27 that it is correct for their architecture.) \ No newline at end of file
diff --git a/Documentation/vDSO/parse_vdso.c b/Documentation/vDSO/parse_vdso.c
new file mode 100644
index 000000000000..85870208edcf
--- /dev/null
+++ b/Documentation/vDSO/parse_vdso.c
@@ -0,0 +1,256 @@
1/*
2 * parse_vdso.c: Linux reference vDSO parser
3 * Written by Andrew Lutomirski, 2011.
4 *
5 * This code is meant to be linked in to various programs that run on Linux.
6 * As such, it is available with as few restrictions as possible. This file
7 * is licensed under the Creative Commons Zero License, version 1.0,
8 * available at http://creativecommons.org/publicdomain/zero/1.0/legalcode
9 *
10 * The vDSO is a regular ELF DSO that the kernel maps into user space when
11 * it starts a program. It works equally well in statically and dynamically
12 * linked binaries.
13 *
14 * This code is tested on x86_64. In principle it should work on any 64-bit
15 * architecture that has a vDSO.
16 */
17
18#include <stdbool.h>
19#include <stdint.h>
20#include <string.h>
21#include <elf.h>
22
23/*
24 * To use this vDSO parser, first call one of the vdso_init_* functions.
25 * If you've already parsed auxv, then pass the value of AT_SYSINFO_EHDR
26 * to vdso_init_from_sysinfo_ehdr. Otherwise pass auxv to vdso_init_from_auxv.
27 * Then call vdso_sym for each symbol you want. For example, to look up
28 * gettimeofday on x86_64, use:
29 *
30 * <some pointer> = vdso_sym("LINUX_2.6", "gettimeofday");
31 * or
32 * <some pointer> = vdso_sym("LINUX_2.6", "__vdso_gettimeofday");
33 *
34 * vdso_sym will return 0 if the symbol doesn't exist or if the init function
35 * failed or was not called. vdso_sym is a little slow, so its return value
36 * should be cached.
37 *
38 * vdso_sym is threadsafe; the init functions are not.
39 *
40 * These are the prototypes:
41 */
42extern void vdso_init_from_auxv(void *auxv);
43extern void vdso_init_from_sysinfo_ehdr(uintptr_t base);
44extern void *vdso_sym(const char *version, const char *name);
45
46
47/* And here's the code. */
48
49#ifndef __x86_64__
50# error Not yet ported to non-x86_64 architectures
51#endif
52
53static struct vdso_info
54{
55 bool valid;
56
57 /* Load information */
58 uintptr_t load_addr;
59 uintptr_t load_offset; /* load_addr - recorded vaddr */
60
61 /* Symbol table */
62 Elf64_Sym *symtab;
63 const char *symstrings;
64 Elf64_Word *bucket, *chain;
65 Elf64_Word nbucket, nchain;
66
67 /* Version table */
68 Elf64_Versym *versym;
69 Elf64_Verdef *verdef;
70} vdso_info;
71
72/* Straight from the ELF specification. */
73static unsigned long elf_hash(const unsigned char *name)
74{
75 unsigned long h = 0, g;
76 while (*name)
77 {
78 h = (h << 4) + *name++;
79 if (g = h & 0xf0000000)
80 h ^= g >> 24;
81 h &= ~g;
82 }
83 return h;
84}
85
86void vdso_init_from_sysinfo_ehdr(uintptr_t base)
87{
88 size_t i;
89 bool found_vaddr = false;
90
91 vdso_info.valid = false;
92
93 vdso_info.load_addr = base;
94
95 Elf64_Ehdr *hdr = (Elf64_Ehdr*)base;
96 Elf64_Phdr *pt = (Elf64_Phdr*)(vdso_info.load_addr + hdr->e_phoff);
97 Elf64_Dyn *dyn = 0;
98
99 /*
100 * We need two things from the segment table: the load offset
101 * and the dynamic table.
102 */
103 for (i = 0; i < hdr->e_phnum; i++)
104 {
105 if (pt[i].p_type == PT_LOAD && !found_vaddr) {
106 found_vaddr = true;
107 vdso_info.load_offset = base
108 + (uintptr_t)pt[i].p_offset
109 - (uintptr_t)pt[i].p_vaddr;
110 } else if (pt[i].p_type == PT_DYNAMIC) {
111 dyn = (Elf64_Dyn*)(base + pt[i].p_offset);
112 }
113 }
114
115 if (!found_vaddr || !dyn)
116 return; /* Failed */
117
118 /*
119 * Fish out the useful bits of the dynamic table.
120 */
121 Elf64_Word *hash = 0;
122 vdso_info.symstrings = 0;
123 vdso_info.symtab = 0;
124 vdso_info.versym = 0;
125 vdso_info.verdef = 0;
126 for (i = 0; dyn[i].d_tag != DT_NULL; i++) {
127 switch (dyn[i].d_tag) {
128 case DT_STRTAB:
129 vdso_info.symstrings = (const char *)
130 ((uintptr_t)dyn[i].d_un.d_ptr
131 + vdso_info.load_offset);
132 break;
133 case DT_SYMTAB:
134 vdso_info.symtab = (Elf64_Sym *)
135 ((uintptr_t)dyn[i].d_un.d_ptr
136 + vdso_info.load_offset);
137 break;
138 case DT_HASH:
139 hash = (Elf64_Word *)
140 ((uintptr_t)dyn[i].d_un.d_ptr
141 + vdso_info.load_offset);
142 break;
143 case DT_VERSYM:
144 vdso_info.versym = (Elf64_Versym *)
145 ((uintptr_t)dyn[i].d_un.d_ptr
146 + vdso_info.load_offset);
147 break;
148 case DT_VERDEF:
149 vdso_info.verdef = (Elf64_Verdef *)
150 ((uintptr_t)dyn[i].d_un.d_ptr
151 + vdso_info.load_offset);
152 break;
153 }
154 }
155 if (!vdso_info.symstrings || !vdso_info.symtab || !hash)
156 return; /* Failed */
157
158 if (!vdso_info.verdef)
159 vdso_info.versym = 0;
160
161 /* Parse the hash table header. */
162 vdso_info.nbucket = hash[0];
163 vdso_info.nchain = hash[1];
164 vdso_info.bucket = &hash[2];
165 vdso_info.chain = &hash[vdso_info.nbucket + 2];
166
167 /* That's all we need. */
168 vdso_info.valid = true;
169}
170
171static bool vdso_match_version(Elf64_Versym ver,
172 const char *name, Elf64_Word hash)
173{
174 /*
175 * This is a helper function to check if the version indexed by
176 * ver matches name (which hashes to hash).
177 *
178 * The version definition table is a mess, and I don't know how
179 * to do this in better than linear time without allocating memory
180 * to build an index. I also don't know why the table has
181 * variable size entries in the first place.
182 *
183 * For added fun, I can't find a comprehensible specification of how
184 * to parse all the weird flags in the table.
185 *
186 * So I just parse the whole table every time.
187 */
188
189 /* First step: find the version definition */
190 ver &= 0x7fff; /* Apparently bit 15 means "hidden" */
191 Elf64_Verdef *def = vdso_info.verdef;
192 while(true) {
193 if ((def->vd_flags & VER_FLG_BASE) == 0
194 && (def->vd_ndx & 0x7fff) == ver)
195 break;
196
197 if (def->vd_next == 0)
198 return false; /* No definition. */
199
200 def = (Elf64_Verdef *)((char *)def + def->vd_next);
201 }
202
203 /* Now figure out whether it matches. */
204 Elf64_Verdaux *aux = (Elf64_Verdaux*)((char *)def + def->vd_aux);
205 return def->vd_hash == hash
206 && !strcmp(name, vdso_info.symstrings + aux->vda_name);
207}
208
209void *vdso_sym(const char *version, const char *name)
210{
211 unsigned long ver_hash;
212 if (!vdso_info.valid)
213 return 0;
214
215 ver_hash = elf_hash(version);
216 Elf64_Word chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket];
217
218 for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) {
219 Elf64_Sym *sym = &vdso_info.symtab[chain];
220
221 /* Check for a defined global or weak function w/ right name. */
222 if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC)
223 continue;
224 if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL &&
225 ELF64_ST_BIND(sym->st_info) != STB_WEAK)
226 continue;
227 if (sym->st_shndx == SHN_UNDEF)
228 continue;
229 if (strcmp(name, vdso_info.symstrings + sym->st_name))
230 continue;
231
232 /* Check symbol version. */
233 if (vdso_info.versym
234 && !vdso_match_version(vdso_info.versym[chain],
235 version, ver_hash))
236 continue;
237
238 return (void *)(vdso_info.load_offset + sym->st_value);
239 }
240
241 return 0;
242}
243
244void vdso_init_from_auxv(void *auxv)
245{
246 Elf64_auxv_t *elf_auxv = auxv;
247 for (int i = 0; elf_auxv[i].a_type != AT_NULL; i++)
248 {
249 if (elf_auxv[i].a_type == AT_SYSINFO_EHDR) {
250 vdso_init_from_sysinfo_ehdr(elf_auxv[i].a_un.a_val);
251 return;
252 }
253 }
254
255 vdso_info.valid = false;
256}
diff --git a/Documentation/vDSO/vdso_test.c b/Documentation/vDSO/vdso_test.c
new file mode 100644
index 000000000000..fff633432dff
--- /dev/null
+++ b/Documentation/vDSO/vdso_test.c
@@ -0,0 +1,111 @@
1/*
2 * vdso_test.c: Sample code to test parse_vdso.c on x86_64
3 * Copyright (c) 2011 Andy Lutomirski
4 * Subject to the GNU General Public License, version 2
5 *
6 * You can amuse yourself by compiling with:
7 * gcc -std=gnu99 -nostdlib
8 * -Os -fno-asynchronous-unwind-tables -flto
9 * vdso_test.c parse_vdso.c -o vdso_test
10 * to generate a small binary with no dependencies at all.
11 */
12
13#include <sys/syscall.h>
14#include <sys/time.h>
15#include <unistd.h>
16#include <stdint.h>
17
18extern void *vdso_sym(const char *version, const char *name);
19extern void vdso_init_from_sysinfo_ehdr(uintptr_t base);
20extern void vdso_init_from_auxv(void *auxv);
21
22/* We need a libc functions... */
23int strcmp(const char *a, const char *b)
24{
25 /* This implementation is buggy: it never returns -1. */
26 while (*a || *b) {
27 if (*a != *b)
28 return 1;
29 if (*a == 0 || *b == 0)
30 return 1;
31 a++;
32 b++;
33 }
34
35 return 0;
36}
37
38/* ...and two syscalls. This is x86_64-specific. */
39static inline long linux_write(int fd, const void *data, size_t len)
40{
41
42 long ret;
43 asm volatile ("syscall" : "=a" (ret) : "a" (__NR_write),
44 "D" (fd), "S" (data), "d" (len) :
45 "cc", "memory", "rcx",
46 "r8", "r9", "r10", "r11" );
47 return ret;
48}
49
50static inline void linux_exit(int code)
51{
52 asm volatile ("syscall" : : "a" (__NR_exit), "D" (code));
53}
54
55void to_base10(char *lastdig, uint64_t n)
56{
57 while (n) {
58 *lastdig = (n % 10) + '0';
59 n /= 10;
60 lastdig--;
61 }
62}
63
64__attribute__((externally_visible)) void c_main(void **stack)
65{
66 /* Parse the stack */
67 long argc = (long)*stack;
68 stack += argc + 2;
69
70 /* Now we're pointing at the environment. Skip it. */
71 while(*stack)
72 stack++;
73 stack++;
74
75 /* Now we're pointing at auxv. Initialize the vDSO parser. */
76 vdso_init_from_auxv((void *)stack);
77
78 /* Find gettimeofday. */
79 typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
80 gtod_t gtod = (gtod_t)vdso_sym("LINUX_2.6", "__vdso_gettimeofday");
81
82 if (!gtod)
83 linux_exit(1);
84
85 struct timeval tv;
86 long ret = gtod(&tv, 0);
87
88 if (ret == 0) {
89 char buf[] = "The time is .000000\n";
90 to_base10(buf + 31, tv.tv_sec);
91 to_base10(buf + 38, tv.tv_usec);
92 linux_write(1, buf, sizeof(buf) - 1);
93 } else {
94 linux_exit(ret);
95 }
96
97 linux_exit(0);
98}
99
100/*
101 * This is the real entry point. It passes the initial stack into
102 * the C entry point.
103 */
104asm (
105 ".text\n"
106 ".global _start\n"
107 ".type _start,@function\n"
108 "_start:\n\t"
109 "mov %rsp,%rdi\n\t"
110 "jmp c_main"
111 );
diff --git a/Documentation/x86/entry_64.txt b/Documentation/x86/entry_64.txt
new file mode 100644
index 000000000000..7869f14d055c
--- /dev/null
+++ b/Documentation/x86/entry_64.txt
@@ -0,0 +1,98 @@
1This file documents some of the kernel entries in
2arch/x86/kernel/entry_64.S. A lot of this explanation is adapted from
3an email from Ingo Molnar:
4
5http://lkml.kernel.org/r/<20110529191055.GC9835%40elte.hu>
6
7The x86 architecture has quite a few different ways to jump into
8kernel code. Most of these entry points are registered in
9arch/x86/kernel/traps.c and implemented in arch/x86/kernel/entry_64.S
10and arch/x86/ia32/ia32entry.S.
11
12The IDT vector assignments are listed in arch/x86/include/irq_vectors.h.
13
14Some of these entries are:
15
16 - system_call: syscall instruction from 64-bit code.
17
18 - ia32_syscall: int 0x80 from 32-bit or 64-bit code; compat syscall
19 either way.
20
21 - ia32_syscall, ia32_sysenter: syscall and sysenter from 32-bit
22 code
23
24 - interrupt: An array of entries. Every IDT vector that doesn't
25 explicitly point somewhere else gets set to the corresponding
26 value in interrupts. These point to a whole array of
27 magically-generated functions that make their way to do_IRQ with
28 the interrupt number as a parameter.
29
30 - emulate_vsyscall: int 0xcc, a special non-ABI entry used by
31 vsyscall emulation.
32
33 - APIC interrupts: Various special-purpose interrupts for things
34 like TLB shootdown.
35
36 - Architecturally-defined exceptions like divide_error.
37
38There are a few complexities here. The different x86-64 entries
39have different calling conventions. The syscall and sysenter
40instructions have their own peculiar calling conventions. Some of
41the IDT entries push an error code onto the stack; others don't.
42IDT entries using the IST alternative stack mechanism need their own
43magic to get the stack frames right. (You can find some
44documentation in the AMD APM, Volume 2, Chapter 8 and the Intel SDM,
45Volume 3, Chapter 6.)
46
47Dealing with the swapgs instruction is especially tricky. Swapgs
48toggles whether gs is the kernel gs or the user gs. The swapgs
49instruction is rather fragile: it must nest perfectly and only in
50single depth, it should only be used if entering from user mode to
51kernel mode and then when returning to user-space, and precisely
52so. If we mess that up even slightly, we crash.
53
54So when we have a secondary entry, already in kernel mode, we *must
55not* use SWAPGS blindly - nor must we forget doing a SWAPGS when it's
56not switched/swapped yet.
57
58Now, there's a secondary complication: there's a cheap way to test
59which mode the CPU is in and an expensive way.
60
61The cheap way is to pick this info off the entry frame on the kernel
62stack, from the CS of the ptregs area of the kernel stack:
63
64 xorl %ebx,%ebx
65 testl $3,CS+8(%rsp)
66 je error_kernelspace
67 SWAPGS
68
69The expensive (paranoid) way is to read back the MSR_GS_BASE value
70(which is what SWAPGS modifies):
71
72 movl $1,%ebx
73 movl $MSR_GS_BASE,%ecx
74 rdmsr
75 testl %edx,%edx
76 js 1f /* negative -> in kernel */
77 SWAPGS
78 xorl %ebx,%ebx
791: ret
80
81and the whole paranoid non-paranoid macro complexity is about whether
82to suffer that RDMSR cost.
83
84If we are at an interrupt or user-trap/gate-alike boundary then we can
85use the faster check: the stack will be a reliable indicator of
86whether SWAPGS was already done: if we see that we are a secondary
87entry interrupting kernel mode execution, then we know that the GS
88base has already been switched. If it says that we interrupted
89user-space execution then we must do the SWAPGS.
90
91But if we are in an NMI/MCE/DEBUG/whatever super-atomic entry context,
92which might have triggered right after a normal entry wrote CS to the
93stack but before we executed SWAPGS, then the only safe way to check
94for GS is the slower method: the RDMSR.
95
96So we try only to mark those entry methods 'paranoid' that absolutely
97need the more expensive check for the GS base - and we generate all
98'normal' entry points with the regular (faster) entry macros.
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 7336ba653b8f..137b277f7e56 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -101,6 +101,9 @@ config GENERIC_IOMAP
101 bool 101 bool
102 default y 102 default y
103 103
104config ARCH_CLOCKSOURCE_DATA
105 def_bool y
106
104config SCHED_OMIT_FRAME_POINTER 107config SCHED_OMIT_FRAME_POINTER
105 bool 108 bool
106 default y 109 default y
diff --git a/arch/ia64/include/asm/clocksource.h b/arch/ia64/include/asm/clocksource.h
new file mode 100644
index 000000000000..5c8596e4cb02
--- /dev/null
+++ b/arch/ia64/include/asm/clocksource.h
@@ -0,0 +1,10 @@
1/* IA64-specific clocksource additions */
2
3#ifndef _ASM_IA64_CLOCKSOURCE_H
4#define _ASM_IA64_CLOCKSOURCE_H
5
6struct arch_clocksource_data {
7 void *fsys_mmio; /* used by fsyscall asm code */
8};
9
10#endif /* _ASM_IA64_CLOCKSOURCE_H */
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
index f64097b5118a..4826ff957a3d 100644
--- a/arch/ia64/kernel/cyclone.c
+++ b/arch/ia64/kernel/cyclone.c
@@ -115,7 +115,7 @@ int __init init_cyclone_clock(void)
115 } 115 }
116 /* initialize last tick */ 116 /* initialize last tick */
117 cyclone_mc = cyclone_timer; 117 cyclone_mc = cyclone_timer;
118 clocksource_cyclone.fsys_mmio = cyclone_timer; 118 clocksource_cyclone.archdata.fsys_mmio = cyclone_timer;
119 clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ); 119 clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ);
120 120
121 return 0; 121 return 0;
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 85118dfe9bb5..43920de425f1 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -468,7 +468,7 @@ void update_vsyscall(struct timespec *wall, struct timespec *wtm,
468 fsyscall_gtod_data.clk_mask = c->mask; 468 fsyscall_gtod_data.clk_mask = c->mask;
469 fsyscall_gtod_data.clk_mult = mult; 469 fsyscall_gtod_data.clk_mult = mult;
470 fsyscall_gtod_data.clk_shift = c->shift; 470 fsyscall_gtod_data.clk_shift = c->shift;
471 fsyscall_gtod_data.clk_fsys_mmio = c->fsys_mmio; 471 fsyscall_gtod_data.clk_fsys_mmio = c->archdata.fsys_mmio;
472 fsyscall_gtod_data.clk_cycle_last = c->cycle_last; 472 fsyscall_gtod_data.clk_cycle_last = c->cycle_last;
473 473
474 /* copy kernel time structures */ 474 /* copy kernel time structures */
diff --git a/arch/ia64/sn/kernel/sn2/timer.c b/arch/ia64/sn/kernel/sn2/timer.c
index c34efda122e1..0f8844e49363 100644
--- a/arch/ia64/sn/kernel/sn2/timer.c
+++ b/arch/ia64/sn/kernel/sn2/timer.c
@@ -54,7 +54,7 @@ ia64_sn_udelay (unsigned long usecs)
54 54
55void __init sn_timer_init(void) 55void __init sn_timer_init(void)
56{ 56{
57 clocksource_sn2.fsys_mmio = RTC_COUNTER_ADDR; 57 clocksource_sn2.archdata.fsys_mmio = RTC_COUNTER_ADDR;
58 clocksource_register_hz(&clocksource_sn2, sn_rtc_cycles_per_second); 58 clocksource_register_hz(&clocksource_sn2, sn_rtc_cycles_per_second);
59 59
60 ia64_udelay = &ia64_sn_udelay; 60 ia64_udelay = &ia64_sn_udelay;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fc76e4209003..5f60ea190d5b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -95,6 +95,10 @@ config CLOCKSOURCE_WATCHDOG
95config GENERIC_CLOCKEVENTS 95config GENERIC_CLOCKEVENTS
96 def_bool y 96 def_bool y
97 97
98config ARCH_CLOCKSOURCE_DATA
99 def_bool y
100 depends on X86_64
101
98config GENERIC_CLOCKEVENTS_BROADCAST 102config GENERIC_CLOCKEVENTS_BROADCAST
99 def_bool y 103 def_bool y
100 depends on X86_64 || (X86_32 && X86_LOCAL_APIC) 104 depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 94d420b360d1..4554cc6fb96a 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -17,8 +17,8 @@
17 17
18.macro altinstruction_entry orig alt feature orig_len alt_len 18.macro altinstruction_entry orig alt feature orig_len alt_len
19 .align 8 19 .align 8
20 .quad \orig 20 .long \orig - .
21 .quad \alt 21 .long \alt - .
22 .word \feature 22 .word \feature
23 .byte \orig_len 23 .byte \orig_len
24 .byte \alt_len 24 .byte \alt_len
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index bf535f947e8c..23fb6d79f209 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -43,8 +43,8 @@
43#endif 43#endif
44 44
45struct alt_instr { 45struct alt_instr {
46 u8 *instr; /* original instruction */ 46 s32 instr_offset; /* original instruction */
47 u8 *replacement; 47 s32 repl_offset; /* offset to replacement instruction */
48 u16 cpuid; /* cpuid bit set for replacement */ 48 u16 cpuid; /* cpuid bit set for replacement */
49 u8 instrlen; /* length of original instruction */ 49 u8 instrlen; /* length of original instruction */
50 u8 replacementlen; /* length of new instruction, <= instrlen */ 50 u8 replacementlen; /* length of new instruction, <= instrlen */
@@ -84,8 +84,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
84 "661:\n\t" oldinstr "\n662:\n" \ 84 "661:\n\t" oldinstr "\n662:\n" \
85 ".section .altinstructions,\"a\"\n" \ 85 ".section .altinstructions,\"a\"\n" \
86 _ASM_ALIGN "\n" \ 86 _ASM_ALIGN "\n" \
87 _ASM_PTR "661b\n" /* label */ \ 87 " .long 661b - .\n" /* label */ \
88 _ASM_PTR "663f\n" /* new instruction */ \ 88 " .long 663f - .\n" /* new instruction */ \
89 " .word " __stringify(feature) "\n" /* feature bit */ \ 89 " .word " __stringify(feature) "\n" /* feature bit */ \
90 " .byte 662b-661b\n" /* sourcelen */ \ 90 " .byte 662b-661b\n" /* sourcelen */ \
91 " .byte 664f-663f\n" /* replacementlen */ \ 91 " .byte 664f-663f\n" /* replacementlen */ \
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
new file mode 100644
index 000000000000..0bdbbb3b9ce7
--- /dev/null
+++ b/arch/x86/include/asm/clocksource.h
@@ -0,0 +1,18 @@
1/* x86-specific clocksource additions */
2
3#ifndef _ASM_X86_CLOCKSOURCE_H
4#define _ASM_X86_CLOCKSOURCE_H
5
6#ifdef CONFIG_X86_64
7
8#define VCLOCK_NONE 0 /* No vDSO clock available. */
9#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */
10#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */
11
12struct arch_clocksource_data {
13 int vclock_mode;
14};
15
16#endif /* CONFIG_X86_64 */
17
18#endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 71cc3800712c..9929b35929ff 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -331,8 +331,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
331 "2:\n" 331 "2:\n"
332 ".section .altinstructions,\"a\"\n" 332 ".section .altinstructions,\"a\"\n"
333 _ASM_ALIGN "\n" 333 _ASM_ALIGN "\n"
334 _ASM_PTR "1b\n" 334 " .long 1b - .\n"
335 _ASM_PTR "0\n" /* no replacement */ 335 " .long 0\n" /* no replacement */
336 " .word %P0\n" /* feature bit */ 336 " .word %P0\n" /* feature bit */
337 " .byte 2b - 1b\n" /* source len */ 337 " .byte 2b - 1b\n" /* source len */
338 " .byte 0\n" /* replacement len */ 338 " .byte 0\n" /* replacement len */
@@ -349,8 +349,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
349 "2:\n" 349 "2:\n"
350 ".section .altinstructions,\"a\"\n" 350 ".section .altinstructions,\"a\"\n"
351 _ASM_ALIGN "\n" 351 _ASM_ALIGN "\n"
352 _ASM_PTR "1b\n" 352 " .long 1b - .\n"
353 _ASM_PTR "3f\n" 353 " .long 3f - .\n"
354 " .word %P1\n" /* feature bit */ 354 " .word %P1\n" /* feature bit */
355 " .byte 2b - 1b\n" /* source len */ 355 " .byte 2b - 1b\n" /* source len */
356 " .byte 4f - 3f\n" /* replacement len */ 356 " .byte 4f - 3f\n" /* replacement len */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4729b2b63117..460c74e4852c 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -78,6 +78,7 @@ enum fixed_addresses {
78 VSYSCALL_LAST_PAGE, 78 VSYSCALL_LAST_PAGE,
79 VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE 79 VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
80 + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, 80 + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
81 VVAR_PAGE,
81 VSYSCALL_HPET, 82 VSYSCALL_HPET,
82#endif 83#endif
83 FIX_DBGP_BASE, 84 FIX_DBGP_BASE,
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6665026ea3ea..f9a320984a10 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -17,7 +17,8 @@
17 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events 17 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events
18 * Vectors 32 ... 127 : device interrupts 18 * Vectors 32 ... 127 : device interrupts
19 * Vector 128 : legacy int80 syscall interface 19 * Vector 128 : legacy int80 syscall interface
20 * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts 20 * Vector 204 : legacy x86_64 vsyscall emulation
21 * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts
21 * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts 22 * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
22 * 23 *
23 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. 24 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
@@ -50,6 +51,9 @@
50#ifdef CONFIG_X86_32 51#ifdef CONFIG_X86_32
51# define SYSCALL_VECTOR 0x80 52# define SYSCALL_VECTOR 0x80
52#endif 53#endif
54#ifdef CONFIG_X86_64
55# define VSYSCALL_EMU_VECTOR 0xcc
56#endif
53 57
54/* 58/*
55 * Vectors 0x30-0x3f are used for ISA interrupts. 59 * Vectors 0x30-0x3f are used for ISA interrupts.
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d56187c6b838..013286a10c2c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -107,7 +107,8 @@
107#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) 107#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
108#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) 108#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
109#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) 109#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
110#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) 110#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
111#define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT)
111#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) 112#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
112#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) 113#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
113#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) 114#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
@@ -129,7 +130,8 @@
129#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) 130#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
130#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) 131#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
131#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) 132#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
132#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) 133#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR)
134#define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE)
133 135
134#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) 136#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
135#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) 137#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0310da67307f..2bae0a513b40 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_TRAPS_H 1#ifndef _ASM_X86_TRAPS_H
2#define _ASM_X86_TRAPS_H 2#define _ASM_X86_TRAPS_H
3 3
4#include <linux/kprobes.h>
5
4#include <asm/debugreg.h> 6#include <asm/debugreg.h>
5#include <asm/siginfo.h> /* TRAP_TRACE, ... */ 7#include <asm/siginfo.h> /* TRAP_TRACE, ... */
6 8
@@ -38,6 +40,7 @@ asmlinkage void alignment_check(void);
38asmlinkage void machine_check(void); 40asmlinkage void machine_check(void);
39#endif /* CONFIG_X86_MCE */ 41#endif /* CONFIG_X86_MCE */
40asmlinkage void simd_coprocessor_error(void); 42asmlinkage void simd_coprocessor_error(void);
43asmlinkage void emulate_vsyscall(void);
41 44
42dotraplinkage void do_divide_error(struct pt_regs *, long); 45dotraplinkage void do_divide_error(struct pt_regs *, long);
43dotraplinkage void do_debug(struct pt_regs *, long); 46dotraplinkage void do_debug(struct pt_regs *, long);
@@ -64,6 +67,7 @@ dotraplinkage void do_alignment_check(struct pt_regs *, long);
64dotraplinkage void do_machine_check(struct pt_regs *, long); 67dotraplinkage void do_machine_check(struct pt_regs *, long);
65#endif 68#endif
66dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); 69dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
70dotraplinkage void do_emulate_vsyscall(struct pt_regs *, long);
67#ifdef CONFIG_X86_32 71#ifdef CONFIG_X86_32
68dotraplinkage void do_iret_error(struct pt_regs *, long); 72dotraplinkage void do_iret_error(struct pt_regs *, long);
69#endif 73#endif
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 9db5583b6d38..83e2efd181e2 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,10 +51,6 @@ extern int unsynchronized_tsc(void);
51extern int check_tsc_unstable(void); 51extern int check_tsc_unstable(void);
52extern unsigned long native_calibrate_tsc(void); 52extern unsigned long native_calibrate_tsc(void);
53 53
54#ifdef CONFIG_X86_64
55extern cycles_t vread_tsc(void);
56#endif
57
58/* 54/*
59 * Boot-time check whether the TSCs are synchronized across 55 * Boot-time check whether the TSCs are synchronized across
60 * all CPUs/cores: 56 * all CPUs/cores:
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 646b4c1ca695..815285bcaceb 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -11,10 +11,9 @@ struct vsyscall_gtod_data {
11 time_t wall_time_sec; 11 time_t wall_time_sec;
12 u32 wall_time_nsec; 12 u32 wall_time_nsec;
13 13
14 int sysctl_enabled;
15 struct timezone sys_tz; 14 struct timezone sys_tz;
16 struct { /* extract of a clocksource struct */ 15 struct { /* extract of a clocksource struct */
17 cycle_t (*vread)(void); 16 int vclock_mode;
18 cycle_t cycle_last; 17 cycle_t cycle_last;
19 cycle_t mask; 18 cycle_t mask;
20 u32 mult; 19 u32 mult;
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d55597351f6a..60107072c28b 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -16,10 +16,6 @@ enum vsyscall_num {
16#ifdef __KERNEL__ 16#ifdef __KERNEL__
17#include <linux/seqlock.h> 17#include <linux/seqlock.h>
18 18
19/* Definitions for CONFIG_GENERIC_TIME definitions */
20#define __vsyscall_fn \
21 __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
22
23#define VGETCPU_RDTSCP 1 19#define VGETCPU_RDTSCP 1
24#define VGETCPU_LSL 2 20#define VGETCPU_LSL 2
25 21
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 341b3559452b..de656ac2af41 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -10,15 +10,14 @@
10 * In normal kernel code, they are used like any other variable. 10 * In normal kernel code, they are used like any other variable.
11 * In user code, they are accessed through the VVAR macro. 11 * In user code, they are accessed through the VVAR macro.
12 * 12 *
13 * Each of these variables lives in the vsyscall page, and each 13 * These variables live in a page of kernel data that has an extra RO
14 * one needs a unique offset within the little piece of the page 14 * mapping for userspace. Each variable needs a unique offset within
15 * reserved for vvars. Specify that offset in DECLARE_VVAR. 15 * that page; specify that offset with the DECLARE_VVAR macro. (If
16 * (There are 896 bytes available. If you mess up, the linker will 16 * you mess up, the linker will catch it.)
17 * catch it.)
18 */ 17 */
19 18
20/* Offset of vars within vsyscall page */ 19/* Base address of vvars. This is not ABI. */
21#define VSYSCALL_VARS_OFFSET (3072 + 128) 20#define VVAR_ADDRESS (-10*1024*1024 - 4096)
22 21
23#if defined(__VVAR_KERNEL_LDS) 22#if defined(__VVAR_KERNEL_LDS)
24 23
@@ -26,17 +25,17 @@
26 * right place. 25 * right place.
27 */ 26 */
28#define DECLARE_VVAR(offset, type, name) \ 27#define DECLARE_VVAR(offset, type, name) \
29 EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset) 28 EMIT_VVAR(name, offset)
30 29
31#else 30#else
32 31
33#define DECLARE_VVAR(offset, type, name) \ 32#define DECLARE_VVAR(offset, type, name) \
34 static type const * const vvaraddr_ ## name = \ 33 static type const * const vvaraddr_ ## name = \
35 (void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset)); 34 (void *)(VVAR_ADDRESS + (offset));
36 35
37#define DEFINE_VVAR(type, name) \ 36#define DEFINE_VVAR(type, name) \
38 type __vvar_ ## name \ 37 type name \
39 __attribute__((section(".vsyscall_var_" #name), aligned(16))) 38 __attribute__((section(".vvar_" #name), aligned(16)))
40 39
41#define VVAR(name) (*vvaraddr_ ## name) 40#define VVAR(name) (*vvaraddr_ ## name)
42 41
@@ -45,8 +44,7 @@
45/* DECLARE_VVAR(offset, type, name) */ 44/* DECLARE_VVAR(offset, type, name) */
46 45
47DECLARE_VVAR(0, volatile unsigned long, jiffies) 46DECLARE_VVAR(0, volatile unsigned long, jiffies)
48DECLARE_VVAR(8, int, vgetcpu_mode) 47DECLARE_VVAR(16, int, vgetcpu_mode)
49DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) 48DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
50 49
51#undef DECLARE_VVAR 50#undef DECLARE_VVAR
52#undef VSYSCALL_VARS_OFFSET
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 11817ff85399..04105574c8e9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,17 +24,12 @@ endif
24nostackp := $(call cc-option, -fno-stack-protector) 24nostackp := $(call cc-option, -fno-stack-protector)
25CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 25CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
26CFLAGS_hpet.o := $(nostackp) 26CFLAGS_hpet.o := $(nostackp)
27CFLAGS_vread_tsc_64.o := $(nostackp)
28CFLAGS_paravirt.o := $(nostackp) 27CFLAGS_paravirt.o := $(nostackp)
29GCOV_PROFILE_vsyscall_64.o := n 28GCOV_PROFILE_vsyscall_64.o := n
30GCOV_PROFILE_hpet.o := n 29GCOV_PROFILE_hpet.o := n
31GCOV_PROFILE_tsc.o := n 30GCOV_PROFILE_tsc.o := n
32GCOV_PROFILE_vread_tsc_64.o := n
33GCOV_PROFILE_paravirt.o := n 31GCOV_PROFILE_paravirt.o := n
34 32
35# vread_tsc_64 is hot and should be fully optimized:
36CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls
37
38obj-y := process_$(BITS).o signal.o entry_$(BITS).o 33obj-y := process_$(BITS).o signal.o entry_$(BITS).o
39obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 34obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
40obj-y += time.o ioport.o ldt.o dumpstack.o 35obj-y += time.o ioport.o ldt.o dumpstack.o
@@ -43,7 +38,8 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
43obj-y += probe_roms.o 38obj-y += probe_roms.o
44obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 39obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
45obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 40obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
46obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o 41obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
42obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
47obj-y += bootflag.o e820.o 43obj-y += bootflag.o e820.o
48obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 44obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
49obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a81f2d52f869..c63822816249 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -14,7 +14,6 @@
14#include <asm/pgtable.h> 14#include <asm/pgtable.h>
15#include <asm/mce.h> 15#include <asm/mce.h>
16#include <asm/nmi.h> 16#include <asm/nmi.h>
17#include <asm/vsyscall.h>
18#include <asm/cacheflush.h> 17#include <asm/cacheflush.h>
19#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
20#include <asm/io.h> 19#include <asm/io.h>
@@ -250,7 +249,6 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
250 249
251extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 250extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
252extern s32 __smp_locks[], __smp_locks_end[]; 251extern s32 __smp_locks[], __smp_locks_end[];
253extern char __vsyscall_0;
254void *text_poke_early(void *addr, const void *opcode, size_t len); 252void *text_poke_early(void *addr, const void *opcode, size_t len);
255 253
256/* Replace instructions with better alternatives for this CPU type. 254/* Replace instructions with better alternatives for this CPU type.
@@ -263,6 +261,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
263 struct alt_instr *end) 261 struct alt_instr *end)
264{ 262{
265 struct alt_instr *a; 263 struct alt_instr *a;
264 u8 *instr, *replacement;
266 u8 insnbuf[MAX_PATCH_LEN]; 265 u8 insnbuf[MAX_PATCH_LEN];
267 266
268 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 267 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
@@ -276,25 +275,23 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
276 * order. 275 * order.
277 */ 276 */
278 for (a = start; a < end; a++) { 277 for (a = start; a < end; a++) {
279 u8 *instr = a->instr; 278 instr = (u8 *)&a->instr_offset + a->instr_offset;
279 replacement = (u8 *)&a->repl_offset + a->repl_offset;
280 BUG_ON(a->replacementlen > a->instrlen); 280 BUG_ON(a->replacementlen > a->instrlen);
281 BUG_ON(a->instrlen > sizeof(insnbuf)); 281 BUG_ON(a->instrlen > sizeof(insnbuf));
282 BUG_ON(a->cpuid >= NCAPINTS*32); 282 BUG_ON(a->cpuid >= NCAPINTS*32);
283 if (!boot_cpu_has(a->cpuid)) 283 if (!boot_cpu_has(a->cpuid))
284 continue; 284 continue;
285#ifdef CONFIG_X86_64 285
286 /* vsyscall code is not mapped yet. resolve it manually. */ 286 memcpy(insnbuf, replacement, a->replacementlen);
287 if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) { 287
288 instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); 288 /* 0xe8 is a relative jump; fix the offset. */
289 DPRINTK("%s: vsyscall fixup: %p => %p\n",
290 __func__, a->instr, instr);
291 }
292#endif
293 memcpy(insnbuf, a->replacement, a->replacementlen);
294 if (*insnbuf == 0xe8 && a->replacementlen == 5) 289 if (*insnbuf == 0xe8 && a->replacementlen == 5)
295 *(s32 *)(insnbuf + 1) += a->replacement - a->instr; 290 *(s32 *)(insnbuf + 1) += replacement - instr;
291
296 add_nops(insnbuf + a->replacementlen, 292 add_nops(insnbuf + a->replacementlen,
297 a->instrlen - a->replacementlen); 293 a->instrlen - a->replacementlen);
294
298 text_poke_early(instr, insnbuf, a->instrlen); 295 text_poke_early(instr, insnbuf, a->instrlen);
299 } 296 }
300} 297}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 37e895a1c74d..e13329d800c8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -9,6 +9,8 @@
9/* 9/*
10 * entry.S contains the system-call and fault low-level handling routines. 10 * entry.S contains the system-call and fault low-level handling routines.
11 * 11 *
12 * Some of this is documented in Documentation/x86/entry_64.txt
13 *
12 * NOTE: This code handles signal-recognition, which happens every time 14 * NOTE: This code handles signal-recognition, which happens every time
13 * after an interrupt and after each system call. 15 * after an interrupt and after each system call.
14 * 16 *
@@ -1109,6 +1111,8 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
1109zeroentry coprocessor_error do_coprocessor_error 1111zeroentry coprocessor_error do_coprocessor_error
1110errorentry alignment_check do_alignment_check 1112errorentry alignment_check do_alignment_check
1111zeroentry simd_coprocessor_error do_simd_coprocessor_error 1113zeroentry simd_coprocessor_error do_simd_coprocessor_error
1114zeroentry emulate_vsyscall do_emulate_vsyscall
1115
1112 1116
1113 /* Reload gs selector with exception handling */ 1117 /* Reload gs selector with exception handling */
1114 /* edi: new selector */ 1118 /* edi: new selector */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 0f4b0651cd3f..4aecc54236a9 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -72,7 +72,7 @@ static inline void hpet_set_mapping(void)
72{ 72{
73 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); 73 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
74#ifdef CONFIG_X86_64 74#ifdef CONFIG_X86_64
75 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); 75 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
76#endif 76#endif
77} 77}
78 78
@@ -739,13 +739,6 @@ static cycle_t read_hpet(struct clocksource *cs)
739 return (cycle_t)hpet_readl(HPET_COUNTER); 739 return (cycle_t)hpet_readl(HPET_COUNTER);
740} 740}
741 741
742#ifdef CONFIG_X86_64
743static cycle_t __vsyscall_fn vread_hpet(void)
744{
745 return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
746}
747#endif
748
749static struct clocksource clocksource_hpet = { 742static struct clocksource clocksource_hpet = {
750 .name = "hpet", 743 .name = "hpet",
751 .rating = 250, 744 .rating = 250,
@@ -754,7 +747,7 @@ static struct clocksource clocksource_hpet = {
754 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 747 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
755 .resume = hpet_resume_counter, 748 .resume = hpet_resume_counter,
756#ifdef CONFIG_X86_64 749#ifdef CONFIG_X86_64
757 .vread = vread_hpet, 750 .archdata = { .vclock_mode = VCLOCK_HPET },
758#endif 751#endif
759}; 752};
760 753
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b9b67166f9de..fbc097a085ca 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -872,6 +872,12 @@ void __init trap_init(void)
872 set_bit(SYSCALL_VECTOR, used_vectors); 872 set_bit(SYSCALL_VECTOR, used_vectors);
873#endif 873#endif
874 874
875#ifdef CONFIG_X86_64
876 BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors));
877 set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall);
878 set_bit(VSYSCALL_EMU_VECTOR, used_vectors);
879#endif
880
875 /* 881 /*
876 * Should be a barrier for any external CPU state: 882 * Should be a barrier for any external CPU state:
877 */ 883 */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6cc6922262af..56c633a5db72 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -777,7 +777,7 @@ static struct clocksource clocksource_tsc = {
777 .flags = CLOCK_SOURCE_IS_CONTINUOUS | 777 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
778 CLOCK_SOURCE_MUST_VERIFY, 778 CLOCK_SOURCE_MUST_VERIFY,
779#ifdef CONFIG_X86_64 779#ifdef CONFIG_X86_64
780 .vread = vread_tsc, 780 .archdata = { .vclock_mode = VCLOCK_TSC },
781#endif 781#endif
782}; 782};
783 783
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 89aed99aafce..4aa9c54a9b76 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -161,50 +161,47 @@ SECTIONS
161 161
162#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) 162#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
163#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) 163#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
164#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \
165 ADDR(.vsyscall_0) + offset \
166 : AT(VLOAD(.vsyscall_var_ ## x)) { \
167 *(.vsyscall_var_ ## x) \
168 } \
169 x = VVIRT(.vsyscall_var_ ## x);
170 164
171 . = ALIGN(4096); 165 . = ALIGN(4096);
172 __vsyscall_0 = .; 166 __vsyscall_0 = .;
173 167
174 . = VSYSCALL_ADDR; 168 . = VSYSCALL_ADDR;
175 .vsyscall_0 : AT(VLOAD(.vsyscall_0)) { 169 .vsyscall : AT(VLOAD(.vsyscall)) {
176 *(.vsyscall_0) 170 *(.vsyscall_0)
177 } :user
178 171
179 . = ALIGN(L1_CACHE_BYTES); 172 . = 1024;
180 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
181 *(.vsyscall_fn)
182 }
183
184 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
185 *(.vsyscall_1) 173 *(.vsyscall_1)
186 }
187 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
188 *(.vsyscall_2)
189 }
190 174
191 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { 175 . = 2048;
192 *(.vsyscall_3) 176 *(.vsyscall_2)
193 }
194
195#define __VVAR_KERNEL_LDS
196#include <asm/vvar.h>
197#undef __VVAR_KERNEL_LDS
198 177
199 . = __vsyscall_0 + PAGE_SIZE; 178 . = 4096; /* Pad the whole page. */
179 } :user =0xcc
180 . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE);
200 181
201#undef VSYSCALL_ADDR 182#undef VSYSCALL_ADDR
202#undef VLOAD_OFFSET 183#undef VLOAD_OFFSET
203#undef VLOAD 184#undef VLOAD
204#undef VVIRT_OFFSET 185#undef VVIRT_OFFSET
205#undef VVIRT 186#undef VVIRT
187
188 __vvar_page = .;
189
190 .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
191
192 /* Place all vvars at the offsets in asm/vvar.h. */
193#define EMIT_VVAR(name, offset) \
194 . = offset; \
195 *(.vvar_ ## name)
196#define __VVAR_KERNEL_LDS
197#include <asm/vvar.h>
198#undef __VVAR_KERNEL_LDS
206#undef EMIT_VVAR 199#undef EMIT_VVAR
207 200
201 } :data
202
203 . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
204
208#endif /* CONFIG_X86_64 */ 205#endif /* CONFIG_X86_64 */
209 206
210 /* Init code and data - will be freed after init */ 207 /* Init code and data - will be freed after init */
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
deleted file mode 100644
index a81aa9e9894c..000000000000
--- a/arch/x86/kernel/vread_tsc_64.c
+++ /dev/null
@@ -1,36 +0,0 @@
1/* This code runs in userspace. */
2
3#define DISABLE_BRANCH_PROFILING
4#include <asm/vgtod.h>
5
6notrace cycle_t __vsyscall_fn vread_tsc(void)
7{
8 cycle_t ret;
9 u64 last;
10
11 /*
12 * Empirically, a fence (of type that depends on the CPU)
13 * before rdtsc is enough to ensure that rdtsc is ordered
14 * with respect to loads. The various CPU manuals are unclear
15 * as to whether rdtsc can be reordered with later loads,
16 * but no one has ever seen it happen.
17 */
18 rdtsc_barrier();
19 ret = (cycle_t)vget_cycles();
20
21 last = VVAR(vsyscall_gtod_data).clock.cycle_last;
22
23 if (likely(ret >= last))
24 return ret;
25
26 /*
27 * GCC likes to generate cmov here, but this branch is extremely
28 * predictable (it's just a funciton of time and the likely is
29 * very likely) and there's a data dependence, so force GCC
30 * to generate a branch instead. I don't barrier() because
31 * we don't actually need a barrier, and if this function
32 * ever gets inlined it will generate worse code.
33 */
34 asm volatile ("");
35 return last;
36}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3e682184d76c..dda7dff9cef7 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -2,6 +2,8 @@
2 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 2 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
3 * Copyright 2003 Andi Kleen, SuSE Labs. 3 * Copyright 2003 Andi Kleen, SuSE Labs.
4 * 4 *
5 * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
6 *
5 * Thanks to hpa@transmeta.com for some useful hint. 7 * Thanks to hpa@transmeta.com for some useful hint.
6 * Special thanks to Ingo Molnar for his early experience with 8 * Special thanks to Ingo Molnar for his early experience with
7 * a different vsyscall implementation for Linux/IA32 and for the name. 9 * a different vsyscall implementation for Linux/IA32 and for the name.
@@ -11,10 +13,9 @@
11 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid 13 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
12 * jumping out of line if necessary. We cannot add more with this 14 * jumping out of line if necessary. We cannot add more with this
13 * mechanism because older kernels won't return -ENOSYS. 15 * mechanism because older kernels won't return -ENOSYS.
14 * If we want more than four we need a vDSO.
15 * 16 *
16 * Note: the concept clashes with user mode linux. If you use UML and 17 * Note: the concept clashes with user mode linux. UML users should
17 * want per guest time just set the kernel.vsyscall64 sysctl to 0. 18 * use the vDSO.
18 */ 19 */
19 20
20/* Disable profiling for userspace code: */ 21/* Disable profiling for userspace code: */
@@ -32,9 +33,12 @@
32#include <linux/cpu.h> 33#include <linux/cpu.h>
33#include <linux/smp.h> 34#include <linux/smp.h>
34#include <linux/notifier.h> 35#include <linux/notifier.h>
36#include <linux/syscalls.h>
37#include <linux/ratelimit.h>
35 38
36#include <asm/vsyscall.h> 39#include <asm/vsyscall.h>
37#include <asm/pgtable.h> 40#include <asm/pgtable.h>
41#include <asm/compat.h>
38#include <asm/page.h> 42#include <asm/page.h>
39#include <asm/unistd.h> 43#include <asm/unistd.h>
40#include <asm/fixmap.h> 44#include <asm/fixmap.h>
@@ -44,16 +48,12 @@
44#include <asm/desc.h> 48#include <asm/desc.h>
45#include <asm/topology.h> 49#include <asm/topology.h>
46#include <asm/vgtod.h> 50#include <asm/vgtod.h>
47 51#include <asm/traps.h>
48#define __vsyscall(nr) \
49 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
50#define __syscall_clobber "r11","cx","memory"
51 52
52DEFINE_VVAR(int, vgetcpu_mode); 53DEFINE_VVAR(int, vgetcpu_mode);
53DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = 54DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
54{ 55{
55 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), 56 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
56 .sysctl_enabled = 1,
57}; 57};
58 58
59void update_vsyscall_tz(void) 59void update_vsyscall_tz(void)
@@ -72,179 +72,149 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
72 unsigned long flags; 72 unsigned long flags;
73 73
74 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); 74 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
75
75 /* copy vsyscall data */ 76 /* copy vsyscall data */
76 vsyscall_gtod_data.clock.vread = clock->vread; 77 vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode;
77 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; 78 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
78 vsyscall_gtod_data.clock.mask = clock->mask; 79 vsyscall_gtod_data.clock.mask = clock->mask;
79 vsyscall_gtod_data.clock.mult = mult; 80 vsyscall_gtod_data.clock.mult = mult;
80 vsyscall_gtod_data.clock.shift = clock->shift; 81 vsyscall_gtod_data.clock.shift = clock->shift;
81 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 82 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
82 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 83 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
83 vsyscall_gtod_data.wall_to_monotonic = *wtm; 84 vsyscall_gtod_data.wall_to_monotonic = *wtm;
84 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); 85 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
86
85 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 87 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
86} 88}
87 89
88/* RED-PEN may want to readd seq locking, but then the variable should be 90static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
89 * write-once. 91 const char *message)
90 */
91static __always_inline void do_get_tz(struct timezone * tz)
92{ 92{
93 *tz = VVAR(vsyscall_gtod_data).sys_tz; 93 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
94} 94 struct task_struct *tsk;
95 95
96static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) 96 if (!show_unhandled_signals || !__ratelimit(&rs))
97{ 97 return;
98 int ret;
99 asm volatile("syscall"
100 : "=a" (ret)
101 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
102 : __syscall_clobber );
103 return ret;
104}
105 98
106static __always_inline long time_syscall(long *t) 99 tsk = current;
107{
108 long secs;
109 asm volatile("syscall"
110 : "=a" (secs)
111 : "0" (__NR_time),"D" (t) : __syscall_clobber);
112 return secs;
113}
114 100
115static __always_inline void do_vgettimeofday(struct timeval * tv) 101 printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
116{ 102 level, tsk->comm, task_pid_nr(tsk),
117 cycle_t now, base, mask, cycle_delta; 103 message, regs->ip - 2, regs->cs,
118 unsigned seq; 104 regs->sp, regs->ax, regs->si, regs->di);
119 unsigned long mult, shift, nsec;
120 cycle_t (*vread)(void);
121 do {
122 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
123
124 vread = VVAR(vsyscall_gtod_data).clock.vread;
125 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled ||
126 !vread)) {
127 gettimeofday(tv,NULL);
128 return;
129 }
130
131 now = vread();
132 base = VVAR(vsyscall_gtod_data).clock.cycle_last;
133 mask = VVAR(vsyscall_gtod_data).clock.mask;
134 mult = VVAR(vsyscall_gtod_data).clock.mult;
135 shift = VVAR(vsyscall_gtod_data).clock.shift;
136
137 tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
138 nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
139 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
140
141 /* calculate interval: */
142 cycle_delta = (now - base) & mask;
143 /* convert to nsecs: */
144 nsec += (cycle_delta * mult) >> shift;
145
146 while (nsec >= NSEC_PER_SEC) {
147 tv->tv_sec += 1;
148 nsec -= NSEC_PER_SEC;
149 }
150 tv->tv_usec = nsec / NSEC_PER_USEC;
151} 105}
152 106
153int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) 107static int addr_to_vsyscall_nr(unsigned long addr)
154{ 108{
155 if (tv) 109 int nr;
156 do_vgettimeofday(tv);
157 if (tz)
158 do_get_tz(tz);
159 return 0;
160}
161 110
162/* This will break when the xtime seconds get inaccurate, but that is 111 if ((addr & ~0xC00UL) != VSYSCALL_START)
163 * unlikely */ 112 return -EINVAL;
164time_t __vsyscall(1) vtime(time_t *t)
165{
166 unsigned seq;
167 time_t result;
168 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
169 return time_syscall(t);
170 113
171 do { 114 nr = (addr & 0xC00UL) >> 10;
172 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); 115 if (nr >= 3)
116 return -EINVAL;
173 117
174 result = VVAR(vsyscall_gtod_data).wall_time_sec; 118 return nr;
119}
175 120
176 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); 121void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code)
122{
123 struct task_struct *tsk;
124 unsigned long caller;
125 int vsyscall_nr;
126 long ret;
127
128 local_irq_enable();
129
130 /*
131 * Real 64-bit user mode code has cs == __USER_CS. Anything else
132 * is bogus.
133 */
134 if (regs->cs != __USER_CS) {
135 /*
136 * If we trapped from kernel mode, we might as well OOPS now
137 * instead of returning to some random address and OOPSing
138 * then.
139 */
140 BUG_ON(!user_mode(regs));
141
142 /* Compat mode and non-compat 32-bit CS should both segfault. */
143 warn_bad_vsyscall(KERN_WARNING, regs,
144 "illegal int 0xcc from 32-bit mode");
145 goto sigsegv;
146 }
177 147
178 if (t) 148 /*
179 *t = result; 149 * x86-ism here: regs->ip points to the instruction after the int 0xcc,
180 return result; 150 * and int 0xcc is two bytes long.
181} 151 */
152 vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2);
153 if (vsyscall_nr < 0) {
154 warn_bad_vsyscall(KERN_WARNING, regs,
155 "illegal int 0xcc (exploit attempt?)");
156 goto sigsegv;
157 }
182 158
183/* Fast way to get current CPU and node. 159 if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
184 This helps to do per node and per CPU caches in user space. 160 warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)");
185 The result is not guaranteed without CPU affinity, but usually 161 goto sigsegv;
186 works out because the scheduler tries to keep a thread on the same 162 }
187 CPU.
188 163
189 tcache must point to a two element sized long array. 164 tsk = current;
190 All arguments can be NULL. */ 165 if (seccomp_mode(&tsk->seccomp))
191long __vsyscall(2) 166 do_exit(SIGKILL);
192vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) 167
193{ 168 switch (vsyscall_nr) {
194 unsigned int p; 169 case 0:
195 unsigned long j = 0; 170 ret = sys_gettimeofday(
196 171 (struct timeval __user *)regs->di,
197 /* Fast cache - only recompute value once per jiffies and avoid 172 (struct timezone __user *)regs->si);
198 relatively costly rdtscp/cpuid otherwise. 173 break;
199 This works because the scheduler usually keeps the process 174
200 on the same CPU and this syscall doesn't guarantee its 175 case 1:
201 results anyways. 176 ret = sys_time((time_t __user *)regs->di);
202 We do this here because otherwise user space would do it on 177 break;
203 its own in a likely inferior way (no access to jiffies). 178
204 If you don't like it pass NULL. */ 179 case 2:
205 if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) { 180 ret = sys_getcpu((unsigned __user *)regs->di,
206 p = tcache->blob[1]; 181 (unsigned __user *)regs->si,
207 } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { 182 0);
208 /* Load per CPU data from RDTSCP */ 183 break;
209 native_read_tscp(&p);
210 } else {
211 /* Load per CPU data from GDT */
212 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
213 } 184 }
214 if (tcache) { 185
215 tcache->blob[0] = j; 186 if (ret == -EFAULT) {
216 tcache->blob[1] = p; 187 /*
188 * Bad news -- userspace fed a bad pointer to a vsyscall.
189 *
190 * With a real vsyscall, that would have caused SIGSEGV.
191 * To make writing reliable exploits using the emulated
192 * vsyscalls harder, generate SIGSEGV here as well.
193 */
194 warn_bad_vsyscall(KERN_INFO, regs,
195 "vsyscall fault (exploit attempt?)");
196 goto sigsegv;
217 } 197 }
218 if (cpu)
219 *cpu = p & 0xfff;
220 if (node)
221 *node = p >> 12;
222 return 0;
223}
224 198
225static long __vsyscall(3) venosys_1(void) 199 regs->ax = ret;
226{
227 return -ENOSYS;
228}
229 200
230#ifdef CONFIG_SYSCTL 201 /* Emulate a ret instruction. */
231static ctl_table kernel_table2[] = { 202 regs->ip = caller;
232 { .procname = "vsyscall64", 203 regs->sp += 8;
233 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
234 .mode = 0644,
235 .proc_handler = proc_dointvec },
236 {}
237};
238 204
239static ctl_table kernel_root_table2[] = { 205 local_irq_disable();
240 { .procname = "kernel", .mode = 0555, 206 return;
241 .child = kernel_table2 }, 207
242 {} 208sigsegv:
243}; 209 regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */
244#endif 210 force_sig(SIGSEGV, current);
211 local_irq_disable();
212}
245 213
246/* Assume __initcall executes before all user space. Hopefully kmod 214/*
247 doesn't violate that. We'll find out if it does. */ 215 * Assume __initcall executes before all user space. Hopefully kmod
216 * doesn't violate that. We'll find out if it does.
217 */
248static void __cpuinit vsyscall_set_cpu(int cpu) 218static void __cpuinit vsyscall_set_cpu(int cpu)
249{ 219{
250 unsigned long d; 220 unsigned long d;
@@ -255,13 +225,15 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
255 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) 225 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
256 write_rdtscp_aux((node << 12) | cpu); 226 write_rdtscp_aux((node << 12) | cpu);
257 227
258 /* Store cpu number in limit so that it can be loaded quickly 228 /*
259 in user space in vgetcpu. 229 * Store cpu number in limit so that it can be loaded quickly
260 12 bits for the CPU and 8 bits for the node. */ 230 * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
231 */
261 d = 0x0f40000000000ULL; 232 d = 0x0f40000000000ULL;
262 d |= cpu; 233 d |= cpu;
263 d |= (node & 0xf) << 12; 234 d |= (node & 0xf) << 12;
264 d |= (node >> 4) << 48; 235 d |= (node >> 4) << 48;
236
265 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); 237 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
266} 238}
267 239
@@ -275,8 +247,10 @@ static int __cpuinit
275cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) 247cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
276{ 248{
277 long cpu = (long)arg; 249 long cpu = (long)arg;
250
278 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 251 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
279 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); 252 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
253
280 return NOTIFY_DONE; 254 return NOTIFY_DONE;
281} 255}
282 256
@@ -284,25 +258,23 @@ void __init map_vsyscall(void)
284{ 258{
285 extern char __vsyscall_0; 259 extern char __vsyscall_0;
286 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); 260 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
261 extern char __vvar_page;
262 unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
287 263
288 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ 264 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
289 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); 265 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
266 __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
267 BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS);
290} 268}
291 269
292static int __init vsyscall_init(void) 270static int __init vsyscall_init(void)
293{ 271{
294 BUG_ON(((unsigned long) &vgettimeofday != 272 BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
295 VSYSCALL_ADDR(__NR_vgettimeofday))); 273
296 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
297 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
298 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
299#ifdef CONFIG_SYSCTL
300 register_sysctl_table(kernel_root_table2);
301#endif
302 on_each_cpu(cpu_vsyscall_init, NULL, 1); 274 on_each_cpu(cpu_vsyscall_init, NULL, 1);
303 /* notifier priority > KVM */ 275 /* notifier priority > KVM */
304 hotcpu_notifier(cpu_vsyscall_notifier, 30); 276 hotcpu_notifier(cpu_vsyscall_notifier, 30);
277
305 return 0; 278 return 0;
306} 279}
307
308__initcall(vsyscall_init); 280__initcall(vsyscall_init);
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S
new file mode 100644
index 000000000000..ffa845eae5ca
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_emu_64.S
@@ -0,0 +1,27 @@
1/*
2 * vsyscall_emu_64.S: Vsyscall emulation page
3 *
4 * Copyright (c) 2011 Andy Lutomirski
5 *
6 * Subject to the GNU General Public License, version 2
7 */
8
9#include <linux/linkage.h>
10#include <asm/irq_vectors.h>
11
12/* The unused parts of the page are filled with 0xcc by the linker script. */
13
14.section .vsyscall_0, "a"
15ENTRY(vsyscall_0)
16 int $VSYSCALL_EMU_VECTOR
17END(vsyscall_0)
18
19.section .vsyscall_1, "a"
20ENTRY(vsyscall_1)
21 int $VSYSCALL_EMU_VECTOR
22END(vsyscall_1)
23
24.section .vsyscall_2, "a"
25ENTRY(vsyscall_2)
26 int $VSYSCALL_EMU_VECTOR
27END(vsyscall_2)
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 6fec2d1cebe1..01c805ba5359 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,6 +2,7 @@
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/dwarf2.h> 4#include <asm/dwarf2.h>
5#include <asm/alternative-asm.h>
5 6
6 ALIGN 7 ALIGN
7copy_page_c: 8copy_page_c:
@@ -110,10 +111,6 @@ ENDPROC(copy_page)
1102: 1112:
111 .previous 112 .previous
112 .section .altinstructions,"a" 113 .section .altinstructions,"a"
113 .align 8 114 altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
114 .quad copy_page 115 .Lcopy_page_end-copy_page, 2b-1b
115 .quad 1b
116 .word X86_FEATURE_REP_GOOD
117 .byte .Lcopy_page_end - copy_page
118 .byte 2b - 1b
119 .previous 116 .previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index d0ec9c2936d7..ee164610ec46 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -9,6 +9,7 @@
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/dwarf2.h> 10#include <asm/dwarf2.h>
11#include <asm/cpufeature.h> 11#include <asm/cpufeature.h>
12#include <asm/alternative-asm.h>
12 13
13#undef memmove 14#undef memmove
14 15
@@ -214,11 +215,9 @@ ENTRY(memmove)
214 .previous 215 .previous
215 216
216 .section .altinstructions,"a" 217 .section .altinstructions,"a"
217 .align 8 218 altinstruction_entry .Lmemmove_begin_forward, \
218 .quad .Lmemmove_begin_forward 219 .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \
219 .quad .Lmemmove_begin_forward_efs 220 .Lmemmove_end_forward-.Lmemmove_begin_forward, \
220 .word X86_FEATURE_ERMS 221 .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
221 .byte .Lmemmove_end_forward-.Lmemmove_begin_forward
222 .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
223 .previous 222 .previous
224ENDPROC(memmove) 223ENDPROC(memmove)
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index bef0bc962400..5d179502a52c 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -26,6 +26,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
26export CPPFLAGS_vdso.lds += -P -C 26export CPPFLAGS_vdso.lds += -P -C
27 27
28VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ 28VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
29 -Wl,--no-undefined \
29 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 30 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
30 31
31$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so 32$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index a724905fdae7..6bc0e723b6e8 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -6,7 +6,6 @@
6 * 6 *
7 * The code should have no internal unresolved relocations. 7 * The code should have no internal unresolved relocations.
8 * Check with readelf after changing. 8 * Check with readelf after changing.
9 * Also alternative() doesn't work.
10 */ 9 */
11 10
12/* Disable profiling for userspace code: */ 11/* Disable profiling for userspace code: */
@@ -17,6 +16,7 @@
17#include <linux/time.h> 16#include <linux/time.h>
18#include <linux/string.h> 17#include <linux/string.h>
19#include <asm/vsyscall.h> 18#include <asm/vsyscall.h>
19#include <asm/fixmap.h>
20#include <asm/vgtod.h> 20#include <asm/vgtod.h>
21#include <asm/timex.h> 21#include <asm/timex.h>
22#include <asm/hpet.h> 22#include <asm/hpet.h>
@@ -25,6 +25,43 @@
25 25
26#define gtod (&VVAR(vsyscall_gtod_data)) 26#define gtod (&VVAR(vsyscall_gtod_data))
27 27
28notrace static cycle_t vread_tsc(void)
29{
30 cycle_t ret;
31 u64 last;
32
33 /*
34 * Empirically, a fence (of type that depends on the CPU)
35 * before rdtsc is enough to ensure that rdtsc is ordered
36 * with respect to loads. The various CPU manuals are unclear
37 * as to whether rdtsc can be reordered with later loads,
38 * but no one has ever seen it happen.
39 */
40 rdtsc_barrier();
41 ret = (cycle_t)vget_cycles();
42
43 last = VVAR(vsyscall_gtod_data).clock.cycle_last;
44
45 if (likely(ret >= last))
46 return ret;
47
48 /*
49 * GCC likes to generate cmov here, but this branch is extremely
50 * predictable (it's just a funciton of time and the likely is
51 * very likely) and there's a data dependence, so force GCC
52 * to generate a branch instead. I don't barrier() because
53 * we don't actually need a barrier, and if this function
54 * ever gets inlined it will generate worse code.
55 */
56 asm volatile ("");
57 return last;
58}
59
60static notrace cycle_t vread_hpet(void)
61{
62 return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
63}
64
28notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) 65notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
29{ 66{
30 long ret; 67 long ret;
@@ -36,9 +73,12 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
36notrace static inline long vgetns(void) 73notrace static inline long vgetns(void)
37{ 74{
38 long v; 75 long v;
39 cycles_t (*vread)(void); 76 cycles_t cycles;
40 vread = gtod->clock.vread; 77 if (gtod->clock.vclock_mode == VCLOCK_TSC)
41 v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask; 78 cycles = vread_tsc();
79 else
80 cycles = vread_hpet();
81 v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
42 return (v * gtod->clock.mult) >> gtod->clock.shift; 82 return (v * gtod->clock.mult) >> gtod->clock.shift;
43} 83}
44 84
@@ -116,21 +156,21 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
116 156
117notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) 157notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
118{ 158{
119 if (likely(gtod->sysctl_enabled)) 159 switch (clock) {
120 switch (clock) { 160 case CLOCK_REALTIME:
121 case CLOCK_REALTIME: 161 if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
122 if (likely(gtod->clock.vread)) 162 return do_realtime(ts);
123 return do_realtime(ts); 163 break;
124 break; 164 case CLOCK_MONOTONIC:
125 case CLOCK_MONOTONIC: 165 if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
126 if (likely(gtod->clock.vread)) 166 return do_monotonic(ts);
127 return do_monotonic(ts); 167 break;
128 break; 168 case CLOCK_REALTIME_COARSE:
129 case CLOCK_REALTIME_COARSE: 169 return do_realtime_coarse(ts);
130 return do_realtime_coarse(ts); 170 case CLOCK_MONOTONIC_COARSE:
131 case CLOCK_MONOTONIC_COARSE: 171 return do_monotonic_coarse(ts);
132 return do_monotonic_coarse(ts); 172 }
133 } 173
134 return vdso_fallback_gettime(clock, ts); 174 return vdso_fallback_gettime(clock, ts);
135} 175}
136int clock_gettime(clockid_t, struct timespec *) 176int clock_gettime(clockid_t, struct timespec *)
@@ -139,7 +179,7 @@ int clock_gettime(clockid_t, struct timespec *)
139notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) 179notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
140{ 180{
141 long ret; 181 long ret;
142 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { 182 if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) {
143 if (likely(tv != NULL)) { 183 if (likely(tv != NULL)) {
144 BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != 184 BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
145 offsetof(struct timespec, tv_nsec) || 185 offsetof(struct timespec, tv_nsec) ||
@@ -161,27 +201,14 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
161int gettimeofday(struct timeval *, struct timezone *) 201int gettimeofday(struct timeval *, struct timezone *)
162 __attribute__((weak, alias("__vdso_gettimeofday"))); 202 __attribute__((weak, alias("__vdso_gettimeofday")));
163 203
164/* This will break when the xtime seconds get inaccurate, but that is 204/*
165 * unlikely */ 205 * This will break when the xtime seconds get inaccurate, but that is
166 206 * unlikely
167static __always_inline long time_syscall(long *t) 207 */
168{
169 long secs;
170 asm volatile("syscall"
171 : "=a" (secs)
172 : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory");
173 return secs;
174}
175
176notrace time_t __vdso_time(time_t *t) 208notrace time_t __vdso_time(time_t *t)
177{ 209{
178 time_t result;
179
180 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
181 return time_syscall(t);
182
183 /* This is atomic on x86_64 so we don't need any locks. */ 210 /* This is atomic on x86_64 so we don't need any locks. */
184 result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec); 211 time_t result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec);
185 212
186 if (t) 213 if (t)
187 *t = result; 214 *t = result;
diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S
index 1d3aa6b87181..1b979c12ba85 100644
--- a/arch/x86/vdso/vdso.S
+++ b/arch/x86/vdso/vdso.S
@@ -1,10 +1,21 @@
1#include <asm/page_types.h>
2#include <linux/linkage.h>
1#include <linux/init.h> 3#include <linux/init.h>
2 4
3__INITDATA 5__PAGE_ALIGNED_DATA
4 6
5 .globl vdso_start, vdso_end 7 .globl vdso_start, vdso_end
8 .align PAGE_SIZE
6vdso_start: 9vdso_start:
7 .incbin "arch/x86/vdso/vdso.so" 10 .incbin "arch/x86/vdso/vdso.so"
8vdso_end: 11vdso_end:
9 12
10__FINIT 13.previous
14
15 .globl vdso_pages
16 .bss
17 .align 8
18 .type vdso_pages, @object
19vdso_pages:
20 .zero (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE * 8
21 .size vdso_pages, .-vdso_pages
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7abd2be0f9b9..316fbca3490e 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -14,41 +14,61 @@
14#include <asm/vgtod.h> 14#include <asm/vgtod.h>
15#include <asm/proto.h> 15#include <asm/proto.h>
16#include <asm/vdso.h> 16#include <asm/vdso.h>
17#include <asm/page.h>
17 18
18unsigned int __read_mostly vdso_enabled = 1; 19unsigned int __read_mostly vdso_enabled = 1;
19 20
20extern char vdso_start[], vdso_end[]; 21extern char vdso_start[], vdso_end[];
21extern unsigned short vdso_sync_cpuid; 22extern unsigned short vdso_sync_cpuid;
22 23
23static struct page **vdso_pages; 24extern struct page *vdso_pages[];
24static unsigned vdso_size; 25static unsigned vdso_size;
25 26
26static int __init init_vdso_vars(void) 27static void __init patch_vdso(void *vdso, size_t len)
28{
29 Elf64_Ehdr *hdr = vdso;
30 Elf64_Shdr *sechdrs, *alt_sec = 0;
31 char *secstrings;
32 void *alt_data;
33 int i;
34
35 BUG_ON(len < sizeof(Elf64_Ehdr));
36 BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0);
37
38 sechdrs = (void *)hdr + hdr->e_shoff;
39 secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
40
41 for (i = 1; i < hdr->e_shnum; i++) {
42 Elf64_Shdr *shdr = &sechdrs[i];
43 if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) {
44 alt_sec = shdr;
45 goto found;
46 }
47 }
48
49 /* If we get here, it's probably a bug. */
50 pr_warning("patch_vdso: .altinstructions not found\n");
51 return; /* nothing to patch */
52
53found:
54 alt_data = (void *)hdr + alt_sec->sh_offset;
55 apply_alternatives(alt_data, alt_data + alt_sec->sh_size);
56}
57
58static int __init init_vdso(void)
27{ 59{
28 int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; 60 int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
29 int i; 61 int i;
30 62
63 patch_vdso(vdso_start, vdso_end - vdso_start);
64
31 vdso_size = npages << PAGE_SHIFT; 65 vdso_size = npages << PAGE_SHIFT;
32 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); 66 for (i = 0; i < npages; i++)
33 if (!vdso_pages) 67 vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE);
34 goto oom;
35 for (i = 0; i < npages; i++) {
36 struct page *p;
37 p = alloc_page(GFP_KERNEL);
38 if (!p)
39 goto oom;
40 vdso_pages[i] = p;
41 copy_page(page_address(p), vdso_start + i*PAGE_SIZE);
42 }
43 68
44 return 0; 69 return 0;
45
46 oom:
47 printk("Cannot allocate vdso\n");
48 vdso_enabled = 0;
49 return -ENOMEM;
50} 70}
51subsys_initcall(init_vdso_vars); 71subsys_initcall(init_vdso);
52 72
53struct linux_binprm; 73struct linux_binprm;
54 74
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 34d6a1cab8de..0833896cf6f2 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -952,7 +952,7 @@ int hpet_alloc(struct hpet_data *hdp)
952#ifdef CONFIG_IA64 952#ifdef CONFIG_IA64
953 if (!hpet_clocksource) { 953 if (!hpet_clocksource) {
954 hpet_mctr = (void __iomem *)&hpetp->hp_hpet->hpet_mc; 954 hpet_mctr = (void __iomem *)&hpetp->hp_hpet->hpet_mc;
955 CLKSRC_FSYS_MMIO_SET(clocksource_hpet.fsys_mmio, hpet_mctr); 955 clocksource_hpet.archdata.fsys_mmio = hpet_mctr;
956 clocksource_register_hz(&clocksource_hpet, hpetp->hp_tick_freq); 956 clocksource_register_hz(&clocksource_hpet, hpetp->hp_tick_freq);
957 hpetp->hp_clocksource = &clocksource_hpet; 957 hpetp->hp_clocksource = &clocksource_hpet;
958 hpet_clocksource = &clocksource_hpet; 958 hpet_clocksource = &clocksource_hpet;
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 18a1baf31f2d..139c4db55f17 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -22,6 +22,10 @@
22typedef u64 cycle_t; 22typedef u64 cycle_t;
23struct clocksource; 23struct clocksource;
24 24
25#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
26#include <asm/clocksource.h>
27#endif
28
25/** 29/**
26 * struct cyclecounter - hardware abstraction for a free running counter 30 * struct cyclecounter - hardware abstraction for a free running counter
27 * Provides completely state-free accessors to the underlying hardware. 31 * Provides completely state-free accessors to the underlying hardware.
@@ -153,7 +157,7 @@ extern u64 timecounter_cyc2time(struct timecounter *tc,
153 * @shift: cycle to nanosecond divisor (power of two) 157 * @shift: cycle to nanosecond divisor (power of two)
154 * @max_idle_ns: max idle time permitted by the clocksource (nsecs) 158 * @max_idle_ns: max idle time permitted by the clocksource (nsecs)
155 * @flags: flags describing special properties 159 * @flags: flags describing special properties
156 * @vread: vsyscall based read 160 * @archdata: arch-specific data
157 * @suspend: suspend function for the clocksource, if necessary 161 * @suspend: suspend function for the clocksource, if necessary
158 * @resume: resume function for the clocksource, if necessary 162 * @resume: resume function for the clocksource, if necessary
159 */ 163 */
@@ -169,16 +173,13 @@ struct clocksource {
169 u32 shift; 173 u32 shift;
170 u64 max_idle_ns; 174 u64 max_idle_ns;
171 175
172#ifdef CONFIG_IA64 176#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
173 void *fsys_mmio; /* used by fsyscall asm code */ 177 struct arch_clocksource_data archdata;
174#define CLKSRC_FSYS_MMIO_SET(mmio, addr) ((mmio) = (addr))
175#else
176#define CLKSRC_FSYS_MMIO_SET(mmio, addr) do { } while (0)
177#endif 178#endif
179
178 const char *name; 180 const char *name;
179 struct list_head list; 181 struct list_head list;
180 int rating; 182 int rating;
181 cycle_t (*vread)(void);
182 int (*enable)(struct clocksource *cs); 183 int (*enable)(struct clocksource *cs);
183 void (*disable)(struct clocksource *cs); 184 void (*disable)(struct clocksource *cs);
184 unsigned long flags; 185 unsigned long flags;
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 167c33361d9c..cc7a4e9cc7ad 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -19,6 +19,11 @@ static inline void secure_computing(int this_syscall)
19extern long prctl_get_seccomp(void); 19extern long prctl_get_seccomp(void);
20extern long prctl_set_seccomp(unsigned long); 20extern long prctl_set_seccomp(unsigned long);
21 21
22static inline int seccomp_mode(seccomp_t *s)
23{
24 return s->mode;
25}
26
22#else /* CONFIG_SECCOMP */ 27#else /* CONFIG_SECCOMP */
23 28
24#include <linux/errno.h> 29#include <linux/errno.h>
@@ -37,6 +42,11 @@ static inline long prctl_set_seccomp(unsigned long arg2)
37 return -EINVAL; 42 return -EINVAL;
38} 43}
39 44
45static inline int seccomp_mode(seccomp_t *s)
46{
47 return 0;
48}
49
40#endif /* CONFIG_SECCOMP */ 50#endif /* CONFIG_SECCOMP */
41 51
42#endif /* _LINUX_SECCOMP_H */ 52#endif /* _LINUX_SECCOMP_H */