aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/entry_64.S2
-rw-r--r--arch/x86/kernel/traps.c6
-rw-r--r--arch/x86/kernel/vsyscall_64.c261
-rw-r--r--arch/x86/kernel/vsyscall_emu_64.S27
5 files changed, 158 insertions, 139 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 90b06d4daee2..cc0469a65120 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -44,6 +44,7 @@ obj-y += probe_roms.o
44obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 44obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
45obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 45obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
46obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o 46obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o
47obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
47obj-y += bootflag.o e820.o 48obj-y += bootflag.o e820.o
48obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 49obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
49obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 50obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 72c4a777bb91..e949793d6b93 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1123,6 +1123,8 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
1123zeroentry coprocessor_error do_coprocessor_error 1123zeroentry coprocessor_error do_coprocessor_error
1124errorentry alignment_check do_alignment_check 1124errorentry alignment_check do_alignment_check
1125zeroentry simd_coprocessor_error do_simd_coprocessor_error 1125zeroentry simd_coprocessor_error do_simd_coprocessor_error
1126zeroentry emulate_vsyscall do_emulate_vsyscall
1127
1126 1128
1127 /* Reload gs selector with exception handling */ 1129 /* Reload gs selector with exception handling */
1128 /* edi: new selector */ 1130 /* edi: new selector */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b9b67166f9de..fbc097a085ca 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -872,6 +872,12 @@ void __init trap_init(void)
872 set_bit(SYSCALL_VECTOR, used_vectors); 872 set_bit(SYSCALL_VECTOR, used_vectors);
873#endif 873#endif
874 874
875#ifdef CONFIG_X86_64
876 BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors));
877 set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall);
878 set_bit(VSYSCALL_EMU_VECTOR, used_vectors);
879#endif
880
875 /* 881 /*
876 * Should be a barrier for any external CPU state: 882 * Should be a barrier for any external CPU state:
877 */ 883 */
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 70a5f6eebd6c..10cd8ac3395a 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -2,6 +2,8 @@
2 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 2 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
3 * Copyright 2003 Andi Kleen, SuSE Labs. 3 * Copyright 2003 Andi Kleen, SuSE Labs.
4 * 4 *
5 * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
6 *
5 * Thanks to hpa@transmeta.com for some useful hint. 7 * Thanks to hpa@transmeta.com for some useful hint.
6 * Special thanks to Ingo Molnar for his early experience with 8 * Special thanks to Ingo Molnar for his early experience with
7 * a different vsyscall implementation for Linux/IA32 and for the name. 9 * a different vsyscall implementation for Linux/IA32 and for the name.
@@ -11,10 +13,9 @@
11 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid 13 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
12 * jumping out of line if necessary. We cannot add more with this 14 * jumping out of line if necessary. We cannot add more with this
13 * mechanism because older kernels won't return -ENOSYS. 15 * mechanism because older kernels won't return -ENOSYS.
14 * If we want more than four we need a vDSO.
15 * 16 *
16 * Note: the concept clashes with user mode linux. If you use UML and 17 * Note: the concept clashes with user mode linux. UML users should
17 * want per guest time just set the kernel.vsyscall64 sysctl to 0. 18 * use the vDSO.
18 */ 19 */
19 20
20/* Disable profiling for userspace code: */ 21/* Disable profiling for userspace code: */
@@ -32,6 +33,8 @@
32#include <linux/cpu.h> 33#include <linux/cpu.h>
33#include <linux/smp.h> 34#include <linux/smp.h>
34#include <linux/notifier.h> 35#include <linux/notifier.h>
36#include <linux/syscalls.h>
37#include <linux/ratelimit.h>
35 38
36#include <asm/vsyscall.h> 39#include <asm/vsyscall.h>
37#include <asm/pgtable.h> 40#include <asm/pgtable.h>
@@ -44,10 +47,7 @@
44#include <asm/desc.h> 47#include <asm/desc.h>
45#include <asm/topology.h> 48#include <asm/topology.h>
46#include <asm/vgtod.h> 49#include <asm/vgtod.h>
47 50#include <asm/traps.h>
48#define __vsyscall(nr) \
49 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
50#define __syscall_clobber "r11","cx","memory"
51 51
52DEFINE_VVAR(int, vgetcpu_mode); 52DEFINE_VVAR(int, vgetcpu_mode);
53DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = 53DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
@@ -71,146 +71,129 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
71 unsigned long flags; 71 unsigned long flags;
72 72
73 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); 73 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
74
74 /* copy vsyscall data */ 75 /* copy vsyscall data */
75 vsyscall_gtod_data.clock.vread = clock->vread; 76 vsyscall_gtod_data.clock.vread = clock->vread;
76 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; 77 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
77 vsyscall_gtod_data.clock.mask = clock->mask; 78 vsyscall_gtod_data.clock.mask = clock->mask;
78 vsyscall_gtod_data.clock.mult = mult; 79 vsyscall_gtod_data.clock.mult = mult;
79 vsyscall_gtod_data.clock.shift = clock->shift; 80 vsyscall_gtod_data.clock.shift = clock->shift;
80 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 81 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
81 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 82 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
82 vsyscall_gtod_data.wall_to_monotonic = *wtm; 83 vsyscall_gtod_data.wall_to_monotonic = *wtm;
83 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); 84 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
85
84 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 86 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
85} 87}
86 88
87/* RED-PEN may want to readd seq locking, but then the variable should be 89static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
88 * write-once. 90 const char *message)
89 */
90static __always_inline void do_get_tz(struct timezone * tz)
91{ 91{
92 *tz = VVAR(vsyscall_gtod_data).sys_tz; 92 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
93} 93 struct task_struct *tsk;
94 94
95static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) 95 if (!show_unhandled_signals || !__ratelimit(&rs))
96{ 96 return;
97 int ret;
98 asm volatile("syscall"
99 : "=a" (ret)
100 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
101 : __syscall_clobber );
102 return ret;
103}
104 97
105static __always_inline void do_vgettimeofday(struct timeval * tv) 98 tsk = current;
106{
107 cycle_t now, base, mask, cycle_delta;
108 unsigned seq;
109 unsigned long mult, shift, nsec;
110 cycle_t (*vread)(void);
111 do {
112 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
113
114 vread = VVAR(vsyscall_gtod_data).clock.vread;
115 if (unlikely(!vread)) {
116 gettimeofday(tv,NULL);
117 return;
118 }
119
120 now = vread();
121 base = VVAR(vsyscall_gtod_data).clock.cycle_last;
122 mask = VVAR(vsyscall_gtod_data).clock.mask;
123 mult = VVAR(vsyscall_gtod_data).clock.mult;
124 shift = VVAR(vsyscall_gtod_data).clock.shift;
125
126 tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
127 nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
128 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
129
130 /* calculate interval: */
131 cycle_delta = (now - base) & mask;
132 /* convert to nsecs: */
133 nsec += (cycle_delta * mult) >> shift;
134
135 while (nsec >= NSEC_PER_SEC) {
136 tv->tv_sec += 1;
137 nsec -= NSEC_PER_SEC;
138 }
139 tv->tv_usec = nsec / NSEC_PER_USEC;
140}
141 99
142int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) 100 printk("%s%s[%d] %s ip:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
143{ 101 level, tsk->comm, task_pid_nr(tsk),
144 if (tv) 102 message, regs->ip - 2, regs->sp, regs->ax, regs->si, regs->di);
145 do_vgettimeofday(tv);
146 if (tz)
147 do_get_tz(tz);
148 return 0;
149} 103}
150 104
151/* This will break when the xtime seconds get inaccurate, but that is 105void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code)
152 * unlikely */
153time_t __vsyscall(1) vtime(time_t *t)
154{ 106{
155 unsigned seq; 107 const char *vsyscall_name;
156 time_t result; 108 struct task_struct *tsk;
109 unsigned long caller;
110 int vsyscall_nr;
111 long ret;
112
113 /* Kernel code must never get here. */
114 BUG_ON(!user_mode(regs));
115
116 local_irq_enable();
117
118 /*
119 * x86-ism here: regs->ip points to the instruction after the int 0xcc,
120 * and int 0xcc is two bytes long.
121 */
122 if (!is_vsyscall_entry(regs->ip - 2)) {
123 warn_bad_vsyscall(KERN_WARNING, regs, "illegal int 0xcc (exploit attempt?)");
124 goto sigsegv;
125 }
126 vsyscall_nr = vsyscall_entry_nr(regs->ip - 2);
157 127
158 do { 128 if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
159 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); 129 warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)");
130 goto sigsegv;
131 }
160 132
161 result = VVAR(vsyscall_gtod_data).wall_time_sec; 133 tsk = current;
134 if (seccomp_mode(&tsk->seccomp))
135 do_exit(SIGKILL);
136
137 switch (vsyscall_nr) {
138 case 0:
139 vsyscall_name = "gettimeofday";
140 ret = sys_gettimeofday(
141 (struct timeval __user *)regs->di,
142 (struct timezone __user *)regs->si);
143 break;
144
145 case 1:
146 vsyscall_name = "time";
147 ret = sys_time((time_t __user *)regs->di);
148 break;
149
150 case 2:
151 vsyscall_name = "getcpu";
152 ret = sys_getcpu((unsigned __user *)regs->di,
153 (unsigned __user *)regs->si,
154 0);
155 break;
156
157 default:
158 /*
159 * If we get here, then vsyscall_nr indicates that int 0xcc
160 * happened at an address in the vsyscall page that doesn't
161 * contain int 0xcc. That can't happen.
162 */
163 BUG();
164 }
162 165
163 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); 166 if (ret == -EFAULT) {
167 /*
168 * Bad news -- userspace fed a bad pointer to a vsyscall.
169 *
170 * With a real vsyscall, that would have caused SIGSEGV.
171 * To make writing reliable exploits using the emulated
172 * vsyscalls harder, generate SIGSEGV here as well.
173 */
174 warn_bad_vsyscall(KERN_INFO, regs,
175 "vsyscall fault (exploit attempt?)");
176 goto sigsegv;
177 }
164 178
165 if (t) 179 regs->ax = ret;
166 *t = result;
167 return result;
168}
169 180
170/* Fast way to get current CPU and node. 181 /* Emulate a ret instruction. */
171 This helps to do per node and per CPU caches in user space. 182 regs->ip = caller;
172 The result is not guaranteed without CPU affinity, but usually 183 regs->sp += 8;
173 works out because the scheduler tries to keep a thread on the same
174 CPU.
175 184
176 tcache must point to a two element sized long array. 185 local_irq_disable();
177 All arguments can be NULL. */ 186 return;
178long __vsyscall(2) 187
179vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) 188sigsegv:
180{ 189 regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */
181 unsigned int p; 190 force_sig(SIGSEGV, current);
182 unsigned long j = 0;
183
184 /* Fast cache - only recompute value once per jiffies and avoid
185 relatively costly rdtscp/cpuid otherwise.
186 This works because the scheduler usually keeps the process
187 on the same CPU and this syscall doesn't guarantee its
188 results anyways.
189 We do this here because otherwise user space would do it on
190 its own in a likely inferior way (no access to jiffies).
191 If you don't like it pass NULL. */
192 if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) {
193 p = tcache->blob[1];
194 } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
195 /* Load per CPU data from RDTSCP */
196 native_read_tscp(&p);
197 } else {
198 /* Load per CPU data from GDT */
199 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
200 }
201 if (tcache) {
202 tcache->blob[0] = j;
203 tcache->blob[1] = p;
204 }
205 if (cpu)
206 *cpu = p & 0xfff;
207 if (node)
208 *node = p >> 12;
209 return 0;
210} 191}
211 192
212/* Assume __initcall executes before all user space. Hopefully kmod 193/*
213 doesn't violate that. We'll find out if it does. */ 194 * Assume __initcall executes before all user space. Hopefully kmod
195 * doesn't violate that. We'll find out if it does.
196 */
214static void __cpuinit vsyscall_set_cpu(int cpu) 197static void __cpuinit vsyscall_set_cpu(int cpu)
215{ 198{
216 unsigned long d; 199 unsigned long d;
@@ -221,13 +204,15 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
221 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) 204 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
222 write_rdtscp_aux((node << 12) | cpu); 205 write_rdtscp_aux((node << 12) | cpu);
223 206
224 /* Store cpu number in limit so that it can be loaded quickly 207 /*
225 in user space in vgetcpu. 208 * Store cpu number in limit so that it can be loaded quickly
226 12 bits for the CPU and 8 bits for the node. */ 209 * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
210 */
227 d = 0x0f40000000000ULL; 211 d = 0x0f40000000000ULL;
228 d |= cpu; 212 d |= cpu;
229 d |= (node & 0xf) << 12; 213 d |= (node & 0xf) << 12;
230 d |= (node >> 4) << 48; 214 d |= (node >> 4) << 48;
215
231 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); 216 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
232} 217}
233 218
@@ -241,8 +226,10 @@ static int __cpuinit
241cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) 226cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
242{ 227{
243 long cpu = (long)arg; 228 long cpu = (long)arg;
229
244 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 230 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
245 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); 231 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
232
246 return NOTIFY_DONE; 233 return NOTIFY_DONE;
247} 234}
248 235
@@ -256,21 +243,17 @@ void __init map_vsyscall(void)
256 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ 243 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
257 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); 244 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
258 __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); 245 __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
259 BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != 246 BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS);
260 (unsigned long)VVAR_ADDRESS);
261} 247}
262 248
263static int __init vsyscall_init(void) 249static int __init vsyscall_init(void)
264{ 250{
265 BUG_ON(((unsigned long) &vgettimeofday != 251 BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
266 VSYSCALL_ADDR(__NR_vgettimeofday))); 252
267 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
268 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
269 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
270 on_each_cpu(cpu_vsyscall_init, NULL, 1); 253 on_each_cpu(cpu_vsyscall_init, NULL, 1);
271 /* notifier priority > KVM */ 254 /* notifier priority > KVM */
272 hotcpu_notifier(cpu_vsyscall_notifier, 30); 255 hotcpu_notifier(cpu_vsyscall_notifier, 30);
256
273 return 0; 257 return 0;
274} 258}
275
276__initcall(vsyscall_init); 259__initcall(vsyscall_init);
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S
new file mode 100644
index 000000000000..ffa845eae5ca
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_emu_64.S
@@ -0,0 +1,27 @@
1/*
2 * vsyscall_emu_64.S: Vsyscall emulation page
3 *
4 * Copyright (c) 2011 Andy Lutomirski
5 *
6 * Subject to the GNU General Public License, version 2
7 */
8
9#include <linux/linkage.h>
10#include <asm/irq_vectors.h>
11
12/* The unused parts of the page are filled with 0xcc by the linker script. */
13
14.section .vsyscall_0, "a"
15ENTRY(vsyscall_0)
16 int $VSYSCALL_EMU_VECTOR
17END(vsyscall_0)
18
19.section .vsyscall_1, "a"
20ENTRY(vsyscall_1)
21 int $VSYSCALL_EMU_VECTOR
22END(vsyscall_1)
23
24.section .vsyscall_2, "a"
25ENTRY(vsyscall_2)
26 int $VSYSCALL_EMU_VECTOR
27END(vsyscall_2)