aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorAndy Lutomirski <luto@amacapital.net>2012-10-01 14:40:45 -0400
committerJames Morris <james.l.morris@oracle.com>2012-10-02 07:14:29 -0400
commit87b526d349b04c31d7b3a40b434eb3f825d22305 (patch)
tree2aeec0465901c9623ef7f5b3eb451ea6ccce6ecc /arch/x86/kernel
parentbf5308344527d015ac9a6d2bda4ad4d40fd7d943 (diff)
seccomp: Make syscall skipping and nr changes more consistent
This fixes two issues that could cause incompatibility between kernel versions: - If a tracer uses SECCOMP_RET_TRACE to select a syscall number higher than the largest known syscall, emulate the unknown vsyscall by returning -ENOSYS. (This is unlikely to make a noticeable difference on x86-64 due to the way the system call entry works.) - On x86-64 with vsyscall=emulate, skipped vsyscalls were buggy. This updates the documentation accordingly. Signed-off-by: Andy Lutomirski <luto@amacapital.net> Acked-by: Will Drewry <wad@chromium.org> Signed-off-by: James Morris <james.l.morris@oracle.com>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/vsyscall_64.c110
1 files changed, 59 insertions, 51 deletions
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8d141b309046..b2e58a248b3b 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -136,19 +136,6 @@ static int addr_to_vsyscall_nr(unsigned long addr)
136 return nr; 136 return nr;
137} 137}
138 138
139#ifdef CONFIG_SECCOMP
140static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
141{
142 if (!seccomp_mode(&tsk->seccomp))
143 return 0;
144 task_pt_regs(tsk)->orig_ax = syscall_nr;
145 task_pt_regs(tsk)->ax = syscall_nr;
146 return __secure_computing(syscall_nr);
147}
148#else
149#define vsyscall_seccomp(_tsk, _nr) 0
150#endif
151
152static bool write_ok_or_segv(unsigned long ptr, size_t size) 139static bool write_ok_or_segv(unsigned long ptr, size_t size)
153{ 140{
154 /* 141 /*
@@ -181,10 +168,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
181{ 168{
182 struct task_struct *tsk; 169 struct task_struct *tsk;
183 unsigned long caller; 170 unsigned long caller;
184 int vsyscall_nr; 171 int vsyscall_nr, syscall_nr, tmp;
185 int prev_sig_on_uaccess_error; 172 int prev_sig_on_uaccess_error;
186 long ret; 173 long ret;
187 int skip;
188 174
189 /* 175 /*
190 * No point in checking CS -- the only way to get here is a user mode 176 * No point in checking CS -- the only way to get here is a user mode
@@ -216,56 +202,84 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
216 } 202 }
217 203
218 tsk = current; 204 tsk = current;
219 /*
220 * With a real vsyscall, page faults cause SIGSEGV. We want to
221 * preserve that behavior to make writing exploits harder.
222 */
223 prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
224 current_thread_info()->sig_on_uaccess_error = 1;
225 205
226 /* 206 /*
207 * Check for access_ok violations and find the syscall nr.
208 *
227 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and 209 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
228 * 64-bit, so we don't need to special-case it here. For all the 210 * 64-bit, so we don't need to special-case it here. For all the
229 * vsyscalls, NULL means "don't write anything" not "write it at 211 * vsyscalls, NULL means "don't write anything" not "write it at
230 * address 0". 212 * address 0".
231 */ 213 */
232 ret = -EFAULT;
233 skip = 0;
234 switch (vsyscall_nr) { 214 switch (vsyscall_nr) {
235 case 0: 215 case 0:
236 skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
237 if (skip)
238 break;
239
240 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || 216 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
241 !write_ok_or_segv(regs->si, sizeof(struct timezone))) 217 !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
242 break; 218 ret = -EFAULT;
219 goto check_fault;
220 }
221
222 syscall_nr = __NR_gettimeofday;
223 break;
224
225 case 1:
226 if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
227 ret = -EFAULT;
228 goto check_fault;
229 }
230
231 syscall_nr = __NR_time;
232 break;
233
234 case 2:
235 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
236 !write_ok_or_segv(regs->si, sizeof(unsigned))) {
237 ret = -EFAULT;
238 goto check_fault;
239 }
240
241 syscall_nr = __NR_getcpu;
242 break;
243 }
244
245 /*
246 * Handle seccomp. regs->ip must be the original value.
247 * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
248 *
249 * We could optimize the seccomp disabled case, but performance
250 * here doesn't matter.
251 */
252 regs->orig_ax = syscall_nr;
253 regs->ax = -ENOSYS;
254 tmp = secure_computing(syscall_nr);
255 if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
256 warn_bad_vsyscall(KERN_DEBUG, regs,
257 "seccomp tried to change syscall nr or ip");
258 do_exit(SIGSYS);
259 }
260 if (tmp)
261 goto do_ret; /* skip requested */
243 262
263 /*
264 * With a real vsyscall, page faults cause SIGSEGV. We want to
265 * preserve that behavior to make writing exploits harder.
266 */
267 prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
268 current_thread_info()->sig_on_uaccess_error = 1;
269
270 ret = -EFAULT;
271 switch (vsyscall_nr) {
272 case 0:
244 ret = sys_gettimeofday( 273 ret = sys_gettimeofday(
245 (struct timeval __user *)regs->di, 274 (struct timeval __user *)regs->di,
246 (struct timezone __user *)regs->si); 275 (struct timezone __user *)regs->si);
247 break; 276 break;
248 277
249 case 1: 278 case 1:
250 skip = vsyscall_seccomp(tsk, __NR_time);
251 if (skip)
252 break;
253
254 if (!write_ok_or_segv(regs->di, sizeof(time_t)))
255 break;
256
257 ret = sys_time((time_t __user *)regs->di); 279 ret = sys_time((time_t __user *)regs->di);
258 break; 280 break;
259 281
260 case 2: 282 case 2:
261 skip = vsyscall_seccomp(tsk, __NR_getcpu);
262 if (skip)
263 break;
264
265 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
266 !write_ok_or_segv(regs->si, sizeof(unsigned)))
267 break;
268
269 ret = sys_getcpu((unsigned __user *)regs->di, 283 ret = sys_getcpu((unsigned __user *)regs->di,
270 (unsigned __user *)regs->si, 284 (unsigned __user *)regs->si,
271 NULL); 285 NULL);
@@ -274,12 +288,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
274 288
275 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; 289 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
276 290
277 if (skip) { 291check_fault:
278 if ((long)regs->ax <= 0L) /* seccomp errno emulation */
279 goto do_ret;
280 goto done; /* seccomp trace/trap */
281 }
282
283 if (ret == -EFAULT) { 292 if (ret == -EFAULT) {
284 /* Bad news -- userspace fed a bad pointer to a vsyscall. */ 293 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
285 warn_bad_vsyscall(KERN_INFO, regs, 294 warn_bad_vsyscall(KERN_INFO, regs,
@@ -302,7 +311,6 @@ do_ret:
302 /* Emulate a ret instruction. */ 311 /* Emulate a ret instruction. */
303 regs->ip = caller; 312 regs->ip = caller;
304 regs->sp += 8; 313 regs->sp += 8;
305done:
306 return true; 314 return true;
307 315
308sigsegv: 316sigsegv: