aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Lutomirski <luto@amacapital.net>2012-10-01 14:40:45 -0400
committerJames Morris <james.l.morris@oracle.com>2012-10-02 07:14:29 -0400
commit87b526d349b04c31d7b3a40b434eb3f825d22305 (patch)
tree2aeec0465901c9623ef7f5b3eb451ea6ccce6ecc
parentbf5308344527d015ac9a6d2bda4ad4d40fd7d943 (diff)
seccomp: Make syscall skipping and nr changes more consistent
This fixes two issues that could cause incompatibility between kernel versions: - If a tracer uses SECCOMP_RET_TRACE to select a syscall number higher than the largest known syscall, emulate the unknown vsyscall by returning -ENOSYS. (This is unlikely to make a noticeable difference on x86-64 due to the way the system call entry works.) - On x86-64 with vsyscall=emulate, skipped vsyscalls were buggy. This updates the documentation accordingly. Signed-off-by: Andy Lutomirski <luto@amacapital.net> Acked-by: Will Drewry <wad@chromium.org> Signed-off-by: James Morris <james.l.morris@oracle.com>
-rw-r--r--Documentation/prctl/seccomp_filter.txt74
-rw-r--r--arch/x86/kernel/vsyscall_64.c110
-rw-r--r--kernel/seccomp.c13
3 files changed, 137 insertions, 60 deletions
diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt
index 597c3c581375..1e469ef75778 100644
--- a/Documentation/prctl/seccomp_filter.txt
+++ b/Documentation/prctl/seccomp_filter.txt
@@ -95,12 +95,15 @@ SECCOMP_RET_KILL:
95 95
96SECCOMP_RET_TRAP: 96SECCOMP_RET_TRAP:
97 Results in the kernel sending a SIGSYS signal to the triggering 97 Results in the kernel sending a SIGSYS signal to the triggering
98 task without executing the system call. The kernel will 98 task without executing the system call. siginfo->si_call_addr
99 rollback the register state to just before the system call 99 will show the address of the system call instruction, and
100 entry such that a signal handler in the task will be able to 100 siginfo->si_syscall and siginfo->si_arch will indicate which
101 inspect the ucontext_t->uc_mcontext registers and emulate 101 syscall was attempted. The program counter will be as though
102 system call success or failure upon return from the signal 102 the syscall happened (i.e. it will not point to the syscall
103 handler. 103 instruction). The return value register will contain an arch-
104 dependent value -- if resuming execution, set it to something
105 sensible. (The architecture dependency is because replacing
106 it with -ENOSYS could overwrite some useful information.)
104 107
105 The SECCOMP_RET_DATA portion of the return value will be passed 108 The SECCOMP_RET_DATA portion of the return value will be passed
106 as si_errno. 109 as si_errno.
@@ -123,6 +126,18 @@ SECCOMP_RET_TRACE:
123 the BPF program return value will be available to the tracer 126 the BPF program return value will be available to the tracer
124 via PTRACE_GETEVENTMSG. 127 via PTRACE_GETEVENTMSG.
125 128
129 The tracer can skip the system call by changing the syscall number
130 to -1. Alternatively, the tracer can change the system call
131 requested by changing the system call to a valid syscall number. If
132 the tracer asks to skip the system call, then the system call will
133 appear to return the value that the tracer puts in the return value
134 register.
135
136 The seccomp check will not be run again after the tracer is
137 notified. (This means that seccomp-based sandboxes MUST NOT
138 allow use of ptrace, even of other sandboxed processes, without
139 extreme care; ptracers can use this mechanism to escape.)
140
126SECCOMP_RET_ALLOW: 141SECCOMP_RET_ALLOW:
127 Results in the system call being executed. 142 Results in the system call being executed.
128 143
@@ -161,3 +176,50 @@ architecture supports both ptrace_event and seccomp, it will be able to
161support seccomp filter with minor fixup: SIGSYS support and seccomp return 176support seccomp filter with minor fixup: SIGSYS support and seccomp return
162value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER 177value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER
163to its arch-specific Kconfig. 178to its arch-specific Kconfig.
179
180
181
182Caveats
183-------
184
185The vDSO can cause some system calls to run entirely in userspace,
186leading to surprises when you run programs on different machines that
187fall back to real syscalls. To minimize these surprises on x86, make
188sure you test with
189/sys/devices/system/clocksource/clocksource0/current_clocksource set to
190something like acpi_pm.
191
192On x86-64, vsyscall emulation is enabled by default. (vsyscalls are
193legacy variants on vDSO calls.) Currently, emulated vsyscalls will honor seccomp, with a few oddities:
194
195- A return value of SECCOMP_RET_TRAP will set a si_call_addr pointing to
196 the vsyscall entry for the given call and not the address after the
197 'syscall' instruction. Any code which wants to restart the call
198 should be aware that (a) a ret instruction has been emulated and (b)
199 trying to resume the syscall will again trigger the standard vsyscall
200 emulation security checks, making resuming the syscall mostly
201 pointless.
202
203- A return value of SECCOMP_RET_TRACE will signal the tracer as usual,
204 but the syscall may not be changed to another system call using the
205 orig_rax register. It may only be changed to -1 order to skip the
206 currently emulated call. Any other change MAY terminate the process.
207 The rip value seen by the tracer will be the syscall entry address;
208 this is different from normal behavior. The tracer MUST NOT modify
209 rip or rsp. (Do not rely on other changes terminating the process.
210 They might work. For example, on some kernels, choosing a syscall
211 that only exists in future kernels will be correctly emulated (by
212 returning -ENOSYS).
213
214To detect this quirky behavior, check for addr & ~0x0C00 ==
2150xFFFFFFFFFF600000. (For SECCOMP_RET_TRACE, use rip. For
216SECCOMP_RET_TRAP, use siginfo->si_call_addr.) Do not check any other
217condition: future kernels may improve vsyscall emulation and current
218kernels in vsyscall=native mode will behave differently, but the
219instructions at 0xF...F600{0,4,8,C}00 will not be system calls in these
220cases.
221
222Note that modern systems are unlikely to use vsyscalls at all -- they
223are a legacy feature and they are considerably slower than standard
224syscalls. New code will use the vDSO, and vDSO-issued system calls
225are indistinguishable from normal system calls.
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8d141b309046..b2e58a248b3b 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -136,19 +136,6 @@ static int addr_to_vsyscall_nr(unsigned long addr)
136 return nr; 136 return nr;
137} 137}
138 138
139#ifdef CONFIG_SECCOMP
140static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
141{
142 if (!seccomp_mode(&tsk->seccomp))
143 return 0;
144 task_pt_regs(tsk)->orig_ax = syscall_nr;
145 task_pt_regs(tsk)->ax = syscall_nr;
146 return __secure_computing(syscall_nr);
147}
148#else
149#define vsyscall_seccomp(_tsk, _nr) 0
150#endif
151
152static bool write_ok_or_segv(unsigned long ptr, size_t size) 139static bool write_ok_or_segv(unsigned long ptr, size_t size)
153{ 140{
154 /* 141 /*
@@ -181,10 +168,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
181{ 168{
182 struct task_struct *tsk; 169 struct task_struct *tsk;
183 unsigned long caller; 170 unsigned long caller;
184 int vsyscall_nr; 171 int vsyscall_nr, syscall_nr, tmp;
185 int prev_sig_on_uaccess_error; 172 int prev_sig_on_uaccess_error;
186 long ret; 173 long ret;
187 int skip;
188 174
189 /* 175 /*
190 * No point in checking CS -- the only way to get here is a user mode 176 * No point in checking CS -- the only way to get here is a user mode
@@ -216,56 +202,84 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
216 } 202 }
217 203
218 tsk = current; 204 tsk = current;
219 /*
220 * With a real vsyscall, page faults cause SIGSEGV. We want to
221 * preserve that behavior to make writing exploits harder.
222 */
223 prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
224 current_thread_info()->sig_on_uaccess_error = 1;
225 205
226 /* 206 /*
207 * Check for access_ok violations and find the syscall nr.
208 *
227 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and 209 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
228 * 64-bit, so we don't need to special-case it here. For all the 210 * 64-bit, so we don't need to special-case it here. For all the
229 * vsyscalls, NULL means "don't write anything" not "write it at 211 * vsyscalls, NULL means "don't write anything" not "write it at
230 * address 0". 212 * address 0".
231 */ 213 */
232 ret = -EFAULT;
233 skip = 0;
234 switch (vsyscall_nr) { 214 switch (vsyscall_nr) {
235 case 0: 215 case 0:
236 skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
237 if (skip)
238 break;
239
240 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || 216 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
241 !write_ok_or_segv(regs->si, sizeof(struct timezone))) 217 !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
242 break; 218 ret = -EFAULT;
219 goto check_fault;
220 }
221
222 syscall_nr = __NR_gettimeofday;
223 break;
224
225 case 1:
226 if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
227 ret = -EFAULT;
228 goto check_fault;
229 }
230
231 syscall_nr = __NR_time;
232 break;
233
234 case 2:
235 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
236 !write_ok_or_segv(regs->si, sizeof(unsigned))) {
237 ret = -EFAULT;
238 goto check_fault;
239 }
240
241 syscall_nr = __NR_getcpu;
242 break;
243 }
244
245 /*
246 * Handle seccomp. regs->ip must be the original value.
247 * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
248 *
249 * We could optimize the seccomp disabled case, but performance
250 * here doesn't matter.
251 */
252 regs->orig_ax = syscall_nr;
253 regs->ax = -ENOSYS;
254 tmp = secure_computing(syscall_nr);
255 if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
256 warn_bad_vsyscall(KERN_DEBUG, regs,
257 "seccomp tried to change syscall nr or ip");
258 do_exit(SIGSYS);
259 }
260 if (tmp)
261 goto do_ret; /* skip requested */
243 262
263 /*
264 * With a real vsyscall, page faults cause SIGSEGV. We want to
265 * preserve that behavior to make writing exploits harder.
266 */
267 prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
268 current_thread_info()->sig_on_uaccess_error = 1;
269
270 ret = -EFAULT;
271 switch (vsyscall_nr) {
272 case 0:
244 ret = sys_gettimeofday( 273 ret = sys_gettimeofday(
245 (struct timeval __user *)regs->di, 274 (struct timeval __user *)regs->di,
246 (struct timezone __user *)regs->si); 275 (struct timezone __user *)regs->si);
247 break; 276 break;
248 277
249 case 1: 278 case 1:
250 skip = vsyscall_seccomp(tsk, __NR_time);
251 if (skip)
252 break;
253
254 if (!write_ok_or_segv(regs->di, sizeof(time_t)))
255 break;
256
257 ret = sys_time((time_t __user *)regs->di); 279 ret = sys_time((time_t __user *)regs->di);
258 break; 280 break;
259 281
260 case 2: 282 case 2:
261 skip = vsyscall_seccomp(tsk, __NR_getcpu);
262 if (skip)
263 break;
264
265 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
266 !write_ok_or_segv(regs->si, sizeof(unsigned)))
267 break;
268
269 ret = sys_getcpu((unsigned __user *)regs->di, 283 ret = sys_getcpu((unsigned __user *)regs->di,
270 (unsigned __user *)regs->si, 284 (unsigned __user *)regs->si,
271 NULL); 285 NULL);
@@ -274,12 +288,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
274 288
275 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; 289 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
276 290
277 if (skip) { 291check_fault:
278 if ((long)regs->ax <= 0L) /* seccomp errno emulation */
279 goto do_ret;
280 goto done; /* seccomp trace/trap */
281 }
282
283 if (ret == -EFAULT) { 292 if (ret == -EFAULT) {
284 /* Bad news -- userspace fed a bad pointer to a vsyscall. */ 293 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
285 warn_bad_vsyscall(KERN_INFO, regs, 294 warn_bad_vsyscall(KERN_INFO, regs,
@@ -302,7 +311,6 @@ do_ret:
302 /* Emulate a ret instruction. */ 311 /* Emulate a ret instruction. */
303 regs->ip = caller; 312 regs->ip = caller;
304 regs->sp += 8; 313 regs->sp += 8;
305done:
306 return true; 314 return true;
307 315
308sigsegv: 316sigsegv:
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ee376beedaf9..5af44b593770 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -396,25 +396,29 @@ int __secure_computing(int this_syscall)
396#ifdef CONFIG_SECCOMP_FILTER 396#ifdef CONFIG_SECCOMP_FILTER
397 case SECCOMP_MODE_FILTER: { 397 case SECCOMP_MODE_FILTER: {
398 int data; 398 int data;
399 struct pt_regs *regs = task_pt_regs(current);
399 ret = seccomp_run_filters(this_syscall); 400 ret = seccomp_run_filters(this_syscall);
400 data = ret & SECCOMP_RET_DATA; 401 data = ret & SECCOMP_RET_DATA;
401 ret &= SECCOMP_RET_ACTION; 402 ret &= SECCOMP_RET_ACTION;
402 switch (ret) { 403 switch (ret) {
403 case SECCOMP_RET_ERRNO: 404 case SECCOMP_RET_ERRNO:
404 /* Set the low-order 16-bits as a errno. */ 405 /* Set the low-order 16-bits as a errno. */
405 syscall_set_return_value(current, task_pt_regs(current), 406 syscall_set_return_value(current, regs,
406 -data, 0); 407 -data, 0);
407 goto skip; 408 goto skip;
408 case SECCOMP_RET_TRAP: 409 case SECCOMP_RET_TRAP:
409 /* Show the handler the original registers. */ 410 /* Show the handler the original registers. */
410 syscall_rollback(current, task_pt_regs(current)); 411 syscall_rollback(current, regs);
411 /* Let the filter pass back 16 bits of data. */ 412 /* Let the filter pass back 16 bits of data. */
412 seccomp_send_sigsys(this_syscall, data); 413 seccomp_send_sigsys(this_syscall, data);
413 goto skip; 414 goto skip;
414 case SECCOMP_RET_TRACE: 415 case SECCOMP_RET_TRACE:
415 /* Skip these calls if there is no tracer. */ 416 /* Skip these calls if there is no tracer. */
416 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) 417 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
418 syscall_set_return_value(current, regs,
419 -ENOSYS, 0);
417 goto skip; 420 goto skip;
421 }
418 /* Allow the BPF to provide the event message */ 422 /* Allow the BPF to provide the event message */
419 ptrace_event(PTRACE_EVENT_SECCOMP, data); 423 ptrace_event(PTRACE_EVENT_SECCOMP, data);
420 /* 424 /*
@@ -425,6 +429,9 @@ int __secure_computing(int this_syscall)
425 */ 429 */
426 if (fatal_signal_pending(current)) 430 if (fatal_signal_pending(current))
427 break; 431 break;
432 if (syscall_get_nr(current, regs) < 0)
433 goto skip; /* Explicit request to skip. */
434
428 return 0; 435 return 0;
429 case SECCOMP_RET_ALLOW: 436 case SECCOMP_RET_ALLOW:
430 return 0; 437 return 0;