diff options
author | Andy Lutomirski <luto@amacapital.net> | 2012-10-01 14:40:45 -0400 |
---|---|---|
committer | James Morris <james.l.morris@oracle.com> | 2012-10-02 07:14:29 -0400 |
commit | 87b526d349b04c31d7b3a40b434eb3f825d22305 (patch) | |
tree | 2aeec0465901c9623ef7f5b3eb451ea6ccce6ecc | |
parent | bf5308344527d015ac9a6d2bda4ad4d40fd7d943 (diff) |
seccomp: Make syscall skipping and nr changes more consistent
This fixes two issues that could cause incompatibility between
kernel versions:
- If a tracer uses SECCOMP_RET_TRACE to select a syscall number
higher than the largest known syscall, emulate the unknown
vsyscall by returning -ENOSYS. (This is unlikely to make a
noticeable difference on x86-64 due to the way the system call
entry works.)
- On x86-64 with vsyscall=emulate, skipped vsyscalls were buggy.
This updates the documentation accordingly.
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Acked-by: Will Drewry <wad@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
-rw-r--r-- | Documentation/prctl/seccomp_filter.txt | 74 | ||||
-rw-r--r-- | arch/x86/kernel/vsyscall_64.c | 110 | ||||
-rw-r--r-- | kernel/seccomp.c | 13 |
3 files changed, 137 insertions, 60 deletions
diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt index 597c3c581375..1e469ef75778 100644 --- a/Documentation/prctl/seccomp_filter.txt +++ b/Documentation/prctl/seccomp_filter.txt | |||
@@ -95,12 +95,15 @@ SECCOMP_RET_KILL: | |||
95 | 95 | ||
96 | SECCOMP_RET_TRAP: | 96 | SECCOMP_RET_TRAP: |
97 | Results in the kernel sending a SIGSYS signal to the triggering | 97 | Results in the kernel sending a SIGSYS signal to the triggering |
98 | task without executing the system call. The kernel will | 98 | task without executing the system call. siginfo->si_call_addr |
99 | rollback the register state to just before the system call | 99 | will show the address of the system call instruction, and |
100 | entry such that a signal handler in the task will be able to | 100 | siginfo->si_syscall and siginfo->si_arch will indicate which |
101 | inspect the ucontext_t->uc_mcontext registers and emulate | 101 | syscall was attempted. The program counter will be as though |
102 | system call success or failure upon return from the signal | 102 | the syscall happened (i.e. it will not point to the syscall |
103 | handler. | 103 | instruction). The return value register will contain an arch- |
104 | dependent value -- if resuming execution, set it to something | ||
105 | sensible. (The architecture dependency is because replacing | ||
106 | it with -ENOSYS could overwrite some useful information.) | ||
104 | 107 | ||
105 | The SECCOMP_RET_DATA portion of the return value will be passed | 108 | The SECCOMP_RET_DATA portion of the return value will be passed |
106 | as si_errno. | 109 | as si_errno. |
@@ -123,6 +126,18 @@ SECCOMP_RET_TRACE: | |||
123 | the BPF program return value will be available to the tracer | 126 | the BPF program return value will be available to the tracer |
124 | via PTRACE_GETEVENTMSG. | 127 | via PTRACE_GETEVENTMSG. |
125 | 128 | ||
129 | The tracer can skip the system call by changing the syscall number | ||
130 | to -1. Alternatively, the tracer can change the system call | ||
131 | requested by changing the system call to a valid syscall number. If | ||
132 | the tracer asks to skip the system call, then the system call will | ||
133 | appear to return the value that the tracer puts in the return value | ||
134 | register. | ||
135 | |||
136 | The seccomp check will not be run again after the tracer is | ||
137 | notified. (This means that seccomp-based sandboxes MUST NOT | ||
138 | allow use of ptrace, even of other sandboxed processes, without | ||
139 | extreme care; ptracers can use this mechanism to escape.) | ||
140 | |||
126 | SECCOMP_RET_ALLOW: | 141 | SECCOMP_RET_ALLOW: |
127 | Results in the system call being executed. | 142 | Results in the system call being executed. |
128 | 143 | ||
@@ -161,3 +176,50 @@ architecture supports both ptrace_event and seccomp, it will be able to | |||
161 | support seccomp filter with minor fixup: SIGSYS support and seccomp return | 176 | support seccomp filter with minor fixup: SIGSYS support and seccomp return |
162 | value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER | 177 | value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER |
163 | to its arch-specific Kconfig. | 178 | to its arch-specific Kconfig. |
179 | |||
180 | |||
181 | |||
182 | Caveats | ||
183 | ------- | ||
184 | |||
185 | The vDSO can cause some system calls to run entirely in userspace, | ||
186 | leading to surprises when you run programs on different machines that | ||
187 | fall back to real syscalls. To minimize these surprises on x86, make | ||
188 | sure you test with | ||
189 | /sys/devices/system/clocksource/clocksource0/current_clocksource set to | ||
190 | something like acpi_pm. | ||
191 | |||
192 | On x86-64, vsyscall emulation is enabled by default. (vsyscalls are | ||
193 | legacy variants on vDSO calls.) Currently, emulated vsyscalls will honor seccomp, with a few oddities: | ||
194 | |||
195 | - A return value of SECCOMP_RET_TRAP will set a si_call_addr pointing to | ||
196 | the vsyscall entry for the given call and not the address after the | ||
197 | 'syscall' instruction. Any code which wants to restart the call | ||
198 | should be aware that (a) a ret instruction has been emulated and (b) | ||
199 | trying to resume the syscall will again trigger the standard vsyscall | ||
200 | emulation security checks, making resuming the syscall mostly | ||
201 | pointless. | ||
202 | |||
203 | - A return value of SECCOMP_RET_TRACE will signal the tracer as usual, | ||
204 | but the syscall may not be changed to another system call using the | ||
205 | orig_rax register. It may only be changed to -1 order to skip the | ||
206 | currently emulated call. Any other change MAY terminate the process. | ||
207 | The rip value seen by the tracer will be the syscall entry address; | ||
208 | this is different from normal behavior. The tracer MUST NOT modify | ||
209 | rip or rsp. (Do not rely on other changes terminating the process. | ||
210 | They might work. For example, on some kernels, choosing a syscall | ||
211 | that only exists in future kernels will be correctly emulated (by | ||
212 | returning -ENOSYS). | ||
213 | |||
214 | To detect this quirky behavior, check for addr & ~0x0C00 == | ||
215 | 0xFFFFFFFFFF600000. (For SECCOMP_RET_TRACE, use rip. For | ||
216 | SECCOMP_RET_TRAP, use siginfo->si_call_addr.) Do not check any other | ||
217 | condition: future kernels may improve vsyscall emulation and current | ||
218 | kernels in vsyscall=native mode will behave differently, but the | ||
219 | instructions at 0xF...F600{0,4,8,C}00 will not be system calls in these | ||
220 | cases. | ||
221 | |||
222 | Note that modern systems are unlikely to use vsyscalls at all -- they | ||
223 | are a legacy feature and they are considerably slower than standard | ||
224 | syscalls. New code will use the vDSO, and vDSO-issued system calls | ||
225 | are indistinguishable from normal system calls. | ||
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 8d141b309046..b2e58a248b3b 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c | |||
@@ -136,19 +136,6 @@ static int addr_to_vsyscall_nr(unsigned long addr) | |||
136 | return nr; | 136 | return nr; |
137 | } | 137 | } |
138 | 138 | ||
139 | #ifdef CONFIG_SECCOMP | ||
140 | static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr) | ||
141 | { | ||
142 | if (!seccomp_mode(&tsk->seccomp)) | ||
143 | return 0; | ||
144 | task_pt_regs(tsk)->orig_ax = syscall_nr; | ||
145 | task_pt_regs(tsk)->ax = syscall_nr; | ||
146 | return __secure_computing(syscall_nr); | ||
147 | } | ||
148 | #else | ||
149 | #define vsyscall_seccomp(_tsk, _nr) 0 | ||
150 | #endif | ||
151 | |||
152 | static bool write_ok_or_segv(unsigned long ptr, size_t size) | 139 | static bool write_ok_or_segv(unsigned long ptr, size_t size) |
153 | { | 140 | { |
154 | /* | 141 | /* |
@@ -181,10 +168,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) | |||
181 | { | 168 | { |
182 | struct task_struct *tsk; | 169 | struct task_struct *tsk; |
183 | unsigned long caller; | 170 | unsigned long caller; |
184 | int vsyscall_nr; | 171 | int vsyscall_nr, syscall_nr, tmp; |
185 | int prev_sig_on_uaccess_error; | 172 | int prev_sig_on_uaccess_error; |
186 | long ret; | 173 | long ret; |
187 | int skip; | ||
188 | 174 | ||
189 | /* | 175 | /* |
190 | * No point in checking CS -- the only way to get here is a user mode | 176 | * No point in checking CS -- the only way to get here is a user mode |
@@ -216,56 +202,84 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) | |||
216 | } | 202 | } |
217 | 203 | ||
218 | tsk = current; | 204 | tsk = current; |
219 | /* | ||
220 | * With a real vsyscall, page faults cause SIGSEGV. We want to | ||
221 | * preserve that behavior to make writing exploits harder. | ||
222 | */ | ||
223 | prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; | ||
224 | current_thread_info()->sig_on_uaccess_error = 1; | ||
225 | 205 | ||
226 | /* | 206 | /* |
207 | * Check for access_ok violations and find the syscall nr. | ||
208 | * | ||
227 | * NULL is a valid user pointer (in the access_ok sense) on 32-bit and | 209 | * NULL is a valid user pointer (in the access_ok sense) on 32-bit and |
228 | * 64-bit, so we don't need to special-case it here. For all the | 210 | * 64-bit, so we don't need to special-case it here. For all the |
229 | * vsyscalls, NULL means "don't write anything" not "write it at | 211 | * vsyscalls, NULL means "don't write anything" not "write it at |
230 | * address 0". | 212 | * address 0". |
231 | */ | 213 | */ |
232 | ret = -EFAULT; | ||
233 | skip = 0; | ||
234 | switch (vsyscall_nr) { | 214 | switch (vsyscall_nr) { |
235 | case 0: | 215 | case 0: |
236 | skip = vsyscall_seccomp(tsk, __NR_gettimeofday); | ||
237 | if (skip) | ||
238 | break; | ||
239 | |||
240 | if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || | 216 | if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || |
241 | !write_ok_or_segv(regs->si, sizeof(struct timezone))) | 217 | !write_ok_or_segv(regs->si, sizeof(struct timezone))) { |
242 | break; | 218 | ret = -EFAULT; |
219 | goto check_fault; | ||
220 | } | ||
221 | |||
222 | syscall_nr = __NR_gettimeofday; | ||
223 | break; | ||
224 | |||
225 | case 1: | ||
226 | if (!write_ok_or_segv(regs->di, sizeof(time_t))) { | ||
227 | ret = -EFAULT; | ||
228 | goto check_fault; | ||
229 | } | ||
230 | |||
231 | syscall_nr = __NR_time; | ||
232 | break; | ||
233 | |||
234 | case 2: | ||
235 | if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || | ||
236 | !write_ok_or_segv(regs->si, sizeof(unsigned))) { | ||
237 | ret = -EFAULT; | ||
238 | goto check_fault; | ||
239 | } | ||
240 | |||
241 | syscall_nr = __NR_getcpu; | ||
242 | break; | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Handle seccomp. regs->ip must be the original value. | ||
247 | * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. | ||
248 | * | ||
249 | * We could optimize the seccomp disabled case, but performance | ||
250 | * here doesn't matter. | ||
251 | */ | ||
252 | regs->orig_ax = syscall_nr; | ||
253 | regs->ax = -ENOSYS; | ||
254 | tmp = secure_computing(syscall_nr); | ||
255 | if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { | ||
256 | warn_bad_vsyscall(KERN_DEBUG, regs, | ||
257 | "seccomp tried to change syscall nr or ip"); | ||
258 | do_exit(SIGSYS); | ||
259 | } | ||
260 | if (tmp) | ||
261 | goto do_ret; /* skip requested */ | ||
243 | 262 | ||
263 | /* | ||
264 | * With a real vsyscall, page faults cause SIGSEGV. We want to | ||
265 | * preserve that behavior to make writing exploits harder. | ||
266 | */ | ||
267 | prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; | ||
268 | current_thread_info()->sig_on_uaccess_error = 1; | ||
269 | |||
270 | ret = -EFAULT; | ||
271 | switch (vsyscall_nr) { | ||
272 | case 0: | ||
244 | ret = sys_gettimeofday( | 273 | ret = sys_gettimeofday( |
245 | (struct timeval __user *)regs->di, | 274 | (struct timeval __user *)regs->di, |
246 | (struct timezone __user *)regs->si); | 275 | (struct timezone __user *)regs->si); |
247 | break; | 276 | break; |
248 | 277 | ||
249 | case 1: | 278 | case 1: |
250 | skip = vsyscall_seccomp(tsk, __NR_time); | ||
251 | if (skip) | ||
252 | break; | ||
253 | |||
254 | if (!write_ok_or_segv(regs->di, sizeof(time_t))) | ||
255 | break; | ||
256 | |||
257 | ret = sys_time((time_t __user *)regs->di); | 279 | ret = sys_time((time_t __user *)regs->di); |
258 | break; | 280 | break; |
259 | 281 | ||
260 | case 2: | 282 | case 2: |
261 | skip = vsyscall_seccomp(tsk, __NR_getcpu); | ||
262 | if (skip) | ||
263 | break; | ||
264 | |||
265 | if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || | ||
266 | !write_ok_or_segv(regs->si, sizeof(unsigned))) | ||
267 | break; | ||
268 | |||
269 | ret = sys_getcpu((unsigned __user *)regs->di, | 283 | ret = sys_getcpu((unsigned __user *)regs->di, |
270 | (unsigned __user *)regs->si, | 284 | (unsigned __user *)regs->si, |
271 | NULL); | 285 | NULL); |
@@ -274,12 +288,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) | |||
274 | 288 | ||
275 | current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; | 289 | current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; |
276 | 290 | ||
277 | if (skip) { | 291 | check_fault: |
278 | if ((long)regs->ax <= 0L) /* seccomp errno emulation */ | ||
279 | goto do_ret; | ||
280 | goto done; /* seccomp trace/trap */ | ||
281 | } | ||
282 | |||
283 | if (ret == -EFAULT) { | 292 | if (ret == -EFAULT) { |
284 | /* Bad news -- userspace fed a bad pointer to a vsyscall. */ | 293 | /* Bad news -- userspace fed a bad pointer to a vsyscall. */ |
285 | warn_bad_vsyscall(KERN_INFO, regs, | 294 | warn_bad_vsyscall(KERN_INFO, regs, |
@@ -302,7 +311,6 @@ do_ret: | |||
302 | /* Emulate a ret instruction. */ | 311 | /* Emulate a ret instruction. */ |
303 | regs->ip = caller; | 312 | regs->ip = caller; |
304 | regs->sp += 8; | 313 | regs->sp += 8; |
305 | done: | ||
306 | return true; | 314 | return true; |
307 | 315 | ||
308 | sigsegv: | 316 | sigsegv: |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ee376beedaf9..5af44b593770 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -396,25 +396,29 @@ int __secure_computing(int this_syscall) | |||
396 | #ifdef CONFIG_SECCOMP_FILTER | 396 | #ifdef CONFIG_SECCOMP_FILTER |
397 | case SECCOMP_MODE_FILTER: { | 397 | case SECCOMP_MODE_FILTER: { |
398 | int data; | 398 | int data; |
399 | struct pt_regs *regs = task_pt_regs(current); | ||
399 | ret = seccomp_run_filters(this_syscall); | 400 | ret = seccomp_run_filters(this_syscall); |
400 | data = ret & SECCOMP_RET_DATA; | 401 | data = ret & SECCOMP_RET_DATA; |
401 | ret &= SECCOMP_RET_ACTION; | 402 | ret &= SECCOMP_RET_ACTION; |
402 | switch (ret) { | 403 | switch (ret) { |
403 | case SECCOMP_RET_ERRNO: | 404 | case SECCOMP_RET_ERRNO: |
404 | /* Set the low-order 16-bits as a errno. */ | 405 | /* Set the low-order 16-bits as a errno. */ |
405 | syscall_set_return_value(current, task_pt_regs(current), | 406 | syscall_set_return_value(current, regs, |
406 | -data, 0); | 407 | -data, 0); |
407 | goto skip; | 408 | goto skip; |
408 | case SECCOMP_RET_TRAP: | 409 | case SECCOMP_RET_TRAP: |
409 | /* Show the handler the original registers. */ | 410 | /* Show the handler the original registers. */ |
410 | syscall_rollback(current, task_pt_regs(current)); | 411 | syscall_rollback(current, regs); |
411 | /* Let the filter pass back 16 bits of data. */ | 412 | /* Let the filter pass back 16 bits of data. */ |
412 | seccomp_send_sigsys(this_syscall, data); | 413 | seccomp_send_sigsys(this_syscall, data); |
413 | goto skip; | 414 | goto skip; |
414 | case SECCOMP_RET_TRACE: | 415 | case SECCOMP_RET_TRACE: |
415 | /* Skip these calls if there is no tracer. */ | 416 | /* Skip these calls if there is no tracer. */ |
416 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) | 417 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { |
418 | syscall_set_return_value(current, regs, | ||
419 | -ENOSYS, 0); | ||
417 | goto skip; | 420 | goto skip; |
421 | } | ||
418 | /* Allow the BPF to provide the event message */ | 422 | /* Allow the BPF to provide the event message */ |
419 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | 423 | ptrace_event(PTRACE_EVENT_SECCOMP, data); |
420 | /* | 424 | /* |
@@ -425,6 +429,9 @@ int __secure_computing(int this_syscall) | |||
425 | */ | 429 | */ |
426 | if (fatal_signal_pending(current)) | 430 | if (fatal_signal_pending(current)) |
427 | break; | 431 | break; |
432 | if (syscall_get_nr(current, regs) < 0) | ||
433 | goto skip; /* Explicit request to skip. */ | ||
434 | |||
428 | return 0; | 435 | return 0; |
429 | case SECCOMP_RET_ALLOW: | 436 | case SECCOMP_RET_ALLOW: |
430 | return 0; | 437 | return 0; |