diff options
| -rw-r--r-- | Documentation/prctl/seccomp_filter.txt | 74 | ||||
| -rw-r--r-- | arch/x86/kernel/vsyscall_64.c | 110 | ||||
| -rw-r--r-- | kernel/seccomp.c | 13 |
3 files changed, 137 insertions, 60 deletions
diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt index 597c3c581375..1e469ef75778 100644 --- a/Documentation/prctl/seccomp_filter.txt +++ b/Documentation/prctl/seccomp_filter.txt | |||
| @@ -95,12 +95,15 @@ SECCOMP_RET_KILL: | |||
| 95 | 95 | ||
| 96 | SECCOMP_RET_TRAP: | 96 | SECCOMP_RET_TRAP: |
| 97 | Results in the kernel sending a SIGSYS signal to the triggering | 97 | Results in the kernel sending a SIGSYS signal to the triggering |
| 98 | task without executing the system call. The kernel will | 98 | task without executing the system call. siginfo->si_call_addr |
| 99 | rollback the register state to just before the system call | 99 | will show the address of the system call instruction, and |
| 100 | entry such that a signal handler in the task will be able to | 100 | siginfo->si_syscall and siginfo->si_arch will indicate which |
| 101 | inspect the ucontext_t->uc_mcontext registers and emulate | 101 | syscall was attempted. The program counter will be as though |
| 102 | system call success or failure upon return from the signal | 102 | the syscall happened (i.e. it will not point to the syscall |
| 103 | handler. | 103 | instruction). The return value register will contain an arch- |
| 104 | dependent value -- if resuming execution, set it to something | ||
| 105 | sensible. (The architecture dependency is because replacing | ||
| 106 | it with -ENOSYS could overwrite some useful information.) | ||
| 104 | 107 | ||
| 105 | The SECCOMP_RET_DATA portion of the return value will be passed | 108 | The SECCOMP_RET_DATA portion of the return value will be passed |
| 106 | as si_errno. | 109 | as si_errno. |
| @@ -123,6 +126,18 @@ SECCOMP_RET_TRACE: | |||
| 123 | the BPF program return value will be available to the tracer | 126 | the BPF program return value will be available to the tracer |
| 124 | via PTRACE_GETEVENTMSG. | 127 | via PTRACE_GETEVENTMSG. |
| 125 | 128 | ||
| 129 | The tracer can skip the system call by changing the syscall number | ||
| 130 | to -1. Alternatively, the tracer can change the system call | ||
| 131 | requested by changing the system call to a valid syscall number. If | ||
| 132 | the tracer asks to skip the system call, then the system call will | ||
| 133 | appear to return the value that the tracer puts in the return value | ||
| 134 | register. | ||
| 135 | |||
| 136 | The seccomp check will not be run again after the tracer is | ||
| 137 | notified. (This means that seccomp-based sandboxes MUST NOT | ||
| 138 | allow use of ptrace, even of other sandboxed processes, without | ||
| 139 | extreme care; ptracers can use this mechanism to escape.) | ||
| 140 | |||
| 126 | SECCOMP_RET_ALLOW: | 141 | SECCOMP_RET_ALLOW: |
| 127 | Results in the system call being executed. | 142 | Results in the system call being executed. |
| 128 | 143 | ||
| @@ -161,3 +176,50 @@ architecture supports both ptrace_event and seccomp, it will be able to | |||
| 161 | support seccomp filter with minor fixup: SIGSYS support and seccomp return | 176 | support seccomp filter with minor fixup: SIGSYS support and seccomp return |
| 162 | value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER | 177 | value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER |
| 163 | to its arch-specific Kconfig. | 178 | to its arch-specific Kconfig. |
| 179 | |||
| 180 | |||
| 181 | |||
| 182 | Caveats | ||
| 183 | ------- | ||
| 184 | |||
| 185 | The vDSO can cause some system calls to run entirely in userspace, | ||
| 186 | leading to surprises when you run programs on different machines that | ||
| 187 | fall back to real syscalls. To minimize these surprises on x86, make | ||
| 188 | sure you test with | ||
| 189 | /sys/devices/system/clocksource/clocksource0/current_clocksource set to | ||
| 190 | something like acpi_pm. | ||
| 191 | |||
| 192 | On x86-64, vsyscall emulation is enabled by default. (vsyscalls are | ||
| 193 | legacy variants on vDSO calls.) Currently, emulated vsyscalls will honor seccomp, with a few oddities: | ||
| 194 | |||
| 195 | - A return value of SECCOMP_RET_TRAP will set a si_call_addr pointing to | ||
| 196 | the vsyscall entry for the given call and not the address after the | ||
| 197 | 'syscall' instruction. Any code which wants to restart the call | ||
| 198 | should be aware that (a) a ret instruction has been emulated and (b) | ||
| 199 | trying to resume the syscall will again trigger the standard vsyscall | ||
| 200 | emulation security checks, making resuming the syscall mostly | ||
| 201 | pointless. | ||
| 202 | |||
| 203 | - A return value of SECCOMP_RET_TRACE will signal the tracer as usual, | ||
| 204 | but the syscall may not be changed to another system call using the | ||
| 205 | orig_rax register. It may only be changed to -1 order to skip the | ||
| 206 | currently emulated call. Any other change MAY terminate the process. | ||
| 207 | The rip value seen by the tracer will be the syscall entry address; | ||
| 208 | this is different from normal behavior. The tracer MUST NOT modify | ||
| 209 | rip or rsp. (Do not rely on other changes terminating the process. | ||
| 210 | They might work. For example, on some kernels, choosing a syscall | ||
| 211 | that only exists in future kernels will be correctly emulated (by | ||
| 212 | returning -ENOSYS). | ||
| 213 | |||
| 214 | To detect this quirky behavior, check for addr & ~0x0C00 == | ||
| 215 | 0xFFFFFFFFFF600000. (For SECCOMP_RET_TRACE, use rip. For | ||
| 216 | SECCOMP_RET_TRAP, use siginfo->si_call_addr.) Do not check any other | ||
| 217 | condition: future kernels may improve vsyscall emulation and current | ||
| 218 | kernels in vsyscall=native mode will behave differently, but the | ||
| 219 | instructions at 0xF...F600{0,4,8,C}00 will not be system calls in these | ||
| 220 | cases. | ||
| 221 | |||
| 222 | Note that modern systems are unlikely to use vsyscalls at all -- they | ||
| 223 | are a legacy feature and they are considerably slower than standard | ||
| 224 | syscalls. New code will use the vDSO, and vDSO-issued system calls | ||
| 225 | are indistinguishable from normal system calls. | ||
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 8d141b309046..b2e58a248b3b 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c | |||
| @@ -136,19 +136,6 @@ static int addr_to_vsyscall_nr(unsigned long addr) | |||
| 136 | return nr; | 136 | return nr; |
| 137 | } | 137 | } |
| 138 | 138 | ||
| 139 | #ifdef CONFIG_SECCOMP | ||
| 140 | static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr) | ||
| 141 | { | ||
| 142 | if (!seccomp_mode(&tsk->seccomp)) | ||
| 143 | return 0; | ||
| 144 | task_pt_regs(tsk)->orig_ax = syscall_nr; | ||
| 145 | task_pt_regs(tsk)->ax = syscall_nr; | ||
| 146 | return __secure_computing(syscall_nr); | ||
| 147 | } | ||
| 148 | #else | ||
| 149 | #define vsyscall_seccomp(_tsk, _nr) 0 | ||
| 150 | #endif | ||
| 151 | |||
| 152 | static bool write_ok_or_segv(unsigned long ptr, size_t size) | 139 | static bool write_ok_or_segv(unsigned long ptr, size_t size) |
| 153 | { | 140 | { |
| 154 | /* | 141 | /* |
| @@ -181,10 +168,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) | |||
| 181 | { | 168 | { |
| 182 | struct task_struct *tsk; | 169 | struct task_struct *tsk; |
| 183 | unsigned long caller; | 170 | unsigned long caller; |
| 184 | int vsyscall_nr; | 171 | int vsyscall_nr, syscall_nr, tmp; |
| 185 | int prev_sig_on_uaccess_error; | 172 | int prev_sig_on_uaccess_error; |
| 186 | long ret; | 173 | long ret; |
| 187 | int skip; | ||
| 188 | 174 | ||
| 189 | /* | 175 | /* |
| 190 | * No point in checking CS -- the only way to get here is a user mode | 176 | * No point in checking CS -- the only way to get here is a user mode |
| @@ -216,56 +202,84 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) | |||
| 216 | } | 202 | } |
| 217 | 203 | ||
| 218 | tsk = current; | 204 | tsk = current; |
| 219 | /* | ||
| 220 | * With a real vsyscall, page faults cause SIGSEGV. We want to | ||
| 221 | * preserve that behavior to make writing exploits harder. | ||
| 222 | */ | ||
| 223 | prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; | ||
| 224 | current_thread_info()->sig_on_uaccess_error = 1; | ||
| 225 | 205 | ||
| 226 | /* | 206 | /* |
| 207 | * Check for access_ok violations and find the syscall nr. | ||
| 208 | * | ||
| 227 | * NULL is a valid user pointer (in the access_ok sense) on 32-bit and | 209 | * NULL is a valid user pointer (in the access_ok sense) on 32-bit and |
| 228 | * 64-bit, so we don't need to special-case it here. For all the | 210 | * 64-bit, so we don't need to special-case it here. For all the |
| 229 | * vsyscalls, NULL means "don't write anything" not "write it at | 211 | * vsyscalls, NULL means "don't write anything" not "write it at |
| 230 | * address 0". | 212 | * address 0". |
| 231 | */ | 213 | */ |
| 232 | ret = -EFAULT; | ||
| 233 | skip = 0; | ||
| 234 | switch (vsyscall_nr) { | 214 | switch (vsyscall_nr) { |
| 235 | case 0: | 215 | case 0: |
| 236 | skip = vsyscall_seccomp(tsk, __NR_gettimeofday); | ||
| 237 | if (skip) | ||
| 238 | break; | ||
| 239 | |||
| 240 | if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || | 216 | if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || |
| 241 | !write_ok_or_segv(regs->si, sizeof(struct timezone))) | 217 | !write_ok_or_segv(regs->si, sizeof(struct timezone))) { |
| 242 | break; | 218 | ret = -EFAULT; |
| 219 | goto check_fault; | ||
| 220 | } | ||
| 221 | |||
| 222 | syscall_nr = __NR_gettimeofday; | ||
| 223 | break; | ||
| 224 | |||
| 225 | case 1: | ||
| 226 | if (!write_ok_or_segv(regs->di, sizeof(time_t))) { | ||
| 227 | ret = -EFAULT; | ||
| 228 | goto check_fault; | ||
| 229 | } | ||
| 230 | |||
| 231 | syscall_nr = __NR_time; | ||
| 232 | break; | ||
| 233 | |||
| 234 | case 2: | ||
| 235 | if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || | ||
| 236 | !write_ok_or_segv(regs->si, sizeof(unsigned))) { | ||
| 237 | ret = -EFAULT; | ||
| 238 | goto check_fault; | ||
| 239 | } | ||
| 240 | |||
| 241 | syscall_nr = __NR_getcpu; | ||
| 242 | break; | ||
| 243 | } | ||
| 244 | |||
| 245 | /* | ||
| 246 | * Handle seccomp. regs->ip must be the original value. | ||
| 247 | * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. | ||
| 248 | * | ||
| 249 | * We could optimize the seccomp disabled case, but performance | ||
| 250 | * here doesn't matter. | ||
| 251 | */ | ||
| 252 | regs->orig_ax = syscall_nr; | ||
| 253 | regs->ax = -ENOSYS; | ||
| 254 | tmp = secure_computing(syscall_nr); | ||
| 255 | if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { | ||
| 256 | warn_bad_vsyscall(KERN_DEBUG, regs, | ||
| 257 | "seccomp tried to change syscall nr or ip"); | ||
| 258 | do_exit(SIGSYS); | ||
| 259 | } | ||
| 260 | if (tmp) | ||
| 261 | goto do_ret; /* skip requested */ | ||
| 243 | 262 | ||
| 263 | /* | ||
| 264 | * With a real vsyscall, page faults cause SIGSEGV. We want to | ||
| 265 | * preserve that behavior to make writing exploits harder. | ||
| 266 | */ | ||
| 267 | prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; | ||
| 268 | current_thread_info()->sig_on_uaccess_error = 1; | ||
| 269 | |||
| 270 | ret = -EFAULT; | ||
| 271 | switch (vsyscall_nr) { | ||
| 272 | case 0: | ||
| 244 | ret = sys_gettimeofday( | 273 | ret = sys_gettimeofday( |
| 245 | (struct timeval __user *)regs->di, | 274 | (struct timeval __user *)regs->di, |
| 246 | (struct timezone __user *)regs->si); | 275 | (struct timezone __user *)regs->si); |
| 247 | break; | 276 | break; |
| 248 | 277 | ||
| 249 | case 1: | 278 | case 1: |
| 250 | skip = vsyscall_seccomp(tsk, __NR_time); | ||
| 251 | if (skip) | ||
| 252 | break; | ||
| 253 | |||
| 254 | if (!write_ok_or_segv(regs->di, sizeof(time_t))) | ||
| 255 | break; | ||
| 256 | |||
| 257 | ret = sys_time((time_t __user *)regs->di); | 279 | ret = sys_time((time_t __user *)regs->di); |
| 258 | break; | 280 | break; |
| 259 | 281 | ||
| 260 | case 2: | 282 | case 2: |
| 261 | skip = vsyscall_seccomp(tsk, __NR_getcpu); | ||
| 262 | if (skip) | ||
| 263 | break; | ||
| 264 | |||
| 265 | if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || | ||
| 266 | !write_ok_or_segv(regs->si, sizeof(unsigned))) | ||
| 267 | break; | ||
| 268 | |||
| 269 | ret = sys_getcpu((unsigned __user *)regs->di, | 283 | ret = sys_getcpu((unsigned __user *)regs->di, |
| 270 | (unsigned __user *)regs->si, | 284 | (unsigned __user *)regs->si, |
| 271 | NULL); | 285 | NULL); |
| @@ -274,12 +288,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) | |||
| 274 | 288 | ||
| 275 | current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; | 289 | current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; |
| 276 | 290 | ||
| 277 | if (skip) { | 291 | check_fault: |
| 278 | if ((long)regs->ax <= 0L) /* seccomp errno emulation */ | ||
| 279 | goto do_ret; | ||
| 280 | goto done; /* seccomp trace/trap */ | ||
| 281 | } | ||
| 282 | |||
| 283 | if (ret == -EFAULT) { | 292 | if (ret == -EFAULT) { |
| 284 | /* Bad news -- userspace fed a bad pointer to a vsyscall. */ | 293 | /* Bad news -- userspace fed a bad pointer to a vsyscall. */ |
| 285 | warn_bad_vsyscall(KERN_INFO, regs, | 294 | warn_bad_vsyscall(KERN_INFO, regs, |
| @@ -302,7 +311,6 @@ do_ret: | |||
| 302 | /* Emulate a ret instruction. */ | 311 | /* Emulate a ret instruction. */ |
| 303 | regs->ip = caller; | 312 | regs->ip = caller; |
| 304 | regs->sp += 8; | 313 | regs->sp += 8; |
| 305 | done: | ||
| 306 | return true; | 314 | return true; |
| 307 | 315 | ||
| 308 | sigsegv: | 316 | sigsegv: |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ee376beedaf9..5af44b593770 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
| @@ -396,25 +396,29 @@ int __secure_computing(int this_syscall) | |||
| 396 | #ifdef CONFIG_SECCOMP_FILTER | 396 | #ifdef CONFIG_SECCOMP_FILTER |
| 397 | case SECCOMP_MODE_FILTER: { | 397 | case SECCOMP_MODE_FILTER: { |
| 398 | int data; | 398 | int data; |
| 399 | struct pt_regs *regs = task_pt_regs(current); | ||
| 399 | ret = seccomp_run_filters(this_syscall); | 400 | ret = seccomp_run_filters(this_syscall); |
| 400 | data = ret & SECCOMP_RET_DATA; | 401 | data = ret & SECCOMP_RET_DATA; |
| 401 | ret &= SECCOMP_RET_ACTION; | 402 | ret &= SECCOMP_RET_ACTION; |
| 402 | switch (ret) { | 403 | switch (ret) { |
| 403 | case SECCOMP_RET_ERRNO: | 404 | case SECCOMP_RET_ERRNO: |
| 404 | /* Set the low-order 16-bits as a errno. */ | 405 | /* Set the low-order 16-bits as a errno. */ |
| 405 | syscall_set_return_value(current, task_pt_regs(current), | 406 | syscall_set_return_value(current, regs, |
| 406 | -data, 0); | 407 | -data, 0); |
| 407 | goto skip; | 408 | goto skip; |
| 408 | case SECCOMP_RET_TRAP: | 409 | case SECCOMP_RET_TRAP: |
| 409 | /* Show the handler the original registers. */ | 410 | /* Show the handler the original registers. */ |
| 410 | syscall_rollback(current, task_pt_regs(current)); | 411 | syscall_rollback(current, regs); |
| 411 | /* Let the filter pass back 16 bits of data. */ | 412 | /* Let the filter pass back 16 bits of data. */ |
| 412 | seccomp_send_sigsys(this_syscall, data); | 413 | seccomp_send_sigsys(this_syscall, data); |
| 413 | goto skip; | 414 | goto skip; |
| 414 | case SECCOMP_RET_TRACE: | 415 | case SECCOMP_RET_TRACE: |
| 415 | /* Skip these calls if there is no tracer. */ | 416 | /* Skip these calls if there is no tracer. */ |
| 416 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) | 417 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { |
| 418 | syscall_set_return_value(current, regs, | ||
| 419 | -ENOSYS, 0); | ||
| 417 | goto skip; | 420 | goto skip; |
| 421 | } | ||
| 418 | /* Allow the BPF to provide the event message */ | 422 | /* Allow the BPF to provide the event message */ |
| 419 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | 423 | ptrace_event(PTRACE_EVENT_SECCOMP, data); |
| 420 | /* | 424 | /* |
| @@ -425,6 +429,9 @@ int __secure_computing(int this_syscall) | |||
| 425 | */ | 429 | */ |
| 426 | if (fatal_signal_pending(current)) | 430 | if (fatal_signal_pending(current)) |
| 427 | break; | 431 | break; |
| 432 | if (syscall_get_nr(current, regs) < 0) | ||
| 433 | goto skip; /* Explicit request to skip. */ | ||
| 434 | |||
| 428 | return 0; | 435 | return 0; |
| 429 | case SECCOMP_RET_ALLOW: | 436 | case SECCOMP_RET_ALLOW: |
| 430 | return 0; | 437 | return 0; |
