diff options
-rw-r--r-- | arch/Kconfig | 11 | ||||
-rw-r--r-- | arch/arm/kernel/ptrace.c | 7 | ||||
-rw-r--r-- | arch/mips/kernel/ptrace.c | 2 | ||||
-rw-r--r-- | arch/s390/kernel/ptrace.c | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/calling.h | 6 | ||||
-rw-r--r-- | arch/x86/include/asm/ptrace.h | 5 | ||||
-rw-r--r-- | arch/x86/kernel/entry_64.S | 51 | ||||
-rw-r--r-- | arch/x86/kernel/ptrace.c | 165 | ||||
-rw-r--r-- | arch/x86/kernel/vsyscall_64.c | 2 | ||||
-rw-r--r-- | include/linux/seccomp.h | 25 | ||||
-rw-r--r-- | kernel/seccomp.c | 252 |
11 files changed, 371 insertions, 157 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 0eae9df35b88..05d7a8a458d5 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
@@ -323,6 +323,17 @@ config HAVE_ARCH_SECCOMP_FILTER | |||
323 | results in the system call being skipped immediately. | 323 | results in the system call being skipped immediately. |
324 | - seccomp syscall wired up | 324 | - seccomp syscall wired up |
325 | 325 | ||
326 | For best performance, an arch should use seccomp_phase1 and | ||
327 | seccomp_phase2 directly. It should call seccomp_phase1 for all | ||
328 | syscalls if TIF_SECCOMP is set, but seccomp_phase1 does not | ||
329 | need to be called from a ptrace-safe context. It must then | ||
330 | call seccomp_phase2 if seccomp_phase1 returns anything other | ||
331 | than SECCOMP_PHASE1_OK or SECCOMP_PHASE1_SKIP. | ||
332 | |||
333 | As an additional optimization, an arch may provide seccomp_data | ||
334 | directly to seccomp_phase1; this avoids multiple calls | ||
335 | to the syscall_xyz helpers for every syscall. | ||
336 | |||
326 | config SECCOMP_FILTER | 337 | config SECCOMP_FILTER |
327 | def_bool y | 338 | def_bool y |
328 | depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET | 339 | depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET |
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 0c27ed6f3f23..5e772a21ab97 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c | |||
@@ -933,8 +933,13 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno) | |||
933 | current_thread_info()->syscall = scno; | 933 | current_thread_info()->syscall = scno; |
934 | 934 | ||
935 | /* Do the secure computing check first; failures should be fast. */ | 935 | /* Do the secure computing check first; failures should be fast. */ |
936 | if (secure_computing(scno) == -1) | 936 | #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER |
937 | if (secure_computing() == -1) | ||
937 | return -1; | 938 | return -1; |
939 | #else | ||
940 | /* XXX: remove this once OABI gets fixed */ | ||
941 | secure_computing_strict(scno); | ||
942 | #endif | ||
938 | 943 | ||
939 | if (test_thread_flag(TIF_SYSCALL_TRACE)) | 944 | if (test_thread_flag(TIF_SYSCALL_TRACE)) |
940 | tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); | 945 | tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); |
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 645b3c4fcfba..f7aac5b57b4b 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c | |||
@@ -770,7 +770,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) | |||
770 | long ret = 0; | 770 | long ret = 0; |
771 | user_exit(); | 771 | user_exit(); |
772 | 772 | ||
773 | if (secure_computing(syscall) == -1) | 773 | if (secure_computing() == -1) |
774 | return -1; | 774 | return -1; |
775 | 775 | ||
776 | if (test_thread_flag(TIF_SYSCALL_TRACE) && | 776 | if (test_thread_flag(TIF_SYSCALL_TRACE) && |
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 5dc7ad9e2fbf..bebacad48305 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c | |||
@@ -803,7 +803,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) | |||
803 | long ret = 0; | 803 | long ret = 0; |
804 | 804 | ||
805 | /* Do the secure computing check first. */ | 805 | /* Do the secure computing check first. */ |
806 | if (secure_computing(regs->gprs[2])) { | 806 | if (secure_computing()) { |
807 | /* seccomp failures shouldn't expose any additional code. */ | 807 | /* seccomp failures shouldn't expose any additional code. */ |
808 | ret = -1; | 808 | ret = -1; |
809 | goto out; | 809 | goto out; |
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index cb4c73bfeb48..76659b67fd11 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h | |||
@@ -85,7 +85,7 @@ For 32-bit we have the following conventions - kernel is built with | |||
85 | #define ARGOFFSET R11 | 85 | #define ARGOFFSET R11 |
86 | #define SWFRAME ORIG_RAX | 86 | #define SWFRAME ORIG_RAX |
87 | 87 | ||
88 | .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1 | 88 | .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 |
89 | subq $9*8+\addskip, %rsp | 89 | subq $9*8+\addskip, %rsp |
90 | CFI_ADJUST_CFA_OFFSET 9*8+\addskip | 90 | CFI_ADJUST_CFA_OFFSET 9*8+\addskip |
91 | movq_cfi rdi, 8*8 | 91 | movq_cfi rdi, 8*8 |
@@ -96,7 +96,11 @@ For 32-bit we have the following conventions - kernel is built with | |||
96 | movq_cfi rcx, 5*8 | 96 | movq_cfi rcx, 5*8 |
97 | .endif | 97 | .endif |
98 | 98 | ||
99 | .if \rax_enosys | ||
100 | movq $-ENOSYS, 4*8(%rsp) | ||
101 | .else | ||
99 | movq_cfi rax, 4*8 | 102 | movq_cfi rax, 4*8 |
103 | .endif | ||
100 | 104 | ||
101 | .if \save_r891011 | 105 | .if \save_r891011 |
102 | movq_cfi r8, 3*8 | 106 | movq_cfi r8, 3*8 |
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 6205f0c434db..86fc2bb82287 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h | |||
@@ -75,6 +75,11 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); | |||
75 | extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, | 75 | extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, |
76 | int error_code, int si_code); | 76 | int error_code, int si_code); |
77 | 77 | ||
78 | |||
79 | extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch); | ||
80 | extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch, | ||
81 | unsigned long phase1_result); | ||
82 | |||
78 | extern long syscall_trace_enter(struct pt_regs *); | 83 | extern long syscall_trace_enter(struct pt_regs *); |
79 | extern void syscall_trace_leave(struct pt_regs *); | 84 | extern void syscall_trace_leave(struct pt_regs *); |
80 | 85 | ||
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 2fac1343a90b..df088bb03fb3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -404,8 +404,8 @@ GLOBAL(system_call_after_swapgs) | |||
404 | * and short: | 404 | * and short: |
405 | */ | 405 | */ |
406 | ENABLE_INTERRUPTS(CLBR_NONE) | 406 | ENABLE_INTERRUPTS(CLBR_NONE) |
407 | SAVE_ARGS 8,0 | 407 | SAVE_ARGS 8, 0, rax_enosys=1 |
408 | movq %rax,ORIG_RAX-ARGOFFSET(%rsp) | 408 | movq_cfi rax,(ORIG_RAX-ARGOFFSET) |
409 | movq %rcx,RIP-ARGOFFSET(%rsp) | 409 | movq %rcx,RIP-ARGOFFSET(%rsp) |
410 | CFI_REL_OFFSET rip,RIP-ARGOFFSET | 410 | CFI_REL_OFFSET rip,RIP-ARGOFFSET |
411 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 411 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) |
@@ -417,7 +417,7 @@ system_call_fastpath: | |||
417 | andl $__SYSCALL_MASK,%eax | 417 | andl $__SYSCALL_MASK,%eax |
418 | cmpl $__NR_syscall_max,%eax | 418 | cmpl $__NR_syscall_max,%eax |
419 | #endif | 419 | #endif |
420 | ja badsys | 420 | ja ret_from_sys_call /* and return regs->ax */ |
421 | movq %r10,%rcx | 421 | movq %r10,%rcx |
422 | call *sys_call_table(,%rax,8) # XXX: rip relative | 422 | call *sys_call_table(,%rax,8) # XXX: rip relative |
423 | movq %rax,RAX-ARGOFFSET(%rsp) | 423 | movq %rax,RAX-ARGOFFSET(%rsp) |
@@ -476,28 +476,8 @@ sysret_signal: | |||
476 | FIXUP_TOP_OF_STACK %r11, -ARGOFFSET | 476 | FIXUP_TOP_OF_STACK %r11, -ARGOFFSET |
477 | jmp int_check_syscall_exit_work | 477 | jmp int_check_syscall_exit_work |
478 | 478 | ||
479 | badsys: | ||
480 | movq $-ENOSYS,RAX-ARGOFFSET(%rsp) | ||
481 | jmp ret_from_sys_call | ||
482 | |||
483 | #ifdef CONFIG_AUDITSYSCALL | 479 | #ifdef CONFIG_AUDITSYSCALL |
484 | /* | 480 | /* |
485 | * Fast path for syscall audit without full syscall trace. | ||
486 | * We just call __audit_syscall_entry() directly, and then | ||
487 | * jump back to the normal fast path. | ||
488 | */ | ||
489 | auditsys: | ||
490 | movq %r10,%r9 /* 6th arg: 4th syscall arg */ | ||
491 | movq %rdx,%r8 /* 5th arg: 3rd syscall arg */ | ||
492 | movq %rsi,%rcx /* 4th arg: 2nd syscall arg */ | ||
493 | movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ | ||
494 | movq %rax,%rsi /* 2nd arg: syscall number */ | ||
495 | movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ | ||
496 | call __audit_syscall_entry | ||
497 | LOAD_ARGS 0 /* reload call-clobbered registers */ | ||
498 | jmp system_call_fastpath | ||
499 | |||
500 | /* | ||
501 | * Return fast path for syscall audit. Call __audit_syscall_exit() | 481 | * Return fast path for syscall audit. Call __audit_syscall_exit() |
502 | * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT | 482 | * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT |
503 | * masked off. | 483 | * masked off. |
@@ -514,18 +494,25 @@ sysret_audit: | |||
514 | 494 | ||
515 | /* Do syscall tracing */ | 495 | /* Do syscall tracing */ |
516 | tracesys: | 496 | tracesys: |
517 | #ifdef CONFIG_AUDITSYSCALL | 497 | leaq -REST_SKIP(%rsp), %rdi |
518 | testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 498 | movq $AUDIT_ARCH_X86_64, %rsi |
519 | jz auditsys | 499 | call syscall_trace_enter_phase1 |
520 | #endif | 500 | test %rax, %rax |
501 | jnz tracesys_phase2 /* if needed, run the slow path */ | ||
502 | LOAD_ARGS 0 /* else restore clobbered regs */ | ||
503 | jmp system_call_fastpath /* and return to the fast path */ | ||
504 | |||
505 | tracesys_phase2: | ||
521 | SAVE_REST | 506 | SAVE_REST |
522 | movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ | ||
523 | FIXUP_TOP_OF_STACK %rdi | 507 | FIXUP_TOP_OF_STACK %rdi |
524 | movq %rsp,%rdi | 508 | movq %rsp, %rdi |
525 | call syscall_trace_enter | 509 | movq $AUDIT_ARCH_X86_64, %rsi |
510 | movq %rax,%rdx | ||
511 | call syscall_trace_enter_phase2 | ||
512 | |||
526 | /* | 513 | /* |
527 | * Reload arg registers from stack in case ptrace changed them. | 514 | * Reload arg registers from stack in case ptrace changed them. |
528 | * We don't reload %rax because syscall_trace_enter() returned | 515 | * We don't reload %rax because syscall_trace_entry_phase2() returned |
529 | * the value it wants us to use in the table lookup. | 516 | * the value it wants us to use in the table lookup. |
530 | */ | 517 | */ |
531 | LOAD_ARGS ARGOFFSET, 1 | 518 | LOAD_ARGS ARGOFFSET, 1 |
@@ -536,7 +523,7 @@ tracesys: | |||
536 | andl $__SYSCALL_MASK,%eax | 523 | andl $__SYSCALL_MASK,%eax |
537 | cmpl $__NR_syscall_max,%eax | 524 | cmpl $__NR_syscall_max,%eax |
538 | #endif | 525 | #endif |
539 | ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ | 526 | ja int_ret_from_sys_call /* RAX(%rsp) is already set */ |
540 | movq %r10,%rcx /* fixup for C */ | 527 | movq %r10,%rcx /* fixup for C */ |
541 | call *sys_call_table(,%rax,8) | 528 | call *sys_call_table(,%rax,8) |
542 | movq %rax,RAX-ARGOFFSET(%rsp) | 529 | movq %rax,RAX-ARGOFFSET(%rsp) |
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 678c0ada3b3c..29576c244699 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c | |||
@@ -1441,24 +1441,126 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, | |||
1441 | force_sig_info(SIGTRAP, &info, tsk); | 1441 | force_sig_info(SIGTRAP, &info, tsk); |
1442 | } | 1442 | } |
1443 | 1443 | ||
1444 | 1444 | static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) | |
1445 | #ifdef CONFIG_X86_32 | 1445 | { |
1446 | # define IS_IA32 1 | 1446 | #ifdef CONFIG_X86_64 |
1447 | #elif defined CONFIG_IA32_EMULATION | 1447 | if (arch == AUDIT_ARCH_X86_64) { |
1448 | # define IS_IA32 is_compat_task() | 1448 | audit_syscall_entry(arch, regs->orig_ax, regs->di, |
1449 | #else | 1449 | regs->si, regs->dx, regs->r10); |
1450 | # define IS_IA32 0 | 1450 | } else |
1451 | #endif | 1451 | #endif |
1452 | { | ||
1453 | audit_syscall_entry(arch, regs->orig_ax, regs->bx, | ||
1454 | regs->cx, regs->dx, regs->si); | ||
1455 | } | ||
1456 | } | ||
1452 | 1457 | ||
1453 | /* | 1458 | /* |
1454 | * We must return the syscall number to actually look up in the table. | 1459 | * We can return 0 to resume the syscall or anything else to go to phase |
1455 | * This can be -1L to skip running any syscall at all. | 1460 | * 2. If we resume the syscall, we need to put something appropriate in |
1461 | * regs->orig_ax. | ||
1462 | * | ||
1463 | * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax | ||
1464 | * are fully functional. | ||
1465 | * | ||
1466 | * For phase 2's benefit, our return value is: | ||
1467 | * 0: resume the syscall | ||
1468 | * 1: go to phase 2; no seccomp phase 2 needed | ||
1469 | * anything else: go to phase 2; pass return value to seccomp | ||
1456 | */ | 1470 | */ |
1457 | long syscall_trace_enter(struct pt_regs *regs) | 1471 | unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) |
1472 | { | ||
1473 | unsigned long ret = 0; | ||
1474 | u32 work; | ||
1475 | |||
1476 | BUG_ON(regs != task_pt_regs(current)); | ||
1477 | |||
1478 | work = ACCESS_ONCE(current_thread_info()->flags) & | ||
1479 | _TIF_WORK_SYSCALL_ENTRY; | ||
1480 | |||
1481 | /* | ||
1482 | * If TIF_NOHZ is set, we are required to call user_exit() before | ||
1483 | * doing anything that could touch RCU. | ||
1484 | */ | ||
1485 | if (work & _TIF_NOHZ) { | ||
1486 | user_exit(); | ||
1487 | work &= ~TIF_NOHZ; | ||
1488 | } | ||
1489 | |||
1490 | #ifdef CONFIG_SECCOMP | ||
1491 | /* | ||
1492 | * Do seccomp first -- it should minimize exposure of other | ||
1493 | * code, and keeping seccomp fast is probably more valuable | ||
1494 | * than the rest of this. | ||
1495 | */ | ||
1496 | if (work & _TIF_SECCOMP) { | ||
1497 | struct seccomp_data sd; | ||
1498 | |||
1499 | sd.arch = arch; | ||
1500 | sd.nr = regs->orig_ax; | ||
1501 | sd.instruction_pointer = regs->ip; | ||
1502 | #ifdef CONFIG_X86_64 | ||
1503 | if (arch == AUDIT_ARCH_X86_64) { | ||
1504 | sd.args[0] = regs->di; | ||
1505 | sd.args[1] = regs->si; | ||
1506 | sd.args[2] = regs->dx; | ||
1507 | sd.args[3] = regs->r10; | ||
1508 | sd.args[4] = regs->r8; | ||
1509 | sd.args[5] = regs->r9; | ||
1510 | } else | ||
1511 | #endif | ||
1512 | { | ||
1513 | sd.args[0] = regs->bx; | ||
1514 | sd.args[1] = regs->cx; | ||
1515 | sd.args[2] = regs->dx; | ||
1516 | sd.args[3] = regs->si; | ||
1517 | sd.args[4] = regs->di; | ||
1518 | sd.args[5] = regs->bp; | ||
1519 | } | ||
1520 | |||
1521 | BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); | ||
1522 | BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); | ||
1523 | |||
1524 | ret = seccomp_phase1(&sd); | ||
1525 | if (ret == SECCOMP_PHASE1_SKIP) { | ||
1526 | regs->orig_ax = -1; | ||
1527 | ret = 0; | ||
1528 | } else if (ret != SECCOMP_PHASE1_OK) { | ||
1529 | return ret; /* Go directly to phase 2 */ | ||
1530 | } | ||
1531 | |||
1532 | work &= ~_TIF_SECCOMP; | ||
1533 | } | ||
1534 | #endif | ||
1535 | |||
1536 | /* Do our best to finish without phase 2. */ | ||
1537 | if (work == 0) | ||
1538 | return ret; /* seccomp and/or nohz only (ret == 0 here) */ | ||
1539 | |||
1540 | #ifdef CONFIG_AUDITSYSCALL | ||
1541 | if (work == _TIF_SYSCALL_AUDIT) { | ||
1542 | /* | ||
1543 | * If there is no more work to be done except auditing, | ||
1544 | * then audit in phase 1. Phase 2 always audits, so, if | ||
1545 | * we audit here, then we can't go on to phase 2. | ||
1546 | */ | ||
1547 | do_audit_syscall_entry(regs, arch); | ||
1548 | return 0; | ||
1549 | } | ||
1550 | #endif | ||
1551 | |||
1552 | return 1; /* Something is enabled that we can't handle in phase 1 */ | ||
1553 | } | ||
1554 | |||
1555 | /* Returns the syscall nr to run (which should match regs->orig_ax). */ | ||
1556 | long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, | ||
1557 | unsigned long phase1_result) | ||
1458 | { | 1558 | { |
1459 | long ret = 0; | 1559 | long ret = 0; |
1560 | u32 work = ACCESS_ONCE(current_thread_info()->flags) & | ||
1561 | _TIF_WORK_SYSCALL_ENTRY; | ||
1460 | 1562 | ||
1461 | user_exit(); | 1563 | BUG_ON(regs != task_pt_regs(current)); |
1462 | 1564 | ||
1463 | /* | 1565 | /* |
1464 | * If we stepped into a sysenter/syscall insn, it trapped in | 1566 | * If we stepped into a sysenter/syscall insn, it trapped in |
@@ -1467,17 +1569,21 @@ long syscall_trace_enter(struct pt_regs *regs) | |||
1467 | * do_debug() and we need to set it again to restore the user | 1569 | * do_debug() and we need to set it again to restore the user |
1468 | * state. If we entered on the slow path, TF was already set. | 1570 | * state. If we entered on the slow path, TF was already set. |
1469 | */ | 1571 | */ |
1470 | if (test_thread_flag(TIF_SINGLESTEP)) | 1572 | if (work & _TIF_SINGLESTEP) |
1471 | regs->flags |= X86_EFLAGS_TF; | 1573 | regs->flags |= X86_EFLAGS_TF; |
1472 | 1574 | ||
1473 | /* do the secure computing check first */ | 1575 | #ifdef CONFIG_SECCOMP |
1474 | if (secure_computing(regs->orig_ax)) { | 1576 | /* |
1577 | * Call seccomp_phase2 before running the other hooks so that | ||
1578 | * they can see any changes made by a seccomp tracer. | ||
1579 | */ | ||
1580 | if (phase1_result > 1 && seccomp_phase2(phase1_result)) { | ||
1475 | /* seccomp failures shouldn't expose any additional code. */ | 1581 | /* seccomp failures shouldn't expose any additional code. */ |
1476 | ret = -1L; | 1582 | return -1; |
1477 | goto out; | ||
1478 | } | 1583 | } |
1584 | #endif | ||
1479 | 1585 | ||
1480 | if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) | 1586 | if (unlikely(work & _TIF_SYSCALL_EMU)) |
1481 | ret = -1L; | 1587 | ret = -1L; |
1482 | 1588 | ||
1483 | if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && | 1589 | if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && |
@@ -1487,23 +1593,22 @@ long syscall_trace_enter(struct pt_regs *regs) | |||
1487 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) | 1593 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
1488 | trace_sys_enter(regs, regs->orig_ax); | 1594 | trace_sys_enter(regs, regs->orig_ax); |
1489 | 1595 | ||
1490 | if (IS_IA32) | 1596 | do_audit_syscall_entry(regs, arch); |
1491 | audit_syscall_entry(AUDIT_ARCH_I386, | ||
1492 | regs->orig_ax, | ||
1493 | regs->bx, regs->cx, | ||
1494 | regs->dx, regs->si); | ||
1495 | #ifdef CONFIG_X86_64 | ||
1496 | else | ||
1497 | audit_syscall_entry(AUDIT_ARCH_X86_64, | ||
1498 | regs->orig_ax, | ||
1499 | regs->di, regs->si, | ||
1500 | regs->dx, regs->r10); | ||
1501 | #endif | ||
1502 | 1597 | ||
1503 | out: | ||
1504 | return ret ?: regs->orig_ax; | 1598 | return ret ?: regs->orig_ax; |
1505 | } | 1599 | } |
1506 | 1600 | ||
1601 | long syscall_trace_enter(struct pt_regs *regs) | ||
1602 | { | ||
1603 | u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; | ||
1604 | unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); | ||
1605 | |||
1606 | if (phase1_result == 0) | ||
1607 | return regs->orig_ax; | ||
1608 | else | ||
1609 | return syscall_trace_enter_phase2(regs, arch, phase1_result); | ||
1610 | } | ||
1611 | |||
1507 | void syscall_trace_leave(struct pt_regs *regs) | 1612 | void syscall_trace_leave(struct pt_regs *regs) |
1508 | { | 1613 | { |
1509 | bool step; | 1614 | bool step; |
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index e1e1e80fc6a6..957779f4eb40 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c | |||
@@ -216,7 +216,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) | |||
216 | */ | 216 | */ |
217 | regs->orig_ax = syscall_nr; | 217 | regs->orig_ax = syscall_nr; |
218 | regs->ax = -ENOSYS; | 218 | regs->ax = -ENOSYS; |
219 | tmp = secure_computing(syscall_nr); | 219 | tmp = secure_computing(); |
220 | if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { | 220 | if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { |
221 | warn_bad_vsyscall(KERN_DEBUG, regs, | 221 | warn_bad_vsyscall(KERN_DEBUG, regs, |
222 | "seccomp tried to change syscall nr or ip"); | 222 | "seccomp tried to change syscall nr or ip"); |
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 5d586a45a319..a19ddacdac30 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h | |||
@@ -27,19 +27,23 @@ struct seccomp { | |||
27 | struct seccomp_filter *filter; | 27 | struct seccomp_filter *filter; |
28 | }; | 28 | }; |
29 | 29 | ||
30 | extern int __secure_computing(int); | 30 | #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER |
31 | static inline int secure_computing(int this_syscall) | 31 | extern int __secure_computing(void); |
32 | static inline int secure_computing(void) | ||
32 | { | 33 | { |
33 | if (unlikely(test_thread_flag(TIF_SECCOMP))) | 34 | if (unlikely(test_thread_flag(TIF_SECCOMP))) |
34 | return __secure_computing(this_syscall); | 35 | return __secure_computing(); |
35 | return 0; | 36 | return 0; |
36 | } | 37 | } |
37 | 38 | ||
38 | /* A wrapper for architectures supporting only SECCOMP_MODE_STRICT. */ | 39 | #define SECCOMP_PHASE1_OK 0 |
39 | static inline void secure_computing_strict(int this_syscall) | 40 | #define SECCOMP_PHASE1_SKIP 1 |
40 | { | 41 | |
41 | BUG_ON(secure_computing(this_syscall) != 0); | 42 | extern u32 seccomp_phase1(struct seccomp_data *sd); |
42 | } | 43 | int seccomp_phase2(u32 phase1_result); |
44 | #else | ||
45 | extern void secure_computing_strict(int this_syscall); | ||
46 | #endif | ||
43 | 47 | ||
44 | extern long prctl_get_seccomp(void); | 48 | extern long prctl_get_seccomp(void); |
45 | extern long prctl_set_seccomp(unsigned long, char __user *); | 49 | extern long prctl_set_seccomp(unsigned long, char __user *); |
@@ -56,8 +60,11 @@ static inline int seccomp_mode(struct seccomp *s) | |||
56 | struct seccomp { }; | 60 | struct seccomp { }; |
57 | struct seccomp_filter { }; | 61 | struct seccomp_filter { }; |
58 | 62 | ||
59 | static inline int secure_computing(int this_syscall) { return 0; } | 63 | #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER |
64 | static inline int secure_computing(void) { return 0; } | ||
65 | #else | ||
60 | static inline void secure_computing_strict(int this_syscall) { return; } | 66 | static inline void secure_computing_strict(int this_syscall) { return; } |
67 | #endif | ||
61 | 68 | ||
62 | static inline long prctl_get_seccomp(void) | 69 | static inline long prctl_get_seccomp(void) |
63 | { | 70 | { |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 84922befea84..4ef9687ac115 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -21,10 +21,11 @@ | |||
21 | #include <linux/slab.h> | 21 | #include <linux/slab.h> |
22 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
23 | 23 | ||
24 | /* #define SECCOMP_DEBUG 1 */ | 24 | #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER |
25 | #include <asm/syscall.h> | ||
26 | #endif | ||
25 | 27 | ||
26 | #ifdef CONFIG_SECCOMP_FILTER | 28 | #ifdef CONFIG_SECCOMP_FILTER |
27 | #include <asm/syscall.h> | ||
28 | #include <linux/filter.h> | 29 | #include <linux/filter.h> |
29 | #include <linux/pid.h> | 30 | #include <linux/pid.h> |
30 | #include <linux/ptrace.h> | 31 | #include <linux/ptrace.h> |
@@ -172,10 +173,10 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) | |||
172 | * | 173 | * |
173 | * Returns valid seccomp BPF response codes. | 174 | * Returns valid seccomp BPF response codes. |
174 | */ | 175 | */ |
175 | static u32 seccomp_run_filters(int syscall) | 176 | static u32 seccomp_run_filters(struct seccomp_data *sd) |
176 | { | 177 | { |
177 | struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); | 178 | struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); |
178 | struct seccomp_data sd; | 179 | struct seccomp_data sd_local; |
179 | u32 ret = SECCOMP_RET_ALLOW; | 180 | u32 ret = SECCOMP_RET_ALLOW; |
180 | 181 | ||
181 | /* Ensure unexpected behavior doesn't result in failing open. */ | 182 | /* Ensure unexpected behavior doesn't result in failing open. */ |
@@ -185,14 +186,17 @@ static u32 seccomp_run_filters(int syscall) | |||
185 | /* Make sure cross-thread synced filter points somewhere sane. */ | 186 | /* Make sure cross-thread synced filter points somewhere sane. */ |
186 | smp_read_barrier_depends(); | 187 | smp_read_barrier_depends(); |
187 | 188 | ||
188 | populate_seccomp_data(&sd); | 189 | if (!sd) { |
190 | populate_seccomp_data(&sd_local); | ||
191 | sd = &sd_local; | ||
192 | } | ||
189 | 193 | ||
190 | /* | 194 | /* |
191 | * All filters in the list are evaluated and the lowest BPF return | 195 | * All filters in the list are evaluated and the lowest BPF return |
192 | * value always takes priority (ignoring the DATA). | 196 | * value always takes priority (ignoring the DATA). |
193 | */ | 197 | */ |
194 | for (; f; f = f->prev) { | 198 | for (; f; f = f->prev) { |
195 | u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd); | 199 | u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd); |
196 | 200 | ||
197 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) | 201 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) |
198 | ret = cur_ret; | 202 | ret = cur_ret; |
@@ -563,11 +567,55 @@ static int mode1_syscalls_32[] = { | |||
563 | }; | 567 | }; |
564 | #endif | 568 | #endif |
565 | 569 | ||
566 | int __secure_computing(int this_syscall) | 570 | static void __secure_computing_strict(int this_syscall) |
571 | { | ||
572 | int *syscall_whitelist = mode1_syscalls; | ||
573 | #ifdef CONFIG_COMPAT | ||
574 | if (is_compat_task()) | ||
575 | syscall_whitelist = mode1_syscalls_32; | ||
576 | #endif | ||
577 | do { | ||
578 | if (*syscall_whitelist == this_syscall) | ||
579 | return; | ||
580 | } while (*++syscall_whitelist); | ||
581 | |||
582 | #ifdef SECCOMP_DEBUG | ||
583 | dump_stack(); | ||
584 | #endif | ||
585 | audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL); | ||
586 | do_exit(SIGKILL); | ||
587 | } | ||
588 | |||
589 | #ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER | ||
590 | void secure_computing_strict(int this_syscall) | ||
591 | { | ||
592 | int mode = current->seccomp.mode; | ||
593 | |||
594 | if (mode == 0) | ||
595 | return; | ||
596 | else if (mode == SECCOMP_MODE_STRICT) | ||
597 | __secure_computing_strict(this_syscall); | ||
598 | else | ||
599 | BUG(); | ||
600 | } | ||
601 | #else | ||
602 | int __secure_computing(void) | ||
567 | { | 603 | { |
568 | int exit_sig = 0; | 604 | u32 phase1_result = seccomp_phase1(NULL); |
569 | int *syscall; | 605 | |
570 | u32 ret; | 606 | if (likely(phase1_result == SECCOMP_PHASE1_OK)) |
607 | return 0; | ||
608 | else if (likely(phase1_result == SECCOMP_PHASE1_SKIP)) | ||
609 | return -1; | ||
610 | else | ||
611 | return seccomp_phase2(phase1_result); | ||
612 | } | ||
613 | |||
614 | #ifdef CONFIG_SECCOMP_FILTER | ||
615 | static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) | ||
616 | { | ||
617 | u32 filter_ret, action; | ||
618 | int data; | ||
571 | 619 | ||
572 | /* | 620 | /* |
573 | * Make sure that any changes to mode from another thread have | 621 | * Make sure that any changes to mode from another thread have |
@@ -575,85 +623,127 @@ int __secure_computing(int this_syscall) | |||
575 | */ | 623 | */ |
576 | rmb(); | 624 | rmb(); |
577 | 625 | ||
578 | switch (current->seccomp.mode) { | 626 | filter_ret = seccomp_run_filters(sd); |
579 | case SECCOMP_MODE_STRICT: | 627 | data = filter_ret & SECCOMP_RET_DATA; |
580 | syscall = mode1_syscalls; | 628 | action = filter_ret & SECCOMP_RET_ACTION; |
581 | #ifdef CONFIG_COMPAT | 629 | |
582 | if (is_compat_task()) | 630 | switch (action) { |
583 | syscall = mode1_syscalls_32; | 631 | case SECCOMP_RET_ERRNO: |
632 | /* Set the low-order 16-bits as a errno. */ | ||
633 | syscall_set_return_value(current, task_pt_regs(current), | ||
634 | -data, 0); | ||
635 | goto skip; | ||
636 | |||
637 | case SECCOMP_RET_TRAP: | ||
638 | /* Show the handler the original registers. */ | ||
639 | syscall_rollback(current, task_pt_regs(current)); | ||
640 | /* Let the filter pass back 16 bits of data. */ | ||
641 | seccomp_send_sigsys(this_syscall, data); | ||
642 | goto skip; | ||
643 | |||
644 | case SECCOMP_RET_TRACE: | ||
645 | return filter_ret; /* Save the rest for phase 2. */ | ||
646 | |||
647 | case SECCOMP_RET_ALLOW: | ||
648 | return SECCOMP_PHASE1_OK; | ||
649 | |||
650 | case SECCOMP_RET_KILL: | ||
651 | default: | ||
652 | audit_seccomp(this_syscall, SIGSYS, action); | ||
653 | do_exit(SIGSYS); | ||
654 | } | ||
655 | |||
656 | unreachable(); | ||
657 | |||
658 | skip: | ||
659 | audit_seccomp(this_syscall, 0, action); | ||
660 | return SECCOMP_PHASE1_SKIP; | ||
661 | } | ||
584 | #endif | 662 | #endif |
585 | do { | 663 | |
586 | if (*syscall == this_syscall) | 664 | /** |
587 | return 0; | 665 | * seccomp_phase1() - run fast path seccomp checks on the current syscall |
588 | } while (*++syscall); | 666 | * @arg sd: The seccomp_data or NULL |
589 | exit_sig = SIGKILL; | 667 | * |
590 | ret = SECCOMP_RET_KILL; | 668 | * This only reads pt_regs via the syscall_xyz helpers. The only change |
591 | break; | 669 | * it will make to pt_regs is via syscall_set_return_value, and it will |
670 | * only do that if it returns SECCOMP_PHASE1_SKIP. | ||
671 | * | ||
672 | * If sd is provided, it will not read pt_regs at all. | ||
673 | * | ||
674 | * It may also call do_exit or force a signal; these actions must be | ||
675 | * safe. | ||
676 | * | ||
677 | * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should | ||
678 | * be processed normally. | ||
679 | * | ||
680 | * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be | ||
681 | * invoked. In this case, seccomp_phase1 will have set the return value | ||
682 | * using syscall_set_return_value. | ||
683 | * | ||
684 | * If it returns anything else, then the return value should be passed | ||
685 | * to seccomp_phase2 from a context in which ptrace hooks are safe. | ||
686 | */ | ||
687 | u32 seccomp_phase1(struct seccomp_data *sd) | ||
688 | { | ||
689 | int mode = current->seccomp.mode; | ||
690 | int this_syscall = sd ? sd->nr : | ||
691 | syscall_get_nr(current, task_pt_regs(current)); | ||
692 | |||
693 | switch (mode) { | ||
694 | case SECCOMP_MODE_STRICT: | ||
695 | __secure_computing_strict(this_syscall); /* may call do_exit */ | ||
696 | return SECCOMP_PHASE1_OK; | ||
592 | #ifdef CONFIG_SECCOMP_FILTER | 697 | #ifdef CONFIG_SECCOMP_FILTER |
593 | case SECCOMP_MODE_FILTER: { | 698 | case SECCOMP_MODE_FILTER: |
594 | int data; | 699 | return __seccomp_phase1_filter(this_syscall, sd); |
595 | struct pt_regs *regs = task_pt_regs(current); | ||
596 | ret = seccomp_run_filters(this_syscall); | ||
597 | data = ret & SECCOMP_RET_DATA; | ||
598 | ret &= SECCOMP_RET_ACTION; | ||
599 | switch (ret) { | ||
600 | case SECCOMP_RET_ERRNO: | ||
601 | /* Set the low-order 16-bits as a errno. */ | ||
602 | syscall_set_return_value(current, regs, | ||
603 | -data, 0); | ||
604 | goto skip; | ||
605 | case SECCOMP_RET_TRAP: | ||
606 | /* Show the handler the original registers. */ | ||
607 | syscall_rollback(current, regs); | ||
608 | /* Let the filter pass back 16 bits of data. */ | ||
609 | seccomp_send_sigsys(this_syscall, data); | ||
610 | goto skip; | ||
611 | case SECCOMP_RET_TRACE: | ||
612 | /* Skip these calls if there is no tracer. */ | ||
613 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { | ||
614 | syscall_set_return_value(current, regs, | ||
615 | -ENOSYS, 0); | ||
616 | goto skip; | ||
617 | } | ||
618 | /* Allow the BPF to provide the event message */ | ||
619 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | ||
620 | /* | ||
621 | * The delivery of a fatal signal during event | ||
622 | * notification may silently skip tracer notification. | ||
623 | * Terminating the task now avoids executing a system | ||
624 | * call that may not be intended. | ||
625 | */ | ||
626 | if (fatal_signal_pending(current)) | ||
627 | break; | ||
628 | if (syscall_get_nr(current, regs) < 0) | ||
629 | goto skip; /* Explicit request to skip. */ | ||
630 | |||
631 | return 0; | ||
632 | case SECCOMP_RET_ALLOW: | ||
633 | return 0; | ||
634 | case SECCOMP_RET_KILL: | ||
635 | default: | ||
636 | break; | ||
637 | } | ||
638 | exit_sig = SIGSYS; | ||
639 | break; | ||
640 | } | ||
641 | #endif | 700 | #endif |
642 | default: | 701 | default: |
643 | BUG(); | 702 | BUG(); |
644 | } | 703 | } |
704 | } | ||
645 | 705 | ||
646 | #ifdef SECCOMP_DEBUG | 706 | /** |
647 | dump_stack(); | 707 | * seccomp_phase2() - finish slow path seccomp work for the current syscall |
648 | #endif | 708 | * @phase1_result: The return value from seccomp_phase1() |
649 | audit_seccomp(this_syscall, exit_sig, ret); | 709 | * |
650 | do_exit(exit_sig); | 710 | * This must be called from a context in which ptrace hooks can be used. |
651 | #ifdef CONFIG_SECCOMP_FILTER | 711 | * |
652 | skip: | 712 | * Returns 0 if the syscall should be processed or -1 to skip the syscall. |
653 | audit_seccomp(this_syscall, exit_sig, ret); | 713 | */ |
654 | #endif | 714 | int seccomp_phase2(u32 phase1_result) |
655 | return -1; | 715 | { |
716 | struct pt_regs *regs = task_pt_regs(current); | ||
717 | u32 action = phase1_result & SECCOMP_RET_ACTION; | ||
718 | int data = phase1_result & SECCOMP_RET_DATA; | ||
719 | |||
720 | BUG_ON(action != SECCOMP_RET_TRACE); | ||
721 | |||
722 | audit_seccomp(syscall_get_nr(current, regs), 0, action); | ||
723 | |||
724 | /* Skip these calls if there is no tracer. */ | ||
725 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { | ||
726 | syscall_set_return_value(current, regs, | ||
727 | -ENOSYS, 0); | ||
728 | return -1; | ||
729 | } | ||
730 | |||
731 | /* Allow the BPF to provide the event message */ | ||
732 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | ||
733 | /* | ||
734 | * The delivery of a fatal signal during event | ||
735 | * notification may silently skip tracer notification. | ||
736 | * Terminating the task now avoids executing a system | ||
737 | * call that may not be intended. | ||
738 | */ | ||
739 | if (fatal_signal_pending(current)) | ||
740 | do_exit(SIGSYS); | ||
741 | if (syscall_get_nr(current, regs) < 0) | ||
742 | return -1; /* Explicit request to skip. */ | ||
743 | |||
744 | return 0; | ||
656 | } | 745 | } |
746 | #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ | ||
657 | 747 | ||
658 | long prctl_get_seccomp(void) | 748 | long prctl_get_seccomp(void) |
659 | { | 749 | { |