aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-10-13 20:27:06 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-13 20:27:06 -0400
commitba1a96fc7ddcaf0c8d4a6752f6a70f080bc307ac (patch)
treec07af88f62df1ab8ed98aab9951dd05dff09d0d2
parentf1bfbd984b4e2177886507b6a0ec5faeb6d7c217 (diff)
parent1dcf74f6edfc3a9acd84d83d8865dd9e2a3b1d1e (diff)
Merge branch 'x86-seccomp-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 seccomp changes from Ingo Molnar: "This tree includes x86 seccomp filter speedups and related preparatory work, which touches core seccomp facilities as well. The main idea is to split seccomp into two phases, to be able to enter a simple fast path for syscalls with ptrace side effects. There's no substantial user-visible (and ABI) effects expected from this, except a change in how we emit a better audit record for SECCOMP_RET_TRACE events" * 'x86-seccomp-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86_64, entry: Use split-phase syscall_trace_enter for 64-bit syscalls x86_64, entry: Treat regs->ax the same in fastpath and slowpath syscalls x86: Split syscall_trace_enter into two phases x86, entry: Only call user_exit if TIF_NOHZ x86, x32, audit: Fix x32's AUDIT_ARCH wrt audit seccomp: Document two-phase seccomp and arch-provided seccomp_data seccomp: Allow arch code to provide seccomp_data seccomp: Refactor the filter callback and the API seccomp,x86,arm,mips,s390: Remove nr parameter from secure_computing
-rw-r--r--arch/Kconfig11
-rw-r--r--arch/arm/kernel/ptrace.c7
-rw-r--r--arch/mips/kernel/ptrace.c2
-rw-r--r--arch/s390/kernel/ptrace.c2
-rw-r--r--arch/x86/include/asm/calling.h6
-rw-r--r--arch/x86/include/asm/ptrace.h5
-rw-r--r--arch/x86/kernel/entry_64.S51
-rw-r--r--arch/x86/kernel/ptrace.c165
-rw-r--r--arch/x86/kernel/vsyscall_64.c2
-rw-r--r--include/linux/seccomp.h25
-rw-r--r--kernel/seccomp.c252
11 files changed, 371 insertions, 157 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index 0eae9df35b88..05d7a8a458d5 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -323,6 +323,17 @@ config HAVE_ARCH_SECCOMP_FILTER
323 results in the system call being skipped immediately. 323 results in the system call being skipped immediately.
324 - seccomp syscall wired up 324 - seccomp syscall wired up
325 325
326 For best performance, an arch should use seccomp_phase1 and
327 seccomp_phase2 directly. It should call seccomp_phase1 for all
328 syscalls if TIF_SECCOMP is set, but seccomp_phase1 does not
329 need to be called from a ptrace-safe context. It must then
330 call seccomp_phase2 if seccomp_phase1 returns anything other
331 than SECCOMP_PHASE1_OK or SECCOMP_PHASE1_SKIP.
332
333 As an additional optimization, an arch may provide seccomp_data
334 directly to seccomp_phase1; this avoids multiple calls
335 to the syscall_xyz helpers for every syscall.
336
326config SECCOMP_FILTER 337config SECCOMP_FILTER
327 def_bool y 338 def_bool y
328 depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET 339 depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 0c27ed6f3f23..5e772a21ab97 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -933,8 +933,13 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno)
933 current_thread_info()->syscall = scno; 933 current_thread_info()->syscall = scno;
934 934
935 /* Do the secure computing check first; failures should be fast. */ 935 /* Do the secure computing check first; failures should be fast. */
936 if (secure_computing(scno) == -1) 936#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
937 if (secure_computing() == -1)
937 return -1; 938 return -1;
939#else
940 /* XXX: remove this once OABI gets fixed */
941 secure_computing_strict(scno);
942#endif
938 943
939 if (test_thread_flag(TIF_SYSCALL_TRACE)) 944 if (test_thread_flag(TIF_SYSCALL_TRACE))
940 tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); 945 tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 645b3c4fcfba..f7aac5b57b4b 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -770,7 +770,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall)
770 long ret = 0; 770 long ret = 0;
771 user_exit(); 771 user_exit();
772 772
773 if (secure_computing(syscall) == -1) 773 if (secure_computing() == -1)
774 return -1; 774 return -1;
775 775
776 if (test_thread_flag(TIF_SYSCALL_TRACE) && 776 if (test_thread_flag(TIF_SYSCALL_TRACE) &&
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 5dc7ad9e2fbf..bebacad48305 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -803,7 +803,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
803 long ret = 0; 803 long ret = 0;
804 804
805 /* Do the secure computing check first. */ 805 /* Do the secure computing check first. */
806 if (secure_computing(regs->gprs[2])) { 806 if (secure_computing()) {
807 /* seccomp failures shouldn't expose any additional code. */ 807 /* seccomp failures shouldn't expose any additional code. */
808 ret = -1; 808 ret = -1;
809 goto out; 809 goto out;
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index cb4c73bfeb48..76659b67fd11 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -85,7 +85,7 @@ For 32-bit we have the following conventions - kernel is built with
85#define ARGOFFSET R11 85#define ARGOFFSET R11
86#define SWFRAME ORIG_RAX 86#define SWFRAME ORIG_RAX
87 87
88 .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1 88 .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
89 subq $9*8+\addskip, %rsp 89 subq $9*8+\addskip, %rsp
90 CFI_ADJUST_CFA_OFFSET 9*8+\addskip 90 CFI_ADJUST_CFA_OFFSET 9*8+\addskip
91 movq_cfi rdi, 8*8 91 movq_cfi rdi, 8*8
@@ -96,7 +96,11 @@ For 32-bit we have the following conventions - kernel is built with
96 movq_cfi rcx, 5*8 96 movq_cfi rcx, 5*8
97 .endif 97 .endif
98 98
99 .if \rax_enosys
100 movq $-ENOSYS, 4*8(%rsp)
101 .else
99 movq_cfi rax, 4*8 102 movq_cfi rax, 4*8
103 .endif
100 104
101 .if \save_r891011 105 .if \save_r891011
102 movq_cfi r8, 3*8 106 movq_cfi r8, 3*8
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 6205f0c434db..86fc2bb82287 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -75,6 +75,11 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
75extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, 75extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
76 int error_code, int si_code); 76 int error_code, int si_code);
77 77
78
79extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch);
80extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch,
81 unsigned long phase1_result);
82
78extern long syscall_trace_enter(struct pt_regs *); 83extern long syscall_trace_enter(struct pt_regs *);
79extern void syscall_trace_leave(struct pt_regs *); 84extern void syscall_trace_leave(struct pt_regs *);
80 85
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2fac1343a90b..df088bb03fb3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -404,8 +404,8 @@ GLOBAL(system_call_after_swapgs)
404 * and short: 404 * and short:
405 */ 405 */
406 ENABLE_INTERRUPTS(CLBR_NONE) 406 ENABLE_INTERRUPTS(CLBR_NONE)
407 SAVE_ARGS 8,0 407 SAVE_ARGS 8, 0, rax_enosys=1
408 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 408 movq_cfi rax,(ORIG_RAX-ARGOFFSET)
409 movq %rcx,RIP-ARGOFFSET(%rsp) 409 movq %rcx,RIP-ARGOFFSET(%rsp)
410 CFI_REL_OFFSET rip,RIP-ARGOFFSET 410 CFI_REL_OFFSET rip,RIP-ARGOFFSET
411 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 411 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
@@ -417,7 +417,7 @@ system_call_fastpath:
417 andl $__SYSCALL_MASK,%eax 417 andl $__SYSCALL_MASK,%eax
418 cmpl $__NR_syscall_max,%eax 418 cmpl $__NR_syscall_max,%eax
419#endif 419#endif
420 ja badsys 420 ja ret_from_sys_call /* and return regs->ax */
421 movq %r10,%rcx 421 movq %r10,%rcx
422 call *sys_call_table(,%rax,8) # XXX: rip relative 422 call *sys_call_table(,%rax,8) # XXX: rip relative
423 movq %rax,RAX-ARGOFFSET(%rsp) 423 movq %rax,RAX-ARGOFFSET(%rsp)
@@ -476,28 +476,8 @@ sysret_signal:
476 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET 476 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
477 jmp int_check_syscall_exit_work 477 jmp int_check_syscall_exit_work
478 478
479badsys:
480 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
481 jmp ret_from_sys_call
482
483#ifdef CONFIG_AUDITSYSCALL 479#ifdef CONFIG_AUDITSYSCALL
484 /* 480 /*
485 * Fast path for syscall audit without full syscall trace.
486 * We just call __audit_syscall_entry() directly, and then
487 * jump back to the normal fast path.
488 */
489auditsys:
490 movq %r10,%r9 /* 6th arg: 4th syscall arg */
491 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
492 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
493 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
494 movq %rax,%rsi /* 2nd arg: syscall number */
495 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
496 call __audit_syscall_entry
497 LOAD_ARGS 0 /* reload call-clobbered registers */
498 jmp system_call_fastpath
499
500 /*
501 * Return fast path for syscall audit. Call __audit_syscall_exit() 481 * Return fast path for syscall audit. Call __audit_syscall_exit()
502 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT 482 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
503 * masked off. 483 * masked off.
@@ -514,18 +494,25 @@ sysret_audit:
514 494
515 /* Do syscall tracing */ 495 /* Do syscall tracing */
516tracesys: 496tracesys:
517#ifdef CONFIG_AUDITSYSCALL 497 leaq -REST_SKIP(%rsp), %rdi
518 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 498 movq $AUDIT_ARCH_X86_64, %rsi
519 jz auditsys 499 call syscall_trace_enter_phase1
520#endif 500 test %rax, %rax
501 jnz tracesys_phase2 /* if needed, run the slow path */
502 LOAD_ARGS 0 /* else restore clobbered regs */
503 jmp system_call_fastpath /* and return to the fast path */
504
505tracesys_phase2:
521 SAVE_REST 506 SAVE_REST
522 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
523 FIXUP_TOP_OF_STACK %rdi 507 FIXUP_TOP_OF_STACK %rdi
524 movq %rsp,%rdi 508 movq %rsp, %rdi
525 call syscall_trace_enter 509 movq $AUDIT_ARCH_X86_64, %rsi
510 movq %rax,%rdx
511 call syscall_trace_enter_phase2
512
526 /* 513 /*
527 * Reload arg registers from stack in case ptrace changed them. 514 * Reload arg registers from stack in case ptrace changed them.
528 * We don't reload %rax because syscall_trace_enter() returned 515 * We don't reload %rax because syscall_trace_entry_phase2() returned
529 * the value it wants us to use in the table lookup. 516 * the value it wants us to use in the table lookup.
530 */ 517 */
531 LOAD_ARGS ARGOFFSET, 1 518 LOAD_ARGS ARGOFFSET, 1
@@ -536,7 +523,7 @@ tracesys:
536 andl $__SYSCALL_MASK,%eax 523 andl $__SYSCALL_MASK,%eax
537 cmpl $__NR_syscall_max,%eax 524 cmpl $__NR_syscall_max,%eax
538#endif 525#endif
539 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ 526 ja int_ret_from_sys_call /* RAX(%rsp) is already set */
540 movq %r10,%rcx /* fixup for C */ 527 movq %r10,%rcx /* fixup for C */
541 call *sys_call_table(,%rax,8) 528 call *sys_call_table(,%rax,8)
542 movq %rax,RAX-ARGOFFSET(%rsp) 529 movq %rax,RAX-ARGOFFSET(%rsp)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 678c0ada3b3c..29576c244699 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1441,24 +1441,126 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1441 force_sig_info(SIGTRAP, &info, tsk); 1441 force_sig_info(SIGTRAP, &info, tsk);
1442} 1442}
1443 1443
1444 1444static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
1445#ifdef CONFIG_X86_32 1445{
1446# define IS_IA32 1 1446#ifdef CONFIG_X86_64
1447#elif defined CONFIG_IA32_EMULATION 1447 if (arch == AUDIT_ARCH_X86_64) {
1448# define IS_IA32 is_compat_task() 1448 audit_syscall_entry(arch, regs->orig_ax, regs->di,
1449#else 1449 regs->si, regs->dx, regs->r10);
1450# define IS_IA32 0 1450 } else
1451#endif 1451#endif
1452 {
1453 audit_syscall_entry(arch, regs->orig_ax, regs->bx,
1454 regs->cx, regs->dx, regs->si);
1455 }
1456}
1452 1457
1453/* 1458/*
1454 * We must return the syscall number to actually look up in the table. 1459 * We can return 0 to resume the syscall or anything else to go to phase
1455 * This can be -1L to skip running any syscall at all. 1460 * 2. If we resume the syscall, we need to put something appropriate in
1461 * regs->orig_ax.
1462 *
1463 * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
1464 * are fully functional.
1465 *
1466 * For phase 2's benefit, our return value is:
1467 * 0: resume the syscall
1468 * 1: go to phase 2; no seccomp phase 2 needed
1469 * anything else: go to phase 2; pass return value to seccomp
1456 */ 1470 */
1457long syscall_trace_enter(struct pt_regs *regs) 1471unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
1472{
1473 unsigned long ret = 0;
1474 u32 work;
1475
1476 BUG_ON(regs != task_pt_regs(current));
1477
1478 work = ACCESS_ONCE(current_thread_info()->flags) &
1479 _TIF_WORK_SYSCALL_ENTRY;
1480
1481 /*
1482 * If TIF_NOHZ is set, we are required to call user_exit() before
1483 * doing anything that could touch RCU.
1484 */
1485 if (work & _TIF_NOHZ) {
1486 user_exit();
1487 work &= ~TIF_NOHZ;
1488 }
1489
1490#ifdef CONFIG_SECCOMP
1491 /*
1492 * Do seccomp first -- it should minimize exposure of other
1493 * code, and keeping seccomp fast is probably more valuable
1494 * than the rest of this.
1495 */
1496 if (work & _TIF_SECCOMP) {
1497 struct seccomp_data sd;
1498
1499 sd.arch = arch;
1500 sd.nr = regs->orig_ax;
1501 sd.instruction_pointer = regs->ip;
1502#ifdef CONFIG_X86_64
1503 if (arch == AUDIT_ARCH_X86_64) {
1504 sd.args[0] = regs->di;
1505 sd.args[1] = regs->si;
1506 sd.args[2] = regs->dx;
1507 sd.args[3] = regs->r10;
1508 sd.args[4] = regs->r8;
1509 sd.args[5] = regs->r9;
1510 } else
1511#endif
1512 {
1513 sd.args[0] = regs->bx;
1514 sd.args[1] = regs->cx;
1515 sd.args[2] = regs->dx;
1516 sd.args[3] = regs->si;
1517 sd.args[4] = regs->di;
1518 sd.args[5] = regs->bp;
1519 }
1520
1521 BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
1522 BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
1523
1524 ret = seccomp_phase1(&sd);
1525 if (ret == SECCOMP_PHASE1_SKIP) {
1526 regs->orig_ax = -1;
1527 ret = 0;
1528 } else if (ret != SECCOMP_PHASE1_OK) {
1529 return ret; /* Go directly to phase 2 */
1530 }
1531
1532 work &= ~_TIF_SECCOMP;
1533 }
1534#endif
1535
1536 /* Do our best to finish without phase 2. */
1537 if (work == 0)
1538 return ret; /* seccomp and/or nohz only (ret == 0 here) */
1539
1540#ifdef CONFIG_AUDITSYSCALL
1541 if (work == _TIF_SYSCALL_AUDIT) {
1542 /*
1543 * If there is no more work to be done except auditing,
1544 * then audit in phase 1. Phase 2 always audits, so, if
1545 * we audit here, then we can't go on to phase 2.
1546 */
1547 do_audit_syscall_entry(regs, arch);
1548 return 0;
1549 }
1550#endif
1551
1552 return 1; /* Something is enabled that we can't handle in phase 1 */
1553}
1554
1555/* Returns the syscall nr to run (which should match regs->orig_ax). */
1556long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
1557 unsigned long phase1_result)
1458{ 1558{
1459 long ret = 0; 1559 long ret = 0;
1560 u32 work = ACCESS_ONCE(current_thread_info()->flags) &
1561 _TIF_WORK_SYSCALL_ENTRY;
1460 1562
1461 user_exit(); 1563 BUG_ON(regs != task_pt_regs(current));
1462 1564
1463 /* 1565 /*
1464 * If we stepped into a sysenter/syscall insn, it trapped in 1566 * If we stepped into a sysenter/syscall insn, it trapped in
@@ -1467,17 +1569,21 @@ long syscall_trace_enter(struct pt_regs *regs)
1467 * do_debug() and we need to set it again to restore the user 1569 * do_debug() and we need to set it again to restore the user
1468 * state. If we entered on the slow path, TF was already set. 1570 * state. If we entered on the slow path, TF was already set.
1469 */ 1571 */
1470 if (test_thread_flag(TIF_SINGLESTEP)) 1572 if (work & _TIF_SINGLESTEP)
1471 regs->flags |= X86_EFLAGS_TF; 1573 regs->flags |= X86_EFLAGS_TF;
1472 1574
1473 /* do the secure computing check first */ 1575#ifdef CONFIG_SECCOMP
1474 if (secure_computing(regs->orig_ax)) { 1576 /*
1577 * Call seccomp_phase2 before running the other hooks so that
1578 * they can see any changes made by a seccomp tracer.
1579 */
1580 if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
1475 /* seccomp failures shouldn't expose any additional code. */ 1581 /* seccomp failures shouldn't expose any additional code. */
1476 ret = -1L; 1582 return -1;
1477 goto out;
1478 } 1583 }
1584#endif
1479 1585
1480 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) 1586 if (unlikely(work & _TIF_SYSCALL_EMU))
1481 ret = -1L; 1587 ret = -1L;
1482 1588
1483 if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && 1589 if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
@@ -1487,23 +1593,22 @@ long syscall_trace_enter(struct pt_regs *regs)
1487 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1593 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1488 trace_sys_enter(regs, regs->orig_ax); 1594 trace_sys_enter(regs, regs->orig_ax);
1489 1595
1490 if (IS_IA32) 1596 do_audit_syscall_entry(regs, arch);
1491 audit_syscall_entry(AUDIT_ARCH_I386,
1492 regs->orig_ax,
1493 regs->bx, regs->cx,
1494 regs->dx, regs->si);
1495#ifdef CONFIG_X86_64
1496 else
1497 audit_syscall_entry(AUDIT_ARCH_X86_64,
1498 regs->orig_ax,
1499 regs->di, regs->si,
1500 regs->dx, regs->r10);
1501#endif
1502 1597
1503out:
1504 return ret ?: regs->orig_ax; 1598 return ret ?: regs->orig_ax;
1505} 1599}
1506 1600
1601long syscall_trace_enter(struct pt_regs *regs)
1602{
1603 u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
1604 unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
1605
1606 if (phase1_result == 0)
1607 return regs->orig_ax;
1608 else
1609 return syscall_trace_enter_phase2(regs, arch, phase1_result);
1610}
1611
1507void syscall_trace_leave(struct pt_regs *regs) 1612void syscall_trace_leave(struct pt_regs *regs)
1508{ 1613{
1509 bool step; 1614 bool step;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index e1e1e80fc6a6..957779f4eb40 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -216,7 +216,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
216 */ 216 */
217 regs->orig_ax = syscall_nr; 217 regs->orig_ax = syscall_nr;
218 regs->ax = -ENOSYS; 218 regs->ax = -ENOSYS;
219 tmp = secure_computing(syscall_nr); 219 tmp = secure_computing();
220 if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { 220 if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
221 warn_bad_vsyscall(KERN_DEBUG, regs, 221 warn_bad_vsyscall(KERN_DEBUG, regs,
222 "seccomp tried to change syscall nr or ip"); 222 "seccomp tried to change syscall nr or ip");
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 5d586a45a319..a19ddacdac30 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -27,19 +27,23 @@ struct seccomp {
27 struct seccomp_filter *filter; 27 struct seccomp_filter *filter;
28}; 28};
29 29
30extern int __secure_computing(int); 30#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
31static inline int secure_computing(int this_syscall) 31extern int __secure_computing(void);
32static inline int secure_computing(void)
32{ 33{
33 if (unlikely(test_thread_flag(TIF_SECCOMP))) 34 if (unlikely(test_thread_flag(TIF_SECCOMP)))
34 return __secure_computing(this_syscall); 35 return __secure_computing();
35 return 0; 36 return 0;
36} 37}
37 38
38/* A wrapper for architectures supporting only SECCOMP_MODE_STRICT. */ 39#define SECCOMP_PHASE1_OK 0
39static inline void secure_computing_strict(int this_syscall) 40#define SECCOMP_PHASE1_SKIP 1
40{ 41
41 BUG_ON(secure_computing(this_syscall) != 0); 42extern u32 seccomp_phase1(struct seccomp_data *sd);
42} 43int seccomp_phase2(u32 phase1_result);
44#else
45extern void secure_computing_strict(int this_syscall);
46#endif
43 47
44extern long prctl_get_seccomp(void); 48extern long prctl_get_seccomp(void);
45extern long prctl_set_seccomp(unsigned long, char __user *); 49extern long prctl_set_seccomp(unsigned long, char __user *);
@@ -56,8 +60,11 @@ static inline int seccomp_mode(struct seccomp *s)
56struct seccomp { }; 60struct seccomp { };
57struct seccomp_filter { }; 61struct seccomp_filter { };
58 62
59static inline int secure_computing(int this_syscall) { return 0; } 63#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
64static inline int secure_computing(void) { return 0; }
65#else
60static inline void secure_computing_strict(int this_syscall) { return; } 66static inline void secure_computing_strict(int this_syscall) { return; }
67#endif
61 68
62static inline long prctl_get_seccomp(void) 69static inline long prctl_get_seccomp(void)
63{ 70{
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 84922befea84..4ef9687ac115 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -21,10 +21,11 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/syscalls.h> 22#include <linux/syscalls.h>
23 23
24/* #define SECCOMP_DEBUG 1 */ 24#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
25#include <asm/syscall.h>
26#endif
25 27
26#ifdef CONFIG_SECCOMP_FILTER 28#ifdef CONFIG_SECCOMP_FILTER
27#include <asm/syscall.h>
28#include <linux/filter.h> 29#include <linux/filter.h>
29#include <linux/pid.h> 30#include <linux/pid.h>
30#include <linux/ptrace.h> 31#include <linux/ptrace.h>
@@ -172,10 +173,10 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
172 * 173 *
173 * Returns valid seccomp BPF response codes. 174 * Returns valid seccomp BPF response codes.
174 */ 175 */
175static u32 seccomp_run_filters(int syscall) 176static u32 seccomp_run_filters(struct seccomp_data *sd)
176{ 177{
177 struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); 178 struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
178 struct seccomp_data sd; 179 struct seccomp_data sd_local;
179 u32 ret = SECCOMP_RET_ALLOW; 180 u32 ret = SECCOMP_RET_ALLOW;
180 181
181 /* Ensure unexpected behavior doesn't result in failing open. */ 182 /* Ensure unexpected behavior doesn't result in failing open. */
@@ -185,14 +186,17 @@ static u32 seccomp_run_filters(int syscall)
185 /* Make sure cross-thread synced filter points somewhere sane. */ 186 /* Make sure cross-thread synced filter points somewhere sane. */
186 smp_read_barrier_depends(); 187 smp_read_barrier_depends();
187 188
188 populate_seccomp_data(&sd); 189 if (!sd) {
190 populate_seccomp_data(&sd_local);
191 sd = &sd_local;
192 }
189 193
190 /* 194 /*
191 * All filters in the list are evaluated and the lowest BPF return 195 * All filters in the list are evaluated and the lowest BPF return
192 * value always takes priority (ignoring the DATA). 196 * value always takes priority (ignoring the DATA).
193 */ 197 */
194 for (; f; f = f->prev) { 198 for (; f; f = f->prev) {
195 u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd); 199 u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
196 200
197 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) 201 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
198 ret = cur_ret; 202 ret = cur_ret;
@@ -563,11 +567,55 @@ static int mode1_syscalls_32[] = {
563}; 567};
564#endif 568#endif
565 569
566int __secure_computing(int this_syscall) 570static void __secure_computing_strict(int this_syscall)
571{
572 int *syscall_whitelist = mode1_syscalls;
573#ifdef CONFIG_COMPAT
574 if (is_compat_task())
575 syscall_whitelist = mode1_syscalls_32;
576#endif
577 do {
578 if (*syscall_whitelist == this_syscall)
579 return;
580 } while (*++syscall_whitelist);
581
582#ifdef SECCOMP_DEBUG
583 dump_stack();
584#endif
585 audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
586 do_exit(SIGKILL);
587}
588
589#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
590void secure_computing_strict(int this_syscall)
591{
592 int mode = current->seccomp.mode;
593
594 if (mode == 0)
595 return;
596 else if (mode == SECCOMP_MODE_STRICT)
597 __secure_computing_strict(this_syscall);
598 else
599 BUG();
600}
601#else
602int __secure_computing(void)
567{ 603{
568 int exit_sig = 0; 604 u32 phase1_result = seccomp_phase1(NULL);
569 int *syscall; 605
570 u32 ret; 606 if (likely(phase1_result == SECCOMP_PHASE1_OK))
607 return 0;
608 else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
609 return -1;
610 else
611 return seccomp_phase2(phase1_result);
612}
613
614#ifdef CONFIG_SECCOMP_FILTER
615static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
616{
617 u32 filter_ret, action;
618 int data;
571 619
572 /* 620 /*
573 * Make sure that any changes to mode from another thread have 621 * Make sure that any changes to mode from another thread have
@@ -575,85 +623,127 @@ int __secure_computing(int this_syscall)
575 */ 623 */
576 rmb(); 624 rmb();
577 625
578 switch (current->seccomp.mode) { 626 filter_ret = seccomp_run_filters(sd);
579 case SECCOMP_MODE_STRICT: 627 data = filter_ret & SECCOMP_RET_DATA;
580 syscall = mode1_syscalls; 628 action = filter_ret & SECCOMP_RET_ACTION;
581#ifdef CONFIG_COMPAT 629
582 if (is_compat_task()) 630 switch (action) {
583 syscall = mode1_syscalls_32; 631 case SECCOMP_RET_ERRNO:
632 /* Set the low-order 16-bits as a errno. */
633 syscall_set_return_value(current, task_pt_regs(current),
634 -data, 0);
635 goto skip;
636
637 case SECCOMP_RET_TRAP:
638 /* Show the handler the original registers. */
639 syscall_rollback(current, task_pt_regs(current));
640 /* Let the filter pass back 16 bits of data. */
641 seccomp_send_sigsys(this_syscall, data);
642 goto skip;
643
644 case SECCOMP_RET_TRACE:
645 return filter_ret; /* Save the rest for phase 2. */
646
647 case SECCOMP_RET_ALLOW:
648 return SECCOMP_PHASE1_OK;
649
650 case SECCOMP_RET_KILL:
651 default:
652 audit_seccomp(this_syscall, SIGSYS, action);
653 do_exit(SIGSYS);
654 }
655
656 unreachable();
657
658skip:
659 audit_seccomp(this_syscall, 0, action);
660 return SECCOMP_PHASE1_SKIP;
661}
584#endif 662#endif
585 do { 663
586 if (*syscall == this_syscall) 664/**
587 return 0; 665 * seccomp_phase1() - run fast path seccomp checks on the current syscall
588 } while (*++syscall); 666 * @arg sd: The seccomp_data or NULL
589 exit_sig = SIGKILL; 667 *
590 ret = SECCOMP_RET_KILL; 668 * This only reads pt_regs via the syscall_xyz helpers. The only change
591 break; 669 * it will make to pt_regs is via syscall_set_return_value, and it will
670 * only do that if it returns SECCOMP_PHASE1_SKIP.
671 *
672 * If sd is provided, it will not read pt_regs at all.
673 *
674 * It may also call do_exit or force a signal; these actions must be
675 * safe.
676 *
677 * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
678 * be processed normally.
679 *
680 * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
681 * invoked. In this case, seccomp_phase1 will have set the return value
682 * using syscall_set_return_value.
683 *
684 * If it returns anything else, then the return value should be passed
685 * to seccomp_phase2 from a context in which ptrace hooks are safe.
686 */
687u32 seccomp_phase1(struct seccomp_data *sd)
688{
689 int mode = current->seccomp.mode;
690 int this_syscall = sd ? sd->nr :
691 syscall_get_nr(current, task_pt_regs(current));
692
693 switch (mode) {
694 case SECCOMP_MODE_STRICT:
695 __secure_computing_strict(this_syscall); /* may call do_exit */
696 return SECCOMP_PHASE1_OK;
592#ifdef CONFIG_SECCOMP_FILTER 697#ifdef CONFIG_SECCOMP_FILTER
593 case SECCOMP_MODE_FILTER: { 698 case SECCOMP_MODE_FILTER:
594 int data; 699 return __seccomp_phase1_filter(this_syscall, sd);
595 struct pt_regs *regs = task_pt_regs(current);
596 ret = seccomp_run_filters(this_syscall);
597 data = ret & SECCOMP_RET_DATA;
598 ret &= SECCOMP_RET_ACTION;
599 switch (ret) {
600 case SECCOMP_RET_ERRNO:
601 /* Set the low-order 16-bits as a errno. */
602 syscall_set_return_value(current, regs,
603 -data, 0);
604 goto skip;
605 case SECCOMP_RET_TRAP:
606 /* Show the handler the original registers. */
607 syscall_rollback(current, regs);
608 /* Let the filter pass back 16 bits of data. */
609 seccomp_send_sigsys(this_syscall, data);
610 goto skip;
611 case SECCOMP_RET_TRACE:
612 /* Skip these calls if there is no tracer. */
613 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
614 syscall_set_return_value(current, regs,
615 -ENOSYS, 0);
616 goto skip;
617 }
618 /* Allow the BPF to provide the event message */
619 ptrace_event(PTRACE_EVENT_SECCOMP, data);
620 /*
621 * The delivery of a fatal signal during event
622 * notification may silently skip tracer notification.
623 * Terminating the task now avoids executing a system
624 * call that may not be intended.
625 */
626 if (fatal_signal_pending(current))
627 break;
628 if (syscall_get_nr(current, regs) < 0)
629 goto skip; /* Explicit request to skip. */
630
631 return 0;
632 case SECCOMP_RET_ALLOW:
633 return 0;
634 case SECCOMP_RET_KILL:
635 default:
636 break;
637 }
638 exit_sig = SIGSYS;
639 break;
640 }
641#endif 700#endif
642 default: 701 default:
643 BUG(); 702 BUG();
644 } 703 }
704}
645 705
646#ifdef SECCOMP_DEBUG 706/**
647 dump_stack(); 707 * seccomp_phase2() - finish slow path seccomp work for the current syscall
648#endif 708 * @phase1_result: The return value from seccomp_phase1()
649 audit_seccomp(this_syscall, exit_sig, ret); 709 *
650 do_exit(exit_sig); 710 * This must be called from a context in which ptrace hooks can be used.
651#ifdef CONFIG_SECCOMP_FILTER 711 *
652skip: 712 * Returns 0 if the syscall should be processed or -1 to skip the syscall.
653 audit_seccomp(this_syscall, exit_sig, ret); 713 */
654#endif 714int seccomp_phase2(u32 phase1_result)
655 return -1; 715{
716 struct pt_regs *regs = task_pt_regs(current);
717 u32 action = phase1_result & SECCOMP_RET_ACTION;
718 int data = phase1_result & SECCOMP_RET_DATA;
719
720 BUG_ON(action != SECCOMP_RET_TRACE);
721
722 audit_seccomp(syscall_get_nr(current, regs), 0, action);
723
724 /* Skip these calls if there is no tracer. */
725 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
726 syscall_set_return_value(current, regs,
727 -ENOSYS, 0);
728 return -1;
729 }
730
731 /* Allow the BPF to provide the event message */
732 ptrace_event(PTRACE_EVENT_SECCOMP, data);
733 /*
734 * The delivery of a fatal signal during event
735 * notification may silently skip tracer notification.
736 * Terminating the task now avoids executing a system
737 * call that may not be intended.
738 */
739 if (fatal_signal_pending(current))
740 do_exit(SIGSYS);
741 if (syscall_get_nr(current, regs) < 0)
742 return -1; /* Explicit request to skip. */
743
744 return 0;
656} 745}
746#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
657 747
658long prctl_get_seccomp(void) 748long prctl_get_seccomp(void)
659{ 749{