aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/Kconfig11
-rw-r--r--arch/arm/kernel/ptrace.c7
-rw-r--r--arch/mips/kernel/ptrace.c2
-rw-r--r--arch/s390/kernel/ptrace.c2
-rw-r--r--arch/x86/include/asm/calling.h6
-rw-r--r--arch/x86/include/asm/ptrace.h5
-rw-r--r--arch/x86/kernel/entry_64.S51
-rw-r--r--arch/x86/kernel/ptrace.c165
-rw-r--r--arch/x86/kernel/vsyscall_64.c2
-rw-r--r--include/linux/seccomp.h25
-rw-r--r--kernel/seccomp.c252
11 files changed, 371 insertions, 157 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index 0eae9df35b88..05d7a8a458d5 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -323,6 +323,17 @@ config HAVE_ARCH_SECCOMP_FILTER
323 results in the system call being skipped immediately. 323 results in the system call being skipped immediately.
324 - seccomp syscall wired up 324 - seccomp syscall wired up
325 325
326 For best performance, an arch should use seccomp_phase1 and
327 seccomp_phase2 directly. It should call seccomp_phase1 for all
328 syscalls if TIF_SECCOMP is set, but seccomp_phase1 does not
329 need to be called from a ptrace-safe context. It must then
330 call seccomp_phase2 if seccomp_phase1 returns anything other
331 than SECCOMP_PHASE1_OK or SECCOMP_PHASE1_SKIP.
332
333 As an additional optimization, an arch may provide seccomp_data
334 directly to seccomp_phase1; this avoids multiple calls
335 to the syscall_xyz helpers for every syscall.
336
326config SECCOMP_FILTER 337config SECCOMP_FILTER
327 def_bool y 338 def_bool y
328 depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET 339 depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 0c27ed6f3f23..5e772a21ab97 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -933,8 +933,13 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno)
933 current_thread_info()->syscall = scno; 933 current_thread_info()->syscall = scno;
934 934
935 /* Do the secure computing check first; failures should be fast. */ 935 /* Do the secure computing check first; failures should be fast. */
936 if (secure_computing(scno) == -1) 936#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
937 if (secure_computing() == -1)
937 return -1; 938 return -1;
939#else
940 /* XXX: remove this once OABI gets fixed */
941 secure_computing_strict(scno);
942#endif
938 943
939 if (test_thread_flag(TIF_SYSCALL_TRACE)) 944 if (test_thread_flag(TIF_SYSCALL_TRACE))
940 tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); 945 tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 645b3c4fcfba..f7aac5b57b4b 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -770,7 +770,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall)
770 long ret = 0; 770 long ret = 0;
771 user_exit(); 771 user_exit();
772 772
773 if (secure_computing(syscall) == -1) 773 if (secure_computing() == -1)
774 return -1; 774 return -1;
775 775
776 if (test_thread_flag(TIF_SYSCALL_TRACE) && 776 if (test_thread_flag(TIF_SYSCALL_TRACE) &&
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 5dc7ad9e2fbf..bebacad48305 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -803,7 +803,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
803 long ret = 0; 803 long ret = 0;
804 804
805 /* Do the secure computing check first. */ 805 /* Do the secure computing check first. */
806 if (secure_computing(regs->gprs[2])) { 806 if (secure_computing()) {
807 /* seccomp failures shouldn't expose any additional code. */ 807 /* seccomp failures shouldn't expose any additional code. */
808 ret = -1; 808 ret = -1;
809 goto out; 809 goto out;
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index cb4c73bfeb48..76659b67fd11 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -85,7 +85,7 @@ For 32-bit we have the following conventions - kernel is built with
85#define ARGOFFSET R11 85#define ARGOFFSET R11
86#define SWFRAME ORIG_RAX 86#define SWFRAME ORIG_RAX
87 87
88 .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1 88 .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
89 subq $9*8+\addskip, %rsp 89 subq $9*8+\addskip, %rsp
90 CFI_ADJUST_CFA_OFFSET 9*8+\addskip 90 CFI_ADJUST_CFA_OFFSET 9*8+\addskip
91 movq_cfi rdi, 8*8 91 movq_cfi rdi, 8*8
@@ -96,7 +96,11 @@ For 32-bit we have the following conventions - kernel is built with
96 movq_cfi rcx, 5*8 96 movq_cfi rcx, 5*8
97 .endif 97 .endif
98 98
99 .if \rax_enosys
100 movq $-ENOSYS, 4*8(%rsp)
101 .else
99 movq_cfi rax, 4*8 102 movq_cfi rax, 4*8
103 .endif
100 104
101 .if \save_r891011 105 .if \save_r891011
102 movq_cfi r8, 3*8 106 movq_cfi r8, 3*8
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 6205f0c434db..86fc2bb82287 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -75,6 +75,11 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
75extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, 75extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
76 int error_code, int si_code); 76 int error_code, int si_code);
77 77
78
79extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch);
80extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch,
81 unsigned long phase1_result);
82
78extern long syscall_trace_enter(struct pt_regs *); 83extern long syscall_trace_enter(struct pt_regs *);
79extern void syscall_trace_leave(struct pt_regs *); 84extern void syscall_trace_leave(struct pt_regs *);
80 85
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2fac1343a90b..df088bb03fb3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -404,8 +404,8 @@ GLOBAL(system_call_after_swapgs)
404 * and short: 404 * and short:
405 */ 405 */
406 ENABLE_INTERRUPTS(CLBR_NONE) 406 ENABLE_INTERRUPTS(CLBR_NONE)
407 SAVE_ARGS 8,0 407 SAVE_ARGS 8, 0, rax_enosys=1
408 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 408 movq_cfi rax,(ORIG_RAX-ARGOFFSET)
409 movq %rcx,RIP-ARGOFFSET(%rsp) 409 movq %rcx,RIP-ARGOFFSET(%rsp)
410 CFI_REL_OFFSET rip,RIP-ARGOFFSET 410 CFI_REL_OFFSET rip,RIP-ARGOFFSET
411 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 411 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
@@ -417,7 +417,7 @@ system_call_fastpath:
417 andl $__SYSCALL_MASK,%eax 417 andl $__SYSCALL_MASK,%eax
418 cmpl $__NR_syscall_max,%eax 418 cmpl $__NR_syscall_max,%eax
419#endif 419#endif
420 ja badsys 420 ja ret_from_sys_call /* and return regs->ax */
421 movq %r10,%rcx 421 movq %r10,%rcx
422 call *sys_call_table(,%rax,8) # XXX: rip relative 422 call *sys_call_table(,%rax,8) # XXX: rip relative
423 movq %rax,RAX-ARGOFFSET(%rsp) 423 movq %rax,RAX-ARGOFFSET(%rsp)
@@ -476,28 +476,8 @@ sysret_signal:
476 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET 476 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
477 jmp int_check_syscall_exit_work 477 jmp int_check_syscall_exit_work
478 478
479badsys:
480 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
481 jmp ret_from_sys_call
482
483#ifdef CONFIG_AUDITSYSCALL 479#ifdef CONFIG_AUDITSYSCALL
484 /* 480 /*
485 * Fast path for syscall audit without full syscall trace.
486 * We just call __audit_syscall_entry() directly, and then
487 * jump back to the normal fast path.
488 */
489auditsys:
490 movq %r10,%r9 /* 6th arg: 4th syscall arg */
491 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
492 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
493 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
494 movq %rax,%rsi /* 2nd arg: syscall number */
495 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
496 call __audit_syscall_entry
497 LOAD_ARGS 0 /* reload call-clobbered registers */
498 jmp system_call_fastpath
499
500 /*
501 * Return fast path for syscall audit. Call __audit_syscall_exit() 481 * Return fast path for syscall audit. Call __audit_syscall_exit()
502 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT 482 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
503 * masked off. 483 * masked off.
@@ -514,18 +494,25 @@ sysret_audit:
514 494
515 /* Do syscall tracing */ 495 /* Do syscall tracing */
516tracesys: 496tracesys:
517#ifdef CONFIG_AUDITSYSCALL 497 leaq -REST_SKIP(%rsp), %rdi
518 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 498 movq $AUDIT_ARCH_X86_64, %rsi
519 jz auditsys 499 call syscall_trace_enter_phase1
520#endif 500 test %rax, %rax
501 jnz tracesys_phase2 /* if needed, run the slow path */
502 LOAD_ARGS 0 /* else restore clobbered regs */
503 jmp system_call_fastpath /* and return to the fast path */
504
505tracesys_phase2:
521 SAVE_REST 506 SAVE_REST
522 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
523 FIXUP_TOP_OF_STACK %rdi 507 FIXUP_TOP_OF_STACK %rdi
524 movq %rsp,%rdi 508 movq %rsp, %rdi
525 call syscall_trace_enter 509 movq $AUDIT_ARCH_X86_64, %rsi
510 movq %rax,%rdx
511 call syscall_trace_enter_phase2
512
526 /* 513 /*
527 * Reload arg registers from stack in case ptrace changed them. 514 * Reload arg registers from stack in case ptrace changed them.
528 * We don't reload %rax because syscall_trace_enter() returned 515 * We don't reload %rax because syscall_trace_entry_phase2() returned
529 * the value it wants us to use in the table lookup. 516 * the value it wants us to use in the table lookup.
530 */ 517 */
531 LOAD_ARGS ARGOFFSET, 1 518 LOAD_ARGS ARGOFFSET, 1
@@ -536,7 +523,7 @@ tracesys:
536 andl $__SYSCALL_MASK,%eax 523 andl $__SYSCALL_MASK,%eax
537 cmpl $__NR_syscall_max,%eax 524 cmpl $__NR_syscall_max,%eax
538#endif 525#endif
539 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ 526 ja int_ret_from_sys_call /* RAX(%rsp) is already set */
540 movq %r10,%rcx /* fixup for C */ 527 movq %r10,%rcx /* fixup for C */
541 call *sys_call_table(,%rax,8) 528 call *sys_call_table(,%rax,8)
542 movq %rax,RAX-ARGOFFSET(%rsp) 529 movq %rax,RAX-ARGOFFSET(%rsp)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 678c0ada3b3c..29576c244699 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1441,24 +1441,126 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1441 force_sig_info(SIGTRAP, &info, tsk); 1441 force_sig_info(SIGTRAP, &info, tsk);
1442} 1442}
1443 1443
1444 1444static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
1445#ifdef CONFIG_X86_32 1445{
1446# define IS_IA32 1 1446#ifdef CONFIG_X86_64
1447#elif defined CONFIG_IA32_EMULATION 1447 if (arch == AUDIT_ARCH_X86_64) {
1448# define IS_IA32 is_compat_task() 1448 audit_syscall_entry(arch, regs->orig_ax, regs->di,
1449#else 1449 regs->si, regs->dx, regs->r10);
1450# define IS_IA32 0 1450 } else
1451#endif 1451#endif
1452 {
1453 audit_syscall_entry(arch, regs->orig_ax, regs->bx,
1454 regs->cx, regs->dx, regs->si);
1455 }
1456}
1452 1457
1453/* 1458/*
1454 * We must return the syscall number to actually look up in the table. 1459 * We can return 0 to resume the syscall or anything else to go to phase
1455 * This can be -1L to skip running any syscall at all. 1460 * 2. If we resume the syscall, we need to put something appropriate in
1461 * regs->orig_ax.
1462 *
1463 * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
1464 * are fully functional.
1465 *
1466 * For phase 2's benefit, our return value is:
1467 * 0: resume the syscall
1468 * 1: go to phase 2; no seccomp phase 2 needed
1469 * anything else: go to phase 2; pass return value to seccomp
1456 */ 1470 */
1457long syscall_trace_enter(struct pt_regs *regs) 1471unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
1472{
1473 unsigned long ret = 0;
1474 u32 work;
1475
1476 BUG_ON(regs != task_pt_regs(current));
1477
1478 work = ACCESS_ONCE(current_thread_info()->flags) &
1479 _TIF_WORK_SYSCALL_ENTRY;
1480
1481 /*
1482 * If TIF_NOHZ is set, we are required to call user_exit() before
1483 * doing anything that could touch RCU.
1484 */
1485 if (work & _TIF_NOHZ) {
1486 user_exit();
1487 work &= ~TIF_NOHZ;
1488 }
1489
1490#ifdef CONFIG_SECCOMP
1491 /*
1492 * Do seccomp first -- it should minimize exposure of other
1493 * code, and keeping seccomp fast is probably more valuable
1494 * than the rest of this.
1495 */
1496 if (work & _TIF_SECCOMP) {
1497 struct seccomp_data sd;
1498
1499 sd.arch = arch;
1500 sd.nr = regs->orig_ax;
1501 sd.instruction_pointer = regs->ip;
1502#ifdef CONFIG_X86_64
1503 if (arch == AUDIT_ARCH_X86_64) {
1504 sd.args[0] = regs->di;
1505 sd.args[1] = regs->si;
1506 sd.args[2] = regs->dx;
1507 sd.args[3] = regs->r10;
1508 sd.args[4] = regs->r8;
1509 sd.args[5] = regs->r9;
1510 } else
1511#endif
1512 {
1513 sd.args[0] = regs->bx;
1514 sd.args[1] = regs->cx;
1515 sd.args[2] = regs->dx;
1516 sd.args[3] = regs->si;
1517 sd.args[4] = regs->di;
1518 sd.args[5] = regs->bp;
1519 }
1520
1521 BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
1522 BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
1523
1524 ret = seccomp_phase1(&sd);
1525 if (ret == SECCOMP_PHASE1_SKIP) {
1526 regs->orig_ax = -1;
1527 ret = 0;
1528 } else if (ret != SECCOMP_PHASE1_OK) {
1529 return ret; /* Go directly to phase 2 */
1530 }
1531
1532 work &= ~_TIF_SECCOMP;
1533 }
1534#endif
1535
1536 /* Do our best to finish without phase 2. */
1537 if (work == 0)
1538 return ret; /* seccomp and/or nohz only (ret == 0 here) */
1539
1540#ifdef CONFIG_AUDITSYSCALL
1541 if (work == _TIF_SYSCALL_AUDIT) {
1542 /*
1543 * If there is no more work to be done except auditing,
1544 * then audit in phase 1. Phase 2 always audits, so, if
1545 * we audit here, then we can't go on to phase 2.
1546 */
1547 do_audit_syscall_entry(regs, arch);
1548 return 0;
1549 }
1550#endif
1551
1552 return 1; /* Something is enabled that we can't handle in phase 1 */
1553}
1554
1555/* Returns the syscall nr to run (which should match regs->orig_ax). */
1556long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
1557 unsigned long phase1_result)
1458{ 1558{
1459 long ret = 0; 1559 long ret = 0;
1560 u32 work = ACCESS_ONCE(current_thread_info()->flags) &
1561 _TIF_WORK_SYSCALL_ENTRY;
1460 1562
1461 user_exit(); 1563 BUG_ON(regs != task_pt_regs(current));
1462 1564
1463 /* 1565 /*
1464 * If we stepped into a sysenter/syscall insn, it trapped in 1566 * If we stepped into a sysenter/syscall insn, it trapped in
@@ -1467,17 +1569,21 @@ long syscall_trace_enter(struct pt_regs *regs)
1467 * do_debug() and we need to set it again to restore the user 1569 * do_debug() and we need to set it again to restore the user
1468 * state. If we entered on the slow path, TF was already set. 1570 * state. If we entered on the slow path, TF was already set.
1469 */ 1571 */
1470 if (test_thread_flag(TIF_SINGLESTEP)) 1572 if (work & _TIF_SINGLESTEP)
1471 regs->flags |= X86_EFLAGS_TF; 1573 regs->flags |= X86_EFLAGS_TF;
1472 1574
1473 /* do the secure computing check first */ 1575#ifdef CONFIG_SECCOMP
1474 if (secure_computing(regs->orig_ax)) { 1576 /*
1577 * Call seccomp_phase2 before running the other hooks so that
1578 * they can see any changes made by a seccomp tracer.
1579 */
1580 if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
1475 /* seccomp failures shouldn't expose any additional code. */ 1581 /* seccomp failures shouldn't expose any additional code. */
1476 ret = -1L; 1582 return -1;
1477 goto out;
1478 } 1583 }
1584#endif
1479 1585
1480 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) 1586 if (unlikely(work & _TIF_SYSCALL_EMU))
1481 ret = -1L; 1587 ret = -1L;
1482 1588
1483 if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && 1589 if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
@@ -1487,23 +1593,22 @@ long syscall_trace_enter(struct pt_regs *regs)
1487 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1593 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1488 trace_sys_enter(regs, regs->orig_ax); 1594 trace_sys_enter(regs, regs->orig_ax);
1489 1595
1490 if (IS_IA32) 1596 do_audit_syscall_entry(regs, arch);
1491 audit_syscall_entry(AUDIT_ARCH_I386,
1492 regs->orig_ax,
1493 regs->bx, regs->cx,
1494 regs->dx, regs->si);
1495#ifdef CONFIG_X86_64
1496 else
1497 audit_syscall_entry(AUDIT_ARCH_X86_64,
1498 regs->orig_ax,
1499 regs->di, regs->si,
1500 regs->dx, regs->r10);
1501#endif
1502 1597
1503out:
1504 return ret ?: regs->orig_ax; 1598 return ret ?: regs->orig_ax;
1505} 1599}
1506 1600
1601long syscall_trace_enter(struct pt_regs *regs)
1602{
1603 u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
1604 unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
1605
1606 if (phase1_result == 0)
1607 return regs->orig_ax;
1608 else
1609 return syscall_trace_enter_phase2(regs, arch, phase1_result);
1610}
1611
1507void syscall_trace_leave(struct pt_regs *regs) 1612void syscall_trace_leave(struct pt_regs *regs)
1508{ 1613{
1509 bool step; 1614 bool step;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index e1e1e80fc6a6..957779f4eb40 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -216,7 +216,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
216 */ 216 */
217 regs->orig_ax = syscall_nr; 217 regs->orig_ax = syscall_nr;
218 regs->ax = -ENOSYS; 218 regs->ax = -ENOSYS;
219 tmp = secure_computing(syscall_nr); 219 tmp = secure_computing();
220 if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { 220 if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
221 warn_bad_vsyscall(KERN_DEBUG, regs, 221 warn_bad_vsyscall(KERN_DEBUG, regs,
222 "seccomp tried to change syscall nr or ip"); 222 "seccomp tried to change syscall nr or ip");
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 5d586a45a319..a19ddacdac30 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -27,19 +27,23 @@ struct seccomp {
27 struct seccomp_filter *filter; 27 struct seccomp_filter *filter;
28}; 28};
29 29
30extern int __secure_computing(int); 30#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
31static inline int secure_computing(int this_syscall) 31extern int __secure_computing(void);
32static inline int secure_computing(void)
32{ 33{
33 if (unlikely(test_thread_flag(TIF_SECCOMP))) 34 if (unlikely(test_thread_flag(TIF_SECCOMP)))
34 return __secure_computing(this_syscall); 35 return __secure_computing();
35 return 0; 36 return 0;
36} 37}
37 38
38/* A wrapper for architectures supporting only SECCOMP_MODE_STRICT. */ 39#define SECCOMP_PHASE1_OK 0
39static inline void secure_computing_strict(int this_syscall) 40#define SECCOMP_PHASE1_SKIP 1
40{ 41
41 BUG_ON(secure_computing(this_syscall) != 0); 42extern u32 seccomp_phase1(struct seccomp_data *sd);
42} 43int seccomp_phase2(u32 phase1_result);
44#else
45extern void secure_computing_strict(int this_syscall);
46#endif
43 47
44extern long prctl_get_seccomp(void); 48extern long prctl_get_seccomp(void);
45extern long prctl_set_seccomp(unsigned long, char __user *); 49extern long prctl_set_seccomp(unsigned long, char __user *);
@@ -56,8 +60,11 @@ static inline int seccomp_mode(struct seccomp *s)
56struct seccomp { }; 60struct seccomp { };
57struct seccomp_filter { }; 61struct seccomp_filter { };
58 62
59static inline int secure_computing(int this_syscall) { return 0; } 63#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
64static inline int secure_computing(void) { return 0; }
65#else
60static inline void secure_computing_strict(int this_syscall) { return; } 66static inline void secure_computing_strict(int this_syscall) { return; }
67#endif
61 68
62static inline long prctl_get_seccomp(void) 69static inline long prctl_get_seccomp(void)
63{ 70{
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 84922befea84..4ef9687ac115 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -21,10 +21,11 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/syscalls.h> 22#include <linux/syscalls.h>
23 23
24/* #define SECCOMP_DEBUG 1 */ 24#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
25#include <asm/syscall.h>
26#endif
25 27
26#ifdef CONFIG_SECCOMP_FILTER 28#ifdef CONFIG_SECCOMP_FILTER
27#include <asm/syscall.h>
28#include <linux/filter.h> 29#include <linux/filter.h>
29#include <linux/pid.h> 30#include <linux/pid.h>
30#include <linux/ptrace.h> 31#include <linux/ptrace.h>
@@ -172,10 +173,10 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
172 * 173 *
173 * Returns valid seccomp BPF response codes. 174 * Returns valid seccomp BPF response codes.
174 */ 175 */
175static u32 seccomp_run_filters(int syscall) 176static u32 seccomp_run_filters(struct seccomp_data *sd)
176{ 177{
177 struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); 178 struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
178 struct seccomp_data sd; 179 struct seccomp_data sd_local;
179 u32 ret = SECCOMP_RET_ALLOW; 180 u32 ret = SECCOMP_RET_ALLOW;
180 181
181 /* Ensure unexpected behavior doesn't result in failing open. */ 182 /* Ensure unexpected behavior doesn't result in failing open. */
@@ -185,14 +186,17 @@ static u32 seccomp_run_filters(int syscall)
185 /* Make sure cross-thread synced filter points somewhere sane. */ 186 /* Make sure cross-thread synced filter points somewhere sane. */
186 smp_read_barrier_depends(); 187 smp_read_barrier_depends();
187 188
188 populate_seccomp_data(&sd); 189 if (!sd) {
190 populate_seccomp_data(&sd_local);
191 sd = &sd_local;
192 }
189 193
190 /* 194 /*
191 * All filters in the list are evaluated and the lowest BPF return 195 * All filters in the list are evaluated and the lowest BPF return
192 * value always takes priority (ignoring the DATA). 196 * value always takes priority (ignoring the DATA).
193 */ 197 */
194 for (; f; f = f->prev) { 198 for (; f; f = f->prev) {
195 u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd); 199 u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
196 200
197 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) 201 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
198 ret = cur_ret; 202 ret = cur_ret;
@@ -563,11 +567,55 @@ static int mode1_syscalls_32[] = {
563}; 567};
564#endif 568#endif
565 569
566int __secure_computing(int this_syscall) 570static void __secure_computing_strict(int this_syscall)
571{
572 int *syscall_whitelist = mode1_syscalls;
573#ifdef CONFIG_COMPAT
574 if (is_compat_task())
575 syscall_whitelist = mode1_syscalls_32;
576#endif
577 do {
578 if (*syscall_whitelist == this_syscall)
579 return;
580 } while (*++syscall_whitelist);
581
582#ifdef SECCOMP_DEBUG
583 dump_stack();
584#endif
585 audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
586 do_exit(SIGKILL);
587}
588
589#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
590void secure_computing_strict(int this_syscall)
591{
592 int mode = current->seccomp.mode;
593
594 if (mode == 0)
595 return;
596 else if (mode == SECCOMP_MODE_STRICT)
597 __secure_computing_strict(this_syscall);
598 else
599 BUG();
600}
601#else
602int __secure_computing(void)
567{ 603{
568 int exit_sig = 0; 604 u32 phase1_result = seccomp_phase1(NULL);
569 int *syscall; 605
570 u32 ret; 606 if (likely(phase1_result == SECCOMP_PHASE1_OK))
607 return 0;
608 else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
609 return -1;
610 else
611 return seccomp_phase2(phase1_result);
612}
613
614#ifdef CONFIG_SECCOMP_FILTER
615static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
616{
617 u32 filter_ret, action;
618 int data;
571 619
572 /* 620 /*
573 * Make sure that any changes to mode from another thread have 621 * Make sure that any changes to mode from another thread have
@@ -575,85 +623,127 @@ int __secure_computing(int this_syscall)
575 */ 623 */
576 rmb(); 624 rmb();
577 625
578 switch (current->seccomp.mode) { 626 filter_ret = seccomp_run_filters(sd);
579 case SECCOMP_MODE_STRICT: 627 data = filter_ret & SECCOMP_RET_DATA;
580 syscall = mode1_syscalls; 628 action = filter_ret & SECCOMP_RET_ACTION;
581#ifdef CONFIG_COMPAT 629
582 if (is_compat_task()) 630 switch (action) {
583 syscall = mode1_syscalls_32; 631 case SECCOMP_RET_ERRNO:
632 /* Set the low-order 16-bits as a errno. */
633 syscall_set_return_value(current, task_pt_regs(current),
634 -data, 0);
635 goto skip;
636
637 case SECCOMP_RET_TRAP:
638 /* Show the handler the original registers. */
639 syscall_rollback(current, task_pt_regs(current));
640 /* Let the filter pass back 16 bits of data. */
641 seccomp_send_sigsys(this_syscall, data);
642 goto skip;
643
644 case SECCOMP_RET_TRACE:
645 return filter_ret; /* Save the rest for phase 2. */
646
647 case SECCOMP_RET_ALLOW:
648 return SECCOMP_PHASE1_OK;
649
650 case SECCOMP_RET_KILL:
651 default:
652 audit_seccomp(this_syscall, SIGSYS, action);
653 do_exit(SIGSYS);
654 }
655
656 unreachable();
657
658skip:
659 audit_seccomp(this_syscall, 0, action);
660 return SECCOMP_PHASE1_SKIP;
661}
584#endif 662#endif
585 do { 663
586 if (*syscall == this_syscall) 664/**
587 return 0; 665 * seccomp_phase1() - run fast path seccomp checks on the current syscall
588 } while (*++syscall); 666 * @arg sd: The seccomp_data or NULL
589 exit_sig = SIGKILL; 667 *
590 ret = SECCOMP_RET_KILL; 668 * This only reads pt_regs via the syscall_xyz helpers. The only change
591 break; 669 * it will make to pt_regs is via syscall_set_return_value, and it will
670 * only do that if it returns SECCOMP_PHASE1_SKIP.
671 *
672 * If sd is provided, it will not read pt_regs at all.
673 *
674 * It may also call do_exit or force a signal; these actions must be
675 * safe.
676 *
677 * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
678 * be processed normally.
679 *
680 * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
681 * invoked. In this case, seccomp_phase1 will have set the return value
682 * using syscall_set_return_value.
683 *
684 * If it returns anything else, then the return value should be passed
685 * to seccomp_phase2 from a context in which ptrace hooks are safe.
686 */
687u32 seccomp_phase1(struct seccomp_data *sd)
688{
689 int mode = current->seccomp.mode;
690 int this_syscall = sd ? sd->nr :
691 syscall_get_nr(current, task_pt_regs(current));
692
693 switch (mode) {
694 case SECCOMP_MODE_STRICT:
695 __secure_computing_strict(this_syscall); /* may call do_exit */
696 return SECCOMP_PHASE1_OK;
592#ifdef CONFIG_SECCOMP_FILTER 697#ifdef CONFIG_SECCOMP_FILTER
593 case SECCOMP_MODE_FILTER: { 698 case SECCOMP_MODE_FILTER:
594 int data; 699 return __seccomp_phase1_filter(this_syscall, sd);
595 struct pt_regs *regs = task_pt_regs(current);
596 ret = seccomp_run_filters(this_syscall);
597 data = ret & SECCOMP_RET_DATA;
598 ret &= SECCOMP_RET_ACTION;
599 switch (ret) {
600 case SECCOMP_RET_ERRNO:
601 /* Set the low-order 16-bits as a errno. */
602 syscall_set_return_value(current, regs,
603 -data, 0);
604 goto skip;
605 case SECCOMP_RET_TRAP:
606 /* Show the handler the original registers. */
607 syscall_rollback(current, regs);
608 /* Let the filter pass back 16 bits of data. */
609 seccomp_send_sigsys(this_syscall, data);
610 goto skip;
611 case SECCOMP_RET_TRACE:
612 /* Skip these calls if there is no tracer. */
613 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
614 syscall_set_return_value(current, regs,
615 -ENOSYS, 0);
616 goto skip;
617 }
618 /* Allow the BPF to provide the event message */
619 ptrace_event(PTRACE_EVENT_SECCOMP, data);
620 /*
621 * The delivery of a fatal signal during event
622 * notification may silently skip tracer notification.
623 * Terminating the task now avoids executing a system
624 * call that may not be intended.
625 */
626 if (fatal_signal_pending(current))
627 break;
628 if (syscall_get_nr(current, regs) < 0)
629 goto skip; /* Explicit request to skip. */
630
631 return 0;
632 case SECCOMP_RET_ALLOW:
633 return 0;
634 case SECCOMP_RET_KILL:
635 default:
636 break;
637 }
638 exit_sig = SIGSYS;
639 break;
640 }
641#endif 700#endif
642 default: 701 default:
643 BUG(); 702 BUG();
644 } 703 }
704}
645 705
646#ifdef SECCOMP_DEBUG 706/**
647 dump_stack(); 707 * seccomp_phase2() - finish slow path seccomp work for the current syscall
648#endif 708 * @phase1_result: The return value from seccomp_phase1()
649 audit_seccomp(this_syscall, exit_sig, ret); 709 *
650 do_exit(exit_sig); 710 * This must be called from a context in which ptrace hooks can be used.
651#ifdef CONFIG_SECCOMP_FILTER 711 *
652skip: 712 * Returns 0 if the syscall should be processed or -1 to skip the syscall.
653 audit_seccomp(this_syscall, exit_sig, ret); 713 */
654#endif 714int seccomp_phase2(u32 phase1_result)
655 return -1; 715{
716 struct pt_regs *regs = task_pt_regs(current);
717 u32 action = phase1_result & SECCOMP_RET_ACTION;
718 int data = phase1_result & SECCOMP_RET_DATA;
719
720 BUG_ON(action != SECCOMP_RET_TRACE);
721
722 audit_seccomp(syscall_get_nr(current, regs), 0, action);
723
724 /* Skip these calls if there is no tracer. */
725 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
726 syscall_set_return_value(current, regs,
727 -ENOSYS, 0);
728 return -1;
729 }
730
731 /* Allow the BPF to provide the event message */
732 ptrace_event(PTRACE_EVENT_SECCOMP, data);
733 /*
734 * The delivery of a fatal signal during event
735 * notification may silently skip tracer notification.
736 * Terminating the task now avoids executing a system
737 * call that may not be intended.
738 */
739 if (fatal_signal_pending(current))
740 do_exit(SIGSYS);
741 if (syscall_get_nr(current, regs) < 0)
742 return -1; /* Explicit request to skip. */
743
744 return 0;
656} 745}
746#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
657 747
658long prctl_get_seccomp(void) 748long prctl_get_seccomp(void)
659{ 749{