summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS11
-rw-r--r--arch/Kconfig7
-rw-r--r--fs/exec.c1
-rw-r--r--include/linux/sched.h134
-rw-r--r--include/linux/syscalls.h4
-rw-r--r--include/trace/events/rseq.h57
-rw-r--r--include/uapi/linux/rseq.h133
-rw-r--r--init/Kconfig23
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/rseq.c357
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sys_ni.c3
13 files changed, 734 insertions, 1 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index aa635837a6af..a384243d911b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11976,6 +11976,17 @@ F: include/dt-bindings/reset/
11976F: include/linux/reset.h 11976F: include/linux/reset.h
11977F: include/linux/reset-controller.h 11977F: include/linux/reset-controller.h
11978 11978
11979RESTARTABLE SEQUENCES SUPPORT
11980M: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
11981M: Peter Zijlstra <peterz@infradead.org>
11982M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
11983M: Boqun Feng <boqun.feng@gmail.com>
11984L: linux-kernel@vger.kernel.org
11985S: Supported
11986F: kernel/rseq.c
11987F: include/uapi/linux/rseq.h
11988F: include/trace/events/rseq.h
11989
11979RFKILL 11990RFKILL
11980M: Johannes Berg <johannes@sipsolutions.net> 11991M: Johannes Berg <johannes@sipsolutions.net>
11981L: linux-wireless@vger.kernel.org 11992L: linux-wireless@vger.kernel.org
diff --git a/arch/Kconfig b/arch/Kconfig
index b695a3e3e922..095ba99968c1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -272,6 +272,13 @@ config HAVE_REGS_AND_STACK_ACCESS_API
272 declared in asm/ptrace.h 272 declared in asm/ptrace.h
273 For example the kprobes-based event tracer needs this API. 273 For example the kprobes-based event tracer needs this API.
274 274
275config HAVE_RSEQ
276 bool
277 depends on HAVE_REGS_AND_STACK_ACCESS_API
278 help
279 This symbol should be selected by an architecture if it
280 supports an implementation of restartable sequences.
281
275config HAVE_CLK 282config HAVE_CLK
276 bool 283 bool
277 help 284 help
diff --git a/fs/exec.c b/fs/exec.c
index 183059c427b9..2c3911612b22 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1822,6 +1822,7 @@ static int do_execveat_common(int fd, struct filename *filename,
1822 current->fs->in_exec = 0; 1822 current->fs->in_exec = 0;
1823 current->in_execve = 0; 1823 current->in_execve = 0;
1824 membarrier_execve(current); 1824 membarrier_execve(current);
1825 rseq_execve(current);
1825 acct_update_integrals(current); 1826 acct_update_integrals(current);
1826 task_numa_free(current); 1827 task_numa_free(current);
1827 free_bprm(bprm); 1828 free_bprm(bprm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 14e4f9c12337..3aa4fcb74e76 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -27,6 +27,7 @@
27#include <linux/signal_types.h> 27#include <linux/signal_types.h>
28#include <linux/mm_types_task.h> 28#include <linux/mm_types_task.h>
29#include <linux/task_io_accounting.h> 29#include <linux/task_io_accounting.h>
30#include <linux/rseq.h>
30 31
31/* task_struct member predeclarations (sorted alphabetically): */ 32/* task_struct member predeclarations (sorted alphabetically): */
32struct audit_context; 33struct audit_context;
@@ -1047,6 +1048,17 @@ struct task_struct {
1047 unsigned long numa_pages_migrated; 1048 unsigned long numa_pages_migrated;
1048#endif /* CONFIG_NUMA_BALANCING */ 1049#endif /* CONFIG_NUMA_BALANCING */
1049 1050
1051#ifdef CONFIG_RSEQ
1052 struct rseq __user *rseq;
1053 u32 rseq_len;
1054 u32 rseq_sig;
1055 /*
1056 * RmW on rseq_event_mask must be performed atomically
1057 * with respect to preemption.
1058 */
1059 unsigned long rseq_event_mask;
1060#endif
1061
1050 struct tlbflush_unmap_batch tlb_ubc; 1062 struct tlbflush_unmap_batch tlb_ubc;
1051 1063
1052 struct rcu_head rcu; 1064 struct rcu_head rcu;
@@ -1757,4 +1769,126 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
1757#define TASK_SIZE_OF(tsk) TASK_SIZE 1769#define TASK_SIZE_OF(tsk) TASK_SIZE
1758#endif 1770#endif
1759 1771
1772#ifdef CONFIG_RSEQ
1773
1774/*
1775 * Map the event mask on the user-space ABI enum rseq_cs_flags
1776 * for direct mask checks.
1777 */
1778enum rseq_event_mask_bits {
1779 RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
1780 RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
1781 RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
1782};
1783
1784enum rseq_event_mask {
1785 RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT),
1786 RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT),
1787 RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT),
1788};
1789
1790static inline void rseq_set_notify_resume(struct task_struct *t)
1791{
1792 if (t->rseq)
1793 set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
1794}
1795
1796void __rseq_handle_notify_resume(struct pt_regs *regs);
1797
1798static inline void rseq_handle_notify_resume(struct pt_regs *regs)
1799{
1800 if (current->rseq)
1801 __rseq_handle_notify_resume(regs);
1802}
1803
1804static inline void rseq_signal_deliver(struct pt_regs *regs)
1805{
1806 preempt_disable();
1807 __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
1808 preempt_enable();
1809 rseq_handle_notify_resume(regs);
1810}
1811
1812/* rseq_preempt() requires preemption to be disabled. */
1813static inline void rseq_preempt(struct task_struct *t)
1814{
1815 __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
1816 rseq_set_notify_resume(t);
1817}
1818
1819/* rseq_migrate() requires preemption to be disabled. */
1820static inline void rseq_migrate(struct task_struct *t)
1821{
1822 __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
1823 rseq_set_notify_resume(t);
1824}
1825
1826/*
1827 * If parent process has a registered restartable sequences area, the
1828 * child inherits. Only applies when forking a process, not a thread. In
1829 * case a parent fork() in the middle of a restartable sequence, set the
1830 * resume notifier to force the child to retry.
1831 */
1832static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
1833{
1834 if (clone_flags & CLONE_THREAD) {
1835 t->rseq = NULL;
1836 t->rseq_len = 0;
1837 t->rseq_sig = 0;
1838 t->rseq_event_mask = 0;
1839 } else {
1840 t->rseq = current->rseq;
1841 t->rseq_len = current->rseq_len;
1842 t->rseq_sig = current->rseq_sig;
1843 t->rseq_event_mask = current->rseq_event_mask;
1844 rseq_preempt(t);
1845 }
1846}
1847
1848static inline void rseq_execve(struct task_struct *t)
1849{
1850 t->rseq = NULL;
1851 t->rseq_len = 0;
1852 t->rseq_sig = 0;
1853 t->rseq_event_mask = 0;
1854}
1855
1856#else
1857
1858static inline void rseq_set_notify_resume(struct task_struct *t)
1859{
1860}
1861static inline void rseq_handle_notify_resume(struct pt_regs *regs)
1862{
1863}
1864static inline void rseq_signal_deliver(struct pt_regs *regs)
1865{
1866}
1867static inline void rseq_preempt(struct task_struct *t)
1868{
1869}
1870static inline void rseq_migrate(struct task_struct *t)
1871{
1872}
1873static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
1874{
1875}
1876static inline void rseq_execve(struct task_struct *t)
1877{
1878}
1879
1880#endif
1881
1882#ifdef CONFIG_DEBUG_RSEQ
1883
1884void rseq_syscall(struct pt_regs *regs);
1885
1886#else
1887
1888static inline void rseq_syscall(struct pt_regs *regs)
1889{
1890}
1891
1892#endif
1893
1760#endif 1894#endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 390e814fdc8d..73810808cdf2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -66,6 +66,7 @@ struct old_linux_dirent;
66struct perf_event_attr; 66struct perf_event_attr;
67struct file_handle; 67struct file_handle;
68struct sigaltstack; 68struct sigaltstack;
69struct rseq;
69union bpf_attr; 70union bpf_attr;
70 71
71#include <linux/types.h> 72#include <linux/types.h>
@@ -897,7 +898,8 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
897asmlinkage long sys_pkey_free(int pkey); 898asmlinkage long sys_pkey_free(int pkey);
898asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, 899asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
899 unsigned mask, struct statx __user *buffer); 900 unsigned mask, struct statx __user *buffer);
900 901asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
902 int flags, uint32_t sig);
901 903
902/* 904/*
903 * Architecture-specific system calls 905 * Architecture-specific system calls
diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h
new file mode 100644
index 000000000000..a04a64bc1a00
--- /dev/null
+++ b/include/trace/events/rseq.h
@@ -0,0 +1,57 @@
1/* SPDX-License-Identifier: GPL-2.0+ */
2#undef TRACE_SYSTEM
3#define TRACE_SYSTEM rseq
4
5#if !defined(_TRACE_RSEQ_H) || defined(TRACE_HEADER_MULTI_READ)
6#define _TRACE_RSEQ_H
7
8#include <linux/tracepoint.h>
9#include <linux/types.h>
10
11TRACE_EVENT(rseq_update,
12
13 TP_PROTO(struct task_struct *t),
14
15 TP_ARGS(t),
16
17 TP_STRUCT__entry(
18 __field(s32, cpu_id)
19 ),
20
21 TP_fast_assign(
22 __entry->cpu_id = raw_smp_processor_id();
23 ),
24
25 TP_printk("cpu_id=%d", __entry->cpu_id)
26);
27
28TRACE_EVENT(rseq_ip_fixup,
29
30 TP_PROTO(unsigned long regs_ip, unsigned long start_ip,
31 unsigned long post_commit_offset, unsigned long abort_ip),
32
33 TP_ARGS(regs_ip, start_ip, post_commit_offset, abort_ip),
34
35 TP_STRUCT__entry(
36 __field(unsigned long, regs_ip)
37 __field(unsigned long, start_ip)
38 __field(unsigned long, post_commit_offset)
39 __field(unsigned long, abort_ip)
40 ),
41
42 TP_fast_assign(
43 __entry->regs_ip = regs_ip;
44 __entry->start_ip = start_ip;
45 __entry->post_commit_offset = post_commit_offset;
46 __entry->abort_ip = abort_ip;
47 ),
48
49 TP_printk("regs_ip=0x%lx start_ip=0x%lx post_commit_offset=%lu abort_ip=0x%lx",
50 __entry->regs_ip, __entry->start_ip,
51 __entry->post_commit_offset, __entry->abort_ip)
52);
53
54#endif /* _TRACE_SOCK_H */
55
56/* This part must be outside protection */
57#include <trace/define_trace.h>
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
new file mode 100644
index 000000000000..d620fa43756c
--- /dev/null
+++ b/include/uapi/linux/rseq.h
@@ -0,0 +1,133 @@
1/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
2#ifndef _UAPI_LINUX_RSEQ_H
3#define _UAPI_LINUX_RSEQ_H
4
5/*
6 * linux/rseq.h
7 *
8 * Restartable sequences system call API
9 *
10 * Copyright (c) 2015-2018 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
11 */
12
13#ifdef __KERNEL__
14# include <linux/types.h>
15#else
16# include <stdint.h>
17#endif
18
19#include <linux/types_32_64.h>
20
21enum rseq_cpu_id_state {
22 RSEQ_CPU_ID_UNINITIALIZED = -1,
23 RSEQ_CPU_ID_REGISTRATION_FAILED = -2,
24};
25
26enum rseq_flags {
27 RSEQ_FLAG_UNREGISTER = (1 << 0),
28};
29
30enum rseq_cs_flags_bit {
31 RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
32 RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
33 RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
34};
35
36enum rseq_cs_flags {
37 RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT =
38 (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT),
39 RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL =
40 (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
41 RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE =
42 (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
43};
44
45/*
46 * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always
47 * contained within a single cache-line. It is usually declared as
48 * link-time constant data.
49 */
50struct rseq_cs {
51 /* Version of this structure. */
52 __u32 version;
53 /* enum rseq_cs_flags */
54 __u32 flags;
55 LINUX_FIELD_u32_u64(start_ip);
56 /* Offset from start_ip. */
57 LINUX_FIELD_u32_u64(post_commit_offset);
58 LINUX_FIELD_u32_u64(abort_ip);
59} __attribute__((aligned(4 * sizeof(__u64))));
60
61/*
62 * struct rseq is aligned on 4 * 8 bytes to ensure it is always
63 * contained within a single cache-line.
64 *
65 * A single struct rseq per thread is allowed.
66 */
67struct rseq {
68 /*
69 * Restartable sequences cpu_id_start field. Updated by the
70 * kernel, and read by user-space with single-copy atomicity
71 * semantics. Aligned on 32-bit. Always contains a value in the
72 * range of possible CPUs, although the value may not be the
73 * actual current CPU (e.g. if rseq is not initialized). This
74 * CPU number value should always be compared against the value
75 * of the cpu_id field before performing a rseq commit or
76 * returning a value read from a data structure indexed using
77 * the cpu_id_start value.
78 */
79 __u32 cpu_id_start;
80 /*
81 * Restartable sequences cpu_id field. Updated by the kernel,
82 * and read by user-space with single-copy atomicity semantics.
83 * Aligned on 32-bit. Values RSEQ_CPU_ID_UNINITIALIZED and
84 * RSEQ_CPU_ID_REGISTRATION_FAILED have a special semantic: the
85 * former means "rseq uninitialized", and latter means "rseq
86 * initialization failed". This value is meant to be read within
87 * rseq critical sections and compared with the cpu_id_start
88 * value previously read, before performing the commit instruction,
89 * or read and compared with the cpu_id_start value before returning
90 * a value loaded from a data structure indexed using the
91 * cpu_id_start value.
92 */
93 __u32 cpu_id;
94 /*
95 * Restartable sequences rseq_cs field.
96 *
97 * Contains NULL when no critical section is active for the current
98 * thread, or holds a pointer to the currently active struct rseq_cs.
99 *
100 * Updated by user-space, which sets the address of the currently
101 * active rseq_cs at the beginning of assembly instruction sequence
102 * block, and set to NULL by the kernel when it restarts an assembly
103 * instruction sequence block, as well as when the kernel detects that
104 * it is preempting or delivering a signal outside of the range
105 * targeted by the rseq_cs. Also needs to be set to NULL by user-space
106 * before reclaiming memory that contains the targeted struct rseq_cs.
107 *
108 * Read and set by the kernel with single-copy atomicity semantics.
109 * Set by user-space with single-copy atomicity semantics. Aligned
110 * on 64-bit.
111 */
112 LINUX_FIELD_u32_u64(rseq_cs);
113 /*
114 * - RSEQ_DISABLE flag:
115 *
116 * Fallback fast-track flag for single-stepping.
117 * Set by user-space if lack of progress is detected.
118 * Cleared by user-space after rseq finish.
119 * Read by the kernel.
120 * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
121 * Inhibit instruction sequence block restart and event
122 * counter increment on preemption for this thread.
123 * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
124 * Inhibit instruction sequence block restart and event
125 * counter increment on signal delivery for this thread.
126 * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
127 * Inhibit instruction sequence block restart and event
128 * counter increment on migration for this thread.
129 */
130 __u32 flags;
131} __attribute__((aligned(4 * sizeof(__u64))));
132
133#endif /* _UAPI_LINUX_RSEQ_H */
diff --git a/init/Kconfig b/init/Kconfig
index 18b151f0ddc1..33ec06fddaaa 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1417,6 +1417,29 @@ config ARCH_HAS_MEMBARRIER_CALLBACKS
1417config ARCH_HAS_MEMBARRIER_SYNC_CORE 1417config ARCH_HAS_MEMBARRIER_SYNC_CORE
1418 bool 1418 bool
1419 1419
1420config RSEQ
1421 bool "Enable rseq() system call" if EXPERT
1422 default y
1423 depends on HAVE_RSEQ
1424 select MEMBARRIER
1425 help
1426 Enable the restartable sequences system call. It provides a
1427 user-space cache for the current CPU number value, which
1428 speeds up getting the current CPU number from user-space,
1429 as well as an ABI to speed up user-space operations on
1430 per-CPU data.
1431
1432 If unsure, say Y.
1433
1434config DEBUG_RSEQ
1435 default n
1436 bool "Enabled debugging of rseq() system call" if EXPERT
1437 depends on RSEQ && DEBUG_KERNEL
1438 help
1439 Enable extra debugging checks for the rseq system call.
1440
1441 If unsure, say N.
1442
1420config EMBEDDED 1443config EMBEDDED
1421 bool "Embedded system" 1444 bool "Embedded system"
1422 option allnoconfig_y 1445 option allnoconfig_y
diff --git a/kernel/Makefile b/kernel/Makefile
index f85ae5dfa474..7085c841c413 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -113,6 +113,7 @@ obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
113obj-$(CONFIG_TORTURE_TEST) += torture.o 113obj-$(CONFIG_TORTURE_TEST) += torture.o
114 114
115obj-$(CONFIG_HAS_IOMEM) += memremap.o 115obj-$(CONFIG_HAS_IOMEM) += memremap.o
116obj-$(CONFIG_RSEQ) += rseq.o
116 117
117$(obj)/configs.o: $(obj)/config_data.h 118$(obj)/configs.o: $(obj)/config_data.h
118 119
diff --git a/kernel/fork.c b/kernel/fork.c
index a5d21c42acfc..70992bfeba81 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1899,6 +1899,8 @@ static __latent_entropy struct task_struct *copy_process(
1899 */ 1899 */
1900 copy_seccomp(p); 1900 copy_seccomp(p);
1901 1901
1902 rseq_fork(p, clone_flags);
1903
1902 /* 1904 /*
1903 * Process group and session signals need to be delivered to just the 1905 * Process group and session signals need to be delivered to just the
1904 * parent before the fork or both the parent and the child after the 1906 * parent before the fork or both the parent and the child after the
diff --git a/kernel/rseq.c b/kernel/rseq.c
new file mode 100644
index 000000000000..ae306f90c514
--- /dev/null
+++ b/kernel/rseq.c
@@ -0,0 +1,357 @@
1// SPDX-License-Identifier: GPL-2.0+
2/*
3 * Restartable sequences system call
4 *
5 * Copyright (C) 2015, Google, Inc.,
6 * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com>
7 * Copyright (C) 2015-2018, EfficiOS Inc.,
8 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
9 */
10
11#include <linux/sched.h>
12#include <linux/uaccess.h>
13#include <linux/syscalls.h>
14#include <linux/rseq.h>
15#include <linux/types.h>
16#include <asm/ptrace.h>
17
18#define CREATE_TRACE_POINTS
19#include <trace/events/rseq.h>
20
21#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
22 RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
23
24/*
25 *
26 * Restartable sequences are a lightweight interface that allows
27 * user-level code to be executed atomically relative to scheduler
28 * preemption and signal delivery. Typically used for implementing
29 * per-cpu operations.
30 *
31 * It allows user-space to perform update operations on per-cpu data
32 * without requiring heavy-weight atomic operations.
33 *
34 * Detailed algorithm of rseq user-space assembly sequences:
35 *
36 * init(rseq_cs)
37 * cpu = TLS->rseq::cpu_id_start
38 * [1] TLS->rseq::rseq_cs = rseq_cs
39 * [start_ip] ----------------------------
40 * [2] if (cpu != TLS->rseq::cpu_id)
41 * goto abort_ip;
42 * [3] <last_instruction_in_cs>
43 * [post_commit_ip] ----------------------------
44 *
45 * The address of jump target abort_ip must be outside the critical
46 * region, i.e.:
47 *
48 * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip]
49 *
50 * Steps [2]-[3] (inclusive) need to be a sequence of instructions in
51 * userspace that can handle being interrupted between any of those
52 * instructions, and then resumed to the abort_ip.
53 *
54 * 1. Userspace stores the address of the struct rseq_cs assembly
55 * block descriptor into the rseq_cs field of the registered
56 * struct rseq TLS area. This update is performed through a single
57 * store within the inline assembly instruction sequence.
58 * [start_ip]
59 *
60 * 2. Userspace tests to check whether the current cpu_id field match
61 * the cpu number loaded before start_ip, branching to abort_ip
62 * in case of a mismatch.
63 *
64 * If the sequence is preempted or interrupted by a signal
65 * at or after start_ip and before post_commit_ip, then the kernel
66 * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return
67 * ip to abort_ip before returning to user-space, so the preempted
68 * execution resumes at abort_ip.
69 *
70 * 3. Userspace critical section final instruction before
71 * post_commit_ip is the commit. The critical section is
72 * self-terminating.
73 * [post_commit_ip]
74 *
75 * 4. <success>
76 *
77 * On failure at [2], or if interrupted by preempt or signal delivery
78 * between [1] and [3]:
79 *
80 * [abort_ip]
81 * F1. <failure>
82 */
83
84static int rseq_update_cpu_id(struct task_struct *t)
85{
86 u32 cpu_id = raw_smp_processor_id();
87
88 if (__put_user(cpu_id, &t->rseq->cpu_id_start))
89 return -EFAULT;
90 if (__put_user(cpu_id, &t->rseq->cpu_id))
91 return -EFAULT;
92 trace_rseq_update(t);
93 return 0;
94}
95
96static int rseq_reset_rseq_cpu_id(struct task_struct *t)
97{
98 u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
99
100 /*
101 * Reset cpu_id_start to its initial state (0).
102 */
103 if (__put_user(cpu_id_start, &t->rseq->cpu_id_start))
104 return -EFAULT;
105 /*
106 * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming
107 * in after unregistration can figure out that rseq needs to be
108 * registered again.
109 */
110 if (__put_user(cpu_id, &t->rseq->cpu_id))
111 return -EFAULT;
112 return 0;
113}
114
115static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
116{
117 struct rseq_cs __user *urseq_cs;
118 unsigned long ptr;
119 u32 __user *usig;
120 u32 sig;
121 int ret;
122
123 ret = __get_user(ptr, &t->rseq->rseq_cs);
124 if (ret)
125 return ret;
126 if (!ptr) {
127 memset(rseq_cs, 0, sizeof(*rseq_cs));
128 return 0;
129 }
130 urseq_cs = (struct rseq_cs __user *)ptr;
131 if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
132 return -EFAULT;
133 if (rseq_cs->version > 0)
134 return -EINVAL;
135
136 /* Ensure that abort_ip is not in the critical section. */
137 if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
138 return -EINVAL;
139
140 usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32));
141 ret = get_user(sig, usig);
142 if (ret)
143 return ret;
144
145 if (current->rseq_sig != sig) {
146 printk_ratelimited(KERN_WARNING
147 "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
148 sig, current->rseq_sig, current->pid, usig);
149 return -EPERM;
150 }
151 return 0;
152}
153
154static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
155{
156 u32 flags, event_mask;
157 int ret;
158
159 /* Get thread flags. */
160 ret = __get_user(flags, &t->rseq->flags);
161 if (ret)
162 return ret;
163
164 /* Take critical section flags into account. */
165 flags |= cs_flags;
166
167 /*
168 * Restart on signal can only be inhibited when restart on
169 * preempt and restart on migrate are inhibited too. Otherwise,
170 * a preempted signal handler could fail to restart the prior
171 * execution context on sigreturn.
172 */
173 if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) &&
174 (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) !=
175 RSEQ_CS_PREEMPT_MIGRATE_FLAGS))
176 return -EINVAL;
177
178 /*
179 * Load and clear event mask atomically with respect to
180 * scheduler preemption.
181 */
182 preempt_disable();
183 event_mask = t->rseq_event_mask;
184 t->rseq_event_mask = 0;
185 preempt_enable();
186
187 return !!(event_mask & ~flags);
188}
189
190static int clear_rseq_cs(struct task_struct *t)
191{
192 /*
193 * The rseq_cs field is set to NULL on preemption or signal
194 * delivery on top of rseq assembly block, as well as on top
195 * of code outside of the rseq assembly block. This performs
196 * a lazy clear of the rseq_cs field.
197 *
198 * Set rseq_cs to NULL with single-copy atomicity.
199 */
200 return __put_user(0UL, &t->rseq->rseq_cs);
201}
202
203/*
204 * Unsigned comparison will be true when ip >= start_ip, and when
205 * ip < start_ip + post_commit_offset.
206 */
207static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
208{
209 return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
210}
211
212static int rseq_ip_fixup(struct pt_regs *regs)
213{
214 unsigned long ip = instruction_pointer(regs);
215 struct task_struct *t = current;
216 struct rseq_cs rseq_cs;
217 int ret;
218
219 ret = rseq_get_rseq_cs(t, &rseq_cs);
220 if (ret)
221 return ret;
222
223 /*
224 * Handle potentially not being within a critical section.
225 * If not nested over a rseq critical section, restart is useless.
226 * Clear the rseq_cs pointer and return.
227 */
228 if (!in_rseq_cs(ip, &rseq_cs))
229 return clear_rseq_cs(t);
230 ret = rseq_need_restart(t, rseq_cs.flags);
231 if (ret <= 0)
232 return ret;
233 ret = clear_rseq_cs(t);
234 if (ret)
235 return ret;
236 trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
237 rseq_cs.abort_ip);
238 instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
239 return 0;
240}
241
242/*
243 * This resume handler must always be executed between any of:
244 * - preemption,
245 * - signal delivery,
246 * and return to user-space.
247 *
248 * This is how we can ensure that the entire rseq critical section,
249 * consisting of both the C part and the assembly instruction sequence,
250 * will issue the commit instruction only if executed atomically with
251 * respect to other threads scheduled on the same CPU, and with respect
252 * to signal handlers.
253 */
254void __rseq_handle_notify_resume(struct pt_regs *regs)
255{
256 struct task_struct *t = current;
257 int ret;
258
259 if (unlikely(t->flags & PF_EXITING))
260 return;
261 if (unlikely(!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq))))
262 goto error;
263 ret = rseq_ip_fixup(regs);
264 if (unlikely(ret < 0))
265 goto error;
266 if (unlikely(rseq_update_cpu_id(t)))
267 goto error;
268 return;
269
270error:
271 force_sig(SIGSEGV, t);
272}
273
274#ifdef CONFIG_DEBUG_RSEQ
275
276/*
277 * Terminate the process if a syscall is issued within a restartable
278 * sequence.
279 */
280void rseq_syscall(struct pt_regs *regs)
281{
282 unsigned long ip = instruction_pointer(regs);
283 struct task_struct *t = current;
284 struct rseq_cs rseq_cs;
285
286 if (!t->rseq)
287 return;
288 if (!access_ok(VERIFY_READ, t->rseq, sizeof(*t->rseq)) ||
289 rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
290 force_sig(SIGSEGV, t);
291}
292
293#endif
294
295/*
296 * sys_rseq - setup restartable sequences for caller thread.
297 */
298SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
299 int, flags, u32, sig)
300{
301 int ret;
302
303 if (flags & RSEQ_FLAG_UNREGISTER) {
304 /* Unregister rseq for current thread. */
305 if (current->rseq != rseq || !current->rseq)
306 return -EINVAL;
307 if (current->rseq_len != rseq_len)
308 return -EINVAL;
309 if (current->rseq_sig != sig)
310 return -EPERM;
311 ret = rseq_reset_rseq_cpu_id(current);
312 if (ret)
313 return ret;
314 current->rseq = NULL;
315 current->rseq_len = 0;
316 current->rseq_sig = 0;
317 return 0;
318 }
319
320 if (unlikely(flags))
321 return -EINVAL;
322
323 if (current->rseq) {
324 /*
325 * If rseq is already registered, check whether
326 * the provided address differs from the prior
327 * one.
328 */
329 if (current->rseq != rseq || current->rseq_len != rseq_len)
330 return -EINVAL;
331 if (current->rseq_sig != sig)
332 return -EPERM;
333 /* Already registered. */
334 return -EBUSY;
335 }
336
337 /*
338 * If there was no rseq previously registered,
339 * ensure the provided rseq is properly aligned and valid.
340 */
341 if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
342 rseq_len != sizeof(*rseq))
343 return -EINVAL;
344 if (!access_ok(VERIFY_WRITE, rseq, rseq_len))
345 return -EFAULT;
346 current->rseq = rseq;
347 current->rseq_len = rseq_len;
348 current->rseq_sig = sig;
349 /*
350 * If rseq was previously inactive, and has just been
351 * registered, ensure the cpu_id_start and cpu_id fields
352 * are updated before returning to user-space.
353 */
354 rseq_set_notify_resume(current);
355
356 return 0;
357}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e9866f86f304..a98d54cd5535 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1191,6 +1191,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1191 if (p->sched_class->migrate_task_rq) 1191 if (p->sched_class->migrate_task_rq)
1192 p->sched_class->migrate_task_rq(p); 1192 p->sched_class->migrate_task_rq(p);
1193 p->se.nr_migrations++; 1193 p->se.nr_migrations++;
1194 rseq_migrate(p);
1194 perf_event_task_migrate(p); 1195 perf_event_task_migrate(p);
1195 } 1196 }
1196 1197
@@ -2634,6 +2635,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2634{ 2635{
2635 sched_info_switch(rq, prev, next); 2636 sched_info_switch(rq, prev, next);
2636 perf_event_task_sched_out(prev, next); 2637 perf_event_task_sched_out(prev, next);
2638 rseq_preempt(prev);
2637 fire_sched_out_preempt_notifiers(prev, next); 2639 fire_sched_out_preempt_notifiers(prev, next);
2638 prepare_task(next); 2640 prepare_task(next);
2639 prepare_arch_switch(next); 2641 prepare_arch_switch(next);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 183169c2a75b..86f832d6ff6f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -432,3 +432,6 @@ COND_SYSCALL(setresgid16);
432COND_SYSCALL(setresuid16); 432COND_SYSCALL(setresuid16);
433COND_SYSCALL(setreuid16); 433COND_SYSCALL(setreuid16);
434COND_SYSCALL(setuid16); 434COND_SYSCALL(setuid16);
435
436/* restartable sequence */
437COND_SYSCALL(rseq);