diff options
-rw-r--r-- | MAINTAINERS | 11 | ||||
-rw-r--r-- | arch/Kconfig | 7 | ||||
-rw-r--r-- | fs/exec.c | 1 | ||||
-rw-r--r-- | include/linux/sched.h | 134 | ||||
-rw-r--r-- | include/linux/syscalls.h | 4 | ||||
-rw-r--r-- | include/trace/events/rseq.h | 57 | ||||
-rw-r--r-- | include/uapi/linux/rseq.h | 133 | ||||
-rw-r--r-- | init/Kconfig | 23 | ||||
-rw-r--r-- | kernel/Makefile | 1 | ||||
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | kernel/rseq.c | 357 | ||||
-rw-r--r-- | kernel/sched/core.c | 2 | ||||
-rw-r--r-- | kernel/sys_ni.c | 3 |
13 files changed, 734 insertions, 1 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index aa635837a6af..a384243d911b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -11976,6 +11976,17 @@ F: include/dt-bindings/reset/ | |||
11976 | F: include/linux/reset.h | 11976 | F: include/linux/reset.h |
11977 | F: include/linux/reset-controller.h | 11977 | F: include/linux/reset-controller.h |
11978 | 11978 | ||
11979 | RESTARTABLE SEQUENCES SUPPORT | ||
11980 | M: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | ||
11981 | M: Peter Zijlstra <peterz@infradead.org> | ||
11982 | M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> | ||
11983 | M: Boqun Feng <boqun.feng@gmail.com> | ||
11984 | L: linux-kernel@vger.kernel.org | ||
11985 | S: Supported | ||
11986 | F: kernel/rseq.c | ||
11987 | F: include/uapi/linux/rseq.h | ||
11988 | F: include/trace/events/rseq.h | ||
11989 | |||
11979 | RFKILL | 11990 | RFKILL |
11980 | M: Johannes Berg <johannes@sipsolutions.net> | 11991 | M: Johannes Berg <johannes@sipsolutions.net> |
11981 | L: linux-wireless@vger.kernel.org | 11992 | L: linux-wireless@vger.kernel.org |
diff --git a/arch/Kconfig b/arch/Kconfig index b695a3e3e922..095ba99968c1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
@@ -272,6 +272,13 @@ config HAVE_REGS_AND_STACK_ACCESS_API | |||
272 | declared in asm/ptrace.h | 272 | declared in asm/ptrace.h |
273 | For example the kprobes-based event tracer needs this API. | 273 | For example the kprobes-based event tracer needs this API. |
274 | 274 | ||
275 | config HAVE_RSEQ | ||
276 | bool | ||
277 | depends on HAVE_REGS_AND_STACK_ACCESS_API | ||
278 | help | ||
279 | This symbol should be selected by an architecture if it | ||
280 | supports an implementation of restartable sequences. | ||
281 | |||
275 | config HAVE_CLK | 282 | config HAVE_CLK |
276 | bool | 283 | bool |
277 | help | 284 | help |
@@ -1822,6 +1822,7 @@ static int do_execveat_common(int fd, struct filename *filename, | |||
1822 | current->fs->in_exec = 0; | 1822 | current->fs->in_exec = 0; |
1823 | current->in_execve = 0; | 1823 | current->in_execve = 0; |
1824 | membarrier_execve(current); | 1824 | membarrier_execve(current); |
1825 | rseq_execve(current); | ||
1825 | acct_update_integrals(current); | 1826 | acct_update_integrals(current); |
1826 | task_numa_free(current); | 1827 | task_numa_free(current); |
1827 | free_bprm(bprm); | 1828 | free_bprm(bprm); |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 14e4f9c12337..3aa4fcb74e76 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/signal_types.h> | 27 | #include <linux/signal_types.h> |
28 | #include <linux/mm_types_task.h> | 28 | #include <linux/mm_types_task.h> |
29 | #include <linux/task_io_accounting.h> | 29 | #include <linux/task_io_accounting.h> |
30 | #include <linux/rseq.h> | ||
30 | 31 | ||
31 | /* task_struct member predeclarations (sorted alphabetically): */ | 32 | /* task_struct member predeclarations (sorted alphabetically): */ |
32 | struct audit_context; | 33 | struct audit_context; |
@@ -1047,6 +1048,17 @@ struct task_struct { | |||
1047 | unsigned long numa_pages_migrated; | 1048 | unsigned long numa_pages_migrated; |
1048 | #endif /* CONFIG_NUMA_BALANCING */ | 1049 | #endif /* CONFIG_NUMA_BALANCING */ |
1049 | 1050 | ||
1051 | #ifdef CONFIG_RSEQ | ||
1052 | struct rseq __user *rseq; | ||
1053 | u32 rseq_len; | ||
1054 | u32 rseq_sig; | ||
1055 | /* | ||
1056 | * RmW on rseq_event_mask must be performed atomically | ||
1057 | * with respect to preemption. | ||
1058 | */ | ||
1059 | unsigned long rseq_event_mask; | ||
1060 | #endif | ||
1061 | |||
1050 | struct tlbflush_unmap_batch tlb_ubc; | 1062 | struct tlbflush_unmap_batch tlb_ubc; |
1051 | 1063 | ||
1052 | struct rcu_head rcu; | 1064 | struct rcu_head rcu; |
@@ -1757,4 +1769,126 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); | |||
1757 | #define TASK_SIZE_OF(tsk) TASK_SIZE | 1769 | #define TASK_SIZE_OF(tsk) TASK_SIZE |
1758 | #endif | 1770 | #endif |
1759 | 1771 | ||
1772 | #ifdef CONFIG_RSEQ | ||
1773 | |||
1774 | /* | ||
1775 | * Map the event mask on the user-space ABI enum rseq_cs_flags | ||
1776 | * for direct mask checks. | ||
1777 | */ | ||
1778 | enum rseq_event_mask_bits { | ||
1779 | RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, | ||
1780 | RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, | ||
1781 | RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, | ||
1782 | }; | ||
1783 | |||
1784 | enum rseq_event_mask { | ||
1785 | RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), | ||
1786 | RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), | ||
1787 | RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), | ||
1788 | }; | ||
1789 | |||
1790 | static inline void rseq_set_notify_resume(struct task_struct *t) | ||
1791 | { | ||
1792 | if (t->rseq) | ||
1793 | set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); | ||
1794 | } | ||
1795 | |||
1796 | void __rseq_handle_notify_resume(struct pt_regs *regs); | ||
1797 | |||
1798 | static inline void rseq_handle_notify_resume(struct pt_regs *regs) | ||
1799 | { | ||
1800 | if (current->rseq) | ||
1801 | __rseq_handle_notify_resume(regs); | ||
1802 | } | ||
1803 | |||
1804 | static inline void rseq_signal_deliver(struct pt_regs *regs) | ||
1805 | { | ||
1806 | preempt_disable(); | ||
1807 | __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); | ||
1808 | preempt_enable(); | ||
1809 | rseq_handle_notify_resume(regs); | ||
1810 | } | ||
1811 | |||
1812 | /* rseq_preempt() requires preemption to be disabled. */ | ||
1813 | static inline void rseq_preempt(struct task_struct *t) | ||
1814 | { | ||
1815 | __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); | ||
1816 | rseq_set_notify_resume(t); | ||
1817 | } | ||
1818 | |||
1819 | /* rseq_migrate() requires preemption to be disabled. */ | ||
1820 | static inline void rseq_migrate(struct task_struct *t) | ||
1821 | { | ||
1822 | __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); | ||
1823 | rseq_set_notify_resume(t); | ||
1824 | } | ||
1825 | |||
1826 | /* | ||
1827 | * If parent process has a registered restartable sequences area, the | ||
1828 | * child inherits. Only applies when forking a process, not a thread. In | ||
1829 | * case a parent fork() in the middle of a restartable sequence, set the | ||
1830 | * resume notifier to force the child to retry. | ||
1831 | */ | ||
1832 | static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) | ||
1833 | { | ||
1834 | if (clone_flags & CLONE_THREAD) { | ||
1835 | t->rseq = NULL; | ||
1836 | t->rseq_len = 0; | ||
1837 | t->rseq_sig = 0; | ||
1838 | t->rseq_event_mask = 0; | ||
1839 | } else { | ||
1840 | t->rseq = current->rseq; | ||
1841 | t->rseq_len = current->rseq_len; | ||
1842 | t->rseq_sig = current->rseq_sig; | ||
1843 | t->rseq_event_mask = current->rseq_event_mask; | ||
1844 | rseq_preempt(t); | ||
1845 | } | ||
1846 | } | ||
1847 | |||
1848 | static inline void rseq_execve(struct task_struct *t) | ||
1849 | { | ||
1850 | t->rseq = NULL; | ||
1851 | t->rseq_len = 0; | ||
1852 | t->rseq_sig = 0; | ||
1853 | t->rseq_event_mask = 0; | ||
1854 | } | ||
1855 | |||
1856 | #else | ||
1857 | |||
1858 | static inline void rseq_set_notify_resume(struct task_struct *t) | ||
1859 | { | ||
1860 | } | ||
1861 | static inline void rseq_handle_notify_resume(struct pt_regs *regs) | ||
1862 | { | ||
1863 | } | ||
1864 | static inline void rseq_signal_deliver(struct pt_regs *regs) | ||
1865 | { | ||
1866 | } | ||
1867 | static inline void rseq_preempt(struct task_struct *t) | ||
1868 | { | ||
1869 | } | ||
1870 | static inline void rseq_migrate(struct task_struct *t) | ||
1871 | { | ||
1872 | } | ||
1873 | static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) | ||
1874 | { | ||
1875 | } | ||
1876 | static inline void rseq_execve(struct task_struct *t) | ||
1877 | { | ||
1878 | } | ||
1879 | |||
1880 | #endif | ||
1881 | |||
1882 | #ifdef CONFIG_DEBUG_RSEQ | ||
1883 | |||
1884 | void rseq_syscall(struct pt_regs *regs); | ||
1885 | |||
1886 | #else | ||
1887 | |||
1888 | static inline void rseq_syscall(struct pt_regs *regs) | ||
1889 | { | ||
1890 | } | ||
1891 | |||
1892 | #endif | ||
1893 | |||
1760 | #endif | 1894 | #endif |
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 390e814fdc8d..73810808cdf2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h | |||
@@ -66,6 +66,7 @@ struct old_linux_dirent; | |||
66 | struct perf_event_attr; | 66 | struct perf_event_attr; |
67 | struct file_handle; | 67 | struct file_handle; |
68 | struct sigaltstack; | 68 | struct sigaltstack; |
69 | struct rseq; | ||
69 | union bpf_attr; | 70 | union bpf_attr; |
70 | 71 | ||
71 | #include <linux/types.h> | 72 | #include <linux/types.h> |
@@ -897,7 +898,8 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val); | |||
897 | asmlinkage long sys_pkey_free(int pkey); | 898 | asmlinkage long sys_pkey_free(int pkey); |
898 | asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, | 899 | asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, |
899 | unsigned mask, struct statx __user *buffer); | 900 | unsigned mask, struct statx __user *buffer); |
900 | 901 | asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, | |
902 | int flags, uint32_t sig); | ||
901 | 903 | ||
902 | /* | 904 | /* |
903 | * Architecture-specific system calls | 905 | * Architecture-specific system calls |
diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h new file mode 100644 index 000000000000..a04a64bc1a00 --- /dev/null +++ b/include/trace/events/rseq.h | |||
@@ -0,0 +1,57 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0+ */ | ||
2 | #undef TRACE_SYSTEM | ||
3 | #define TRACE_SYSTEM rseq | ||
4 | |||
5 | #if !defined(_TRACE_RSEQ_H) || defined(TRACE_HEADER_MULTI_READ) | ||
6 | #define _TRACE_RSEQ_H | ||
7 | |||
8 | #include <linux/tracepoint.h> | ||
9 | #include <linux/types.h> | ||
10 | |||
11 | TRACE_EVENT(rseq_update, | ||
12 | |||
13 | TP_PROTO(struct task_struct *t), | ||
14 | |||
15 | TP_ARGS(t), | ||
16 | |||
17 | TP_STRUCT__entry( | ||
18 | __field(s32, cpu_id) | ||
19 | ), | ||
20 | |||
21 | TP_fast_assign( | ||
22 | __entry->cpu_id = raw_smp_processor_id(); | ||
23 | ), | ||
24 | |||
25 | TP_printk("cpu_id=%d", __entry->cpu_id) | ||
26 | ); | ||
27 | |||
28 | TRACE_EVENT(rseq_ip_fixup, | ||
29 | |||
30 | TP_PROTO(unsigned long regs_ip, unsigned long start_ip, | ||
31 | unsigned long post_commit_offset, unsigned long abort_ip), | ||
32 | |||
33 | TP_ARGS(regs_ip, start_ip, post_commit_offset, abort_ip), | ||
34 | |||
35 | TP_STRUCT__entry( | ||
36 | __field(unsigned long, regs_ip) | ||
37 | __field(unsigned long, start_ip) | ||
38 | __field(unsigned long, post_commit_offset) | ||
39 | __field(unsigned long, abort_ip) | ||
40 | ), | ||
41 | |||
42 | TP_fast_assign( | ||
43 | __entry->regs_ip = regs_ip; | ||
44 | __entry->start_ip = start_ip; | ||
45 | __entry->post_commit_offset = post_commit_offset; | ||
46 | __entry->abort_ip = abort_ip; | ||
47 | ), | ||
48 | |||
49 | TP_printk("regs_ip=0x%lx start_ip=0x%lx post_commit_offset=%lu abort_ip=0x%lx", | ||
50 | __entry->regs_ip, __entry->start_ip, | ||
51 | __entry->post_commit_offset, __entry->abort_ip) | ||
52 | ); | ||
53 | |||
54 | #endif /* _TRACE_SOCK_H */ | ||
55 | |||
56 | /* This part must be outside protection */ | ||
57 | #include <trace/define_trace.h> | ||
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h new file mode 100644 index 000000000000..d620fa43756c --- /dev/null +++ b/include/uapi/linux/rseq.h | |||
@@ -0,0 +1,133 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ | ||
2 | #ifndef _UAPI_LINUX_RSEQ_H | ||
3 | #define _UAPI_LINUX_RSEQ_H | ||
4 | |||
5 | /* | ||
6 | * linux/rseq.h | ||
7 | * | ||
8 | * Restartable sequences system call API | ||
9 | * | ||
10 | * Copyright (c) 2015-2018 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | ||
11 | */ | ||
12 | |||
13 | #ifdef __KERNEL__ | ||
14 | # include <linux/types.h> | ||
15 | #else | ||
16 | # include <stdint.h> | ||
17 | #endif | ||
18 | |||
19 | #include <linux/types_32_64.h> | ||
20 | |||
21 | enum rseq_cpu_id_state { | ||
22 | RSEQ_CPU_ID_UNINITIALIZED = -1, | ||
23 | RSEQ_CPU_ID_REGISTRATION_FAILED = -2, | ||
24 | }; | ||
25 | |||
26 | enum rseq_flags { | ||
27 | RSEQ_FLAG_UNREGISTER = (1 << 0), | ||
28 | }; | ||
29 | |||
30 | enum rseq_cs_flags_bit { | ||
31 | RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, | ||
32 | RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, | ||
33 | RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, | ||
34 | }; | ||
35 | |||
36 | enum rseq_cs_flags { | ||
37 | RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = | ||
38 | (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT), | ||
39 | RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = | ||
40 | (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), | ||
41 | RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = | ||
42 | (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), | ||
43 | }; | ||
44 | |||
45 | /* | ||
46 | * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always | ||
47 | * contained within a single cache-line. It is usually declared as | ||
48 | * link-time constant data. | ||
49 | */ | ||
50 | struct rseq_cs { | ||
51 | /* Version of this structure. */ | ||
52 | __u32 version; | ||
53 | /* enum rseq_cs_flags */ | ||
54 | __u32 flags; | ||
55 | LINUX_FIELD_u32_u64(start_ip); | ||
56 | /* Offset from start_ip. */ | ||
57 | LINUX_FIELD_u32_u64(post_commit_offset); | ||
58 | LINUX_FIELD_u32_u64(abort_ip); | ||
59 | } __attribute__((aligned(4 * sizeof(__u64)))); | ||
60 | |||
61 | /* | ||
62 | * struct rseq is aligned on 4 * 8 bytes to ensure it is always | ||
63 | * contained within a single cache-line. | ||
64 | * | ||
65 | * A single struct rseq per thread is allowed. | ||
66 | */ | ||
67 | struct rseq { | ||
68 | /* | ||
69 | * Restartable sequences cpu_id_start field. Updated by the | ||
70 | * kernel, and read by user-space with single-copy atomicity | ||
71 | * semantics. Aligned on 32-bit. Always contains a value in the | ||
72 | * range of possible CPUs, although the value may not be the | ||
73 | * actual current CPU (e.g. if rseq is not initialized). This | ||
74 | * CPU number value should always be compared against the value | ||
75 | * of the cpu_id field before performing a rseq commit or | ||
76 | * returning a value read from a data structure indexed using | ||
77 | * the cpu_id_start value. | ||
78 | */ | ||
79 | __u32 cpu_id_start; | ||
80 | /* | ||
81 | * Restartable sequences cpu_id field. Updated by the kernel, | ||
82 | * and read by user-space with single-copy atomicity semantics. | ||
83 | * Aligned on 32-bit. Values RSEQ_CPU_ID_UNINITIALIZED and | ||
84 | * RSEQ_CPU_ID_REGISTRATION_FAILED have a special semantic: the | ||
85 | * former means "rseq uninitialized", and latter means "rseq | ||
86 | * initialization failed". This value is meant to be read within | ||
87 | * rseq critical sections and compared with the cpu_id_start | ||
88 | * value previously read, before performing the commit instruction, | ||
89 | * or read and compared with the cpu_id_start value before returning | ||
90 | * a value loaded from a data structure indexed using the | ||
91 | * cpu_id_start value. | ||
92 | */ | ||
93 | __u32 cpu_id; | ||
94 | /* | ||
95 | * Restartable sequences rseq_cs field. | ||
96 | * | ||
97 | * Contains NULL when no critical section is active for the current | ||
98 | * thread, or holds a pointer to the currently active struct rseq_cs. | ||
99 | * | ||
100 | * Updated by user-space, which sets the address of the currently | ||
101 | * active rseq_cs at the beginning of assembly instruction sequence | ||
102 | * block, and set to NULL by the kernel when it restarts an assembly | ||
103 | * instruction sequence block, as well as when the kernel detects that | ||
104 | * it is preempting or delivering a signal outside of the range | ||
105 | * targeted by the rseq_cs. Also needs to be set to NULL by user-space | ||
106 | * before reclaiming memory that contains the targeted struct rseq_cs. | ||
107 | * | ||
108 | * Read and set by the kernel with single-copy atomicity semantics. | ||
109 | * Set by user-space with single-copy atomicity semantics. Aligned | ||
110 | * on 64-bit. | ||
111 | */ | ||
112 | LINUX_FIELD_u32_u64(rseq_cs); | ||
113 | /* | ||
114 | * - RSEQ_DISABLE flag: | ||
115 | * | ||
116 | * Fallback fast-track flag for single-stepping. | ||
117 | * Set by user-space if lack of progress is detected. | ||
118 | * Cleared by user-space after rseq finish. | ||
119 | * Read by the kernel. | ||
120 | * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | ||
121 | * Inhibit instruction sequence block restart and event | ||
122 | * counter increment on preemption for this thread. | ||
123 | * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | ||
124 | * Inhibit instruction sequence block restart and event | ||
125 | * counter increment on signal delivery for this thread. | ||
126 | * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | ||
127 | * Inhibit instruction sequence block restart and event | ||
128 | * counter increment on migration for this thread. | ||
129 | */ | ||
130 | __u32 flags; | ||
131 | } __attribute__((aligned(4 * sizeof(__u64)))); | ||
132 | |||
133 | #endif /* _UAPI_LINUX_RSEQ_H */ | ||
diff --git a/init/Kconfig b/init/Kconfig index 18b151f0ddc1..33ec06fddaaa 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -1417,6 +1417,29 @@ config ARCH_HAS_MEMBARRIER_CALLBACKS | |||
1417 | config ARCH_HAS_MEMBARRIER_SYNC_CORE | 1417 | config ARCH_HAS_MEMBARRIER_SYNC_CORE |
1418 | bool | 1418 | bool |
1419 | 1419 | ||
1420 | config RSEQ | ||
1421 | bool "Enable rseq() system call" if EXPERT | ||
1422 | default y | ||
1423 | depends on HAVE_RSEQ | ||
1424 | select MEMBARRIER | ||
1425 | help | ||
1426 | Enable the restartable sequences system call. It provides a | ||
1427 | user-space cache for the current CPU number value, which | ||
1428 | speeds up getting the current CPU number from user-space, | ||
1429 | as well as an ABI to speed up user-space operations on | ||
1430 | per-CPU data. | ||
1431 | |||
1432 | If unsure, say Y. | ||
1433 | |||
1434 | config DEBUG_RSEQ | ||
1435 | default n | ||
1436 | bool "Enabled debugging of rseq() system call" if EXPERT | ||
1437 | depends on RSEQ && DEBUG_KERNEL | ||
1438 | help | ||
1439 | Enable extra debugging checks for the rseq system call. | ||
1440 | |||
1441 | If unsure, say N. | ||
1442 | |||
1420 | config EMBEDDED | 1443 | config EMBEDDED |
1421 | bool "Embedded system" | 1444 | bool "Embedded system" |
1422 | option allnoconfig_y | 1445 | option allnoconfig_y |
diff --git a/kernel/Makefile b/kernel/Makefile index f85ae5dfa474..7085c841c413 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -113,6 +113,7 @@ obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o | |||
113 | obj-$(CONFIG_TORTURE_TEST) += torture.o | 113 | obj-$(CONFIG_TORTURE_TEST) += torture.o |
114 | 114 | ||
115 | obj-$(CONFIG_HAS_IOMEM) += memremap.o | 115 | obj-$(CONFIG_HAS_IOMEM) += memremap.o |
116 | obj-$(CONFIG_RSEQ) += rseq.o | ||
116 | 117 | ||
117 | $(obj)/configs.o: $(obj)/config_data.h | 118 | $(obj)/configs.o: $(obj)/config_data.h |
118 | 119 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index a5d21c42acfc..70992bfeba81 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1899,6 +1899,8 @@ static __latent_entropy struct task_struct *copy_process( | |||
1899 | */ | 1899 | */ |
1900 | copy_seccomp(p); | 1900 | copy_seccomp(p); |
1901 | 1901 | ||
1902 | rseq_fork(p, clone_flags); | ||
1903 | |||
1902 | /* | 1904 | /* |
1903 | * Process group and session signals need to be delivered to just the | 1905 | * Process group and session signals need to be delivered to just the |
1904 | * parent before the fork or both the parent and the child after the | 1906 | * parent before the fork or both the parent and the child after the |
diff --git a/kernel/rseq.c b/kernel/rseq.c new file mode 100644 index 000000000000..ae306f90c514 --- /dev/null +++ b/kernel/rseq.c | |||
@@ -0,0 +1,357 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0+ | ||
2 | /* | ||
3 | * Restartable sequences system call | ||
4 | * | ||
5 | * Copyright (C) 2015, Google, Inc., | ||
6 | * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> | ||
7 | * Copyright (C) 2015-2018, EfficiOS Inc., | ||
8 | * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | ||
9 | */ | ||
10 | |||
11 | #include <linux/sched.h> | ||
12 | #include <linux/uaccess.h> | ||
13 | #include <linux/syscalls.h> | ||
14 | #include <linux/rseq.h> | ||
15 | #include <linux/types.h> | ||
16 | #include <asm/ptrace.h> | ||
17 | |||
18 | #define CREATE_TRACE_POINTS | ||
19 | #include <trace/events/rseq.h> | ||
20 | |||
21 | #define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \ | ||
22 | RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) | ||
23 | |||
24 | /* | ||
25 | * | ||
26 | * Restartable sequences are a lightweight interface that allows | ||
27 | * user-level code to be executed atomically relative to scheduler | ||
28 | * preemption and signal delivery. Typically used for implementing | ||
29 | * per-cpu operations. | ||
30 | * | ||
31 | * It allows user-space to perform update operations on per-cpu data | ||
32 | * without requiring heavy-weight atomic operations. | ||
33 | * | ||
34 | * Detailed algorithm of rseq user-space assembly sequences: | ||
35 | * | ||
36 | * init(rseq_cs) | ||
37 | * cpu = TLS->rseq::cpu_id_start | ||
38 | * [1] TLS->rseq::rseq_cs = rseq_cs | ||
39 | * [start_ip] ---------------------------- | ||
40 | * [2] if (cpu != TLS->rseq::cpu_id) | ||
41 | * goto abort_ip; | ||
42 | * [3] <last_instruction_in_cs> | ||
43 | * [post_commit_ip] ---------------------------- | ||
44 | * | ||
45 | * The address of jump target abort_ip must be outside the critical | ||
46 | * region, i.e.: | ||
47 | * | ||
48 | * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] | ||
49 | * | ||
50 | * Steps [2]-[3] (inclusive) need to be a sequence of instructions in | ||
51 | * userspace that can handle being interrupted between any of those | ||
52 | * instructions, and then resumed to the abort_ip. | ||
53 | * | ||
54 | * 1. Userspace stores the address of the struct rseq_cs assembly | ||
55 | * block descriptor into the rseq_cs field of the registered | ||
56 | * struct rseq TLS area. This update is performed through a single | ||
57 | * store within the inline assembly instruction sequence. | ||
58 | * [start_ip] | ||
59 | * | ||
60 | * 2. Userspace tests to check whether the current cpu_id field match | ||
61 | * the cpu number loaded before start_ip, branching to abort_ip | ||
62 | * in case of a mismatch. | ||
63 | * | ||
64 | * If the sequence is preempted or interrupted by a signal | ||
65 | * at or after start_ip and before post_commit_ip, then the kernel | ||
66 | * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return | ||
67 | * ip to abort_ip before returning to user-space, so the preempted | ||
68 | * execution resumes at abort_ip. | ||
69 | * | ||
70 | * 3. Userspace critical section final instruction before | ||
71 | * post_commit_ip is the commit. The critical section is | ||
72 | * self-terminating. | ||
73 | * [post_commit_ip] | ||
74 | * | ||
75 | * 4. <success> | ||
76 | * | ||
77 | * On failure at [2], or if interrupted by preempt or signal delivery | ||
78 | * between [1] and [3]: | ||
79 | * | ||
80 | * [abort_ip] | ||
81 | * F1. <failure> | ||
82 | */ | ||
83 | |||
84 | static int rseq_update_cpu_id(struct task_struct *t) | ||
85 | { | ||
86 | u32 cpu_id = raw_smp_processor_id(); | ||
87 | |||
88 | if (__put_user(cpu_id, &t->rseq->cpu_id_start)) | ||
89 | return -EFAULT; | ||
90 | if (__put_user(cpu_id, &t->rseq->cpu_id)) | ||
91 | return -EFAULT; | ||
92 | trace_rseq_update(t); | ||
93 | return 0; | ||
94 | } | ||
95 | |||
96 | static int rseq_reset_rseq_cpu_id(struct task_struct *t) | ||
97 | { | ||
98 | u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED; | ||
99 | |||
100 | /* | ||
101 | * Reset cpu_id_start to its initial state (0). | ||
102 | */ | ||
103 | if (__put_user(cpu_id_start, &t->rseq->cpu_id_start)) | ||
104 | return -EFAULT; | ||
105 | /* | ||
106 | * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming | ||
107 | * in after unregistration can figure out that rseq needs to be | ||
108 | * registered again. | ||
109 | */ | ||
110 | if (__put_user(cpu_id, &t->rseq->cpu_id)) | ||
111 | return -EFAULT; | ||
112 | return 0; | ||
113 | } | ||
114 | |||
115 | static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) | ||
116 | { | ||
117 | struct rseq_cs __user *urseq_cs; | ||
118 | unsigned long ptr; | ||
119 | u32 __user *usig; | ||
120 | u32 sig; | ||
121 | int ret; | ||
122 | |||
123 | ret = __get_user(ptr, &t->rseq->rseq_cs); | ||
124 | if (ret) | ||
125 | return ret; | ||
126 | if (!ptr) { | ||
127 | memset(rseq_cs, 0, sizeof(*rseq_cs)); | ||
128 | return 0; | ||
129 | } | ||
130 | urseq_cs = (struct rseq_cs __user *)ptr; | ||
131 | if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) | ||
132 | return -EFAULT; | ||
133 | if (rseq_cs->version > 0) | ||
134 | return -EINVAL; | ||
135 | |||
136 | /* Ensure that abort_ip is not in the critical section. */ | ||
137 | if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) | ||
138 | return -EINVAL; | ||
139 | |||
140 | usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32)); | ||
141 | ret = get_user(sig, usig); | ||
142 | if (ret) | ||
143 | return ret; | ||
144 | |||
145 | if (current->rseq_sig != sig) { | ||
146 | printk_ratelimited(KERN_WARNING | ||
147 | "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", | ||
148 | sig, current->rseq_sig, current->pid, usig); | ||
149 | return -EPERM; | ||
150 | } | ||
151 | return 0; | ||
152 | } | ||
153 | |||
154 | static int rseq_need_restart(struct task_struct *t, u32 cs_flags) | ||
155 | { | ||
156 | u32 flags, event_mask; | ||
157 | int ret; | ||
158 | |||
159 | /* Get thread flags. */ | ||
160 | ret = __get_user(flags, &t->rseq->flags); | ||
161 | if (ret) | ||
162 | return ret; | ||
163 | |||
164 | /* Take critical section flags into account. */ | ||
165 | flags |= cs_flags; | ||
166 | |||
167 | /* | ||
168 | * Restart on signal can only be inhibited when restart on | ||
169 | * preempt and restart on migrate are inhibited too. Otherwise, | ||
170 | * a preempted signal handler could fail to restart the prior | ||
171 | * execution context on sigreturn. | ||
172 | */ | ||
173 | if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) && | ||
174 | (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) != | ||
175 | RSEQ_CS_PREEMPT_MIGRATE_FLAGS)) | ||
176 | return -EINVAL; | ||
177 | |||
178 | /* | ||
179 | * Load and clear event mask atomically with respect to | ||
180 | * scheduler preemption. | ||
181 | */ | ||
182 | preempt_disable(); | ||
183 | event_mask = t->rseq_event_mask; | ||
184 | t->rseq_event_mask = 0; | ||
185 | preempt_enable(); | ||
186 | |||
187 | return !!(event_mask & ~flags); | ||
188 | } | ||
189 | |||
190 | static int clear_rseq_cs(struct task_struct *t) | ||
191 | { | ||
192 | /* | ||
193 | * The rseq_cs field is set to NULL on preemption or signal | ||
194 | * delivery on top of rseq assembly block, as well as on top | ||
195 | * of code outside of the rseq assembly block. This performs | ||
196 | * a lazy clear of the rseq_cs field. | ||
197 | * | ||
198 | * Set rseq_cs to NULL with single-copy atomicity. | ||
199 | */ | ||
200 | return __put_user(0UL, &t->rseq->rseq_cs); | ||
201 | } | ||
202 | |||
203 | /* | ||
204 | * Unsigned comparison will be true when ip >= start_ip, and when | ||
205 | * ip < start_ip + post_commit_offset. | ||
206 | */ | ||
207 | static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) | ||
208 | { | ||
209 | return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; | ||
210 | } | ||
211 | |||
212 | static int rseq_ip_fixup(struct pt_regs *regs) | ||
213 | { | ||
214 | unsigned long ip = instruction_pointer(regs); | ||
215 | struct task_struct *t = current; | ||
216 | struct rseq_cs rseq_cs; | ||
217 | int ret; | ||
218 | |||
219 | ret = rseq_get_rseq_cs(t, &rseq_cs); | ||
220 | if (ret) | ||
221 | return ret; | ||
222 | |||
223 | /* | ||
224 | * Handle potentially not being within a critical section. | ||
225 | * If not nested over a rseq critical section, restart is useless. | ||
226 | * Clear the rseq_cs pointer and return. | ||
227 | */ | ||
228 | if (!in_rseq_cs(ip, &rseq_cs)) | ||
229 | return clear_rseq_cs(t); | ||
230 | ret = rseq_need_restart(t, rseq_cs.flags); | ||
231 | if (ret <= 0) | ||
232 | return ret; | ||
233 | ret = clear_rseq_cs(t); | ||
234 | if (ret) | ||
235 | return ret; | ||
236 | trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, | ||
237 | rseq_cs.abort_ip); | ||
238 | instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); | ||
239 | return 0; | ||
240 | } | ||
241 | |||
242 | /* | ||
243 | * This resume handler must always be executed between any of: | ||
244 | * - preemption, | ||
245 | * - signal delivery, | ||
246 | * and return to user-space. | ||
247 | * | ||
248 | * This is how we can ensure that the entire rseq critical section, | ||
249 | * consisting of both the C part and the assembly instruction sequence, | ||
250 | * will issue the commit instruction only if executed atomically with | ||
251 | * respect to other threads scheduled on the same CPU, and with respect | ||
252 | * to signal handlers. | ||
253 | */ | ||
254 | void __rseq_handle_notify_resume(struct pt_regs *regs) | ||
255 | { | ||
256 | struct task_struct *t = current; | ||
257 | int ret; | ||
258 | |||
259 | if (unlikely(t->flags & PF_EXITING)) | ||
260 | return; | ||
261 | if (unlikely(!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq)))) | ||
262 | goto error; | ||
263 | ret = rseq_ip_fixup(regs); | ||
264 | if (unlikely(ret < 0)) | ||
265 | goto error; | ||
266 | if (unlikely(rseq_update_cpu_id(t))) | ||
267 | goto error; | ||
268 | return; | ||
269 | |||
270 | error: | ||
271 | force_sig(SIGSEGV, t); | ||
272 | } | ||
273 | |||
274 | #ifdef CONFIG_DEBUG_RSEQ | ||
275 | |||
276 | /* | ||
277 | * Terminate the process if a syscall is issued within a restartable | ||
278 | * sequence. | ||
279 | */ | ||
280 | void rseq_syscall(struct pt_regs *regs) | ||
281 | { | ||
282 | unsigned long ip = instruction_pointer(regs); | ||
283 | struct task_struct *t = current; | ||
284 | struct rseq_cs rseq_cs; | ||
285 | |||
286 | if (!t->rseq) | ||
287 | return; | ||
288 | if (!access_ok(VERIFY_READ, t->rseq, sizeof(*t->rseq)) || | ||
289 | rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) | ||
290 | force_sig(SIGSEGV, t); | ||
291 | } | ||
292 | |||
293 | #endif | ||
294 | |||
295 | /* | ||
296 | * sys_rseq - setup restartable sequences for caller thread. | ||
297 | */ | ||
298 | SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, | ||
299 | int, flags, u32, sig) | ||
300 | { | ||
301 | int ret; | ||
302 | |||
303 | if (flags & RSEQ_FLAG_UNREGISTER) { | ||
304 | /* Unregister rseq for current thread. */ | ||
305 | if (current->rseq != rseq || !current->rseq) | ||
306 | return -EINVAL; | ||
307 | if (current->rseq_len != rseq_len) | ||
308 | return -EINVAL; | ||
309 | if (current->rseq_sig != sig) | ||
310 | return -EPERM; | ||
311 | ret = rseq_reset_rseq_cpu_id(current); | ||
312 | if (ret) | ||
313 | return ret; | ||
314 | current->rseq = NULL; | ||
315 | current->rseq_len = 0; | ||
316 | current->rseq_sig = 0; | ||
317 | return 0; | ||
318 | } | ||
319 | |||
320 | if (unlikely(flags)) | ||
321 | return -EINVAL; | ||
322 | |||
323 | if (current->rseq) { | ||
324 | /* | ||
325 | * If rseq is already registered, check whether | ||
326 | * the provided address differs from the prior | ||
327 | * one. | ||
328 | */ | ||
329 | if (current->rseq != rseq || current->rseq_len != rseq_len) | ||
330 | return -EINVAL; | ||
331 | if (current->rseq_sig != sig) | ||
332 | return -EPERM; | ||
333 | /* Already registered. */ | ||
334 | return -EBUSY; | ||
335 | } | ||
336 | |||
337 | /* | ||
338 | * If there was no rseq previously registered, | ||
339 | * ensure the provided rseq is properly aligned and valid. | ||
340 | */ | ||
341 | if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || | ||
342 | rseq_len != sizeof(*rseq)) | ||
343 | return -EINVAL; | ||
344 | if (!access_ok(VERIFY_WRITE, rseq, rseq_len)) | ||
345 | return -EFAULT; | ||
346 | current->rseq = rseq; | ||
347 | current->rseq_len = rseq_len; | ||
348 | current->rseq_sig = sig; | ||
349 | /* | ||
350 | * If rseq was previously inactive, and has just been | ||
351 | * registered, ensure the cpu_id_start and cpu_id fields | ||
352 | * are updated before returning to user-space. | ||
353 | */ | ||
354 | rseq_set_notify_resume(current); | ||
355 | |||
356 | return 0; | ||
357 | } | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e9866f86f304..a98d54cd5535 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1191,6 +1191,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1191 | if (p->sched_class->migrate_task_rq) | 1191 | if (p->sched_class->migrate_task_rq) |
1192 | p->sched_class->migrate_task_rq(p); | 1192 | p->sched_class->migrate_task_rq(p); |
1193 | p->se.nr_migrations++; | 1193 | p->se.nr_migrations++; |
1194 | rseq_migrate(p); | ||
1194 | perf_event_task_migrate(p); | 1195 | perf_event_task_migrate(p); |
1195 | } | 1196 | } |
1196 | 1197 | ||
@@ -2634,6 +2635,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
2634 | { | 2635 | { |
2635 | sched_info_switch(rq, prev, next); | 2636 | sched_info_switch(rq, prev, next); |
2636 | perf_event_task_sched_out(prev, next); | 2637 | perf_event_task_sched_out(prev, next); |
2638 | rseq_preempt(prev); | ||
2637 | fire_sched_out_preempt_notifiers(prev, next); | 2639 | fire_sched_out_preempt_notifiers(prev, next); |
2638 | prepare_task(next); | 2640 | prepare_task(next); |
2639 | prepare_arch_switch(next); | 2641 | prepare_arch_switch(next); |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 183169c2a75b..86f832d6ff6f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -432,3 +432,6 @@ COND_SYSCALL(setresgid16); | |||
432 | COND_SYSCALL(setresuid16); | 432 | COND_SYSCALL(setresuid16); |
433 | COND_SYSCALL(setreuid16); | 433 | COND_SYSCALL(setreuid16); |
434 | COND_SYSCALL(setuid16); | 434 | COND_SYSCALL(setuid16); |
435 | |||
436 | /* restartable sequence */ | ||
437 | COND_SYSCALL(rseq); | ||