diff options
| -rw-r--r-- | MAINTAINERS | 11 | ||||
| -rw-r--r-- | arch/Kconfig | 7 | ||||
| -rw-r--r-- | fs/exec.c | 1 | ||||
| -rw-r--r-- | include/linux/sched.h | 134 | ||||
| -rw-r--r-- | include/linux/syscalls.h | 4 | ||||
| -rw-r--r-- | include/trace/events/rseq.h | 57 | ||||
| -rw-r--r-- | include/uapi/linux/rseq.h | 133 | ||||
| -rw-r--r-- | init/Kconfig | 23 | ||||
| -rw-r--r-- | kernel/Makefile | 1 | ||||
| -rw-r--r-- | kernel/fork.c | 2 | ||||
| -rw-r--r-- | kernel/rseq.c | 357 | ||||
| -rw-r--r-- | kernel/sched/core.c | 2 | ||||
| -rw-r--r-- | kernel/sys_ni.c | 3 |
13 files changed, 734 insertions, 1 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index aa635837a6af..a384243d911b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -11976,6 +11976,17 @@ F: include/dt-bindings/reset/ | |||
| 11976 | F: include/linux/reset.h | 11976 | F: include/linux/reset.h |
| 11977 | F: include/linux/reset-controller.h | 11977 | F: include/linux/reset-controller.h |
| 11978 | 11978 | ||
| 11979 | RESTARTABLE SEQUENCES SUPPORT | ||
| 11980 | M: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | ||
| 11981 | M: Peter Zijlstra <peterz@infradead.org> | ||
| 11982 | M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> | ||
| 11983 | M: Boqun Feng <boqun.feng@gmail.com> | ||
| 11984 | L: linux-kernel@vger.kernel.org | ||
| 11985 | S: Supported | ||
| 11986 | F: kernel/rseq.c | ||
| 11987 | F: include/uapi/linux/rseq.h | ||
| 11988 | F: include/trace/events/rseq.h | ||
| 11989 | |||
| 11979 | RFKILL | 11990 | RFKILL |
| 11980 | M: Johannes Berg <johannes@sipsolutions.net> | 11991 | M: Johannes Berg <johannes@sipsolutions.net> |
| 11981 | L: linux-wireless@vger.kernel.org | 11992 | L: linux-wireless@vger.kernel.org |
diff --git a/arch/Kconfig b/arch/Kconfig index b695a3e3e922..095ba99968c1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
| @@ -272,6 +272,13 @@ config HAVE_REGS_AND_STACK_ACCESS_API | |||
| 272 | declared in asm/ptrace.h | 272 | declared in asm/ptrace.h |
| 273 | For example the kprobes-based event tracer needs this API. | 273 | For example the kprobes-based event tracer needs this API. |
| 274 | 274 | ||
| 275 | config HAVE_RSEQ | ||
| 276 | bool | ||
| 277 | depends on HAVE_REGS_AND_STACK_ACCESS_API | ||
| 278 | help | ||
| 279 | This symbol should be selected by an architecture if it | ||
| 280 | supports an implementation of restartable sequences. | ||
| 281 | |||
| 275 | config HAVE_CLK | 282 | config HAVE_CLK |
| 276 | bool | 283 | bool |
| 277 | help | 284 | help |
| @@ -1822,6 +1822,7 @@ static int do_execveat_common(int fd, struct filename *filename, | |||
| 1822 | current->fs->in_exec = 0; | 1822 | current->fs->in_exec = 0; |
| 1823 | current->in_execve = 0; | 1823 | current->in_execve = 0; |
| 1824 | membarrier_execve(current); | 1824 | membarrier_execve(current); |
| 1825 | rseq_execve(current); | ||
| 1825 | acct_update_integrals(current); | 1826 | acct_update_integrals(current); |
| 1826 | task_numa_free(current); | 1827 | task_numa_free(current); |
| 1827 | free_bprm(bprm); | 1828 | free_bprm(bprm); |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 14e4f9c12337..3aa4fcb74e76 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | #include <linux/signal_types.h> | 27 | #include <linux/signal_types.h> |
| 28 | #include <linux/mm_types_task.h> | 28 | #include <linux/mm_types_task.h> |
| 29 | #include <linux/task_io_accounting.h> | 29 | #include <linux/task_io_accounting.h> |
| 30 | #include <linux/rseq.h> | ||
| 30 | 31 | ||
| 31 | /* task_struct member predeclarations (sorted alphabetically): */ | 32 | /* task_struct member predeclarations (sorted alphabetically): */ |
| 32 | struct audit_context; | 33 | struct audit_context; |
| @@ -1047,6 +1048,17 @@ struct task_struct { | |||
| 1047 | unsigned long numa_pages_migrated; | 1048 | unsigned long numa_pages_migrated; |
| 1048 | #endif /* CONFIG_NUMA_BALANCING */ | 1049 | #endif /* CONFIG_NUMA_BALANCING */ |
| 1049 | 1050 | ||
| 1051 | #ifdef CONFIG_RSEQ | ||
| 1052 | struct rseq __user *rseq; | ||
| 1053 | u32 rseq_len; | ||
| 1054 | u32 rseq_sig; | ||
| 1055 | /* | ||
| 1056 | * RmW on rseq_event_mask must be performed atomically | ||
| 1057 | * with respect to preemption. | ||
| 1058 | */ | ||
| 1059 | unsigned long rseq_event_mask; | ||
| 1060 | #endif | ||
| 1061 | |||
| 1050 | struct tlbflush_unmap_batch tlb_ubc; | 1062 | struct tlbflush_unmap_batch tlb_ubc; |
| 1051 | 1063 | ||
| 1052 | struct rcu_head rcu; | 1064 | struct rcu_head rcu; |
| @@ -1757,4 +1769,126 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); | |||
| 1757 | #define TASK_SIZE_OF(tsk) TASK_SIZE | 1769 | #define TASK_SIZE_OF(tsk) TASK_SIZE |
| 1758 | #endif | 1770 | #endif |
| 1759 | 1771 | ||
| 1772 | #ifdef CONFIG_RSEQ | ||
| 1773 | |||
| 1774 | /* | ||
| 1775 | * Map the event mask on the user-space ABI enum rseq_cs_flags | ||
| 1776 | * for direct mask checks. | ||
| 1777 | */ | ||
| 1778 | enum rseq_event_mask_bits { | ||
| 1779 | RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, | ||
| 1780 | RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, | ||
| 1781 | RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, | ||
| 1782 | }; | ||
| 1783 | |||
| 1784 | enum rseq_event_mask { | ||
| 1785 | RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), | ||
| 1786 | RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), | ||
| 1787 | RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), | ||
| 1788 | }; | ||
| 1789 | |||
| 1790 | static inline void rseq_set_notify_resume(struct task_struct *t) | ||
| 1791 | { | ||
| 1792 | if (t->rseq) | ||
| 1793 | set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); | ||
| 1794 | } | ||
| 1795 | |||
| 1796 | void __rseq_handle_notify_resume(struct pt_regs *regs); | ||
| 1797 | |||
| 1798 | static inline void rseq_handle_notify_resume(struct pt_regs *regs) | ||
| 1799 | { | ||
| 1800 | if (current->rseq) | ||
| 1801 | __rseq_handle_notify_resume(regs); | ||
| 1802 | } | ||
| 1803 | |||
| 1804 | static inline void rseq_signal_deliver(struct pt_regs *regs) | ||
| 1805 | { | ||
| 1806 | preempt_disable(); | ||
| 1807 | __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); | ||
| 1808 | preempt_enable(); | ||
| 1809 | rseq_handle_notify_resume(regs); | ||
| 1810 | } | ||
| 1811 | |||
| 1812 | /* rseq_preempt() requires preemption to be disabled. */ | ||
| 1813 | static inline void rseq_preempt(struct task_struct *t) | ||
| 1814 | { | ||
| 1815 | __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); | ||
| 1816 | rseq_set_notify_resume(t); | ||
| 1817 | } | ||
| 1818 | |||
| 1819 | /* rseq_migrate() requires preemption to be disabled. */ | ||
| 1820 | static inline void rseq_migrate(struct task_struct *t) | ||
| 1821 | { | ||
| 1822 | __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); | ||
| 1823 | rseq_set_notify_resume(t); | ||
| 1824 | } | ||
| 1825 | |||
| 1826 | /* | ||
| 1827 | * If parent process has a registered restartable sequences area, the | ||
| 1828 | * child inherits. Only applies when forking a process, not a thread. In | ||
| 1829 | * case a parent fork() in the middle of a restartable sequence, set the | ||
| 1830 | * resume notifier to force the child to retry. | ||
| 1831 | */ | ||
| 1832 | static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) | ||
| 1833 | { | ||
| 1834 | if (clone_flags & CLONE_THREAD) { | ||
| 1835 | t->rseq = NULL; | ||
| 1836 | t->rseq_len = 0; | ||
| 1837 | t->rseq_sig = 0; | ||
| 1838 | t->rseq_event_mask = 0; | ||
| 1839 | } else { | ||
| 1840 | t->rseq = current->rseq; | ||
| 1841 | t->rseq_len = current->rseq_len; | ||
| 1842 | t->rseq_sig = current->rseq_sig; | ||
| 1843 | t->rseq_event_mask = current->rseq_event_mask; | ||
| 1844 | rseq_preempt(t); | ||
| 1845 | } | ||
| 1846 | } | ||
| 1847 | |||
| 1848 | static inline void rseq_execve(struct task_struct *t) | ||
| 1849 | { | ||
| 1850 | t->rseq = NULL; | ||
| 1851 | t->rseq_len = 0; | ||
| 1852 | t->rseq_sig = 0; | ||
| 1853 | t->rseq_event_mask = 0; | ||
| 1854 | } | ||
| 1855 | |||
| 1856 | #else | ||
| 1857 | |||
| 1858 | static inline void rseq_set_notify_resume(struct task_struct *t) | ||
| 1859 | { | ||
| 1860 | } | ||
| 1861 | static inline void rseq_handle_notify_resume(struct pt_regs *regs) | ||
| 1862 | { | ||
| 1863 | } | ||
| 1864 | static inline void rseq_signal_deliver(struct pt_regs *regs) | ||
| 1865 | { | ||
| 1866 | } | ||
| 1867 | static inline void rseq_preempt(struct task_struct *t) | ||
| 1868 | { | ||
| 1869 | } | ||
| 1870 | static inline void rseq_migrate(struct task_struct *t) | ||
| 1871 | { | ||
| 1872 | } | ||
| 1873 | static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) | ||
| 1874 | { | ||
| 1875 | } | ||
| 1876 | static inline void rseq_execve(struct task_struct *t) | ||
| 1877 | { | ||
| 1878 | } | ||
| 1879 | |||
| 1880 | #endif | ||
| 1881 | |||
| 1882 | #ifdef CONFIG_DEBUG_RSEQ | ||
| 1883 | |||
| 1884 | void rseq_syscall(struct pt_regs *regs); | ||
| 1885 | |||
| 1886 | #else | ||
| 1887 | |||
| 1888 | static inline void rseq_syscall(struct pt_regs *regs) | ||
| 1889 | { | ||
| 1890 | } | ||
| 1891 | |||
| 1892 | #endif | ||
| 1893 | |||
| 1760 | #endif | 1894 | #endif |
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 390e814fdc8d..73810808cdf2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h | |||
| @@ -66,6 +66,7 @@ struct old_linux_dirent; | |||
| 66 | struct perf_event_attr; | 66 | struct perf_event_attr; |
| 67 | struct file_handle; | 67 | struct file_handle; |
| 68 | struct sigaltstack; | 68 | struct sigaltstack; |
| 69 | struct rseq; | ||
| 69 | union bpf_attr; | 70 | union bpf_attr; |
| 70 | 71 | ||
| 71 | #include <linux/types.h> | 72 | #include <linux/types.h> |
| @@ -897,7 +898,8 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val); | |||
| 897 | asmlinkage long sys_pkey_free(int pkey); | 898 | asmlinkage long sys_pkey_free(int pkey); |
| 898 | asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, | 899 | asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, |
| 899 | unsigned mask, struct statx __user *buffer); | 900 | unsigned mask, struct statx __user *buffer); |
| 900 | 901 | asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, | |
| 902 | int flags, uint32_t sig); | ||
| 901 | 903 | ||
| 902 | /* | 904 | /* |
| 903 | * Architecture-specific system calls | 905 | * Architecture-specific system calls |
diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h new file mode 100644 index 000000000000..a04a64bc1a00 --- /dev/null +++ b/include/trace/events/rseq.h | |||
| @@ -0,0 +1,57 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0+ */ | ||
| 2 | #undef TRACE_SYSTEM | ||
| 3 | #define TRACE_SYSTEM rseq | ||
| 4 | |||
| 5 | #if !defined(_TRACE_RSEQ_H) || defined(TRACE_HEADER_MULTI_READ) | ||
| 6 | #define _TRACE_RSEQ_H | ||
| 7 | |||
| 8 | #include <linux/tracepoint.h> | ||
| 9 | #include <linux/types.h> | ||
| 10 | |||
| 11 | TRACE_EVENT(rseq_update, | ||
| 12 | |||
| 13 | TP_PROTO(struct task_struct *t), | ||
| 14 | |||
| 15 | TP_ARGS(t), | ||
| 16 | |||
| 17 | TP_STRUCT__entry( | ||
| 18 | __field(s32, cpu_id) | ||
| 19 | ), | ||
| 20 | |||
| 21 | TP_fast_assign( | ||
| 22 | __entry->cpu_id = raw_smp_processor_id(); | ||
| 23 | ), | ||
| 24 | |||
| 25 | TP_printk("cpu_id=%d", __entry->cpu_id) | ||
| 26 | ); | ||
| 27 | |||
| 28 | TRACE_EVENT(rseq_ip_fixup, | ||
| 29 | |||
| 30 | TP_PROTO(unsigned long regs_ip, unsigned long start_ip, | ||
| 31 | unsigned long post_commit_offset, unsigned long abort_ip), | ||
| 32 | |||
| 33 | TP_ARGS(regs_ip, start_ip, post_commit_offset, abort_ip), | ||
| 34 | |||
| 35 | TP_STRUCT__entry( | ||
| 36 | __field(unsigned long, regs_ip) | ||
| 37 | __field(unsigned long, start_ip) | ||
| 38 | __field(unsigned long, post_commit_offset) | ||
| 39 | __field(unsigned long, abort_ip) | ||
| 40 | ), | ||
| 41 | |||
| 42 | TP_fast_assign( | ||
| 43 | __entry->regs_ip = regs_ip; | ||
| 44 | __entry->start_ip = start_ip; | ||
| 45 | __entry->post_commit_offset = post_commit_offset; | ||
| 46 | __entry->abort_ip = abort_ip; | ||
| 47 | ), | ||
| 48 | |||
| 49 | TP_printk("regs_ip=0x%lx start_ip=0x%lx post_commit_offset=%lu abort_ip=0x%lx", | ||
| 50 | __entry->regs_ip, __entry->start_ip, | ||
| 51 | __entry->post_commit_offset, __entry->abort_ip) | ||
| 52 | ); | ||
| 53 | |||
| 54 | #endif /* _TRACE_SOCK_H */ | ||
| 55 | |||
| 56 | /* This part must be outside protection */ | ||
| 57 | #include <trace/define_trace.h> | ||
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h new file mode 100644 index 000000000000..d620fa43756c --- /dev/null +++ b/include/uapi/linux/rseq.h | |||
| @@ -0,0 +1,133 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ | ||
| 2 | #ifndef _UAPI_LINUX_RSEQ_H | ||
| 3 | #define _UAPI_LINUX_RSEQ_H | ||
| 4 | |||
| 5 | /* | ||
| 6 | * linux/rseq.h | ||
| 7 | * | ||
| 8 | * Restartable sequences system call API | ||
| 9 | * | ||
| 10 | * Copyright (c) 2015-2018 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | ||
| 11 | */ | ||
| 12 | |||
| 13 | #ifdef __KERNEL__ | ||
| 14 | # include <linux/types.h> | ||
| 15 | #else | ||
| 16 | # include <stdint.h> | ||
| 17 | #endif | ||
| 18 | |||
| 19 | #include <linux/types_32_64.h> | ||
| 20 | |||
| 21 | enum rseq_cpu_id_state { | ||
| 22 | RSEQ_CPU_ID_UNINITIALIZED = -1, | ||
| 23 | RSEQ_CPU_ID_REGISTRATION_FAILED = -2, | ||
| 24 | }; | ||
| 25 | |||
| 26 | enum rseq_flags { | ||
| 27 | RSEQ_FLAG_UNREGISTER = (1 << 0), | ||
| 28 | }; | ||
| 29 | |||
| 30 | enum rseq_cs_flags_bit { | ||
| 31 | RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, | ||
| 32 | RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, | ||
| 33 | RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, | ||
| 34 | }; | ||
| 35 | |||
| 36 | enum rseq_cs_flags { | ||
| 37 | RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = | ||
| 38 | (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT), | ||
| 39 | RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = | ||
| 40 | (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), | ||
| 41 | RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = | ||
| 42 | (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), | ||
| 43 | }; | ||
| 44 | |||
| 45 | /* | ||
| 46 | * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always | ||
| 47 | * contained within a single cache-line. It is usually declared as | ||
| 48 | * link-time constant data. | ||
| 49 | */ | ||
| 50 | struct rseq_cs { | ||
| 51 | /* Version of this structure. */ | ||
| 52 | __u32 version; | ||
| 53 | /* enum rseq_cs_flags */ | ||
| 54 | __u32 flags; | ||
| 55 | LINUX_FIELD_u32_u64(start_ip); | ||
| 56 | /* Offset from start_ip. */ | ||
| 57 | LINUX_FIELD_u32_u64(post_commit_offset); | ||
| 58 | LINUX_FIELD_u32_u64(abort_ip); | ||
| 59 | } __attribute__((aligned(4 * sizeof(__u64)))); | ||
| 60 | |||
| 61 | /* | ||
| 62 | * struct rseq is aligned on 4 * 8 bytes to ensure it is always | ||
| 63 | * contained within a single cache-line. | ||
| 64 | * | ||
| 65 | * A single struct rseq per thread is allowed. | ||
| 66 | */ | ||
| 67 | struct rseq { | ||
| 68 | /* | ||
| 69 | * Restartable sequences cpu_id_start field. Updated by the | ||
| 70 | * kernel, and read by user-space with single-copy atomicity | ||
| 71 | * semantics. Aligned on 32-bit. Always contains a value in the | ||
| 72 | * range of possible CPUs, although the value may not be the | ||
| 73 | * actual current CPU (e.g. if rseq is not initialized). This | ||
| 74 | * CPU number value should always be compared against the value | ||
| 75 | * of the cpu_id field before performing a rseq commit or | ||
| 76 | * returning a value read from a data structure indexed using | ||
| 77 | * the cpu_id_start value. | ||
| 78 | */ | ||
| 79 | __u32 cpu_id_start; | ||
| 80 | /* | ||
| 81 | * Restartable sequences cpu_id field. Updated by the kernel, | ||
| 82 | * and read by user-space with single-copy atomicity semantics. | ||
| 83 | * Aligned on 32-bit. Values RSEQ_CPU_ID_UNINITIALIZED and | ||
| 84 | * RSEQ_CPU_ID_REGISTRATION_FAILED have a special semantic: the | ||
| 85 | * former means "rseq uninitialized", and latter means "rseq | ||
| 86 | * initialization failed". This value is meant to be read within | ||
| 87 | * rseq critical sections and compared with the cpu_id_start | ||
| 88 | * value previously read, before performing the commit instruction, | ||
| 89 | * or read and compared with the cpu_id_start value before returning | ||
| 90 | * a value loaded from a data structure indexed using the | ||
| 91 | * cpu_id_start value. | ||
| 92 | */ | ||
| 93 | __u32 cpu_id; | ||
| 94 | /* | ||
| 95 | * Restartable sequences rseq_cs field. | ||
| 96 | * | ||
| 97 | * Contains NULL when no critical section is active for the current | ||
| 98 | * thread, or holds a pointer to the currently active struct rseq_cs. | ||
| 99 | * | ||
| 100 | * Updated by user-space, which sets the address of the currently | ||
| 101 | * active rseq_cs at the beginning of assembly instruction sequence | ||
| 102 | * block, and set to NULL by the kernel when it restarts an assembly | ||
| 103 | * instruction sequence block, as well as when the kernel detects that | ||
| 104 | * it is preempting or delivering a signal outside of the range | ||
| 105 | * targeted by the rseq_cs. Also needs to be set to NULL by user-space | ||
| 106 | * before reclaiming memory that contains the targeted struct rseq_cs. | ||
| 107 | * | ||
| 108 | * Read and set by the kernel with single-copy atomicity semantics. | ||
| 109 | * Set by user-space with single-copy atomicity semantics. Aligned | ||
| 110 | * on 64-bit. | ||
| 111 | */ | ||
| 112 | LINUX_FIELD_u32_u64(rseq_cs); | ||
| 113 | /* | ||
| 114 | * - RSEQ_DISABLE flag: | ||
| 115 | * | ||
| 116 | * Fallback fast-track flag for single-stepping. | ||
| 117 | * Set by user-space if lack of progress is detected. | ||
| 118 | * Cleared by user-space after rseq finish. | ||
| 119 | * Read by the kernel. | ||
| 120 | * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | ||
| 121 | * Inhibit instruction sequence block restart and event | ||
| 122 | * counter increment on preemption for this thread. | ||
| 123 | * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | ||
| 124 | * Inhibit instruction sequence block restart and event | ||
| 125 | * counter increment on signal delivery for this thread. | ||
| 126 | * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | ||
| 127 | * Inhibit instruction sequence block restart and event | ||
| 128 | * counter increment on migration for this thread. | ||
| 129 | */ | ||
| 130 | __u32 flags; | ||
| 131 | } __attribute__((aligned(4 * sizeof(__u64)))); | ||
| 132 | |||
| 133 | #endif /* _UAPI_LINUX_RSEQ_H */ | ||
diff --git a/init/Kconfig b/init/Kconfig index 18b151f0ddc1..33ec06fddaaa 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
| @@ -1417,6 +1417,29 @@ config ARCH_HAS_MEMBARRIER_CALLBACKS | |||
| 1417 | config ARCH_HAS_MEMBARRIER_SYNC_CORE | 1417 | config ARCH_HAS_MEMBARRIER_SYNC_CORE |
| 1418 | bool | 1418 | bool |
| 1419 | 1419 | ||
| 1420 | config RSEQ | ||
| 1421 | bool "Enable rseq() system call" if EXPERT | ||
| 1422 | default y | ||
| 1423 | depends on HAVE_RSEQ | ||
| 1424 | select MEMBARRIER | ||
| 1425 | help | ||
| 1426 | Enable the restartable sequences system call. It provides a | ||
| 1427 | user-space cache for the current CPU number value, which | ||
| 1428 | speeds up getting the current CPU number from user-space, | ||
| 1429 | as well as an ABI to speed up user-space operations on | ||
| 1430 | per-CPU data. | ||
| 1431 | |||
| 1432 | If unsure, say Y. | ||
| 1433 | |||
| 1434 | config DEBUG_RSEQ | ||
| 1435 | default n | ||
| 1436 | bool "Enabled debugging of rseq() system call" if EXPERT | ||
| 1437 | depends on RSEQ && DEBUG_KERNEL | ||
| 1438 | help | ||
| 1439 | Enable extra debugging checks for the rseq system call. | ||
| 1440 | |||
| 1441 | If unsure, say N. | ||
| 1442 | |||
| 1420 | config EMBEDDED | 1443 | config EMBEDDED |
| 1421 | bool "Embedded system" | 1444 | bool "Embedded system" |
| 1422 | option allnoconfig_y | 1445 | option allnoconfig_y |
diff --git a/kernel/Makefile b/kernel/Makefile index f85ae5dfa474..7085c841c413 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -113,6 +113,7 @@ obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o | |||
| 113 | obj-$(CONFIG_TORTURE_TEST) += torture.o | 113 | obj-$(CONFIG_TORTURE_TEST) += torture.o |
| 114 | 114 | ||
| 115 | obj-$(CONFIG_HAS_IOMEM) += memremap.o | 115 | obj-$(CONFIG_HAS_IOMEM) += memremap.o |
| 116 | obj-$(CONFIG_RSEQ) += rseq.o | ||
| 116 | 117 | ||
| 117 | $(obj)/configs.o: $(obj)/config_data.h | 118 | $(obj)/configs.o: $(obj)/config_data.h |
| 118 | 119 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index a5d21c42acfc..70992bfeba81 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -1899,6 +1899,8 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 1899 | */ | 1899 | */ |
| 1900 | copy_seccomp(p); | 1900 | copy_seccomp(p); |
| 1901 | 1901 | ||
| 1902 | rseq_fork(p, clone_flags); | ||
| 1903 | |||
| 1902 | /* | 1904 | /* |
| 1903 | * Process group and session signals need to be delivered to just the | 1905 | * Process group and session signals need to be delivered to just the |
| 1904 | * parent before the fork or both the parent and the child after the | 1906 | * parent before the fork or both the parent and the child after the |
diff --git a/kernel/rseq.c b/kernel/rseq.c new file mode 100644 index 000000000000..ae306f90c514 --- /dev/null +++ b/kernel/rseq.c | |||
| @@ -0,0 +1,357 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0+ | ||
| 2 | /* | ||
| 3 | * Restartable sequences system call | ||
| 4 | * | ||
| 5 | * Copyright (C) 2015, Google, Inc., | ||
| 6 | * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> | ||
| 7 | * Copyright (C) 2015-2018, EfficiOS Inc., | ||
| 8 | * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <linux/sched.h> | ||
| 12 | #include <linux/uaccess.h> | ||
| 13 | #include <linux/syscalls.h> | ||
| 14 | #include <linux/rseq.h> | ||
| 15 | #include <linux/types.h> | ||
| 16 | #include <asm/ptrace.h> | ||
| 17 | |||
| 18 | #define CREATE_TRACE_POINTS | ||
| 19 | #include <trace/events/rseq.h> | ||
| 20 | |||
| 21 | #define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \ | ||
| 22 | RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) | ||
| 23 | |||
| 24 | /* | ||
| 25 | * | ||
| 26 | * Restartable sequences are a lightweight interface that allows | ||
| 27 | * user-level code to be executed atomically relative to scheduler | ||
| 28 | * preemption and signal delivery. Typically used for implementing | ||
| 29 | * per-cpu operations. | ||
| 30 | * | ||
| 31 | * It allows user-space to perform update operations on per-cpu data | ||
| 32 | * without requiring heavy-weight atomic operations. | ||
| 33 | * | ||
| 34 | * Detailed algorithm of rseq user-space assembly sequences: | ||
| 35 | * | ||
| 36 | * init(rseq_cs) | ||
| 37 | * cpu = TLS->rseq::cpu_id_start | ||
| 38 | * [1] TLS->rseq::rseq_cs = rseq_cs | ||
| 39 | * [start_ip] ---------------------------- | ||
| 40 | * [2] if (cpu != TLS->rseq::cpu_id) | ||
| 41 | * goto abort_ip; | ||
| 42 | * [3] <last_instruction_in_cs> | ||
| 43 | * [post_commit_ip] ---------------------------- | ||
| 44 | * | ||
| 45 | * The address of jump target abort_ip must be outside the critical | ||
| 46 | * region, i.e.: | ||
| 47 | * | ||
| 48 | * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] | ||
| 49 | * | ||
| 50 | * Steps [2]-[3] (inclusive) need to be a sequence of instructions in | ||
| 51 | * userspace that can handle being interrupted between any of those | ||
| 52 | * instructions, and then resumed to the abort_ip. | ||
| 53 | * | ||
| 54 | * 1. Userspace stores the address of the struct rseq_cs assembly | ||
| 55 | * block descriptor into the rseq_cs field of the registered | ||
| 56 | * struct rseq TLS area. This update is performed through a single | ||
| 57 | * store within the inline assembly instruction sequence. | ||
| 58 | * [start_ip] | ||
| 59 | * | ||
| 60 | * 2. Userspace tests to check whether the current cpu_id field match | ||
| 61 | * the cpu number loaded before start_ip, branching to abort_ip | ||
| 62 | * in case of a mismatch. | ||
| 63 | * | ||
| 64 | * If the sequence is preempted or interrupted by a signal | ||
| 65 | * at or after start_ip and before post_commit_ip, then the kernel | ||
| 66 | * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return | ||
| 67 | * ip to abort_ip before returning to user-space, so the preempted | ||
| 68 | * execution resumes at abort_ip. | ||
| 69 | * | ||
| 70 | * 3. Userspace critical section final instruction before | ||
| 71 | * post_commit_ip is the commit. The critical section is | ||
| 72 | * self-terminating. | ||
| 73 | * [post_commit_ip] | ||
| 74 | * | ||
| 75 | * 4. <success> | ||
| 76 | * | ||
| 77 | * On failure at [2], or if interrupted by preempt or signal delivery | ||
| 78 | * between [1] and [3]: | ||
| 79 | * | ||
| 80 | * [abort_ip] | ||
| 81 | * F1. <failure> | ||
| 82 | */ | ||
| 83 | |||
| 84 | static int rseq_update_cpu_id(struct task_struct *t) | ||
| 85 | { | ||
| 86 | u32 cpu_id = raw_smp_processor_id(); | ||
| 87 | |||
| 88 | if (__put_user(cpu_id, &t->rseq->cpu_id_start)) | ||
| 89 | return -EFAULT; | ||
| 90 | if (__put_user(cpu_id, &t->rseq->cpu_id)) | ||
| 91 | return -EFAULT; | ||
| 92 | trace_rseq_update(t); | ||
| 93 | return 0; | ||
| 94 | } | ||
| 95 | |||
| 96 | static int rseq_reset_rseq_cpu_id(struct task_struct *t) | ||
| 97 | { | ||
| 98 | u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED; | ||
| 99 | |||
| 100 | /* | ||
| 101 | * Reset cpu_id_start to its initial state (0). | ||
| 102 | */ | ||
| 103 | if (__put_user(cpu_id_start, &t->rseq->cpu_id_start)) | ||
| 104 | return -EFAULT; | ||
| 105 | /* | ||
| 106 | * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming | ||
| 107 | * in after unregistration can figure out that rseq needs to be | ||
| 108 | * registered again. | ||
| 109 | */ | ||
| 110 | if (__put_user(cpu_id, &t->rseq->cpu_id)) | ||
| 111 | return -EFAULT; | ||
| 112 | return 0; | ||
| 113 | } | ||
| 114 | |||
| 115 | static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) | ||
| 116 | { | ||
| 117 | struct rseq_cs __user *urseq_cs; | ||
| 118 | unsigned long ptr; | ||
| 119 | u32 __user *usig; | ||
| 120 | u32 sig; | ||
| 121 | int ret; | ||
| 122 | |||
| 123 | ret = __get_user(ptr, &t->rseq->rseq_cs); | ||
| 124 | if (ret) | ||
| 125 | return ret; | ||
| 126 | if (!ptr) { | ||
| 127 | memset(rseq_cs, 0, sizeof(*rseq_cs)); | ||
| 128 | return 0; | ||
| 129 | } | ||
| 130 | urseq_cs = (struct rseq_cs __user *)ptr; | ||
| 131 | if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) | ||
| 132 | return -EFAULT; | ||
| 133 | if (rseq_cs->version > 0) | ||
| 134 | return -EINVAL; | ||
| 135 | |||
| 136 | /* Ensure that abort_ip is not in the critical section. */ | ||
| 137 | if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) | ||
| 138 | return -EINVAL; | ||
| 139 | |||
| 140 | usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32)); | ||
| 141 | ret = get_user(sig, usig); | ||
| 142 | if (ret) | ||
| 143 | return ret; | ||
| 144 | |||
| 145 | if (current->rseq_sig != sig) { | ||
| 146 | printk_ratelimited(KERN_WARNING | ||
| 147 | "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", | ||
| 148 | sig, current->rseq_sig, current->pid, usig); | ||
| 149 | return -EPERM; | ||
| 150 | } | ||
| 151 | return 0; | ||
| 152 | } | ||
| 153 | |||
| 154 | static int rseq_need_restart(struct task_struct *t, u32 cs_flags) | ||
| 155 | { | ||
| 156 | u32 flags, event_mask; | ||
| 157 | int ret; | ||
| 158 | |||
| 159 | /* Get thread flags. */ | ||
| 160 | ret = __get_user(flags, &t->rseq->flags); | ||
| 161 | if (ret) | ||
| 162 | return ret; | ||
| 163 | |||
| 164 | /* Take critical section flags into account. */ | ||
| 165 | flags |= cs_flags; | ||
| 166 | |||
| 167 | /* | ||
| 168 | * Restart on signal can only be inhibited when restart on | ||
| 169 | * preempt and restart on migrate are inhibited too. Otherwise, | ||
| 170 | * a preempted signal handler could fail to restart the prior | ||
| 171 | * execution context on sigreturn. | ||
| 172 | */ | ||
| 173 | if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) && | ||
| 174 | (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) != | ||
| 175 | RSEQ_CS_PREEMPT_MIGRATE_FLAGS)) | ||
| 176 | return -EINVAL; | ||
| 177 | |||
| 178 | /* | ||
| 179 | * Load and clear event mask atomically with respect to | ||
| 180 | * scheduler preemption. | ||
| 181 | */ | ||
| 182 | preempt_disable(); | ||
| 183 | event_mask = t->rseq_event_mask; | ||
| 184 | t->rseq_event_mask = 0; | ||
| 185 | preempt_enable(); | ||
| 186 | |||
| 187 | return !!(event_mask & ~flags); | ||
| 188 | } | ||
| 189 | |||
| 190 | static int clear_rseq_cs(struct task_struct *t) | ||
| 191 | { | ||
| 192 | /* | ||
| 193 | * The rseq_cs field is set to NULL on preemption or signal | ||
| 194 | * delivery on top of rseq assembly block, as well as on top | ||
| 195 | * of code outside of the rseq assembly block. This performs | ||
| 196 | * a lazy clear of the rseq_cs field. | ||
| 197 | * | ||
| 198 | * Set rseq_cs to NULL with single-copy atomicity. | ||
| 199 | */ | ||
| 200 | return __put_user(0UL, &t->rseq->rseq_cs); | ||
| 201 | } | ||
| 202 | |||
| 203 | /* | ||
| 204 | * Unsigned comparison will be true when ip >= start_ip, and when | ||
| 205 | * ip < start_ip + post_commit_offset. | ||
| 206 | */ | ||
| 207 | static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) | ||
| 208 | { | ||
| 209 | return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; | ||
| 210 | } | ||
| 211 | |||
| 212 | static int rseq_ip_fixup(struct pt_regs *regs) | ||
| 213 | { | ||
| 214 | unsigned long ip = instruction_pointer(regs); | ||
| 215 | struct task_struct *t = current; | ||
| 216 | struct rseq_cs rseq_cs; | ||
| 217 | int ret; | ||
| 218 | |||
| 219 | ret = rseq_get_rseq_cs(t, &rseq_cs); | ||
| 220 | if (ret) | ||
| 221 | return ret; | ||
| 222 | |||
| 223 | /* | ||
| 224 | * Handle potentially not being within a critical section. | ||
| 225 | * If not nested over a rseq critical section, restart is useless. | ||
| 226 | * Clear the rseq_cs pointer and return. | ||
| 227 | */ | ||
| 228 | if (!in_rseq_cs(ip, &rseq_cs)) | ||
| 229 | return clear_rseq_cs(t); | ||
| 230 | ret = rseq_need_restart(t, rseq_cs.flags); | ||
| 231 | if (ret <= 0) | ||
| 232 | return ret; | ||
| 233 | ret = clear_rseq_cs(t); | ||
| 234 | if (ret) | ||
| 235 | return ret; | ||
| 236 | trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, | ||
| 237 | rseq_cs.abort_ip); | ||
| 238 | instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); | ||
| 239 | return 0; | ||
| 240 | } | ||
| 241 | |||
| 242 | /* | ||
| 243 | * This resume handler must always be executed between any of: | ||
| 244 | * - preemption, | ||
| 245 | * - signal delivery, | ||
| 246 | * and return to user-space. | ||
| 247 | * | ||
| 248 | * This is how we can ensure that the entire rseq critical section, | ||
| 249 | * consisting of both the C part and the assembly instruction sequence, | ||
| 250 | * will issue the commit instruction only if executed atomically with | ||
| 251 | * respect to other threads scheduled on the same CPU, and with respect | ||
| 252 | * to signal handlers. | ||
| 253 | */ | ||
| 254 | void __rseq_handle_notify_resume(struct pt_regs *regs) | ||
| 255 | { | ||
| 256 | struct task_struct *t = current; | ||
| 257 | int ret; | ||
| 258 | |||
| 259 | if (unlikely(t->flags & PF_EXITING)) | ||
| 260 | return; | ||
| 261 | if (unlikely(!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq)))) | ||
| 262 | goto error; | ||
| 263 | ret = rseq_ip_fixup(regs); | ||
| 264 | if (unlikely(ret < 0)) | ||
| 265 | goto error; | ||
| 266 | if (unlikely(rseq_update_cpu_id(t))) | ||
| 267 | goto error; | ||
| 268 | return; | ||
| 269 | |||
| 270 | error: | ||
| 271 | force_sig(SIGSEGV, t); | ||
| 272 | } | ||
| 273 | |||
| 274 | #ifdef CONFIG_DEBUG_RSEQ | ||
| 275 | |||
| 276 | /* | ||
| 277 | * Terminate the process if a syscall is issued within a restartable | ||
| 278 | * sequence. | ||
| 279 | */ | ||
| 280 | void rseq_syscall(struct pt_regs *regs) | ||
| 281 | { | ||
| 282 | unsigned long ip = instruction_pointer(regs); | ||
| 283 | struct task_struct *t = current; | ||
| 284 | struct rseq_cs rseq_cs; | ||
| 285 | |||
| 286 | if (!t->rseq) | ||
| 287 | return; | ||
| 288 | if (!access_ok(VERIFY_READ, t->rseq, sizeof(*t->rseq)) || | ||
| 289 | rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) | ||
| 290 | force_sig(SIGSEGV, t); | ||
| 291 | } | ||
| 292 | |||
| 293 | #endif | ||
| 294 | |||
| 295 | /* | ||
| 296 | * sys_rseq - setup restartable sequences for caller thread. | ||
| 297 | */ | ||
| 298 | SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, | ||
| 299 | int, flags, u32, sig) | ||
| 300 | { | ||
| 301 | int ret; | ||
| 302 | |||
| 303 | if (flags & RSEQ_FLAG_UNREGISTER) { | ||
| 304 | /* Unregister rseq for current thread. */ | ||
| 305 | if (current->rseq != rseq || !current->rseq) | ||
| 306 | return -EINVAL; | ||
| 307 | if (current->rseq_len != rseq_len) | ||
| 308 | return -EINVAL; | ||
| 309 | if (current->rseq_sig != sig) | ||
| 310 | return -EPERM; | ||
| 311 | ret = rseq_reset_rseq_cpu_id(current); | ||
| 312 | if (ret) | ||
| 313 | return ret; | ||
| 314 | current->rseq = NULL; | ||
| 315 | current->rseq_len = 0; | ||
| 316 | current->rseq_sig = 0; | ||
| 317 | return 0; | ||
| 318 | } | ||
| 319 | |||
| 320 | if (unlikely(flags)) | ||
| 321 | return -EINVAL; | ||
| 322 | |||
| 323 | if (current->rseq) { | ||
| 324 | /* | ||
| 325 | * If rseq is already registered, check whether | ||
| 326 | * the provided address differs from the prior | ||
| 327 | * one. | ||
| 328 | */ | ||
| 329 | if (current->rseq != rseq || current->rseq_len != rseq_len) | ||
| 330 | return -EINVAL; | ||
| 331 | if (current->rseq_sig != sig) | ||
| 332 | return -EPERM; | ||
| 333 | /* Already registered. */ | ||
| 334 | return -EBUSY; | ||
| 335 | } | ||
| 336 | |||
| 337 | /* | ||
| 338 | * If there was no rseq previously registered, | ||
| 339 | * ensure the provided rseq is properly aligned and valid. | ||
| 340 | */ | ||
| 341 | if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || | ||
| 342 | rseq_len != sizeof(*rseq)) | ||
| 343 | return -EINVAL; | ||
| 344 | if (!access_ok(VERIFY_WRITE, rseq, rseq_len)) | ||
| 345 | return -EFAULT; | ||
| 346 | current->rseq = rseq; | ||
| 347 | current->rseq_len = rseq_len; | ||
| 348 | current->rseq_sig = sig; | ||
| 349 | /* | ||
| 350 | * If rseq was previously inactive, and has just been | ||
| 351 | * registered, ensure the cpu_id_start and cpu_id fields | ||
| 352 | * are updated before returning to user-space. | ||
| 353 | */ | ||
| 354 | rseq_set_notify_resume(current); | ||
| 355 | |||
| 356 | return 0; | ||
| 357 | } | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e9866f86f304..a98d54cd5535 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -1191,6 +1191,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1191 | if (p->sched_class->migrate_task_rq) | 1191 | if (p->sched_class->migrate_task_rq) |
| 1192 | p->sched_class->migrate_task_rq(p); | 1192 | p->sched_class->migrate_task_rq(p); |
| 1193 | p->se.nr_migrations++; | 1193 | p->se.nr_migrations++; |
| 1194 | rseq_migrate(p); | ||
| 1194 | perf_event_task_migrate(p); | 1195 | perf_event_task_migrate(p); |
| 1195 | } | 1196 | } |
| 1196 | 1197 | ||
| @@ -2634,6 +2635,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
| 2634 | { | 2635 | { |
| 2635 | sched_info_switch(rq, prev, next); | 2636 | sched_info_switch(rq, prev, next); |
| 2636 | perf_event_task_sched_out(prev, next); | 2637 | perf_event_task_sched_out(prev, next); |
| 2638 | rseq_preempt(prev); | ||
| 2637 | fire_sched_out_preempt_notifiers(prev, next); | 2639 | fire_sched_out_preempt_notifiers(prev, next); |
| 2638 | prepare_task(next); | 2640 | prepare_task(next); |
| 2639 | prepare_arch_switch(next); | 2641 | prepare_arch_switch(next); |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 183169c2a75b..86f832d6ff6f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
| @@ -432,3 +432,6 @@ COND_SYSCALL(setresgid16); | |||
| 432 | COND_SYSCALL(setresuid16); | 432 | COND_SYSCALL(setresuid16); |
| 433 | COND_SYSCALL(setreuid16); | 433 | COND_SYSCALL(setreuid16); |
| 434 | COND_SYSCALL(setuid16); | 434 | COND_SYSCALL(setuid16); |
| 435 | |||
| 436 | /* restartable sequence */ | ||
| 437 | COND_SYSCALL(rseq); | ||
