aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux/sched.h
diff options
context:
space:
mode:
authorMathieu Desnoyers <mathieu.desnoyers@efficios.com>2018-06-02 08:43:54 -0400
committerThomas Gleixner <tglx@linutronix.de>2018-06-06 05:58:31 -0400
commitd7822b1e24f2df5df98c76f0e94a5416349ff759 (patch)
treebb67c4fc4f588a110c6277aba639cfe79430e54a /include/linux/sched.h
parentb575e837215325544b0dbcef912f13f369de4f3f (diff)
rseq: Introduce restartable sequences system call
Expose a new system call allowing each thread to register one userspace memory area to be used as an ABI between kernel and user-space for two purposes: user-space restartable sequences and quick access to read the current CPU number value from user-space. * Restartable sequences (per-cpu atomics) Restartables sequences allow user-space to perform update operations on per-cpu data without requiring heavy-weight atomic operations. The restartable critical sections (percpu atomics) work has been started by Paul Turner and Andrew Hunter. It lets the kernel handle restart of critical sections. [1] [2] The re-implementation proposed here brings a few simplifications to the ABI which facilitates porting to other architectures and speeds up the user-space fast path. Here are benchmarks of various rseq use-cases. Test hardware: arm32: ARMv7 Processor rev 4 (v7l) "Cubietruck", 2-core x86-64: Intel E5-2630 v3@2.40GHz, 16-core, hyperthreading The following benchmarks were all performed on a single thread. * Per-CPU statistic counter increment getcpu+atomic (ns/op) rseq (ns/op) speedup arm32: 344.0 31.4 11.0 x86-64: 15.3 2.0 7.7 * LTTng-UST: write event 32-bit header, 32-bit payload into tracer per-cpu buffer getcpu+atomic (ns/op) rseq (ns/op) speedup arm32: 2502.0 2250.0 1.1 x86-64: 117.4 98.0 1.2 * liburcu percpu: lock-unlock pair, dereference, read/compare word getcpu+atomic (ns/op) rseq (ns/op) speedup arm32: 751.0 128.5 5.8 x86-64: 53.4 28.6 1.9 * jemalloc memory allocator adapted to use rseq Using rseq with per-cpu memory pools in jemalloc at Facebook (based on rseq 2016 implementation): The production workload response-time has 1-2% gain avg. latency, and the P99 overall latency drops by 2-3%. * Reading the current CPU number Speeding up reading the current CPU number on which the caller thread is running is done by keeping the current CPU number up do date within the cpu_id field of the memory area registered by the thread. This is done by making scheduler preemption set the TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space, a notify-resume handler updates the current CPU value within the registered user-space memory area. User-space can then read the current CPU number directly from memory. Keeping the current cpu id in a memory area shared between kernel and user-space is an improvement over current mechanisms available to read the current CPU number, which has the following benefits over alternative approaches: - 35x speedup on ARM vs system call through glibc - 20x speedup on x86 compared to calling glibc, which calls vdso executing a "lsl" instruction, - 14x speedup on x86 compared to inlined "lsl" instruction, - Unlike vdso approaches, this cpu_id value can be read from an inline assembly, which makes it a useful building block for restartable sequences. - The approach of reading the cpu id through memory mapping shared between kernel and user-space is portable (e.g. ARM), which is not the case for the lsl-based x86 vdso. On x86, yet another possible approach would be to use the gs segment selector to point to user-space per-cpu data. This approach performs similarly to the cpu id cache, but it has two disadvantages: it is not portable, and it is incompatible with existing applications already using the gs segment selector for other purposes. Benchmarking various approaches for reading the current CPU number: ARMv7 Processor rev 4 (v7l) Machine model: Cubietruck - Baseline (empty loop): 8.4 ns - Read CPU from rseq cpu_id: 16.7 ns - Read CPU from rseq cpu_id (lazy register): 19.8 ns - glibc 2.19-0ubuntu6.6 getcpu: 301.8 ns - getcpu system call: 234.9 ns x86-64 Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz: - Baseline (empty loop): 0.8 ns - Read CPU from rseq cpu_id: 0.8 ns - Read CPU from rseq cpu_id (lazy register): 0.8 ns - Read using gs segment selector: 0.8 ns - "lsl" inline assembly: 13.0 ns - glibc 2.19-0ubuntu6 getcpu: 16.6 ns - getcpu system call: 53.9 ns - Speed (benchmark taken on v8 of patchset) Running 10 runs of hackbench -l 100000 seems to indicate, contrary to expectations, that enabling CONFIG_RSEQ slightly accelerates the scheduler: Configuration: 2 sockets * 8-core Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz (directly on hardware, hyperthreading disabled in BIOS, energy saving disabled in BIOS, turboboost disabled in BIOS, cpuidle.off=1 kernel parameter), with a Linux v4.6 defconfig+localyesconfig, restartable sequences series applied. * CONFIG_RSEQ=n avg.: 41.37 s std.dev.: 0.36 s * CONFIG_RSEQ=y avg.: 40.46 s std.dev.: 0.33 s - Size On x86-64, between CONFIG_RSEQ=n/y, the text size increase of vmlinux is 567 bytes, and the data size increase of vmlinux is 5696 bytes. [1] https://lwn.net/Articles/650333/ [2] http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Joel Fernandes <joelaf@google.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Dave Watson <davejwatson@fb.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: "H . Peter Anvin" <hpa@zytor.com> Cc: Chris Lameter <cl@linux.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Andrew Hunter <ahh@google.com> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com> Cc: Paul Turner <pjt@google.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Josh Triplett <josh@joshtriplett.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Ben Maurer <bmaurer@fb.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: linux-api@vger.kernel.org Cc: Andy Lutomirski <luto@amacapital.net> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/20151027235635.16059.11630.stgit@pjt-glaptop.roam.corp.google.com Link: http://lkml.kernel.org/r/20150624222609.6116.86035.stgit@kitami.mtv.corp.google.com Link: https://lkml.kernel.org/r/20180602124408.8430-3-mathieu.desnoyers@efficios.com
Diffstat (limited to 'include/linux/sched.h')
-rw-r--r--include/linux/sched.h134
1 files changed, 134 insertions, 0 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 14e4f9c12337..3aa4fcb74e76 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -27,6 +27,7 @@
27#include <linux/signal_types.h> 27#include <linux/signal_types.h>
28#include <linux/mm_types_task.h> 28#include <linux/mm_types_task.h>
29#include <linux/task_io_accounting.h> 29#include <linux/task_io_accounting.h>
30#include <linux/rseq.h>
30 31
31/* task_struct member predeclarations (sorted alphabetically): */ 32/* task_struct member predeclarations (sorted alphabetically): */
32struct audit_context; 33struct audit_context;
@@ -1047,6 +1048,17 @@ struct task_struct {
1047 unsigned long numa_pages_migrated; 1048 unsigned long numa_pages_migrated;
1048#endif /* CONFIG_NUMA_BALANCING */ 1049#endif /* CONFIG_NUMA_BALANCING */
1049 1050
1051#ifdef CONFIG_RSEQ
1052 struct rseq __user *rseq;
1053 u32 rseq_len;
1054 u32 rseq_sig;
1055 /*
1056 * RmW on rseq_event_mask must be performed atomically
1057 * with respect to preemption.
1058 */
1059 unsigned long rseq_event_mask;
1060#endif
1061
1050 struct tlbflush_unmap_batch tlb_ubc; 1062 struct tlbflush_unmap_batch tlb_ubc;
1051 1063
1052 struct rcu_head rcu; 1064 struct rcu_head rcu;
@@ -1757,4 +1769,126 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
1757#define TASK_SIZE_OF(tsk) TASK_SIZE 1769#define TASK_SIZE_OF(tsk) TASK_SIZE
1758#endif 1770#endif
1759 1771
1772#ifdef CONFIG_RSEQ
1773
1774/*
1775 * Map the event mask on the user-space ABI enum rseq_cs_flags
1776 * for direct mask checks.
1777 */
1778enum rseq_event_mask_bits {
1779 RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
1780 RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
1781 RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
1782};
1783
1784enum rseq_event_mask {
1785 RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT),
1786 RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT),
1787 RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT),
1788};
1789
1790static inline void rseq_set_notify_resume(struct task_struct *t)
1791{
1792 if (t->rseq)
1793 set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
1794}
1795
1796void __rseq_handle_notify_resume(struct pt_regs *regs);
1797
1798static inline void rseq_handle_notify_resume(struct pt_regs *regs)
1799{
1800 if (current->rseq)
1801 __rseq_handle_notify_resume(regs);
1802}
1803
1804static inline void rseq_signal_deliver(struct pt_regs *regs)
1805{
1806 preempt_disable();
1807 __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
1808 preempt_enable();
1809 rseq_handle_notify_resume(regs);
1810}
1811
1812/* rseq_preempt() requires preemption to be disabled. */
1813static inline void rseq_preempt(struct task_struct *t)
1814{
1815 __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
1816 rseq_set_notify_resume(t);
1817}
1818
1819/* rseq_migrate() requires preemption to be disabled. */
1820static inline void rseq_migrate(struct task_struct *t)
1821{
1822 __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
1823 rseq_set_notify_resume(t);
1824}
1825
1826/*
1827 * If parent process has a registered restartable sequences area, the
1828 * child inherits. Only applies when forking a process, not a thread. In
1829 * case a parent fork() in the middle of a restartable sequence, set the
1830 * resume notifier to force the child to retry.
1831 */
1832static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
1833{
1834 if (clone_flags & CLONE_THREAD) {
1835 t->rseq = NULL;
1836 t->rseq_len = 0;
1837 t->rseq_sig = 0;
1838 t->rseq_event_mask = 0;
1839 } else {
1840 t->rseq = current->rseq;
1841 t->rseq_len = current->rseq_len;
1842 t->rseq_sig = current->rseq_sig;
1843 t->rseq_event_mask = current->rseq_event_mask;
1844 rseq_preempt(t);
1845 }
1846}
1847
1848static inline void rseq_execve(struct task_struct *t)
1849{
1850 t->rseq = NULL;
1851 t->rseq_len = 0;
1852 t->rseq_sig = 0;
1853 t->rseq_event_mask = 0;
1854}
1855
1856#else
1857
1858static inline void rseq_set_notify_resume(struct task_struct *t)
1859{
1860}
1861static inline void rseq_handle_notify_resume(struct pt_regs *regs)
1862{
1863}
1864static inline void rseq_signal_deliver(struct pt_regs *regs)
1865{
1866}
1867static inline void rseq_preempt(struct task_struct *t)
1868{
1869}
1870static inline void rseq_migrate(struct task_struct *t)
1871{
1872}
1873static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
1874{
1875}
1876static inline void rseq_execve(struct task_struct *t)
1877{
1878}
1879
1880#endif
1881
1882#ifdef CONFIG_DEBUG_RSEQ
1883
1884void rseq_syscall(struct pt_regs *regs);
1885
1886#else
1887
1888static inline void rseq_syscall(struct pt_regs *regs)
1889{
1890}
1891
1892#endif
1893
1760#endif 1894#endif