summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile26
-rw-r--r--kernel/acct.c10
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/bpf/Makefile1
-rw-r--r--kernel/bpf/core.c534
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/cgroup.c461
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cpu.c33
-rw-r--r--kernel/cpuset.c500
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/delayacct.c62
-rw-r--r--kernel/events/core.c40
-rw-r--r--kernel/fork.c55
-rw-r--r--kernel/futex.c402
-rw-r--r--kernel/irq/generic-chip.c5
-rw-r--r--kernel/irq/irqdomain.c2
-rw-r--r--kernel/irq_work.c110
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kprobes.c14
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/locking/lockdep.c2
-rw-r--r--kernel/locking/mcs_spinlock.c8
-rw-r--r--kernel/locking/mcs_spinlock.h4
-rw-r--r--kernel/locking/mutex.c39
-rw-r--r--kernel/locking/qrwlock.c9
-rw-r--r--kernel/locking/rtmutex-debug.c5
-rw-r--r--kernel/locking/rtmutex-debug.h7
-rw-r--r--kernel/locking/rtmutex.c562
-rw-r--r--kernel/locking/rtmutex.h7
-rw-r--r--kernel/locking/rtmutex_common.h22
-rw-r--r--kernel/locking/rwsem-xadd.c4
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/params.c1
-rw-r--r--kernel/power/Kconfig3
-rw-r--r--kernel/power/hibernate.c6
-rw-r--r--kernel/power/main.c4
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/ptrace.c8
-rw-r--r--kernel/rcu/rcu.h8
-rw-r--r--kernel/rcu/rcutorture.c4
-rw-r--r--kernel/rcu/srcu.c4
-rw-r--r--kernel/rcu/tree.c59
-rw-r--r--kernel/rcu/tree.h36
-rw-r--r--kernel/rcu/tree_plugin.h302
-rw-r--r--kernel/rcu/update.c3
-rw-r--r--kernel/sched/core.c128
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/deadline.c18
-rw-r--r--kernel/sched/fair.c244
-rw-r--r--kernel/sched/idle.c8
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/proc.c7
-rw-r--r--kernel/sched/rt.c30
-rw-r--r--kernel/sched/sched.h38
-rw-r--r--kernel/sched/wait.c30
-rw-r--r--kernel/seccomp.c430
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/smp.c9
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/system_keyring.c1
-rw-r--r--kernel/time/Kconfig9
-rw-r--r--kernel/time/Makefile19
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clockevents.c10
-rw-r--r--kernel/time/clocksource.c12
-rw-r--r--kernel/time/hrtimer.c (renamed from kernel/hrtimer.c)125
-rw-r--r--kernel/time/itimer.c (renamed from kernel/itimer.c)0
-rw-r--r--kernel/time/ntp.c15
-rw-r--r--kernel/time/ntp_internal.h2
-rw-r--r--kernel/time/posix-cpu-timers.c (renamed from kernel/posix-cpu-timers.c)0
-rw-r--r--kernel/time/posix-timers.c (renamed from kernel/posix-timers.c)2
-rw-r--r--kernel/time/sched_clock.c4
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-sched.c20
-rw-r--r--kernel/time/time.c (renamed from kernel/time.c)64
-rw-r--r--kernel/time/timeconst.bc (renamed from kernel/timeconst.bc)0
-rw-r--r--kernel/time/timekeeping.c1147
-rw-r--r--kernel/time/timekeeping.h20
-rw-r--r--kernel/time/timekeeping_debug.c2
-rw-r--r--kernel/time/timekeeping_internal.h17
-rw-r--r--kernel/time/timer.c (renamed from kernel/timer.c)34
-rw-r--r--kernel/time/udelay_test.c168
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/Kconfig5
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/ftrace.c445
-rw-r--r--kernel/trace/ring_buffer.c26
-rw-r--r--kernel/trace/trace.c107
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_clock.c9
-rw-r--r--kernel/trace/trace_event_perf.c12
-rw-r--r--kernel/trace/trace_events.c60
-rw-r--r--kernel/trace/trace_events_filter.c73
-rw-r--r--kernel/trace/trace_functions_graph.c43
-rw-r--r--kernel/trace/trace_output.c282
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_seq.c428
-rw-r--r--kernel/trace/trace_uprobe.c3
-rw-r--r--kernel/tsacct.c19
-rw-r--r--kernel/workqueue.c206
103 files changed, 5096 insertions, 2648 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index f2a8b6246ce9..0026cf531769 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -3,12 +3,11 @@
3# 3#
4 4
5obj-y = fork.o exec_domain.o panic.o \ 5obj-y = fork.o exec_domain.o panic.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
9 extable.o params.o posix-timers.o \ 9 extable.o params.o \
10 kthread.o sys_ni.o posix-cpu-timers.o \ 10 kthread.o sys_ni.o nsproxy.o \
11 hrtimer.o nsproxy.o \
12 notifier.o ksysfs.o cred.o reboot.o \ 11 notifier.o ksysfs.o cred.o reboot.o \
13 async.o range.o groups.o smpboot.o 12 async.o range.o groups.o smpboot.o
14 13
@@ -87,6 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
87obj-$(CONFIG_TRACEPOINTS) += trace/ 86obj-$(CONFIG_TRACEPOINTS) += trace/
88obj-$(CONFIG_IRQ_WORK) += irq_work.o 87obj-$(CONFIG_IRQ_WORK) += irq_work.o
89obj-$(CONFIG_CPU_PM) += cpu_pm.o 88obj-$(CONFIG_CPU_PM) += cpu_pm.o
89obj-$(CONFIG_NET) += bpf/
90 90
91obj-$(CONFIG_PERF_EVENTS) += events/ 91obj-$(CONFIG_PERF_EVENTS) += events/
92 92
@@ -110,22 +110,6 @@ targets += config_data.h
110$(obj)/config_data.h: $(obj)/config_data.gz FORCE 110$(obj)/config_data.h: $(obj)/config_data.gz FORCE
111 $(call filechk,ikconfiggz) 111 $(call filechk,ikconfiggz)
112 112
113$(obj)/time.o: $(obj)/timeconst.h
114
115quiet_cmd_hzfile = HZFILE $@
116 cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
117
118targets += hz.bc
119$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
120 $(call if_changed,hzfile)
121
122quiet_cmd_bc = BC $@
123 cmd_bc = bc -q $(filter-out FORCE,$^) > $@
124
125targets += timeconst.h
126$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
127 $(call if_changed,bc)
128
129############################################################################### 113###############################################################################
130# 114#
131# Roll all the X.509 certificates that we can find together and pull them into 115# Roll all the X.509 certificates that we can find together and pull them into
diff --git a/kernel/acct.c b/kernel/acct.c
index 808a86ff229d..a1844f14c6d6 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -458,9 +458,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
458 acct_t ac; 458 acct_t ac;
459 mm_segment_t fs; 459 mm_segment_t fs;
460 unsigned long flim; 460 unsigned long flim;
461 u64 elapsed; 461 u64 elapsed, run_time;
462 u64 run_time;
463 struct timespec uptime;
464 struct tty_struct *tty; 462 struct tty_struct *tty;
465 const struct cred *orig_cred; 463 const struct cred *orig_cred;
466 464
@@ -484,10 +482,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
484 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); 482 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
485 483
486 /* calculate run_time in nsec*/ 484 /* calculate run_time in nsec*/
487 do_posix_clock_monotonic_gettime(&uptime); 485 run_time = ktime_get_ns();
488 run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; 486 run_time -= current->group_leader->start_time;
489 run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
490 + current->group_leader->start_time.tv_nsec;
491 /* convert nsec -> AHZ */ 487 /* convert nsec -> AHZ */
492 elapsed = nsec_to_AHZ(run_time); 488 elapsed = nsec_to_AHZ(run_time);
493#if ACCT_VERSION==3 489#if ACCT_VERSION==3
diff --git a/kernel/audit.c b/kernel/audit.c
index 3ef2e0e797e8..ba2ff5a5c600 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1677,7 +1677,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
1677 audit_log_format(ab, " %s=", prefix); 1677 audit_log_format(ab, " %s=", prefix);
1678 CAP_FOR_EACH_U32(i) { 1678 CAP_FOR_EACH_U32(i) {
1679 audit_log_format(ab, "%08x", 1679 audit_log_format(ab, "%08x",
1680 cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); 1680 cap->cap[CAP_LAST_U32 - i]);
1681 } 1681 }
1682} 1682}
1683 1683
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
new file mode 100644
index 000000000000..6a71145e2769
--- /dev/null
+++ b/kernel/bpf/Makefile
@@ -0,0 +1 @@
obj-y := core.o
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
new file mode 100644
index 000000000000..7f0dbcbb34af
--- /dev/null
+++ b/kernel/bpf/core.c
@@ -0,0 +1,534 @@
1/*
2 * Linux Socket Filter - Kernel level socket filtering
3 *
4 * Based on the design of the Berkeley Packet Filter. The new
5 * internal format has been designed by PLUMgrid:
6 *
7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
8 *
9 * Authors:
10 *
11 * Jay Schulist <jschlst@samba.org>
12 * Alexei Starovoitov <ast@plumgrid.com>
13 * Daniel Borkmann <dborkman@redhat.com>
14 *
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version
18 * 2 of the License, or (at your option) any later version.
19 *
20 * Andi Kleen - Fix a few bad bugs and races.
21 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
22 */
23#include <linux/filter.h>
24#include <linux/skbuff.h>
25#include <asm/unaligned.h>
26
27/* Registers */
28#define BPF_R0 regs[BPF_REG_0]
29#define BPF_R1 regs[BPF_REG_1]
30#define BPF_R2 regs[BPF_REG_2]
31#define BPF_R3 regs[BPF_REG_3]
32#define BPF_R4 regs[BPF_REG_4]
33#define BPF_R5 regs[BPF_REG_5]
34#define BPF_R6 regs[BPF_REG_6]
35#define BPF_R7 regs[BPF_REG_7]
36#define BPF_R8 regs[BPF_REG_8]
37#define BPF_R9 regs[BPF_REG_9]
38#define BPF_R10 regs[BPF_REG_10]
39
40/* Named registers */
41#define DST regs[insn->dst_reg]
42#define SRC regs[insn->src_reg]
43#define FP regs[BPF_REG_FP]
44#define ARG1 regs[BPF_REG_ARG1]
45#define CTX regs[BPF_REG_CTX]
46#define IMM insn->imm
47
48/* No hurry in this branch
49 *
50 * Exported for the bpf jit load helper.
51 */
52void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
53{
54 u8 *ptr = NULL;
55
56 if (k >= SKF_NET_OFF)
57 ptr = skb_network_header(skb) + k - SKF_NET_OFF;
58 else if (k >= SKF_LL_OFF)
59 ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
60 if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
61 return ptr;
62
63 return NULL;
64}
65
66/* Base function for offset calculation. Needs to go into .text section,
67 * therefore keeping it non-static as well; will also be used by JITs
68 * anyway later on, so do not let the compiler omit it.
69 */
70noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
71{
72 return 0;
73}
74
75/**
76 * __bpf_prog_run - run eBPF program on a given context
77 * @ctx: is the data we are operating on
78 * @insn: is the array of eBPF instructions
79 *
80 * Decode and execute eBPF instructions.
81 */
82static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
83{
84 u64 stack[MAX_BPF_STACK / sizeof(u64)];
85 u64 regs[MAX_BPF_REG], tmp;
86 static const void *jumptable[256] = {
87 [0 ... 255] = &&default_label,
88 /* Now overwrite non-defaults ... */
89 /* 32 bit ALU operations */
90 [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
91 [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
92 [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
93 [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
94 [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
95 [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
96 [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
97 [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
98 [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
99 [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
100 [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
101 [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
102 [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
103 [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
104 [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
105 [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
106 [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
107 [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
108 [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
109 [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
110 [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
111 [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
112 [BPF_ALU | BPF_NEG] = &&ALU_NEG,
113 [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
114 [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
115 /* 64 bit ALU operations */
116 [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
117 [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
118 [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
119 [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
120 [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
121 [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
122 [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
123 [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
124 [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
125 [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
126 [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
127 [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
128 [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
129 [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
130 [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
131 [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
132 [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
133 [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
134 [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
135 [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
136 [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
137 [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
138 [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
139 [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
140 [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
141 /* Call instruction */
142 [BPF_JMP | BPF_CALL] = &&JMP_CALL,
143 /* Jumps */
144 [BPF_JMP | BPF_JA] = &&JMP_JA,
145 [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
146 [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
147 [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
148 [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
149 [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
150 [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
151 [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
152 [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
153 [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
154 [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
155 [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
156 [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
157 [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
158 [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
159 /* Program return */
160 [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
161 /* Store instructions */
162 [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
163 [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
164 [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
165 [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
166 [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
167 [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
168 [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
169 [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
170 [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
171 [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
172 /* Load instructions */
173 [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
174 [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
175 [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
176 [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
177 [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
178 [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
179 [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
180 [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
181 [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
182 [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
183 };
184 void *ptr;
185 int off;
186
187#define CONT ({ insn++; goto select_insn; })
188#define CONT_JMP ({ insn++; goto select_insn; })
189
190 FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
191 ARG1 = (u64) (unsigned long) ctx;
192
193 /* Registers used in classic BPF programs need to be reset first. */
194 regs[BPF_REG_A] = 0;
195 regs[BPF_REG_X] = 0;
196
197select_insn:
198 goto *jumptable[insn->code];
199
200 /* ALU */
201#define ALU(OPCODE, OP) \
202 ALU64_##OPCODE##_X: \
203 DST = DST OP SRC; \
204 CONT; \
205 ALU_##OPCODE##_X: \
206 DST = (u32) DST OP (u32) SRC; \
207 CONT; \
208 ALU64_##OPCODE##_K: \
209 DST = DST OP IMM; \
210 CONT; \
211 ALU_##OPCODE##_K: \
212 DST = (u32) DST OP (u32) IMM; \
213 CONT;
214
215 ALU(ADD, +)
216 ALU(SUB, -)
217 ALU(AND, &)
218 ALU(OR, |)
219 ALU(LSH, <<)
220 ALU(RSH, >>)
221 ALU(XOR, ^)
222 ALU(MUL, *)
223#undef ALU
224 ALU_NEG:
225 DST = (u32) -DST;
226 CONT;
227 ALU64_NEG:
228 DST = -DST;
229 CONT;
230 ALU_MOV_X:
231 DST = (u32) SRC;
232 CONT;
233 ALU_MOV_K:
234 DST = (u32) IMM;
235 CONT;
236 ALU64_MOV_X:
237 DST = SRC;
238 CONT;
239 ALU64_MOV_K:
240 DST = IMM;
241 CONT;
242 ALU64_ARSH_X:
243 (*(s64 *) &DST) >>= SRC;
244 CONT;
245 ALU64_ARSH_K:
246 (*(s64 *) &DST) >>= IMM;
247 CONT;
248 ALU64_MOD_X:
249 if (unlikely(SRC == 0))
250 return 0;
251 tmp = DST;
252 DST = do_div(tmp, SRC);
253 CONT;
254 ALU_MOD_X:
255 if (unlikely(SRC == 0))
256 return 0;
257 tmp = (u32) DST;
258 DST = do_div(tmp, (u32) SRC);
259 CONT;
260 ALU64_MOD_K:
261 tmp = DST;
262 DST = do_div(tmp, IMM);
263 CONT;
264 ALU_MOD_K:
265 tmp = (u32) DST;
266 DST = do_div(tmp, (u32) IMM);
267 CONT;
268 ALU64_DIV_X:
269 if (unlikely(SRC == 0))
270 return 0;
271 do_div(DST, SRC);
272 CONT;
273 ALU_DIV_X:
274 if (unlikely(SRC == 0))
275 return 0;
276 tmp = (u32) DST;
277 do_div(tmp, (u32) SRC);
278 DST = (u32) tmp;
279 CONT;
280 ALU64_DIV_K:
281 do_div(DST, IMM);
282 CONT;
283 ALU_DIV_K:
284 tmp = (u32) DST;
285 do_div(tmp, (u32) IMM);
286 DST = (u32) tmp;
287 CONT;
288 ALU_END_TO_BE:
289 switch (IMM) {
290 case 16:
291 DST = (__force u16) cpu_to_be16(DST);
292 break;
293 case 32:
294 DST = (__force u32) cpu_to_be32(DST);
295 break;
296 case 64:
297 DST = (__force u64) cpu_to_be64(DST);
298 break;
299 }
300 CONT;
301 ALU_END_TO_LE:
302 switch (IMM) {
303 case 16:
304 DST = (__force u16) cpu_to_le16(DST);
305 break;
306 case 32:
307 DST = (__force u32) cpu_to_le32(DST);
308 break;
309 case 64:
310 DST = (__force u64) cpu_to_le64(DST);
311 break;
312 }
313 CONT;
314
315 /* CALL */
316 JMP_CALL:
317 /* Function call scratches BPF_R1-BPF_R5 registers,
318 * preserves BPF_R6-BPF_R9, and stores return value
319 * into BPF_R0.
320 */
321 BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
322 BPF_R4, BPF_R5);
323 CONT;
324
325 /* JMP */
326 JMP_JA:
327 insn += insn->off;
328 CONT;
329 JMP_JEQ_X:
330 if (DST == SRC) {
331 insn += insn->off;
332 CONT_JMP;
333 }
334 CONT;
335 JMP_JEQ_K:
336 if (DST == IMM) {
337 insn += insn->off;
338 CONT_JMP;
339 }
340 CONT;
341 JMP_JNE_X:
342 if (DST != SRC) {
343 insn += insn->off;
344 CONT_JMP;
345 }
346 CONT;
347 JMP_JNE_K:
348 if (DST != IMM) {
349 insn += insn->off;
350 CONT_JMP;
351 }
352 CONT;
353 JMP_JGT_X:
354 if (DST > SRC) {
355 insn += insn->off;
356 CONT_JMP;
357 }
358 CONT;
359 JMP_JGT_K:
360 if (DST > IMM) {
361 insn += insn->off;
362 CONT_JMP;
363 }
364 CONT;
365 JMP_JGE_X:
366 if (DST >= SRC) {
367 insn += insn->off;
368 CONT_JMP;
369 }
370 CONT;
371 JMP_JGE_K:
372 if (DST >= IMM) {
373 insn += insn->off;
374 CONT_JMP;
375 }
376 CONT;
377 JMP_JSGT_X:
378 if (((s64) DST) > ((s64) SRC)) {
379 insn += insn->off;
380 CONT_JMP;
381 }
382 CONT;
383 JMP_JSGT_K:
384 if (((s64) DST) > ((s64) IMM)) {
385 insn += insn->off;
386 CONT_JMP;
387 }
388 CONT;
389 JMP_JSGE_X:
390 if (((s64) DST) >= ((s64) SRC)) {
391 insn += insn->off;
392 CONT_JMP;
393 }
394 CONT;
395 JMP_JSGE_K:
396 if (((s64) DST) >= ((s64) IMM)) {
397 insn += insn->off;
398 CONT_JMP;
399 }
400 CONT;
401 JMP_JSET_X:
402 if (DST & SRC) {
403 insn += insn->off;
404 CONT_JMP;
405 }
406 CONT;
407 JMP_JSET_K:
408 if (DST & IMM) {
409 insn += insn->off;
410 CONT_JMP;
411 }
412 CONT;
413 JMP_EXIT:
414 return BPF_R0;
415
416 /* STX and ST and LDX*/
417#define LDST(SIZEOP, SIZE) \
418 STX_MEM_##SIZEOP: \
419 *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
420 CONT; \
421 ST_MEM_##SIZEOP: \
422 *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
423 CONT; \
424 LDX_MEM_##SIZEOP: \
425 DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
426 CONT;
427
428 LDST(B, u8)
429 LDST(H, u16)
430 LDST(W, u32)
431 LDST(DW, u64)
432#undef LDST
433 STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
434 atomic_add((u32) SRC, (atomic_t *)(unsigned long)
435 (DST + insn->off));
436 CONT;
437 STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
438 atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
439 (DST + insn->off));
440 CONT;
441 LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
442 off = IMM;
443load_word:
444 /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
445 * only appearing in the programs where ctx ==
446 * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
447 * == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
448 * internal BPF verifier will check that BPF_R6 ==
449 * ctx.
450 *
451 * BPF_ABS and BPF_IND are wrappers of function calls,
452 * so they scratch BPF_R1-BPF_R5 registers, preserve
453 * BPF_R6-BPF_R9, and store return value into BPF_R0.
454 *
455 * Implicit input:
456 * ctx == skb == BPF_R6 == CTX
457 *
458 * Explicit input:
459 * SRC == any register
460 * IMM == 32-bit immediate
461 *
462 * Output:
463 * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
464 */
465
466 ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
467 if (likely(ptr != NULL)) {
468 BPF_R0 = get_unaligned_be32(ptr);
469 CONT;
470 }
471
472 return 0;
473 LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
474 off = IMM;
475load_half:
476 ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
477 if (likely(ptr != NULL)) {
478 BPF_R0 = get_unaligned_be16(ptr);
479 CONT;
480 }
481
482 return 0;
483 LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
484 off = IMM;
485load_byte:
486 ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
487 if (likely(ptr != NULL)) {
488 BPF_R0 = *(u8 *)ptr;
489 CONT;
490 }
491
492 return 0;
493 LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
494 off = IMM + SRC;
495 goto load_word;
496 LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
497 off = IMM + SRC;
498 goto load_half;
499 LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
500 off = IMM + SRC;
501 goto load_byte;
502
503 default_label:
504 /* If we ever reach this, we have a bug somewhere. */
505 WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
506 return 0;
507}
508
509void __weak bpf_int_jit_compile(struct bpf_prog *prog)
510{
511}
512
513/**
514 * bpf_prog_select_runtime - select execution runtime for BPF program
515 * @fp: bpf_prog populated with internal BPF program
516 *
517 * try to JIT internal BPF program, if JIT is not available select interpreter
518 * BPF program will be executed via BPF_PROG_RUN() macro
519 */
520void bpf_prog_select_runtime(struct bpf_prog *fp)
521{
522 fp->bpf_func = (void *) __bpf_prog_run;
523
524 /* Probe if internal BPF can be JITed */
525 bpf_int_jit_compile(fp);
526}
527EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
528
529/* free internal BPF program */
530void bpf_prog_free(struct bpf_prog *fp)
531{
532 bpf_jit_free(fp);
533}
534EXPORT_SYMBOL_GPL(bpf_prog_free);
diff --git a/kernel/capability.c b/kernel/capability.c
index a5cf13c018ce..989f5bfc57dc 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -258,6 +258,10 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
258 i++; 258 i++;
259 } 259 }
260 260
261 effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
262 permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
263 inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
264
261 new = prepare_creds(); 265 new = prepare_creds();
262 if (!new) 266 if (!new)
263 return -ENOMEM; 267 return -ENOMEM;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 70776aec2562..7dc8788cfd52 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -149,12 +149,14 @@ struct cgroup_root cgrp_dfl_root;
149 */ 149 */
150static bool cgrp_dfl_root_visible; 150static bool cgrp_dfl_root_visible;
151 151
152/*
153 * Set by the boot param of the same name and makes subsystems with NULL
154 * ->dfl_files to use ->legacy_files on the default hierarchy.
155 */
156static bool cgroup_legacy_files_on_dfl;
157
152/* some controllers are not supported in the default hierarchy */ 158/* some controllers are not supported in the default hierarchy */
153static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 159static unsigned int cgrp_dfl_root_inhibit_ss_mask;
154#ifdef CONFIG_CGROUP_DEBUG
155 | (1 << debug_cgrp_id)
156#endif
157 ;
158 160
159/* The list of hierarchy roots */ 161/* The list of hierarchy roots */
160 162
@@ -180,13 +182,15 @@ static u64 css_serial_nr_next = 1;
180 */ 182 */
181static int need_forkexit_callback __read_mostly; 183static int need_forkexit_callback __read_mostly;
182 184
183static struct cftype cgroup_base_files[]; 185static struct cftype cgroup_dfl_base_files[];
186static struct cftype cgroup_legacy_base_files[];
184 187
185static void cgroup_put(struct cgroup *cgrp); 188static void cgroup_put(struct cgroup *cgrp);
186static int rebind_subsystems(struct cgroup_root *dst_root, 189static int rebind_subsystems(struct cgroup_root *dst_root,
187 unsigned int ss_mask); 190 unsigned int ss_mask);
188static int cgroup_destroy_locked(struct cgroup *cgrp); 191static int cgroup_destroy_locked(struct cgroup *cgrp);
189static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); 192static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
193 bool visible);
190static void css_release(struct percpu_ref *ref); 194static void css_release(struct percpu_ref *ref);
191static void kill_css(struct cgroup_subsys_state *css); 195static void kill_css(struct cgroup_subsys_state *css);
192static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 196static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
@@ -1037,6 +1041,58 @@ static void cgroup_put(struct cgroup *cgrp)
1037} 1041}
1038 1042
1039/** 1043/**
1044 * cgroup_refresh_child_subsys_mask - update child_subsys_mask
1045 * @cgrp: the target cgroup
1046 *
1047 * On the default hierarchy, a subsystem may request other subsystems to be
1048 * enabled together through its ->depends_on mask. In such cases, more
1049 * subsystems than specified in "cgroup.subtree_control" may be enabled.
1050 *
1051 * This function determines which subsystems need to be enabled given the
1052 * current @cgrp->subtree_control and records it in
1053 * @cgrp->child_subsys_mask. The resulting mask is always a superset of
1054 * @cgrp->subtree_control and follows the usual hierarchy rules.
1055 */
1056static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
1057{
1058 struct cgroup *parent = cgroup_parent(cgrp);
1059 unsigned int cur_ss_mask = cgrp->subtree_control;
1060 struct cgroup_subsys *ss;
1061 int ssid;
1062
1063 lockdep_assert_held(&cgroup_mutex);
1064
1065 if (!cgroup_on_dfl(cgrp)) {
1066 cgrp->child_subsys_mask = cur_ss_mask;
1067 return;
1068 }
1069
1070 while (true) {
1071 unsigned int new_ss_mask = cur_ss_mask;
1072
1073 for_each_subsys(ss, ssid)
1074 if (cur_ss_mask & (1 << ssid))
1075 new_ss_mask |= ss->depends_on;
1076
1077 /*
1078 * Mask out subsystems which aren't available. This can
1079 * happen only if some depended-upon subsystems were bound
1080 * to non-default hierarchies.
1081 */
1082 if (parent)
1083 new_ss_mask &= parent->child_subsys_mask;
1084 else
1085 new_ss_mask &= cgrp->root->subsys_mask;
1086
1087 if (new_ss_mask == cur_ss_mask)
1088 break;
1089 cur_ss_mask = new_ss_mask;
1090 }
1091
1092 cgrp->child_subsys_mask = cur_ss_mask;
1093}
1094
1095/**
1040 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods 1096 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1041 * @kn: the kernfs_node being serviced 1097 * @kn: the kernfs_node being serviced
1042 * 1098 *
@@ -1208,12 +1264,15 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
1208 up_write(&css_set_rwsem); 1264 up_write(&css_set_rwsem);
1209 1265
1210 src_root->subsys_mask &= ~(1 << ssid); 1266 src_root->subsys_mask &= ~(1 << ssid);
1211 src_root->cgrp.child_subsys_mask &= ~(1 << ssid); 1267 src_root->cgrp.subtree_control &= ~(1 << ssid);
1268 cgroup_refresh_child_subsys_mask(&src_root->cgrp);
1212 1269
1213 /* default hierarchy doesn't enable controllers by default */ 1270 /* default hierarchy doesn't enable controllers by default */
1214 dst_root->subsys_mask |= 1 << ssid; 1271 dst_root->subsys_mask |= 1 << ssid;
1215 if (dst_root != &cgrp_dfl_root) 1272 if (dst_root != &cgrp_dfl_root) {
1216 dst_root->cgrp.child_subsys_mask |= 1 << ssid; 1273 dst_root->cgrp.subtree_control |= 1 << ssid;
1274 cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
1275 }
1217 1276
1218 if (ss->bind) 1277 if (ss->bind)
1219 ss->bind(css); 1278 ss->bind(css);
@@ -1233,8 +1292,6 @@ static int cgroup_show_options(struct seq_file *seq,
1233 for_each_subsys(ss, ssid) 1292 for_each_subsys(ss, ssid)
1234 if (root->subsys_mask & (1 << ssid)) 1293 if (root->subsys_mask & (1 << ssid))
1235 seq_printf(seq, ",%s", ss->name); 1294 seq_printf(seq, ",%s", ss->name);
1236 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1237 seq_puts(seq, ",sane_behavior");
1238 if (root->flags & CGRP_ROOT_NOPREFIX) 1295 if (root->flags & CGRP_ROOT_NOPREFIX)
1239 seq_puts(seq, ",noprefix"); 1296 seq_puts(seq, ",noprefix");
1240 if (root->flags & CGRP_ROOT_XATTR) 1297 if (root->flags & CGRP_ROOT_XATTR)
@@ -1268,6 +1325,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1268 bool all_ss = false, one_ss = false; 1325 bool all_ss = false, one_ss = false;
1269 unsigned int mask = -1U; 1326 unsigned int mask = -1U;
1270 struct cgroup_subsys *ss; 1327 struct cgroup_subsys *ss;
1328 int nr_opts = 0;
1271 int i; 1329 int i;
1272 1330
1273#ifdef CONFIG_CPUSETS 1331#ifdef CONFIG_CPUSETS
@@ -1277,6 +1335,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1277 memset(opts, 0, sizeof(*opts)); 1335 memset(opts, 0, sizeof(*opts));
1278 1336
1279 while ((token = strsep(&o, ",")) != NULL) { 1337 while ((token = strsep(&o, ",")) != NULL) {
1338 nr_opts++;
1339
1280 if (!*token) 1340 if (!*token)
1281 return -EINVAL; 1341 return -EINVAL;
1282 if (!strcmp(token, "none")) { 1342 if (!strcmp(token, "none")) {
@@ -1361,37 +1421,33 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1361 return -ENOENT; 1421 return -ENOENT;
1362 } 1422 }
1363 1423
1364 /* Consistency checks */
1365
1366 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1424 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1367 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1425 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1368 1426 if (nr_opts != 1) {
1369 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || 1427 pr_err("sane_behavior: no other mount options allowed\n");
1370 opts->cpuset_clone_children || opts->release_agent ||
1371 opts->name) {
1372 pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1373 return -EINVAL; 1428 return -EINVAL;
1374 } 1429 }
1375 } else { 1430 return 0;
1376 /*
1377 * If the 'all' option was specified select all the
1378 * subsystems, otherwise if 'none', 'name=' and a subsystem
1379 * name options were not specified, let's default to 'all'
1380 */
1381 if (all_ss || (!one_ss && !opts->none && !opts->name))
1382 for_each_subsys(ss, i)
1383 if (!ss->disabled)
1384 opts->subsys_mask |= (1 << i);
1385
1386 /*
1387 * We either have to specify by name or by subsystems. (So
1388 * all empty hierarchies must have a name).
1389 */
1390 if (!opts->subsys_mask && !opts->name)
1391 return -EINVAL;
1392 } 1431 }
1393 1432
1394 /* 1433 /*
1434 * If the 'all' option was specified select all the subsystems,
1435 * otherwise if 'none', 'name=' and a subsystem name options were
1436 * not specified, let's default to 'all'
1437 */
1438 if (all_ss || (!one_ss && !opts->none && !opts->name))
1439 for_each_subsys(ss, i)
1440 if (!ss->disabled)
1441 opts->subsys_mask |= (1 << i);
1442
1443 /*
1444 * We either have to specify by name or by subsystems. (So all
1445 * empty hierarchies must have a name).
1446 */
1447 if (!opts->subsys_mask && !opts->name)
1448 return -EINVAL;
1449
1450 /*
1395 * Option noprefix was introduced just for backward compatibility 1451 * Option noprefix was introduced just for backward compatibility
1396 * with the old cpuset, so we allow noprefix only if mounting just 1452 * with the old cpuset, so we allow noprefix only if mounting just
1397 * the cpuset subsystem. 1453 * the cpuset subsystem.
@@ -1399,7 +1455,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1399 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) 1455 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1400 return -EINVAL; 1456 return -EINVAL;
1401 1457
1402
1403 /* Can't specify "none" and some subsystems */ 1458 /* Can't specify "none" and some subsystems */
1404 if (opts->subsys_mask && opts->none) 1459 if (opts->subsys_mask && opts->none)
1405 return -EINVAL; 1460 return -EINVAL;
@@ -1414,8 +1469,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1414 struct cgroup_sb_opts opts; 1469 struct cgroup_sb_opts opts;
1415 unsigned int added_mask, removed_mask; 1470 unsigned int added_mask, removed_mask;
1416 1471
1417 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1472 if (root == &cgrp_dfl_root) {
1418 pr_err("sane_behavior: remount is not allowed\n"); 1473 pr_err("remount is not allowed\n");
1419 return -EINVAL; 1474 return -EINVAL;
1420 } 1475 }
1421 1476
@@ -1434,11 +1489,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1434 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1489 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1435 1490
1436 /* Don't allow flags or name to change at remount */ 1491 /* Don't allow flags or name to change at remount */
1437 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1492 if ((opts.flags ^ root->flags) ||
1438 (opts.name && strcmp(opts.name, root->name))) { 1493 (opts.name && strcmp(opts.name, root->name))) {
1439 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", 1494 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1440 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", 1495 opts.flags, opts.name ?: "", root->flags, root->name);
1441 root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1442 ret = -EINVAL; 1496 ret = -EINVAL;
1443 goto out_unlock; 1497 goto out_unlock;
1444 } 1498 }
@@ -1563,6 +1617,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1563{ 1617{
1564 LIST_HEAD(tmp_links); 1618 LIST_HEAD(tmp_links);
1565 struct cgroup *root_cgrp = &root->cgrp; 1619 struct cgroup *root_cgrp = &root->cgrp;
1620 struct cftype *base_files;
1566 struct css_set *cset; 1621 struct css_set *cset;
1567 int i, ret; 1622 int i, ret;
1568 1623
@@ -1600,7 +1655,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1600 } 1655 }
1601 root_cgrp->kn = root->kf_root->kn; 1656 root_cgrp->kn = root->kf_root->kn;
1602 1657
1603 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); 1658 if (root == &cgrp_dfl_root)
1659 base_files = cgroup_dfl_base_files;
1660 else
1661 base_files = cgroup_legacy_base_files;
1662
1663 ret = cgroup_addrm_files(root_cgrp, base_files, true);
1604 if (ret) 1664 if (ret)
1605 goto destroy_root; 1665 goto destroy_root;
1606 1666
@@ -1638,7 +1698,7 @@ destroy_root:
1638exit_root_id: 1698exit_root_id:
1639 cgroup_exit_root_id(root); 1699 cgroup_exit_root_id(root);
1640cancel_ref: 1700cancel_ref:
1641 percpu_ref_cancel_init(&root_cgrp->self.refcnt); 1701 percpu_ref_exit(&root_cgrp->self.refcnt);
1642out: 1702out:
1643 free_cgrp_cset_links(&tmp_links); 1703 free_cgrp_cset_links(&tmp_links);
1644 return ret; 1704 return ret;
@@ -1672,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1672 goto out_unlock; 1732 goto out_unlock;
1673 1733
1674 /* look for a matching existing root */ 1734 /* look for a matching existing root */
1675 if (!opts.subsys_mask && !opts.none && !opts.name) { 1735 if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
1676 cgrp_dfl_root_visible = true; 1736 cgrp_dfl_root_visible = true;
1677 root = &cgrp_dfl_root; 1737 root = &cgrp_dfl_root;
1678 cgroup_get(&root->cgrp); 1738 cgroup_get(&root->cgrp);
@@ -1730,15 +1790,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1730 goto out_unlock; 1790 goto out_unlock;
1731 } 1791 }
1732 1792
1733 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1793 if (root->flags ^ opts.flags)
1734 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1794 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1735 pr_err("sane_behavior: new mount options should match the existing superblock\n");
1736 ret = -EINVAL;
1737 goto out_unlock;
1738 } else {
1739 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1740 }
1741 }
1742 1795
1743 /* 1796 /*
1744 * We want to reuse @root whose lifetime is governed by its 1797 * We want to reuse @root whose lifetime is governed by its
@@ -2457,9 +2510,7 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2457 2510
2458static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) 2511static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2459{ 2512{
2460 struct cgroup *cgrp = seq_css(seq)->cgroup; 2513 seq_puts(seq, "0\n");
2461
2462 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2463 return 0; 2514 return 0;
2464} 2515}
2465 2516
@@ -2496,7 +2547,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v)
2496{ 2547{
2497 struct cgroup *cgrp = seq_css(seq)->cgroup; 2548 struct cgroup *cgrp = seq_css(seq)->cgroup;
2498 2549
2499 cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); 2550 cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
2500 return 0; 2551 return 0;
2501} 2552}
2502 2553
@@ -2505,7 +2556,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2505{ 2556{
2506 struct cgroup *cgrp = seq_css(seq)->cgroup; 2557 struct cgroup *cgrp = seq_css(seq)->cgroup;
2507 2558
2508 cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); 2559 cgroup_print_ss_mask(seq, cgrp->subtree_control);
2509 return 0; 2560 return 0;
2510} 2561}
2511 2562
@@ -2611,6 +2662,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2611 loff_t off) 2662 loff_t off)
2612{ 2663{
2613 unsigned int enable = 0, disable = 0; 2664 unsigned int enable = 0, disable = 0;
2665 unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
2614 struct cgroup *cgrp, *child; 2666 struct cgroup *cgrp, *child;
2615 struct cgroup_subsys *ss; 2667 struct cgroup_subsys *ss;
2616 char *tok; 2668 char *tok;
@@ -2650,11 +2702,26 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2650 2702
2651 for_each_subsys(ss, ssid) { 2703 for_each_subsys(ss, ssid) {
2652 if (enable & (1 << ssid)) { 2704 if (enable & (1 << ssid)) {
2653 if (cgrp->child_subsys_mask & (1 << ssid)) { 2705 if (cgrp->subtree_control & (1 << ssid)) {
2654 enable &= ~(1 << ssid); 2706 enable &= ~(1 << ssid);
2655 continue; 2707 continue;
2656 } 2708 }
2657 2709
2710 /* unavailable or not enabled on the parent? */
2711 if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2712 (cgroup_parent(cgrp) &&
2713 !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
2714 ret = -ENOENT;
2715 goto out_unlock;
2716 }
2717
2718 /*
2719 * @ss is already enabled through dependency and
2720 * we'll just make it visible. Skip draining.
2721 */
2722 if (cgrp->child_subsys_mask & (1 << ssid))
2723 continue;
2724
2658 /* 2725 /*
2659 * Because css offlining is asynchronous, userland 2726 * Because css offlining is asynchronous, userland
2660 * might try to re-enable the same controller while 2727 * might try to re-enable the same controller while
@@ -2677,23 +2744,15 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2677 2744
2678 return restart_syscall(); 2745 return restart_syscall();
2679 } 2746 }
2680
2681 /* unavailable or not enabled on the parent? */
2682 if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2683 (cgroup_parent(cgrp) &&
2684 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
2685 ret = -ENOENT;
2686 goto out_unlock;
2687 }
2688 } else if (disable & (1 << ssid)) { 2747 } else if (disable & (1 << ssid)) {
2689 if (!(cgrp->child_subsys_mask & (1 << ssid))) { 2748 if (!(cgrp->subtree_control & (1 << ssid))) {
2690 disable &= ~(1 << ssid); 2749 disable &= ~(1 << ssid);
2691 continue; 2750 continue;
2692 } 2751 }
2693 2752
2694 /* a child has it enabled? */ 2753 /* a child has it enabled? */
2695 cgroup_for_each_live_child(child, cgrp) { 2754 cgroup_for_each_live_child(child, cgrp) {
2696 if (child->child_subsys_mask & (1 << ssid)) { 2755 if (child->subtree_control & (1 << ssid)) {
2697 ret = -EBUSY; 2756 ret = -EBUSY;
2698 goto out_unlock; 2757 goto out_unlock;
2699 } 2758 }
@@ -2707,7 +2766,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2707 } 2766 }
2708 2767
2709 /* 2768 /*
2710 * Except for the root, child_subsys_mask must be zero for a cgroup 2769 * Except for the root, subtree_control must be zero for a cgroup
2711 * with tasks so that child cgroups don't compete against tasks. 2770 * with tasks so that child cgroups don't compete against tasks.
2712 */ 2771 */
2713 if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { 2772 if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
@@ -2716,36 +2775,75 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2716 } 2775 }
2717 2776
2718 /* 2777 /*
2719 * Create csses for enables and update child_subsys_mask. This 2778 * Update subsys masks and calculate what needs to be done. More
2720 * changes cgroup_e_css() results which in turn makes the 2779 * subsystems than specified may need to be enabled or disabled
2721 * subsequent cgroup_update_dfl_csses() associate all tasks in the 2780 * depending on subsystem dependencies.
2722 * subtree to the updated csses. 2781 */
2782 cgrp->subtree_control |= enable;
2783 cgrp->subtree_control &= ~disable;
2784
2785 old_ctrl = cgrp->child_subsys_mask;
2786 cgroup_refresh_child_subsys_mask(cgrp);
2787 new_ctrl = cgrp->child_subsys_mask;
2788
2789 css_enable = ~old_ctrl & new_ctrl;
2790 css_disable = old_ctrl & ~new_ctrl;
2791 enable |= css_enable;
2792 disable |= css_disable;
2793
2794 /*
2795 * Create new csses or make the existing ones visible. A css is
2796 * created invisible if it's being implicitly enabled through
2797 * dependency. An invisible css is made visible when the userland
2798 * explicitly enables it.
2723 */ 2799 */
2724 for_each_subsys(ss, ssid) { 2800 for_each_subsys(ss, ssid) {
2725 if (!(enable & (1 << ssid))) 2801 if (!(enable & (1 << ssid)))
2726 continue; 2802 continue;
2727 2803
2728 cgroup_for_each_live_child(child, cgrp) { 2804 cgroup_for_each_live_child(child, cgrp) {
2729 ret = create_css(child, ss); 2805 if (css_enable & (1 << ssid))
2806 ret = create_css(child, ss,
2807 cgrp->subtree_control & (1 << ssid));
2808 else
2809 ret = cgroup_populate_dir(child, 1 << ssid);
2730 if (ret) 2810 if (ret)
2731 goto err_undo_css; 2811 goto err_undo_css;
2732 } 2812 }
2733 } 2813 }
2734 2814
2735 cgrp->child_subsys_mask |= enable; 2815 /*
2736 cgrp->child_subsys_mask &= ~disable; 2816 * At this point, cgroup_e_css() results reflect the new csses
2737 2817 * making the following cgroup_update_dfl_csses() properly update
2818 * css associations of all tasks in the subtree.
2819 */
2738 ret = cgroup_update_dfl_csses(cgrp); 2820 ret = cgroup_update_dfl_csses(cgrp);
2739 if (ret) 2821 if (ret)
2740 goto err_undo_css; 2822 goto err_undo_css;
2741 2823
2742 /* all tasks are now migrated away from the old csses, kill them */ 2824 /*
2825 * All tasks are migrated out of disabled csses. Kill or hide
2826 * them. A css is hidden when the userland requests it to be
2827 * disabled while other subsystems are still depending on it. The
2828 * css must not actively control resources and be in the vanilla
2829 * state if it's made visible again later. Controllers which may
2830 * be depended upon should provide ->css_reset() for this purpose.
2831 */
2743 for_each_subsys(ss, ssid) { 2832 for_each_subsys(ss, ssid) {
2744 if (!(disable & (1 << ssid))) 2833 if (!(disable & (1 << ssid)))
2745 continue; 2834 continue;
2746 2835
2747 cgroup_for_each_live_child(child, cgrp) 2836 cgroup_for_each_live_child(child, cgrp) {
2748 kill_css(cgroup_css(child, ss)); 2837 struct cgroup_subsys_state *css = cgroup_css(child, ss);
2838
2839 if (css_disable & (1 << ssid)) {
2840 kill_css(css);
2841 } else {
2842 cgroup_clear_dir(child, 1 << ssid);
2843 if (ss->css_reset)
2844 ss->css_reset(css);
2845 }
2846 }
2749 } 2847 }
2750 2848
2751 kernfs_activate(cgrp->kn); 2849 kernfs_activate(cgrp->kn);
@@ -2755,8 +2853,9 @@ out_unlock:
2755 return ret ?: nbytes; 2853 return ret ?: nbytes;
2756 2854
2757err_undo_css: 2855err_undo_css:
2758 cgrp->child_subsys_mask &= ~enable; 2856 cgrp->subtree_control &= ~enable;
2759 cgrp->child_subsys_mask |= disable; 2857 cgrp->subtree_control |= disable;
2858 cgroup_refresh_child_subsys_mask(cgrp);
2760 2859
2761 for_each_subsys(ss, ssid) { 2860 for_each_subsys(ss, ssid) {
2762 if (!(enable & (1 << ssid))) 2861 if (!(enable & (1 << ssid)))
@@ -2764,8 +2863,14 @@ err_undo_css:
2764 2863
2765 cgroup_for_each_live_child(child, cgrp) { 2864 cgroup_for_each_live_child(child, cgrp) {
2766 struct cgroup_subsys_state *css = cgroup_css(child, ss); 2865 struct cgroup_subsys_state *css = cgroup_css(child, ss);
2767 if (css) 2866
2867 if (!css)
2868 continue;
2869
2870 if (css_enable & (1 << ssid))
2768 kill_css(css); 2871 kill_css(css);
2872 else
2873 cgroup_clear_dir(child, 1 << ssid);
2769 } 2874 }
2770 } 2875 }
2771 goto out_unlock; 2876 goto out_unlock;
@@ -2878,9 +2983,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2878 2983
2879 /* 2984 /*
2880 * This isn't a proper migration and its usefulness is very 2985 * This isn't a proper migration and its usefulness is very
2881 * limited. Disallow if sane_behavior. 2986 * limited. Disallow on the default hierarchy.
2882 */ 2987 */
2883 if (cgroup_sane_behavior(cgrp)) 2988 if (cgroup_on_dfl(cgrp))
2884 return -EPERM; 2989 return -EPERM;
2885 2990
2886 /* 2991 /*
@@ -2964,9 +3069,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2964 3069
2965 for (cft = cfts; cft->name[0] != '\0'; cft++) { 3070 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2966 /* does cft->flags tell us to skip this file on @cgrp? */ 3071 /* does cft->flags tell us to skip this file on @cgrp? */
2967 if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) 3072 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
2968 continue; 3073 continue;
2969 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 3074 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
2970 continue; 3075 continue;
2971 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) 3076 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
2972 continue; 3077 continue;
@@ -3024,6 +3129,9 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
3024 kfree(cft->kf_ops); 3129 kfree(cft->kf_ops);
3025 cft->kf_ops = NULL; 3130 cft->kf_ops = NULL;
3026 cft->ss = NULL; 3131 cft->ss = NULL;
3132
3133 /* revert flags set by cgroup core while adding @cfts */
3134 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
3027 } 3135 }
3028} 3136}
3029 3137
@@ -3109,7 +3217,7 @@ int cgroup_rm_cftypes(struct cftype *cfts)
3109 * function currently returns 0 as long as @cfts registration is successful 3217 * function currently returns 0 as long as @cfts registration is successful
3110 * even if some file creation attempts on existing cgroups fail. 3218 * even if some file creation attempts on existing cgroups fail.
3111 */ 3219 */
3112int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 3220static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3113{ 3221{
3114 int ret; 3222 int ret;
3115 3223
@@ -3135,6 +3243,40 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3135} 3243}
3136 3244
3137/** 3245/**
3246 * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
3247 * @ss: target cgroup subsystem
3248 * @cfts: zero-length name terminated array of cftypes
3249 *
3250 * Similar to cgroup_add_cftypes() but the added files are only used for
3251 * the default hierarchy.
3252 */
3253int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3254{
3255 struct cftype *cft;
3256
3257 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3258 cft->flags |= __CFTYPE_ONLY_ON_DFL;
3259 return cgroup_add_cftypes(ss, cfts);
3260}
3261
3262/**
3263 * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
3264 * @ss: target cgroup subsystem
3265 * @cfts: zero-length name terminated array of cftypes
3266 *
3267 * Similar to cgroup_add_cftypes() but the added files are only used for
3268 * the legacy hierarchies.
3269 */
3270int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3271{
3272 struct cftype *cft;
3273
3274 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3275 cft->flags |= __CFTYPE_NOT_ON_DFL;
3276 return cgroup_add_cftypes(ss, cfts);
3277}
3278
3279/**
3138 * cgroup_task_count - count the number of tasks in a cgroup. 3280 * cgroup_task_count - count the number of tasks in a cgroup.
3139 * @cgrp: the cgroup in question 3281 * @cgrp: the cgroup in question
3140 * 3282 *
@@ -3699,8 +3841,9 @@ after:
3699 * 3841 *
3700 * All this extra complexity was caused by the original implementation 3842 * All this extra complexity was caused by the original implementation
3701 * committing to an entirely unnecessary property. In the long term, we 3843 * committing to an entirely unnecessary property. In the long term, we
3702 * want to do away with it. Explicitly scramble sort order if 3844 * want to do away with it. Explicitly scramble sort order if on the
3703 * sane_behavior so that no such expectation exists in the new interface. 3845 * default hierarchy so that no such expectation exists in the new
3846 * interface.
3704 * 3847 *
3705 * Scrambling is done by swapping every two consecutive bits, which is 3848 * Scrambling is done by swapping every two consecutive bits, which is
3706 * non-identity one-to-one mapping which disturbs sort order sufficiently. 3849 * non-identity one-to-one mapping which disturbs sort order sufficiently.
@@ -3715,7 +3858,7 @@ static pid_t pid_fry(pid_t pid)
3715 3858
3716static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) 3859static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3717{ 3860{
3718 if (cgroup_sane_behavior(cgrp)) 3861 if (cgroup_on_dfl(cgrp))
3719 return pid_fry(pid); 3862 return pid_fry(pid);
3720 else 3863 else
3721 return pid; 3864 return pid;
@@ -3818,7 +3961,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3818 css_task_iter_end(&it); 3961 css_task_iter_end(&it);
3819 length = n; 3962 length = n;
3820 /* now sort & (if procs) strip out duplicates */ 3963 /* now sort & (if procs) strip out duplicates */
3821 if (cgroup_sane_behavior(cgrp)) 3964 if (cgroup_on_dfl(cgrp))
3822 sort(array, length, sizeof(pid_t), fried_cmppid, NULL); 3965 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
3823 else 3966 else
3824 sort(array, length, sizeof(pid_t), cmppid, NULL); 3967 sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -4040,7 +4183,8 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4040 return 0; 4183 return 0;
4041} 4184}
4042 4185
4043static struct cftype cgroup_base_files[] = { 4186/* cgroup core interface files for the default hierarchy */
4187static struct cftype cgroup_dfl_base_files[] = {
4044 { 4188 {
4045 .name = "cgroup.procs", 4189 .name = "cgroup.procs",
4046 .seq_start = cgroup_pidlist_start, 4190 .seq_start = cgroup_pidlist_start,
@@ -4052,46 +4196,52 @@ static struct cftype cgroup_base_files[] = {
4052 .mode = S_IRUGO | S_IWUSR, 4196 .mode = S_IRUGO | S_IWUSR,
4053 }, 4197 },
4054 { 4198 {
4055 .name = "cgroup.clone_children",
4056 .flags = CFTYPE_INSANE,
4057 .read_u64 = cgroup_clone_children_read,
4058 .write_u64 = cgroup_clone_children_write,
4059 },
4060 {
4061 .name = "cgroup.sane_behavior",
4062 .flags = CFTYPE_ONLY_ON_ROOT,
4063 .seq_show = cgroup_sane_behavior_show,
4064 },
4065 {
4066 .name = "cgroup.controllers", 4199 .name = "cgroup.controllers",
4067 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, 4200 .flags = CFTYPE_ONLY_ON_ROOT,
4068 .seq_show = cgroup_root_controllers_show, 4201 .seq_show = cgroup_root_controllers_show,
4069 }, 4202 },
4070 { 4203 {
4071 .name = "cgroup.controllers", 4204 .name = "cgroup.controllers",
4072 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, 4205 .flags = CFTYPE_NOT_ON_ROOT,
4073 .seq_show = cgroup_controllers_show, 4206 .seq_show = cgroup_controllers_show,
4074 }, 4207 },
4075 { 4208 {
4076 .name = "cgroup.subtree_control", 4209 .name = "cgroup.subtree_control",
4077 .flags = CFTYPE_ONLY_ON_DFL,
4078 .seq_show = cgroup_subtree_control_show, 4210 .seq_show = cgroup_subtree_control_show,
4079 .write = cgroup_subtree_control_write, 4211 .write = cgroup_subtree_control_write,
4080 }, 4212 },
4081 { 4213 {
4082 .name = "cgroup.populated", 4214 .name = "cgroup.populated",
4083 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, 4215 .flags = CFTYPE_NOT_ON_ROOT,
4084 .seq_show = cgroup_populated_show, 4216 .seq_show = cgroup_populated_show,
4085 }, 4217 },
4218 { } /* terminate */
4219};
4086 4220
4087 /* 4221/* cgroup core interface files for the legacy hierarchies */
4088 * Historical crazy stuff. These don't have "cgroup." prefix and 4222static struct cftype cgroup_legacy_base_files[] = {
4089 * don't exist if sane_behavior. If you're depending on these, be 4223 {
4090 * prepared to be burned. 4224 .name = "cgroup.procs",
4091 */ 4225 .seq_start = cgroup_pidlist_start,
4226 .seq_next = cgroup_pidlist_next,
4227 .seq_stop = cgroup_pidlist_stop,
4228 .seq_show = cgroup_pidlist_show,
4229 .private = CGROUP_FILE_PROCS,
4230 .write = cgroup_procs_write,
4231 .mode = S_IRUGO | S_IWUSR,
4232 },
4233 {
4234 .name = "cgroup.clone_children",
4235 .read_u64 = cgroup_clone_children_read,
4236 .write_u64 = cgroup_clone_children_write,
4237 },
4238 {
4239 .name = "cgroup.sane_behavior",
4240 .flags = CFTYPE_ONLY_ON_ROOT,
4241 .seq_show = cgroup_sane_behavior_show,
4242 },
4092 { 4243 {
4093 .name = "tasks", 4244 .name = "tasks",
4094 .flags = CFTYPE_INSANE, /* use "procs" instead */
4095 .seq_start = cgroup_pidlist_start, 4245 .seq_start = cgroup_pidlist_start,
4096 .seq_next = cgroup_pidlist_next, 4246 .seq_next = cgroup_pidlist_next,
4097 .seq_stop = cgroup_pidlist_stop, 4247 .seq_stop = cgroup_pidlist_stop,
@@ -4102,13 +4252,12 @@ static struct cftype cgroup_base_files[] = {
4102 }, 4252 },
4103 { 4253 {
4104 .name = "notify_on_release", 4254 .name = "notify_on_release",
4105 .flags = CFTYPE_INSANE,
4106 .read_u64 = cgroup_read_notify_on_release, 4255 .read_u64 = cgroup_read_notify_on_release,
4107 .write_u64 = cgroup_write_notify_on_release, 4256 .write_u64 = cgroup_write_notify_on_release,
4108 }, 4257 },
4109 { 4258 {
4110 .name = "release_agent", 4259 .name = "release_agent",
4111 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 4260 .flags = CFTYPE_ONLY_ON_ROOT,
4112 .seq_show = cgroup_release_agent_show, 4261 .seq_show = cgroup_release_agent_show,
4113 .write = cgroup_release_agent_write, 4262 .write = cgroup_release_agent_write,
4114 .max_write_len = PATH_MAX - 1, 4263 .max_write_len = PATH_MAX - 1,
@@ -4175,6 +4324,8 @@ static void css_free_work_fn(struct work_struct *work)
4175 container_of(work, struct cgroup_subsys_state, destroy_work); 4324 container_of(work, struct cgroup_subsys_state, destroy_work);
4176 struct cgroup *cgrp = css->cgroup; 4325 struct cgroup *cgrp = css->cgroup;
4177 4326
4327 percpu_ref_exit(&css->refcnt);
4328
4178 if (css->ss) { 4329 if (css->ss) {
4179 /* css free path */ 4330 /* css free path */
4180 if (css->parent) 4331 if (css->parent)
@@ -4314,12 +4465,14 @@ static void offline_css(struct cgroup_subsys_state *css)
4314 * create_css - create a cgroup_subsys_state 4465 * create_css - create a cgroup_subsys_state
4315 * @cgrp: the cgroup new css will be associated with 4466 * @cgrp: the cgroup new css will be associated with
4316 * @ss: the subsys of new css 4467 * @ss: the subsys of new css
4468 * @visible: whether to create control knobs for the new css or not
4317 * 4469 *
4318 * Create a new css associated with @cgrp - @ss pair. On success, the new 4470 * Create a new css associated with @cgrp - @ss pair. On success, the new
4319 * css is online and installed in @cgrp with all interface files created. 4471 * css is online and installed in @cgrp with all interface files created if
4320 * Returns 0 on success, -errno on failure. 4472 * @visible. Returns 0 on success, -errno on failure.
4321 */ 4473 */
4322static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) 4474static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4475 bool visible)
4323{ 4476{
4324 struct cgroup *parent = cgroup_parent(cgrp); 4477 struct cgroup *parent = cgroup_parent(cgrp);
4325 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); 4478 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
@@ -4343,9 +4496,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4343 goto err_free_percpu_ref; 4496 goto err_free_percpu_ref;
4344 css->id = err; 4497 css->id = err;
4345 4498
4346 err = cgroup_populate_dir(cgrp, 1 << ss->id); 4499 if (visible) {
4347 if (err) 4500 err = cgroup_populate_dir(cgrp, 1 << ss->id);
4348 goto err_free_id; 4501 if (err)
4502 goto err_free_id;
4503 }
4349 4504
4350 /* @css is ready to be brought online now, make it visible */ 4505 /* @css is ready to be brought online now, make it visible */
4351 list_add_tail_rcu(&css->sibling, &parent_css->children); 4506 list_add_tail_rcu(&css->sibling, &parent_css->children);
@@ -4372,7 +4527,7 @@ err_list_del:
4372err_free_id: 4527err_free_id:
4373 cgroup_idr_remove(&ss->css_idr, css->id); 4528 cgroup_idr_remove(&ss->css_idr, css->id);
4374err_free_percpu_ref: 4529err_free_percpu_ref:
4375 percpu_ref_cancel_init(&css->refcnt); 4530 percpu_ref_exit(&css->refcnt);
4376err_free_css: 4531err_free_css:
4377 call_rcu(&css->rcu_head, css_free_rcu_fn); 4532 call_rcu(&css->rcu_head, css_free_rcu_fn);
4378 return err; 4533 return err;
@@ -4385,6 +4540,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4385 struct cgroup_root *root; 4540 struct cgroup_root *root;
4386 struct cgroup_subsys *ss; 4541 struct cgroup_subsys *ss;
4387 struct kernfs_node *kn; 4542 struct kernfs_node *kn;
4543 struct cftype *base_files;
4388 int ssid, ret; 4544 int ssid, ret;
4389 4545
4390 parent = cgroup_kn_lock_live(parent_kn); 4546 parent = cgroup_kn_lock_live(parent_kn);
@@ -4455,14 +4611,20 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4455 if (ret) 4611 if (ret)
4456 goto out_destroy; 4612 goto out_destroy;
4457 4613
4458 ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4614 if (cgroup_on_dfl(cgrp))
4615 base_files = cgroup_dfl_base_files;
4616 else
4617 base_files = cgroup_legacy_base_files;
4618
4619 ret = cgroup_addrm_files(cgrp, base_files, true);
4459 if (ret) 4620 if (ret)
4460 goto out_destroy; 4621 goto out_destroy;
4461 4622
4462 /* let's create and online css's */ 4623 /* let's create and online css's */
4463 for_each_subsys(ss, ssid) { 4624 for_each_subsys(ss, ssid) {
4464 if (parent->child_subsys_mask & (1 << ssid)) { 4625 if (parent->child_subsys_mask & (1 << ssid)) {
4465 ret = create_css(cgrp, ss); 4626 ret = create_css(cgrp, ss,
4627 parent->subtree_control & (1 << ssid));
4466 if (ret) 4628 if (ret)
4467 goto out_destroy; 4629 goto out_destroy;
4468 } 4630 }
@@ -4470,10 +4632,12 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4470 4632
4471 /* 4633 /*
4472 * On the default hierarchy, a child doesn't automatically inherit 4634 * On the default hierarchy, a child doesn't automatically inherit
4473 * child_subsys_mask from the parent. Each is configured manually. 4635 * subtree_control from the parent. Each is configured manually.
4474 */ 4636 */
4475 if (!cgroup_on_dfl(cgrp)) 4637 if (!cgroup_on_dfl(cgrp)) {
4476 cgrp->child_subsys_mask = parent->child_subsys_mask; 4638 cgrp->subtree_control = parent->subtree_control;
4639 cgroup_refresh_child_subsys_mask(cgrp);
4640 }
4477 4641
4478 kernfs_activate(kn); 4642 kernfs_activate(kn);
4479 4643
@@ -4483,7 +4647,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4483out_free_id: 4647out_free_id:
4484 cgroup_idr_remove(&root->cgroup_idr, cgrp->id); 4648 cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
4485out_cancel_ref: 4649out_cancel_ref:
4486 percpu_ref_cancel_init(&cgrp->self.refcnt); 4650 percpu_ref_exit(&cgrp->self.refcnt);
4487out_free_cgrp: 4651out_free_cgrp:
4488 kfree(cgrp); 4652 kfree(cgrp);
4489out_unlock: 4653out_unlock:
@@ -4736,8 +4900,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4736 */ 4900 */
4737int __init cgroup_init_early(void) 4901int __init cgroup_init_early(void)
4738{ 4902{
4739 static struct cgroup_sb_opts __initdata opts = 4903 static struct cgroup_sb_opts __initdata opts;
4740 { .flags = CGRP_ROOT_SANE_BEHAVIOR };
4741 struct cgroup_subsys *ss; 4904 struct cgroup_subsys *ss;
4742 int i; 4905 int i;
4743 4906
@@ -4775,7 +4938,8 @@ int __init cgroup_init(void)
4775 unsigned long key; 4938 unsigned long key;
4776 int ssid, err; 4939 int ssid, err;
4777 4940
4778 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 4941 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
4942 BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
4779 4943
4780 mutex_lock(&cgroup_mutex); 4944 mutex_lock(&cgroup_mutex);
4781 4945
@@ -4807,9 +4971,22 @@ int __init cgroup_init(void)
4807 * disabled flag and cftype registration needs kmalloc, 4971 * disabled flag and cftype registration needs kmalloc,
4808 * both of which aren't available during early_init. 4972 * both of which aren't available during early_init.
4809 */ 4973 */
4810 if (!ss->disabled) { 4974 if (ss->disabled)
4811 cgrp_dfl_root.subsys_mask |= 1 << ss->id; 4975 continue;
4812 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); 4976
4977 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4978
4979 if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
4980 ss->dfl_cftypes = ss->legacy_cftypes;
4981
4982 if (!ss->dfl_cftypes)
4983 cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
4984
4985 if (ss->dfl_cftypes == ss->legacy_cftypes) {
4986 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
4987 } else {
4988 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
4989 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
4813 } 4990 }
4814 } 4991 }
4815 4992
@@ -5205,6 +5382,14 @@ static int __init cgroup_disable(char *str)
5205} 5382}
5206__setup("cgroup_disable=", cgroup_disable); 5383__setup("cgroup_disable=", cgroup_disable);
5207 5384
5385static int __init cgroup_set_legacy_files_on_dfl(char *str)
5386{
5387 printk("cgroup: using legacy files on the default hierarchy\n");
5388 cgroup_legacy_files_on_dfl = true;
5389 return 0;
5390}
5391__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
5392
5208/** 5393/**
5209 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry 5394 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
5210 * @dentry: directory dentry of interest 5395 * @dentry: directory dentry of interest
@@ -5399,6 +5584,6 @@ static struct cftype debug_files[] = {
5399struct cgroup_subsys debug_cgrp_subsys = { 5584struct cgroup_subsys debug_cgrp_subsys = {
5400 .css_alloc = debug_css_alloc, 5585 .css_alloc = debug_css_alloc,
5401 .css_free = debug_css_free, 5586 .css_free = debug_css_free,
5402 .base_cftypes = debug_files, 5587 .legacy_cftypes = debug_files,
5403}; 5588};
5404#endif /* CONFIG_CGROUP_DEBUG */ 5589#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index a79e40f9d700..92b98cc0ee76 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -480,5 +480,5 @@ struct cgroup_subsys freezer_cgrp_subsys = {
480 .css_free = freezer_css_free, 480 .css_free = freezer_css_free,
481 .attach = freezer_attach, 481 .attach = freezer_attach,
482 .fork = freezer_fork, 482 .fork = freezer_fork,
483 .base_cftypes = files, 483 .legacy_cftypes = files,
484}; 484};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a343bde710b1..81e2a388a0f6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -274,21 +274,28 @@ void clear_tasks_mm_cpumask(int cpu)
274 rcu_read_unlock(); 274 rcu_read_unlock();
275} 275}
276 276
277static inline void check_for_tasks(int cpu) 277static inline void check_for_tasks(int dead_cpu)
278{ 278{
279 struct task_struct *p; 279 struct task_struct *g, *p;
280 cputime_t utime, stime;
281 280
282 write_lock_irq(&tasklist_lock); 281 read_lock_irq(&tasklist_lock);
283 for_each_process(p) { 282 do_each_thread(g, p) {
284 task_cputime(p, &utime, &stime); 283 if (!p->on_rq)
285 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 284 continue;
286 (utime || stime)) 285 /*
287 pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", 286 * We do the check with unlocked task_rq(p)->lock.
288 p->comm, task_pid_nr(p), cpu, 287 * Order the reading to do not warn about a task,
289 p->state, p->flags); 288 * which was running on this cpu in the past, and
290 } 289 * it's just been woken on another cpu.
291 write_unlock_irq(&tasklist_lock); 290 */
291 rmb();
292 if (task_cpu(p) != dead_cpu)
293 continue;
294
295 pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
296 p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
297 } while_each_thread(g, p);
298 read_unlock_irq(&tasklist_lock);
292} 299}
293 300
294struct take_cpu_down_param { 301struct take_cpu_down_param {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 116a4164720a..22874d7cf2c0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -76,8 +76,34 @@ struct cpuset {
76 struct cgroup_subsys_state css; 76 struct cgroup_subsys_state css;
77 77
78 unsigned long flags; /* "unsigned long" so bitops work */ 78 unsigned long flags; /* "unsigned long" so bitops work */
79 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 79
80 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 80 /*
81 * On default hierarchy:
82 *
83 * The user-configured masks can only be changed by writing to
84 * cpuset.cpus and cpuset.mems, and won't be limited by the
85 * parent masks.
86 *
87 * The effective masks is the real masks that apply to the tasks
88 * in the cpuset. They may be changed if the configured masks are
89 * changed or hotplug happens.
90 *
91 * effective_mask == configured_mask & parent's effective_mask,
92 * and if it ends up empty, it will inherit the parent's mask.
93 *
94 *
95 * On legacy hierachy:
96 *
97 * The user-configured masks are always the same with effective masks.
98 */
99
100 /* user-configured CPUs and Memory Nodes allow to tasks */
101 cpumask_var_t cpus_allowed;
102 nodemask_t mems_allowed;
103
104 /* effective CPUs and Memory Nodes allow to tasks */
105 cpumask_var_t effective_cpus;
106 nodemask_t effective_mems;
81 107
82 /* 108 /*
83 * This is old Memory Nodes tasks took on. 109 * This is old Memory Nodes tasks took on.
@@ -307,9 +333,9 @@ static struct file_system_type cpuset_fs_type = {
307 */ 333 */
308static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) 334static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
309{ 335{
310 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 336 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
311 cs = parent_cs(cs); 337 cs = parent_cs(cs);
312 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); 338 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
313} 339}
314 340
315/* 341/*
@@ -325,9 +351,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
325 */ 351 */
326static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 352static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
327{ 353{
328 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) 354 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
329 cs = parent_cs(cs); 355 cs = parent_cs(cs);
330 nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); 356 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
331} 357}
332 358
333/* 359/*
@@ -376,13 +402,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
376 if (!trial) 402 if (!trial)
377 return NULL; 403 return NULL;
378 404
379 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { 405 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
380 kfree(trial); 406 goto free_cs;
381 return NULL; 407 if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
382 } 408 goto free_cpus;
383 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
384 409
410 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
411 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
385 return trial; 412 return trial;
413
414free_cpus:
415 free_cpumask_var(trial->cpus_allowed);
416free_cs:
417 kfree(trial);
418 return NULL;
386} 419}
387 420
388/** 421/**
@@ -391,6 +424,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
391 */ 424 */
392static void free_trial_cpuset(struct cpuset *trial) 425static void free_trial_cpuset(struct cpuset *trial)
393{ 426{
427 free_cpumask_var(trial->effective_cpus);
394 free_cpumask_var(trial->cpus_allowed); 428 free_cpumask_var(trial->cpus_allowed);
395 kfree(trial); 429 kfree(trial);
396} 430}
@@ -436,9 +470,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
436 470
437 par = parent_cs(cur); 471 par = parent_cs(cur);
438 472
439 /* We must be a subset of our parent cpuset */ 473 /* On legacy hiearchy, we must be a subset of our parent cpuset. */
440 ret = -EACCES; 474 ret = -EACCES;
441 if (!is_cpuset_subset(trial, par)) 475 if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
442 goto out; 476 goto out;
443 477
444 /* 478 /*
@@ -480,11 +514,11 @@ out:
480#ifdef CONFIG_SMP 514#ifdef CONFIG_SMP
481/* 515/*
482 * Helper routine for generate_sched_domains(). 516 * Helper routine for generate_sched_domains().
483 * Do cpusets a, b have overlapping cpus_allowed masks? 517 * Do cpusets a, b have overlapping effective cpus_allowed masks?
484 */ 518 */
485static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 519static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
486{ 520{
487 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); 521 return cpumask_intersects(a->effective_cpus, b->effective_cpus);
488} 522}
489 523
490static void 524static void
@@ -601,7 +635,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
601 *dattr = SD_ATTR_INIT; 635 *dattr = SD_ATTR_INIT;
602 update_domain_attr_tree(dattr, &top_cpuset); 636 update_domain_attr_tree(dattr, &top_cpuset);
603 } 637 }
604 cpumask_copy(doms[0], top_cpuset.cpus_allowed); 638 cpumask_copy(doms[0], top_cpuset.effective_cpus);
605 639
606 goto done; 640 goto done;
607 } 641 }
@@ -705,7 +739,7 @@ restart:
705 struct cpuset *b = csa[j]; 739 struct cpuset *b = csa[j];
706 740
707 if (apn == b->pn) { 741 if (apn == b->pn) {
708 cpumask_or(dp, dp, b->cpus_allowed); 742 cpumask_or(dp, dp, b->effective_cpus);
709 if (dattr) 743 if (dattr)
710 update_domain_attr_tree(dattr + nslot, b); 744 update_domain_attr_tree(dattr + nslot, b);
711 745
@@ -757,7 +791,7 @@ static void rebuild_sched_domains_locked(void)
757 * passing doms with offlined cpu to partition_sched_domains(). 791 * passing doms with offlined cpu to partition_sched_domains().
758 * Anyways, hotplug work item will rebuild sched domains. 792 * Anyways, hotplug work item will rebuild sched domains.
759 */ 793 */
760 if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) 794 if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
761 goto out; 795 goto out;
762 796
763 /* Generate domain masks and attrs */ 797 /* Generate domain masks and attrs */
@@ -781,45 +815,6 @@ void rebuild_sched_domains(void)
781 mutex_unlock(&cpuset_mutex); 815 mutex_unlock(&cpuset_mutex);
782} 816}
783 817
784/*
785 * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
786 * @cs: the cpuset in interest
787 *
788 * A cpuset's effective cpumask is the cpumask of the nearest ancestor
789 * with non-empty cpus. We use effective cpumask whenever:
790 * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
791 * if the cpuset they reside in has no cpus)
792 * - we want to retrieve task_cs(tsk)'s cpus_allowed.
793 *
794 * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
795 * exception. See comments there.
796 */
797static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
798{
799 while (cpumask_empty(cs->cpus_allowed))
800 cs = parent_cs(cs);
801 return cs;
802}
803
804/*
805 * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
806 * @cs: the cpuset in interest
807 *
808 * A cpuset's effective nodemask is the nodemask of the nearest ancestor
809 * with non-empty memss. We use effective nodemask whenever:
810 * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
811 * if the cpuset they reside in has no mems)
812 * - we want to retrieve task_cs(tsk)'s mems_allowed.
813 *
814 * Called with cpuset_mutex held.
815 */
816static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
817{
818 while (nodes_empty(cs->mems_allowed))
819 cs = parent_cs(cs);
820 return cs;
821}
822
823/** 818/**
824 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 819 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
825 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 820 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -830,53 +825,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
830 */ 825 */
831static void update_tasks_cpumask(struct cpuset *cs) 826static void update_tasks_cpumask(struct cpuset *cs)
832{ 827{
833 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
834 struct css_task_iter it; 828 struct css_task_iter it;
835 struct task_struct *task; 829 struct task_struct *task;
836 830
837 css_task_iter_start(&cs->css, &it); 831 css_task_iter_start(&cs->css, &it);
838 while ((task = css_task_iter_next(&it))) 832 while ((task = css_task_iter_next(&it)))
839 set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed); 833 set_cpus_allowed_ptr(task, cs->effective_cpus);
840 css_task_iter_end(&it); 834 css_task_iter_end(&it);
841} 835}
842 836
843/* 837/*
844 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 838 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
845 * @root_cs: the root cpuset of the hierarchy 839 * @cs: the cpuset to consider
846 * @update_root: update root cpuset or not? 840 * @new_cpus: temp variable for calculating new effective_cpus
841 *
842 * When congifured cpumask is changed, the effective cpumasks of this cpuset
843 * and all its descendants need to be updated.
847 * 844 *
848 * This will update cpumasks of tasks in @root_cs and all other empty cpusets 845 * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
849 * which take on cpumask of @root_cs.
850 * 846 *
851 * Called with cpuset_mutex held 847 * Called with cpuset_mutex held
852 */ 848 */
853static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) 849static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
854{ 850{
855 struct cpuset *cp; 851 struct cpuset *cp;
856 struct cgroup_subsys_state *pos_css; 852 struct cgroup_subsys_state *pos_css;
853 bool need_rebuild_sched_domains = false;
857 854
858 rcu_read_lock(); 855 rcu_read_lock();
859 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { 856 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
860 if (cp == root_cs) { 857 struct cpuset *parent = parent_cs(cp);
861 if (!update_root) 858
862 continue; 859 cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
863 } else { 860
864 /* skip the whole subtree if @cp have some CPU */ 861 /*
865 if (!cpumask_empty(cp->cpus_allowed)) { 862 * If it becomes empty, inherit the effective mask of the
866 pos_css = css_rightmost_descendant(pos_css); 863 * parent, which is guaranteed to have some CPUs.
867 continue; 864 */
868 } 865 if (cpumask_empty(new_cpus))
866 cpumask_copy(new_cpus, parent->effective_cpus);
867
868 /* Skip the whole subtree if the cpumask remains the same. */
869 if (cpumask_equal(new_cpus, cp->effective_cpus)) {
870 pos_css = css_rightmost_descendant(pos_css);
871 continue;
869 } 872 }
873
870 if (!css_tryget_online(&cp->css)) 874 if (!css_tryget_online(&cp->css))
871 continue; 875 continue;
872 rcu_read_unlock(); 876 rcu_read_unlock();
873 877
878 mutex_lock(&callback_mutex);
879 cpumask_copy(cp->effective_cpus, new_cpus);
880 mutex_unlock(&callback_mutex);
881
882 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
883 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
884
874 update_tasks_cpumask(cp); 885 update_tasks_cpumask(cp);
875 886
887 /*
888 * If the effective cpumask of any non-empty cpuset is changed,
889 * we need to rebuild sched domains.
890 */
891 if (!cpumask_empty(cp->cpus_allowed) &&
892 is_sched_load_balance(cp))
893 need_rebuild_sched_domains = true;
894
876 rcu_read_lock(); 895 rcu_read_lock();
877 css_put(&cp->css); 896 css_put(&cp->css);
878 } 897 }
879 rcu_read_unlock(); 898 rcu_read_unlock();
899
900 if (need_rebuild_sched_domains)
901 rebuild_sched_domains_locked();
880} 902}
881 903
882/** 904/**
@@ -889,7 +911,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
889 const char *buf) 911 const char *buf)
890{ 912{
891 int retval; 913 int retval;
892 int is_load_balanced;
893 914
894 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ 915 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
895 if (cs == &top_cpuset) 916 if (cs == &top_cpuset)
@@ -908,7 +929,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
908 if (retval < 0) 929 if (retval < 0)
909 return retval; 930 return retval;
910 931
911 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) 932 if (!cpumask_subset(trialcs->cpus_allowed,
933 top_cpuset.cpus_allowed))
912 return -EINVAL; 934 return -EINVAL;
913 } 935 }
914 936
@@ -920,16 +942,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
920 if (retval < 0) 942 if (retval < 0)
921 return retval; 943 return retval;
922 944
923 is_load_balanced = is_sched_load_balance(trialcs);
924
925 mutex_lock(&callback_mutex); 945 mutex_lock(&callback_mutex);
926 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 946 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
927 mutex_unlock(&callback_mutex); 947 mutex_unlock(&callback_mutex);
928 948
929 update_tasks_cpumask_hier(cs, true); 949 /* use trialcs->cpus_allowed as a temp variable */
930 950 update_cpumasks_hier(cs, trialcs->cpus_allowed);
931 if (is_load_balanced)
932 rebuild_sched_domains_locked();
933 return 0; 951 return 0;
934} 952}
935 953
@@ -951,15 +969,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
951 const nodemask_t *to) 969 const nodemask_t *to)
952{ 970{
953 struct task_struct *tsk = current; 971 struct task_struct *tsk = current;
954 struct cpuset *mems_cs;
955 972
956 tsk->mems_allowed = *to; 973 tsk->mems_allowed = *to;
957 974
958 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 975 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
959 976
960 rcu_read_lock(); 977 rcu_read_lock();
961 mems_cs = effective_nodemask_cpuset(task_cs(tsk)); 978 guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
962 guarantee_online_mems(mems_cs, &tsk->mems_allowed);
963 rcu_read_unlock(); 979 rcu_read_unlock();
964} 980}
965 981
@@ -1028,13 +1044,12 @@ static void *cpuset_being_rebound;
1028static void update_tasks_nodemask(struct cpuset *cs) 1044static void update_tasks_nodemask(struct cpuset *cs)
1029{ 1045{
1030 static nodemask_t newmems; /* protected by cpuset_mutex */ 1046 static nodemask_t newmems; /* protected by cpuset_mutex */
1031 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1032 struct css_task_iter it; 1047 struct css_task_iter it;
1033 struct task_struct *task; 1048 struct task_struct *task;
1034 1049
1035 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1050 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1036 1051
1037 guarantee_online_mems(mems_cs, &newmems); 1052 guarantee_online_mems(cs, &newmems);
1038 1053
1039 /* 1054 /*
1040 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1055 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1077,36 +1092,52 @@ static void update_tasks_nodemask(struct cpuset *cs)
1077} 1092}
1078 1093
1079/* 1094/*
1080 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1095 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
1081 * @cs: the root cpuset of the hierarchy 1096 * @cs: the cpuset to consider
1082 * @update_root: update the root cpuset or not? 1097 * @new_mems: a temp variable for calculating new effective_mems
1083 * 1098 *
1084 * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1099 * When configured nodemask is changed, the effective nodemasks of this cpuset
1085 * which take on nodemask of @root_cs. 1100 * and all its descendants need to be updated.
1101 *
1102 * On legacy hiearchy, effective_mems will be the same with mems_allowed.
1086 * 1103 *
1087 * Called with cpuset_mutex held 1104 * Called with cpuset_mutex held
1088 */ 1105 */
1089static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) 1106static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1090{ 1107{
1091 struct cpuset *cp; 1108 struct cpuset *cp;
1092 struct cgroup_subsys_state *pos_css; 1109 struct cgroup_subsys_state *pos_css;
1093 1110
1094 rcu_read_lock(); 1111 rcu_read_lock();
1095 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { 1112 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1096 if (cp == root_cs) { 1113 struct cpuset *parent = parent_cs(cp);
1097 if (!update_root) 1114
1098 continue; 1115 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1099 } else { 1116
1100 /* skip the whole subtree if @cp have some CPU */ 1117 /*
1101 if (!nodes_empty(cp->mems_allowed)) { 1118 * If it becomes empty, inherit the effective mask of the
1102 pos_css = css_rightmost_descendant(pos_css); 1119 * parent, which is guaranteed to have some MEMs.
1103 continue; 1120 */
1104 } 1121 if (nodes_empty(*new_mems))
1122 *new_mems = parent->effective_mems;
1123
1124 /* Skip the whole subtree if the nodemask remains the same. */
1125 if (nodes_equal(*new_mems, cp->effective_mems)) {
1126 pos_css = css_rightmost_descendant(pos_css);
1127 continue;
1105 } 1128 }
1129
1106 if (!css_tryget_online(&cp->css)) 1130 if (!css_tryget_online(&cp->css))
1107 continue; 1131 continue;
1108 rcu_read_unlock(); 1132 rcu_read_unlock();
1109 1133
1134 mutex_lock(&callback_mutex);
1135 cp->effective_mems = *new_mems;
1136 mutex_unlock(&callback_mutex);
1137
1138 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
1139 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1140
1110 update_tasks_nodemask(cp); 1141 update_tasks_nodemask(cp);
1111 1142
1112 rcu_read_lock(); 1143 rcu_read_lock();
@@ -1156,8 +1187,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1156 goto done; 1187 goto done;
1157 1188
1158 if (!nodes_subset(trialcs->mems_allowed, 1189 if (!nodes_subset(trialcs->mems_allowed,
1159 node_states[N_MEMORY])) { 1190 top_cpuset.mems_allowed)) {
1160 retval = -EINVAL; 1191 retval = -EINVAL;
1161 goto done; 1192 goto done;
1162 } 1193 }
1163 } 1194 }
@@ -1174,7 +1205,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1174 cs->mems_allowed = trialcs->mems_allowed; 1205 cs->mems_allowed = trialcs->mems_allowed;
1175 mutex_unlock(&callback_mutex); 1206 mutex_unlock(&callback_mutex);
1176 1207
1177 update_tasks_nodemask_hier(cs, true); 1208 /* use trialcs->mems_allowed as a temp variable */
1209 update_nodemasks_hier(cs, &cs->mems_allowed);
1178done: 1210done:
1179 return retval; 1211 return retval;
1180} 1212}
@@ -1389,12 +1421,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1389 1421
1390 mutex_lock(&cpuset_mutex); 1422 mutex_lock(&cpuset_mutex);
1391 1423
1392 /* 1424 /* allow moving tasks into an empty cpuset if on default hierarchy */
1393 * We allow to move tasks into an empty cpuset if sane_behavior
1394 * flag is set.
1395 */
1396 ret = -ENOSPC; 1425 ret = -ENOSPC;
1397 if (!cgroup_sane_behavior(css->cgroup) && 1426 if (!cgroup_on_dfl(css->cgroup) &&
1398 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1427 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1399 goto out_unlock; 1428 goto out_unlock;
1400 1429
@@ -1452,8 +1481,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1452 struct task_struct *leader = cgroup_taskset_first(tset); 1481 struct task_struct *leader = cgroup_taskset_first(tset);
1453 struct cpuset *cs = css_cs(css); 1482 struct cpuset *cs = css_cs(css);
1454 struct cpuset *oldcs = cpuset_attach_old_cs; 1483 struct cpuset *oldcs = cpuset_attach_old_cs;
1455 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1456 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1457 1484
1458 mutex_lock(&cpuset_mutex); 1485 mutex_lock(&cpuset_mutex);
1459 1486
@@ -1461,9 +1488,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1461 if (cs == &top_cpuset) 1488 if (cs == &top_cpuset)
1462 cpumask_copy(cpus_attach, cpu_possible_mask); 1489 cpumask_copy(cpus_attach, cpu_possible_mask);
1463 else 1490 else
1464 guarantee_online_cpus(cpus_cs, cpus_attach); 1491 guarantee_online_cpus(cs, cpus_attach);
1465 1492
1466 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1493 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1467 1494
1468 cgroup_taskset_for_each(task, tset) { 1495 cgroup_taskset_for_each(task, tset) {
1469 /* 1496 /*
@@ -1480,11 +1507,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1480 * Change mm, possibly for multiple threads in a threadgroup. This is 1507 * Change mm, possibly for multiple threads in a threadgroup. This is
1481 * expensive and may sleep. 1508 * expensive and may sleep.
1482 */ 1509 */
1483 cpuset_attach_nodemask_to = cs->mems_allowed; 1510 cpuset_attach_nodemask_to = cs->effective_mems;
1484 mm = get_task_mm(leader); 1511 mm = get_task_mm(leader);
1485 if (mm) { 1512 if (mm) {
1486 struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
1487
1488 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1513 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1489 1514
1490 /* 1515 /*
@@ -1495,7 +1520,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1495 * mm from. 1520 * mm from.
1496 */ 1521 */
1497 if (is_memory_migrate(cs)) { 1522 if (is_memory_migrate(cs)) {
1498 cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed, 1523 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
1499 &cpuset_attach_nodemask_to); 1524 &cpuset_attach_nodemask_to);
1500 } 1525 }
1501 mmput(mm); 1526 mmput(mm);
@@ -1516,6 +1541,8 @@ typedef enum {
1516 FILE_MEMORY_MIGRATE, 1541 FILE_MEMORY_MIGRATE,
1517 FILE_CPULIST, 1542 FILE_CPULIST,
1518 FILE_MEMLIST, 1543 FILE_MEMLIST,
1544 FILE_EFFECTIVE_CPULIST,
1545 FILE_EFFECTIVE_MEMLIST,
1519 FILE_CPU_EXCLUSIVE, 1546 FILE_CPU_EXCLUSIVE,
1520 FILE_MEM_EXCLUSIVE, 1547 FILE_MEM_EXCLUSIVE,
1521 FILE_MEM_HARDWALL, 1548 FILE_MEM_HARDWALL,
@@ -1694,6 +1721,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1694 case FILE_MEMLIST: 1721 case FILE_MEMLIST:
1695 s += nodelist_scnprintf(s, count, cs->mems_allowed); 1722 s += nodelist_scnprintf(s, count, cs->mems_allowed);
1696 break; 1723 break;
1724 case FILE_EFFECTIVE_CPULIST:
1725 s += cpulist_scnprintf(s, count, cs->effective_cpus);
1726 break;
1727 case FILE_EFFECTIVE_MEMLIST:
1728 s += nodelist_scnprintf(s, count, cs->effective_mems);
1729 break;
1697 default: 1730 default:
1698 ret = -EINVAL; 1731 ret = -EINVAL;
1699 goto out_unlock; 1732 goto out_unlock;
@@ -1779,6 +1812,18 @@ static struct cftype files[] = {
1779 }, 1812 },
1780 1813
1781 { 1814 {
1815 .name = "effective_cpus",
1816 .seq_show = cpuset_common_seq_show,
1817 .private = FILE_EFFECTIVE_CPULIST,
1818 },
1819
1820 {
1821 .name = "effective_mems",
1822 .seq_show = cpuset_common_seq_show,
1823 .private = FILE_EFFECTIVE_MEMLIST,
1824 },
1825
1826 {
1782 .name = "cpu_exclusive", 1827 .name = "cpu_exclusive",
1783 .read_u64 = cpuset_read_u64, 1828 .read_u64 = cpuset_read_u64,
1784 .write_u64 = cpuset_write_u64, 1829 .write_u64 = cpuset_write_u64,
@@ -1869,18 +1914,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1869 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1914 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1870 if (!cs) 1915 if (!cs)
1871 return ERR_PTR(-ENOMEM); 1916 return ERR_PTR(-ENOMEM);
1872 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { 1917 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
1873 kfree(cs); 1918 goto free_cs;
1874 return ERR_PTR(-ENOMEM); 1919 if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
1875 } 1920 goto free_cpus;
1876 1921
1877 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1922 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1878 cpumask_clear(cs->cpus_allowed); 1923 cpumask_clear(cs->cpus_allowed);
1879 nodes_clear(cs->mems_allowed); 1924 nodes_clear(cs->mems_allowed);
1925 cpumask_clear(cs->effective_cpus);
1926 nodes_clear(cs->effective_mems);
1880 fmeter_init(&cs->fmeter); 1927 fmeter_init(&cs->fmeter);
1881 cs->relax_domain_level = -1; 1928 cs->relax_domain_level = -1;
1882 1929
1883 return &cs->css; 1930 return &cs->css;
1931
1932free_cpus:
1933 free_cpumask_var(cs->cpus_allowed);
1934free_cs:
1935 kfree(cs);
1936 return ERR_PTR(-ENOMEM);
1884} 1937}
1885 1938
1886static int cpuset_css_online(struct cgroup_subsys_state *css) 1939static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1903,6 +1956,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1903 1956
1904 cpuset_inc(); 1957 cpuset_inc();
1905 1958
1959 mutex_lock(&callback_mutex);
1960 if (cgroup_on_dfl(cs->css.cgroup)) {
1961 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1962 cs->effective_mems = parent->effective_mems;
1963 }
1964 mutex_unlock(&callback_mutex);
1965
1906 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 1966 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1907 goto out_unlock; 1967 goto out_unlock;
1908 1968
@@ -1962,20 +2022,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
1962{ 2022{
1963 struct cpuset *cs = css_cs(css); 2023 struct cpuset *cs = css_cs(css);
1964 2024
2025 free_cpumask_var(cs->effective_cpus);
1965 free_cpumask_var(cs->cpus_allowed); 2026 free_cpumask_var(cs->cpus_allowed);
1966 kfree(cs); 2027 kfree(cs);
1967} 2028}
1968 2029
2030static void cpuset_bind(struct cgroup_subsys_state *root_css)
2031{
2032 mutex_lock(&cpuset_mutex);
2033 mutex_lock(&callback_mutex);
2034
2035 if (cgroup_on_dfl(root_css->cgroup)) {
2036 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2037 top_cpuset.mems_allowed = node_possible_map;
2038 } else {
2039 cpumask_copy(top_cpuset.cpus_allowed,
2040 top_cpuset.effective_cpus);
2041 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2042 }
2043
2044 mutex_unlock(&callback_mutex);
2045 mutex_unlock(&cpuset_mutex);
2046}
2047
1969struct cgroup_subsys cpuset_cgrp_subsys = { 2048struct cgroup_subsys cpuset_cgrp_subsys = {
1970 .css_alloc = cpuset_css_alloc, 2049 .css_alloc = cpuset_css_alloc,
1971 .css_online = cpuset_css_online, 2050 .css_online = cpuset_css_online,
1972 .css_offline = cpuset_css_offline, 2051 .css_offline = cpuset_css_offline,
1973 .css_free = cpuset_css_free, 2052 .css_free = cpuset_css_free,
1974 .can_attach = cpuset_can_attach, 2053 .can_attach = cpuset_can_attach,
1975 .cancel_attach = cpuset_cancel_attach, 2054 .cancel_attach = cpuset_cancel_attach,
1976 .attach = cpuset_attach, 2055 .attach = cpuset_attach,
1977 .base_cftypes = files, 2056 .bind = cpuset_bind,
1978 .early_init = 1, 2057 .legacy_cftypes = files,
2058 .early_init = 1,
1979}; 2059};
1980 2060
1981/** 2061/**
@@ -1990,9 +2070,13 @@ int __init cpuset_init(void)
1990 2070
1991 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) 2071 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1992 BUG(); 2072 BUG();
2073 if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
2074 BUG();
1993 2075
1994 cpumask_setall(top_cpuset.cpus_allowed); 2076 cpumask_setall(top_cpuset.cpus_allowed);
1995 nodes_setall(top_cpuset.mems_allowed); 2077 nodes_setall(top_cpuset.mems_allowed);
2078 cpumask_setall(top_cpuset.effective_cpus);
2079 nodes_setall(top_cpuset.effective_mems);
1996 2080
1997 fmeter_init(&top_cpuset.fmeter); 2081 fmeter_init(&top_cpuset.fmeter);
1998 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 2082 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
@@ -2035,6 +2119,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2035 } 2119 }
2036} 2120}
2037 2121
2122static void
2123hotplug_update_tasks_legacy(struct cpuset *cs,
2124 struct cpumask *new_cpus, nodemask_t *new_mems,
2125 bool cpus_updated, bool mems_updated)
2126{
2127 bool is_empty;
2128
2129 mutex_lock(&callback_mutex);
2130 cpumask_copy(cs->cpus_allowed, new_cpus);
2131 cpumask_copy(cs->effective_cpus, new_cpus);
2132 cs->mems_allowed = *new_mems;
2133 cs->effective_mems = *new_mems;
2134 mutex_unlock(&callback_mutex);
2135
2136 /*
2137 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
2138 * as the tasks will be migratecd to an ancestor.
2139 */
2140 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
2141 update_tasks_cpumask(cs);
2142 if (mems_updated && !nodes_empty(cs->mems_allowed))
2143 update_tasks_nodemask(cs);
2144
2145 is_empty = cpumask_empty(cs->cpus_allowed) ||
2146 nodes_empty(cs->mems_allowed);
2147
2148 mutex_unlock(&cpuset_mutex);
2149
2150 /*
2151 * Move tasks to the nearest ancestor with execution resources,
2152 * This is full cgroup operation which will also call back into
2153 * cpuset. Should be done outside any lock.
2154 */
2155 if (is_empty)
2156 remove_tasks_in_empty_cpuset(cs);
2157
2158 mutex_lock(&cpuset_mutex);
2159}
2160
2161static void
2162hotplug_update_tasks(struct cpuset *cs,
2163 struct cpumask *new_cpus, nodemask_t *new_mems,
2164 bool cpus_updated, bool mems_updated)
2165{
2166 if (cpumask_empty(new_cpus))
2167 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
2168 if (nodes_empty(*new_mems))
2169 *new_mems = parent_cs(cs)->effective_mems;
2170
2171 mutex_lock(&callback_mutex);
2172 cpumask_copy(cs->effective_cpus, new_cpus);
2173 cs->effective_mems = *new_mems;
2174 mutex_unlock(&callback_mutex);
2175
2176 if (cpus_updated)
2177 update_tasks_cpumask(cs);
2178 if (mems_updated)
2179 update_tasks_nodemask(cs);
2180}
2181
2038/** 2182/**
2039 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug 2183 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
2040 * @cs: cpuset in interest 2184 * @cs: cpuset in interest
@@ -2045,11 +2189,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2045 */ 2189 */
2046static void cpuset_hotplug_update_tasks(struct cpuset *cs) 2190static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2047{ 2191{
2048 static cpumask_t off_cpus; 2192 static cpumask_t new_cpus;
2049 static nodemask_t off_mems; 2193 static nodemask_t new_mems;
2050 bool is_empty; 2194 bool cpus_updated;
2051 bool sane = cgroup_sane_behavior(cs->css.cgroup); 2195 bool mems_updated;
2052
2053retry: 2196retry:
2054 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 2197 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2055 2198
@@ -2064,51 +2207,20 @@ retry:
2064 goto retry; 2207 goto retry;
2065 } 2208 }
2066 2209
2067 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); 2210 cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
2068 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); 2211 nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
2069
2070 mutex_lock(&callback_mutex);
2071 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2072 mutex_unlock(&callback_mutex);
2073
2074 /*
2075 * If sane_behavior flag is set, we need to update tasks' cpumask
2076 * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
2077 * call update_tasks_cpumask() if the cpuset becomes empty, as
2078 * the tasks in it will be migrated to an ancestor.
2079 */
2080 if ((sane && cpumask_empty(cs->cpus_allowed)) ||
2081 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
2082 update_tasks_cpumask(cs);
2083 2212
2084 mutex_lock(&callback_mutex); 2213 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
2085 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); 2214 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
2086 mutex_unlock(&callback_mutex);
2087
2088 /*
2089 * If sane_behavior flag is set, we need to update tasks' nodemask
2090 * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
2091 * call update_tasks_nodemask() if the cpuset becomes empty, as
2092 * the tasks in it will be migratd to an ancestor.
2093 */
2094 if ((sane && nodes_empty(cs->mems_allowed)) ||
2095 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
2096 update_tasks_nodemask(cs);
2097 2215
2098 is_empty = cpumask_empty(cs->cpus_allowed) || 2216 if (cgroup_on_dfl(cs->css.cgroup))
2099 nodes_empty(cs->mems_allowed); 2217 hotplug_update_tasks(cs, &new_cpus, &new_mems,
2218 cpus_updated, mems_updated);
2219 else
2220 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
2221 cpus_updated, mems_updated);
2100 2222
2101 mutex_unlock(&cpuset_mutex); 2223 mutex_unlock(&cpuset_mutex);
2102
2103 /*
2104 * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
2105 *
2106 * Otherwise move tasks to the nearest ancestor with execution
2107 * resources. This is full cgroup operation which will
2108 * also call back into cpuset. Should be done outside any lock.
2109 */
2110 if (!sane && is_empty)
2111 remove_tasks_in_empty_cpuset(cs);
2112} 2224}
2113 2225
2114/** 2226/**
@@ -2132,6 +2244,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2132 static cpumask_t new_cpus; 2244 static cpumask_t new_cpus;
2133 static nodemask_t new_mems; 2245 static nodemask_t new_mems;
2134 bool cpus_updated, mems_updated; 2246 bool cpus_updated, mems_updated;
2247 bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
2135 2248
2136 mutex_lock(&cpuset_mutex); 2249 mutex_lock(&cpuset_mutex);
2137 2250
@@ -2139,13 +2252,15 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2139 cpumask_copy(&new_cpus, cpu_active_mask); 2252 cpumask_copy(&new_cpus, cpu_active_mask);
2140 new_mems = node_states[N_MEMORY]; 2253 new_mems = node_states[N_MEMORY];
2141 2254
2142 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); 2255 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
2143 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); 2256 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
2144 2257
2145 /* synchronize cpus_allowed to cpu_active_mask */ 2258 /* synchronize cpus_allowed to cpu_active_mask */
2146 if (cpus_updated) { 2259 if (cpus_updated) {
2147 mutex_lock(&callback_mutex); 2260 mutex_lock(&callback_mutex);
2148 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); 2261 if (!on_dfl)
2262 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2263 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2149 mutex_unlock(&callback_mutex); 2264 mutex_unlock(&callback_mutex);
2150 /* we don't mess with cpumasks of tasks in top_cpuset */ 2265 /* we don't mess with cpumasks of tasks in top_cpuset */
2151 } 2266 }
@@ -2153,7 +2268,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2153 /* synchronize mems_allowed to N_MEMORY */ 2268 /* synchronize mems_allowed to N_MEMORY */
2154 if (mems_updated) { 2269 if (mems_updated) {
2155 mutex_lock(&callback_mutex); 2270 mutex_lock(&callback_mutex);
2156 top_cpuset.mems_allowed = new_mems; 2271 if (!on_dfl)
2272 top_cpuset.mems_allowed = new_mems;
2273 top_cpuset.effective_mems = new_mems;
2157 mutex_unlock(&callback_mutex); 2274 mutex_unlock(&callback_mutex);
2158 update_tasks_nodemask(&top_cpuset); 2275 update_tasks_nodemask(&top_cpuset);
2159 } 2276 }
@@ -2228,6 +2345,9 @@ void __init cpuset_init_smp(void)
2228 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2345 top_cpuset.mems_allowed = node_states[N_MEMORY];
2229 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; 2346 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
2230 2347
2348 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
2349 top_cpuset.effective_mems = node_states[N_MEMORY];
2350
2231 register_hotmemory_notifier(&cpuset_track_online_nodes_nb); 2351 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2232} 2352}
2233 2353
@@ -2244,23 +2364,17 @@ void __init cpuset_init_smp(void)
2244 2364
2245void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2365void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2246{ 2366{
2247 struct cpuset *cpus_cs;
2248
2249 mutex_lock(&callback_mutex); 2367 mutex_lock(&callback_mutex);
2250 rcu_read_lock(); 2368 rcu_read_lock();
2251 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2369 guarantee_online_cpus(task_cs(tsk), pmask);
2252 guarantee_online_cpus(cpus_cs, pmask);
2253 rcu_read_unlock(); 2370 rcu_read_unlock();
2254 mutex_unlock(&callback_mutex); 2371 mutex_unlock(&callback_mutex);
2255} 2372}
2256 2373
2257void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2374void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2258{ 2375{
2259 struct cpuset *cpus_cs;
2260
2261 rcu_read_lock(); 2376 rcu_read_lock();
2262 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2377 do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
2263 do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
2264 rcu_read_unlock(); 2378 rcu_read_unlock();
2265 2379
2266 /* 2380 /*
@@ -2299,13 +2413,11 @@ void cpuset_init_current_mems_allowed(void)
2299 2413
2300nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2414nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2301{ 2415{
2302 struct cpuset *mems_cs;
2303 nodemask_t mask; 2416 nodemask_t mask;
2304 2417
2305 mutex_lock(&callback_mutex); 2418 mutex_lock(&callback_mutex);
2306 rcu_read_lock(); 2419 rcu_read_lock();
2307 mems_cs = effective_nodemask_cpuset(task_cs(tsk)); 2420 guarantee_online_mems(task_cs(tsk), &mask);
2308 guarantee_online_mems(mems_cs, &mask);
2309 rcu_read_unlock(); 2421 rcu_read_unlock();
2310 mutex_unlock(&callback_mutex); 2422 mutex_unlock(&callback_mutex);
2311 2423
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2f7c760305ca..379650b984f8 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2472,7 +2472,7 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
2472static void kdb_sysinfo(struct sysinfo *val) 2472static void kdb_sysinfo(struct sysinfo *val)
2473{ 2473{
2474 struct timespec uptime; 2474 struct timespec uptime;
2475 do_posix_clock_monotonic_gettime(&uptime); 2475 ktime_get_ts(&uptime);
2476 memset(val, 0, sizeof(*val)); 2476 memset(val, 0, sizeof(*val));
2477 val->uptime = uptime.tv_sec; 2477 val->uptime = uptime.tv_sec;
2478 val->loads[0] = avenrun[0]; 2478 val->loads[0] = avenrun[0];
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 54996b71e66d..ef90b04d783f 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -46,42 +46,25 @@ void __delayacct_tsk_init(struct task_struct *tsk)
46} 46}
47 47
48/* 48/*
49 * Start accounting for a delay statistic using 49 * Finish delay accounting for a statistic using its timestamps (@start),
50 * its starting timestamp (@start) 50 * accumalator (@total) and @count
51 */ 51 */
52 52static void delayacct_end(u64 *start, u64 *total, u32 *count)
53static inline void delayacct_start(struct timespec *start)
54{ 53{
55 do_posix_clock_monotonic_gettime(start); 54 s64 ns = ktime_get_ns() - *start;
56}
57
58/*
59 * Finish delay accounting for a statistic using
60 * its timestamps (@start, @end), accumalator (@total) and @count
61 */
62
63static void delayacct_end(struct timespec *start, struct timespec *end,
64 u64 *total, u32 *count)
65{
66 struct timespec ts;
67 s64 ns;
68 unsigned long flags; 55 unsigned long flags;
69 56
70 do_posix_clock_monotonic_gettime(end); 57 if (ns > 0) {
71 ts = timespec_sub(*end, *start); 58 spin_lock_irqsave(&current->delays->lock, flags);
72 ns = timespec_to_ns(&ts); 59 *total += ns;
73 if (ns < 0) 60 (*count)++;
74 return; 61 spin_unlock_irqrestore(&current->delays->lock, flags);
75 62 }
76 spin_lock_irqsave(&current->delays->lock, flags);
77 *total += ns;
78 (*count)++;
79 spin_unlock_irqrestore(&current->delays->lock, flags);
80} 63}
81 64
82void __delayacct_blkio_start(void) 65void __delayacct_blkio_start(void)
83{ 66{
84 delayacct_start(&current->delays->blkio_start); 67 current->delays->blkio_start = ktime_get_ns();
85} 68}
86 69
87void __delayacct_blkio_end(void) 70void __delayacct_blkio_end(void)
@@ -89,35 +72,29 @@ void __delayacct_blkio_end(void)
89 if (current->delays->flags & DELAYACCT_PF_SWAPIN) 72 if (current->delays->flags & DELAYACCT_PF_SWAPIN)
90 /* Swapin block I/O */ 73 /* Swapin block I/O */
91 delayacct_end(&current->delays->blkio_start, 74 delayacct_end(&current->delays->blkio_start,
92 &current->delays->blkio_end,
93 &current->delays->swapin_delay, 75 &current->delays->swapin_delay,
94 &current->delays->swapin_count); 76 &current->delays->swapin_count);
95 else /* Other block I/O */ 77 else /* Other block I/O */
96 delayacct_end(&current->delays->blkio_start, 78 delayacct_end(&current->delays->blkio_start,
97 &current->delays->blkio_end,
98 &current->delays->blkio_delay, 79 &current->delays->blkio_delay,
99 &current->delays->blkio_count); 80 &current->delays->blkio_count);
100} 81}
101 82
102int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) 83int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
103{ 84{
104 s64 tmp;
105 unsigned long t1;
106 unsigned long long t2, t3;
107 unsigned long flags;
108 struct timespec ts;
109 cputime_t utime, stime, stimescaled, utimescaled; 85 cputime_t utime, stime, stimescaled, utimescaled;
86 unsigned long long t2, t3;
87 unsigned long flags, t1;
88 s64 tmp;
110 89
111 tmp = (s64)d->cpu_run_real_total;
112 task_cputime(tsk, &utime, &stime); 90 task_cputime(tsk, &utime, &stime);
113 cputime_to_timespec(utime + stime, &ts); 91 tmp = (s64)d->cpu_run_real_total;
114 tmp += timespec_to_ns(&ts); 92 tmp += cputime_to_nsecs(utime + stime);
115 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; 93 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
116 94
117 tmp = (s64)d->cpu_scaled_run_real_total;
118 task_cputime_scaled(tsk, &utimescaled, &stimescaled); 95 task_cputime_scaled(tsk, &utimescaled, &stimescaled);
119 cputime_to_timespec(utimescaled + stimescaled, &ts); 96 tmp = (s64)d->cpu_scaled_run_real_total;
120 tmp += timespec_to_ns(&ts); 97 tmp += cputime_to_nsecs(utimescaled + stimescaled);
121 d->cpu_scaled_run_real_total = 98 d->cpu_scaled_run_real_total =
122 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; 99 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
123 100
@@ -169,13 +146,12 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
169 146
170void __delayacct_freepages_start(void) 147void __delayacct_freepages_start(void)
171{ 148{
172 delayacct_start(&current->delays->freepages_start); 149 current->delays->freepages_start = ktime_get_ns();
173} 150}
174 151
175void __delayacct_freepages_end(void) 152void __delayacct_freepages_end(void)
176{ 153{
177 delayacct_end(&current->delays->freepages_start, 154 delayacct_end(&current->delays->freepages_start,
178 &current->delays->freepages_end,
179 &current->delays->freepages_delay, 155 &current->delays->freepages_delay,
180 &current->delays->freepages_count); 156 &current->delays->freepages_count);
181} 157}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b0c95f0f06fd..1cf24b3e42ec 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5266,6 +5266,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5266 5266
5267 goto got_name; 5267 goto got_name;
5268 } else { 5268 } else {
5269 if (vma->vm_ops && vma->vm_ops->name) {
5270 name = (char *) vma->vm_ops->name(vma);
5271 if (name)
5272 goto cpy_name;
5273 }
5274
5269 name = (char *)arch_vma_name(vma); 5275 name = (char *)arch_vma_name(vma);
5270 if (name) 5276 if (name)
5271 goto cpy_name; 5277 goto cpy_name;
@@ -7458,7 +7464,19 @@ __perf_event_exit_task(struct perf_event *child_event,
7458 struct perf_event_context *child_ctx, 7464 struct perf_event_context *child_ctx,
7459 struct task_struct *child) 7465 struct task_struct *child)
7460{ 7466{
7461 perf_remove_from_context(child_event, true); 7467 /*
7468 * Do not destroy the 'original' grouping; because of the context
7469 * switch optimization the original events could've ended up in a
7470 * random child task.
7471 *
7472 * If we were to destroy the original group, all group related
7473 * operations would cease to function properly after this random
7474 * child dies.
7475 *
7476 * Do destroy all inherited groups, we don't care about those
7477 * and being thorough is better.
7478 */
7479 perf_remove_from_context(child_event, !!child_event->parent);
7462 7480
7463 /* 7481 /*
7464 * It can happen that the parent exits first, and has events 7482 * It can happen that the parent exits first, and has events
@@ -7474,7 +7492,7 @@ __perf_event_exit_task(struct perf_event *child_event,
7474static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 7492static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7475{ 7493{
7476 struct perf_event *child_event, *next; 7494 struct perf_event *child_event, *next;
7477 struct perf_event_context *child_ctx; 7495 struct perf_event_context *child_ctx, *parent_ctx;
7478 unsigned long flags; 7496 unsigned long flags;
7479 7497
7480 if (likely(!child->perf_event_ctxp[ctxn])) { 7498 if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -7499,6 +7517,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7499 raw_spin_lock(&child_ctx->lock); 7517 raw_spin_lock(&child_ctx->lock);
7500 task_ctx_sched_out(child_ctx); 7518 task_ctx_sched_out(child_ctx);
7501 child->perf_event_ctxp[ctxn] = NULL; 7519 child->perf_event_ctxp[ctxn] = NULL;
7520
7521 /*
7522 * In order to avoid freeing: child_ctx->parent_ctx->task
7523 * under perf_event_context::lock, grab another reference.
7524 */
7525 parent_ctx = child_ctx->parent_ctx;
7526 if (parent_ctx)
7527 get_ctx(parent_ctx);
7528
7502 /* 7529 /*
7503 * If this context is a clone; unclone it so it can't get 7530 * If this context is a clone; unclone it so it can't get
7504 * swapped to another process while we're removing all 7531 * swapped to another process while we're removing all
@@ -7509,6 +7536,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7509 raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 7536 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
7510 7537
7511 /* 7538 /*
7539 * Now that we no longer hold perf_event_context::lock, drop
7540 * our extra child_ctx->parent_ctx reference.
7541 */
7542 if (parent_ctx)
7543 put_ctx(parent_ctx);
7544
7545 /*
7512 * Report the task dead after unscheduling the events so that we 7546 * Report the task dead after unscheduling the events so that we
7513 * won't get any samples after PERF_RECORD_EXIT. We can however still 7547 * won't get any samples after PERF_RECORD_EXIT. We can however still
7514 * get a few PERF_RECORD_READ events. 7548 * get a few PERF_RECORD_READ events.
@@ -7776,7 +7810,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
7776/* 7810/*
7777 * Initialize the perf_event context in task_struct 7811 * Initialize the perf_event context in task_struct
7778 */ 7812 */
7779int perf_event_init_context(struct task_struct *child, int ctxn) 7813static int perf_event_init_context(struct task_struct *child, int ctxn)
7780{ 7814{
7781 struct perf_event_context *child_ctx, *parent_ctx; 7815 struct perf_event_context *child_ctx, *parent_ctx;
7782 struct perf_event_context *cloned_ctx; 7816 struct perf_event_context *cloned_ctx;
diff --git a/kernel/fork.c b/kernel/fork.c
index 6a13c46cd87d..fbd3497b221f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -315,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
315 goto free_ti; 315 goto free_ti;
316 316
317 tsk->stack = ti; 317 tsk->stack = ti;
318#ifdef CONFIG_SECCOMP
319 /*
320 * We must handle setting up seccomp filters once we're under
321 * the sighand lock in case orig has changed between now and
322 * then. Until then, filter must be NULL to avoid messing up
323 * the usage counts on the error path calling free_task.
324 */
325 tsk->seccomp.filter = NULL;
326#endif
318 327
319 setup_thread_stack(tsk, orig); 328 setup_thread_stack(tsk, orig);
320 clear_user_return_notifier(tsk); 329 clear_user_return_notifier(tsk);
@@ -1081,6 +1090,39 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1081 return 0; 1090 return 0;
1082} 1091}
1083 1092
1093static void copy_seccomp(struct task_struct *p)
1094{
1095#ifdef CONFIG_SECCOMP
1096 /*
1097 * Must be called with sighand->lock held, which is common to
1098 * all threads in the group. Holding cred_guard_mutex is not
1099 * needed because this new task is not yet running and cannot
1100 * be racing exec.
1101 */
1102 BUG_ON(!spin_is_locked(&current->sighand->siglock));
1103
1104 /* Ref-count the new filter user, and assign it. */
1105 get_seccomp_filter(current);
1106 p->seccomp = current->seccomp;
1107
1108 /*
1109 * Explicitly enable no_new_privs here in case it got set
1110 * between the task_struct being duplicated and holding the
1111 * sighand lock. The seccomp state and nnp must be in sync.
1112 */
1113 if (task_no_new_privs(current))
1114 task_set_no_new_privs(p);
1115
1116 /*
1117 * If the parent gained a seccomp mode after copying thread
1118 * flags and between before we held the sighand lock, we have
1119 * to manually enable the seccomp thread flag here.
1120 */
1121 if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
1122 set_tsk_thread_flag(p, TIF_SECCOMP);
1123#endif
1124}
1125
1084SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) 1126SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
1085{ 1127{
1086 current->clear_child_tid = tidptr; 1128 current->clear_child_tid = tidptr;
@@ -1095,7 +1137,6 @@ static void rt_mutex_init_task(struct task_struct *p)
1095 p->pi_waiters = RB_ROOT; 1137 p->pi_waiters = RB_ROOT;
1096 p->pi_waiters_leftmost = NULL; 1138 p->pi_waiters_leftmost = NULL;
1097 p->pi_blocked_on = NULL; 1139 p->pi_blocked_on = NULL;
1098 p->pi_top_task = NULL;
1099#endif 1140#endif
1100} 1141}
1101 1142
@@ -1196,7 +1237,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1196 goto fork_out; 1237 goto fork_out;
1197 1238
1198 ftrace_graph_init_task(p); 1239 ftrace_graph_init_task(p);
1199 get_seccomp_filter(p);
1200 1240
1201 rt_mutex_init_task(p); 1241 rt_mutex_init_task(p);
1202 1242
@@ -1262,9 +1302,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1262 1302
1263 posix_cpu_timers_init(p); 1303 posix_cpu_timers_init(p);
1264 1304
1265 do_posix_clock_monotonic_gettime(&p->start_time); 1305 p->start_time = ktime_get_ns();
1266 p->real_start_time = p->start_time; 1306 p->real_start_time = ktime_get_boot_ns();
1267 monotonic_to_bootbased(&p->real_start_time);
1268 p->io_context = NULL; 1307 p->io_context = NULL;
1269 p->audit_context = NULL; 1308 p->audit_context = NULL;
1270 if (clone_flags & CLONE_THREAD) 1309 if (clone_flags & CLONE_THREAD)
@@ -1437,6 +1476,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1437 spin_lock(&current->sighand->siglock); 1476 spin_lock(&current->sighand->siglock);
1438 1477
1439 /* 1478 /*
1479 * Copy seccomp details explicitly here, in case they were changed
1480 * before holding sighand lock.
1481 */
1482 copy_seccomp(p);
1483
1484 /*
1440 * Process group and session signals need to be delivered to just the 1485 * Process group and session signals need to be delivered to just the
1441 * parent before the fork or both the parent and the child after the 1486 * parent before the fork or both the parent and the child after the
1442 * fork. Restart if a signal comes in before we add the new process to 1487 * fork. Restart if a signal comes in before we add the new process to
diff --git a/kernel/futex.c b/kernel/futex.c
index b632b5f3f094..d3a9d946d0b7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -792,94 +792,91 @@ void exit_pi_state_list(struct task_struct *curr)
792 * [10] There is no transient state which leaves owner and user space 792 * [10] There is no transient state which leaves owner and user space
793 * TID out of sync. 793 * TID out of sync.
794 */ 794 */
795static int 795
796lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 796/*
797 union futex_key *key, struct futex_pi_state **ps) 797 * Validate that the existing waiter has a pi_state and sanity check
798 * the pi_state against the user space value. If correct, attach to
799 * it.
800 */
801static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
802 struct futex_pi_state **ps)
798{ 803{
799 struct futex_pi_state *pi_state = NULL;
800 struct futex_q *this, *next;
801 struct task_struct *p;
802 pid_t pid = uval & FUTEX_TID_MASK; 804 pid_t pid = uval & FUTEX_TID_MASK;
803 805
804 plist_for_each_entry_safe(this, next, &hb->chain, list) { 806 /*
805 if (match_futex(&this->key, key)) { 807 * Userspace might have messed up non-PI and PI futexes [3]
806 /* 808 */
807 * Sanity check the waiter before increasing 809 if (unlikely(!pi_state))
808 * the refcount and attaching to it. 810 return -EINVAL;
809 */
810 pi_state = this->pi_state;
811 /*
812 * Userspace might have messed up non-PI and
813 * PI futexes [3]
814 */
815 if (unlikely(!pi_state))
816 return -EINVAL;
817 811
818 WARN_ON(!atomic_read(&pi_state->refcount)); 812 WARN_ON(!atomic_read(&pi_state->refcount));
819 813
814 /*
815 * Handle the owner died case:
816 */
817 if (uval & FUTEX_OWNER_DIED) {
818 /*
819 * exit_pi_state_list sets owner to NULL and wakes the
820 * topmost waiter. The task which acquires the
821 * pi_state->rt_mutex will fixup owner.
822 */
823 if (!pi_state->owner) {
820 /* 824 /*
821 * Handle the owner died case: 825 * No pi state owner, but the user space TID
826 * is not 0. Inconsistent state. [5]
822 */ 827 */
823 if (uval & FUTEX_OWNER_DIED) { 828 if (pid)
824 /* 829 return -EINVAL;
825 * exit_pi_state_list sets owner to NULL and
826 * wakes the topmost waiter. The task which
827 * acquires the pi_state->rt_mutex will fixup
828 * owner.
829 */
830 if (!pi_state->owner) {
831 /*
832 * No pi state owner, but the user
833 * space TID is not 0. Inconsistent
834 * state. [5]
835 */
836 if (pid)
837 return -EINVAL;
838 /*
839 * Take a ref on the state and
840 * return. [4]
841 */
842 goto out_state;
843 }
844
845 /*
846 * If TID is 0, then either the dying owner
847 * has not yet executed exit_pi_state_list()
848 * or some waiter acquired the rtmutex in the
849 * pi state, but did not yet fixup the TID in
850 * user space.
851 *
852 * Take a ref on the state and return. [6]
853 */
854 if (!pid)
855 goto out_state;
856 } else {
857 /*
858 * If the owner died bit is not set,
859 * then the pi_state must have an
860 * owner. [7]
861 */
862 if (!pi_state->owner)
863 return -EINVAL;
864 }
865
866 /* 830 /*
867 * Bail out if user space manipulated the 831 * Take a ref on the state and return success. [4]
868 * futex value. If pi state exists then the
869 * owner TID must be the same as the user
870 * space TID. [9/10]
871 */ 832 */
872 if (pid != task_pid_vnr(pi_state->owner)) 833 goto out_state;
873 return -EINVAL;
874
875 out_state:
876 atomic_inc(&pi_state->refcount);
877 *ps = pi_state;
878 return 0;
879 } 834 }
835
836 /*
837 * If TID is 0, then either the dying owner has not
838 * yet executed exit_pi_state_list() or some waiter
839 * acquired the rtmutex in the pi state, but did not
840 * yet fixup the TID in user space.
841 *
842 * Take a ref on the state and return success. [6]
843 */
844 if (!pid)
845 goto out_state;
846 } else {
847 /*
848 * If the owner died bit is not set, then the pi_state
849 * must have an owner. [7]
850 */
851 if (!pi_state->owner)
852 return -EINVAL;
880 } 853 }
881 854
882 /* 855 /*
856 * Bail out if user space manipulated the futex value. If pi
857 * state exists then the owner TID must be the same as the
858 * user space TID. [9/10]
859 */
860 if (pid != task_pid_vnr(pi_state->owner))
861 return -EINVAL;
862out_state:
863 atomic_inc(&pi_state->refcount);
864 *ps = pi_state;
865 return 0;
866}
867
868/*
869 * Lookup the task for the TID provided from user space and attach to
870 * it after doing proper sanity checks.
871 */
872static int attach_to_pi_owner(u32 uval, union futex_key *key,
873 struct futex_pi_state **ps)
874{
875 pid_t pid = uval & FUTEX_TID_MASK;
876 struct futex_pi_state *pi_state;
877 struct task_struct *p;
878
879 /*
883 * We are the first waiter - try to look up the real owner and attach 880 * We are the first waiter - try to look up the real owner and attach
884 * the new pi_state to it, but bail out when TID = 0 [1] 881 * the new pi_state to it, but bail out when TID = 0 [1]
885 */ 882 */
@@ -920,7 +917,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
920 pi_state = alloc_pi_state(); 917 pi_state = alloc_pi_state();
921 918
922 /* 919 /*
923 * Initialize the pi_mutex in locked state and make 'p' 920 * Initialize the pi_mutex in locked state and make @p
924 * the owner of it: 921 * the owner of it:
925 */ 922 */
926 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 923 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
@@ -940,6 +937,36 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
940 return 0; 937 return 0;
941} 938}
942 939
940static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
941 union futex_key *key, struct futex_pi_state **ps)
942{
943 struct futex_q *match = futex_top_waiter(hb, key);
944
945 /*
946 * If there is a waiter on that futex, validate it and
947 * attach to the pi_state when the validation succeeds.
948 */
949 if (match)
950 return attach_to_pi_state(uval, match->pi_state, ps);
951
952 /*
953 * We are the first waiter - try to look up the owner based on
954 * @uval and attach to it.
955 */
956 return attach_to_pi_owner(uval, key, ps);
957}
958
959static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
960{
961 u32 uninitialized_var(curval);
962
963 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
964 return -EFAULT;
965
966 /*If user space value changed, let the caller retry */
967 return curval != uval ? -EAGAIN : 0;
968}
969
943/** 970/**
944 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex 971 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
945 * @uaddr: the pi futex user address 972 * @uaddr: the pi futex user address
@@ -963,113 +990,69 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
963 struct futex_pi_state **ps, 990 struct futex_pi_state **ps,
964 struct task_struct *task, int set_waiters) 991 struct task_struct *task, int set_waiters)
965{ 992{
966 int lock_taken, ret, force_take = 0; 993 u32 uval, newval, vpid = task_pid_vnr(task);
967 u32 uval, newval, curval, vpid = task_pid_vnr(task); 994 struct futex_q *match;
968 995 int ret;
969retry:
970 ret = lock_taken = 0;
971 996
972 /* 997 /*
973 * To avoid races, we attempt to take the lock here again 998 * Read the user space value first so we can validate a few
974 * (by doing a 0 -> TID atomic cmpxchg), while holding all 999 * things before proceeding further.
975 * the locks. It will most likely not succeed.
976 */ 1000 */
977 newval = vpid; 1001 if (get_futex_value_locked(&uval, uaddr))
978 if (set_waiters)
979 newval |= FUTEX_WAITERS;
980
981 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
982 return -EFAULT; 1002 return -EFAULT;
983 1003
984 /* 1004 /*
985 * Detect deadlocks. 1005 * Detect deadlocks.
986 */ 1006 */
987 if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) 1007 if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
988 return -EDEADLK; 1008 return -EDEADLK;
989 1009
990 /* 1010 /*
991 * Surprise - we got the lock, but we do not trust user space at all. 1011 * Lookup existing state first. If it exists, try to attach to
992 */ 1012 * its pi_state.
993 if (unlikely(!curval)) {
994 /*
995 * We verify whether there is kernel state for this
996 * futex. If not, we can safely assume, that the 0 ->
997 * TID transition is correct. If state exists, we do
998 * not bother to fixup the user space state as it was
999 * corrupted already.
1000 */
1001 return futex_top_waiter(hb, key) ? -EINVAL : 1;
1002 }
1003
1004 uval = curval;
1005
1006 /*
1007 * Set the FUTEX_WAITERS flag, so the owner will know it has someone
1008 * to wake at the next unlock.
1009 */ 1013 */
1010 newval = curval | FUTEX_WAITERS; 1014 match = futex_top_waiter(hb, key);
1015 if (match)
1016 return attach_to_pi_state(uval, match->pi_state, ps);
1011 1017
1012 /* 1018 /*
1013 * Should we force take the futex? See below. 1019 * No waiter and user TID is 0. We are here because the
1020 * waiters or the owner died bit is set or called from
1021 * requeue_cmp_pi or for whatever reason something took the
1022 * syscall.
1014 */ 1023 */
1015 if (unlikely(force_take)) { 1024 if (!(uval & FUTEX_TID_MASK)) {
1016 /* 1025 /*
1017 * Keep the OWNER_DIED and the WAITERS bit and set the 1026 * We take over the futex. No other waiters and the user space
1018 * new TID value. 1027 * TID is 0. We preserve the owner died bit.
1019 */ 1028 */
1020 newval = (curval & ~FUTEX_TID_MASK) | vpid; 1029 newval = uval & FUTEX_OWNER_DIED;
1021 force_take = 0; 1030 newval |= vpid;
1022 lock_taken = 1;
1023 }
1024 1031
1025 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) 1032 /* The futex requeue_pi code can enforce the waiters bit */
1026 return -EFAULT; 1033 if (set_waiters)
1027 if (unlikely(curval != uval)) 1034 newval |= FUTEX_WAITERS;
1028 goto retry; 1035
1036 ret = lock_pi_update_atomic(uaddr, uval, newval);
1037 /* If the take over worked, return 1 */
1038 return ret < 0 ? ret : 1;
1039 }
1029 1040
1030 /* 1041 /*
1031 * We took the lock due to forced take over. 1042 * First waiter. Set the waiters bit before attaching ourself to
1043 * the owner. If owner tries to unlock, it will be forced into
1044 * the kernel and blocked on hb->lock.
1032 */ 1045 */
1033 if (unlikely(lock_taken)) 1046 newval = uval | FUTEX_WAITERS;
1034 return 1; 1047 ret = lock_pi_update_atomic(uaddr, uval, newval);
1035 1048 if (ret)
1049 return ret;
1036 /* 1050 /*
1037 * We dont have the lock. Look up the PI state (or create it if 1051 * If the update of the user space value succeeded, we try to
1038 * we are the first waiter): 1052 * attach to the owner. If that fails, no harm done, we only
1053 * set the FUTEX_WAITERS bit in the user space variable.
1039 */ 1054 */
1040 ret = lookup_pi_state(uval, hb, key, ps); 1055 return attach_to_pi_owner(uval, key, ps);
1041
1042 if (unlikely(ret)) {
1043 switch (ret) {
1044 case -ESRCH:
1045 /*
1046 * We failed to find an owner for this
1047 * futex. So we have no pi_state to block
1048 * on. This can happen in two cases:
1049 *
1050 * 1) The owner died
1051 * 2) A stale FUTEX_WAITERS bit
1052 *
1053 * Re-read the futex value.
1054 */
1055 if (get_futex_value_locked(&curval, uaddr))
1056 return -EFAULT;
1057
1058 /*
1059 * If the owner died or we have a stale
1060 * WAITERS bit the owner TID in the user space
1061 * futex is 0.
1062 */
1063 if (!(curval & FUTEX_TID_MASK)) {
1064 force_take = 1;
1065 goto retry;
1066 }
1067 default:
1068 break;
1069 }
1070 }
1071
1072 return ret;
1073} 1056}
1074 1057
1075/** 1058/**
@@ -1186,22 +1169,6 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
1186 return 0; 1169 return 0;
1187} 1170}
1188 1171
1189static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
1190{
1191 u32 uninitialized_var(oldval);
1192
1193 /*
1194 * There is no waiter, so we unlock the futex. The owner died
1195 * bit has not to be preserved here. We are the owner:
1196 */
1197 if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
1198 return -EFAULT;
1199 if (oldval != uval)
1200 return -EAGAIN;
1201
1202 return 0;
1203}
1204
1205/* 1172/*
1206 * Express the locking dependencies for lockdep: 1173 * Express the locking dependencies for lockdep:
1207 */ 1174 */
@@ -1659,7 +1626,12 @@ retry_private:
1659 goto retry; 1626 goto retry;
1660 goto out; 1627 goto out;
1661 case -EAGAIN: 1628 case -EAGAIN:
1662 /* The owner was exiting, try again. */ 1629 /*
1630 * Two reasons for this:
1631 * - Owner is exiting and we just wait for the
1632 * exit to complete.
1633 * - The user space value changed.
1634 */
1663 double_unlock_hb(hb1, hb2); 1635 double_unlock_hb(hb1, hb2);
1664 hb_waiters_dec(hb2); 1636 hb_waiters_dec(hb2);
1665 put_futex_key(&key2); 1637 put_futex_key(&key2);
@@ -1718,7 +1690,7 @@ retry_private:
1718 this->pi_state = pi_state; 1690 this->pi_state = pi_state;
1719 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, 1691 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1720 this->rt_waiter, 1692 this->rt_waiter,
1721 this->task, 1); 1693 this->task);
1722 if (ret == 1) { 1694 if (ret == 1) {
1723 /* We got the lock. */ 1695 /* We got the lock. */
1724 requeue_pi_wake_futex(this, &key2, hb2); 1696 requeue_pi_wake_futex(this, &key2, hb2);
@@ -2316,8 +2288,10 @@ retry_private:
2316 goto uaddr_faulted; 2288 goto uaddr_faulted;
2317 case -EAGAIN: 2289 case -EAGAIN:
2318 /* 2290 /*
2319 * Task is exiting and we just wait for the 2291 * Two reasons for this:
2320 * exit to complete. 2292 * - Task is exiting and we just wait for the
2293 * exit to complete.
2294 * - The user space value changed.
2321 */ 2295 */
2322 queue_unlock(hb); 2296 queue_unlock(hb);
2323 put_futex_key(&q.key); 2297 put_futex_key(&q.key);
@@ -2337,9 +2311,9 @@ retry_private:
2337 /* 2311 /*
2338 * Block on the PI mutex: 2312 * Block on the PI mutex:
2339 */ 2313 */
2340 if (!trylock) 2314 if (!trylock) {
2341 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); 2315 ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
2342 else { 2316 } else {
2343 ret = rt_mutex_trylock(&q.pi_state->pi_mutex); 2317 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
2344 /* Fixup the trylock return value: */ 2318 /* Fixup the trylock return value: */
2345 ret = ret ? 0 : -EWOULDBLOCK; 2319 ret = ret ? 0 : -EWOULDBLOCK;
@@ -2401,10 +2375,10 @@ uaddr_faulted:
2401 */ 2375 */
2402static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) 2376static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2403{ 2377{
2404 struct futex_hash_bucket *hb; 2378 u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
2405 struct futex_q *this, *next;
2406 union futex_key key = FUTEX_KEY_INIT; 2379 union futex_key key = FUTEX_KEY_INIT;
2407 u32 uval, vpid = task_pid_vnr(current); 2380 struct futex_hash_bucket *hb;
2381 struct futex_q *match;
2408 int ret; 2382 int ret;
2409 2383
2410retry: 2384retry:
@@ -2417,57 +2391,47 @@ retry:
2417 return -EPERM; 2391 return -EPERM;
2418 2392
2419 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); 2393 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
2420 if (unlikely(ret != 0)) 2394 if (ret)
2421 goto out; 2395 return ret;
2422 2396
2423 hb = hash_futex(&key); 2397 hb = hash_futex(&key);
2424 spin_lock(&hb->lock); 2398 spin_lock(&hb->lock);
2425 2399
2426 /* 2400 /*
2427 * To avoid races, try to do the TID -> 0 atomic transition 2401 * Check waiters first. We do not trust user space values at
2428 * again. If it succeeds then we can return without waking 2402 * all and we at least want to know if user space fiddled
2429 * anyone else up. We only try this if neither the waiters nor 2403 * with the futex value instead of blindly unlocking.
2430 * the owner died bit are set.
2431 */
2432 if (!(uval & ~FUTEX_TID_MASK) &&
2433 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2434 goto pi_faulted;
2435 /*
2436 * Rare case: we managed to release the lock atomically,
2437 * no need to wake anyone else up:
2438 */
2439 if (unlikely(uval == vpid))
2440 goto out_unlock;
2441
2442 /*
2443 * Ok, other tasks may need to be woken up - check waiters
2444 * and do the wakeup if necessary:
2445 */ 2404 */
2446 plist_for_each_entry_safe(this, next, &hb->chain, list) { 2405 match = futex_top_waiter(hb, &key);
2447 if (!match_futex (&this->key, &key)) 2406 if (match) {
2448 continue; 2407 ret = wake_futex_pi(uaddr, uval, match);
2449 ret = wake_futex_pi(uaddr, uval, this);
2450 /* 2408 /*
2451 * The atomic access to the futex value 2409 * The atomic access to the futex value generated a
2452 * generated a pagefault, so retry the 2410 * pagefault, so retry the user-access and the wakeup:
2453 * user-access and the wakeup:
2454 */ 2411 */
2455 if (ret == -EFAULT) 2412 if (ret == -EFAULT)
2456 goto pi_faulted; 2413 goto pi_faulted;
2457 goto out_unlock; 2414 goto out_unlock;
2458 } 2415 }
2416
2459 /* 2417 /*
2460 * No waiters - kernel unlocks the futex: 2418 * We have no kernel internal state, i.e. no waiters in the
2419 * kernel. Waiters which are about to queue themselves are stuck
2420 * on hb->lock. So we can safely ignore them. We do neither
2421 * preserve the WAITERS bit not the OWNER_DIED one. We are the
2422 * owner.
2461 */ 2423 */
2462 ret = unlock_futex_pi(uaddr, uval); 2424 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
2463 if (ret == -EFAULT)
2464 goto pi_faulted; 2425 goto pi_faulted;
2465 2426
2427 /*
2428 * If uval has changed, let user space handle it.
2429 */
2430 ret = (curval == uval) ? 0 : -EAGAIN;
2431
2466out_unlock: 2432out_unlock:
2467 spin_unlock(&hb->lock); 2433 spin_unlock(&hb->lock);
2468 put_futex_key(&key); 2434 put_futex_key(&key);
2469
2470out:
2471 return ret; 2435 return ret;
2472 2436
2473pi_faulted: 2437pi_faulted:
@@ -2669,7 +2633,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2669 */ 2633 */
2670 WARN_ON(!q.pi_state); 2634 WARN_ON(!q.pi_state);
2671 pi_mutex = &q.pi_state->pi_mutex; 2635 pi_mutex = &q.pi_state->pi_mutex;
2672 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); 2636 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
2673 debug_rt_mutex_free_waiter(&rt_waiter); 2637 debug_rt_mutex_free_waiter(&rt_waiter);
2674 2638
2675 spin_lock(q.lock_ptr); 2639 spin_lock(q.lock_ptr);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 452d6f2ba21d..cf80e7b0ddab 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -341,8 +341,8 @@ static struct lock_class_key irq_nested_lock_class;
341/* 341/*
342 * irq_map_generic_chip - Map a generic chip for an irq domain 342 * irq_map_generic_chip - Map a generic chip for an irq domain
343 */ 343 */
344static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, 344int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
345 irq_hw_number_t hw_irq) 345 irq_hw_number_t hw_irq)
346{ 346{
347 struct irq_data *data = irq_get_irq_data(virq); 347 struct irq_data *data = irq_get_irq_data(virq);
348 struct irq_domain_chip_generic *dgc = d->gc; 348 struct irq_domain_chip_generic *dgc = d->gc;
@@ -394,6 +394,7 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
394 irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); 394 irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
395 return 0; 395 return 0;
396} 396}
397EXPORT_SYMBOL_GPL(irq_map_generic_chip);
397 398
398struct irq_domain_ops irq_generic_chip_ops = { 399struct irq_domain_ops irq_generic_chip_ops = {
399 .map = irq_map_generic_chip, 400 .map = irq_map_generic_chip,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index eb5e10e32e05..6534ff6ce02e 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -231,7 +231,7 @@ void irq_set_default_host(struct irq_domain *domain)
231} 231}
232EXPORT_SYMBOL_GPL(irq_set_default_host); 232EXPORT_SYMBOL_GPL(irq_set_default_host);
233 233
234static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) 234void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
235{ 235{
236 struct irq_data *irq_data = irq_get_irq_data(irq); 236 struct irq_data *irq_data = irq_get_irq_data(irq);
237 irq_hw_number_t hwirq; 237 irq_hw_number_t hwirq;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index a82170e2fa78..e6bcbe756663 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -16,11 +16,12 @@
16#include <linux/tick.h> 16#include <linux/tick.h>
17#include <linux/cpu.h> 17#include <linux/cpu.h>
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h>
19#include <asm/processor.h> 20#include <asm/processor.h>
20 21
21 22
22static DEFINE_PER_CPU(struct llist_head, irq_work_list); 23static DEFINE_PER_CPU(struct llist_head, raised_list);
23static DEFINE_PER_CPU(int, irq_work_raised); 24static DEFINE_PER_CPU(struct llist_head, lazy_list);
24 25
25/* 26/*
26 * Claim the entry so that no one else will poke at it. 27 * Claim the entry so that no one else will poke at it.
@@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void)
55 */ 56 */
56} 57}
57 58
59#ifdef CONFIG_SMP
58/* 60/*
59 * Enqueue the irq_work @entry unless it's already pending 61 * Enqueue the irq_work @work on @cpu unless it's already pending
60 * somewhere. 62 * somewhere.
61 * 63 *
62 * Can be re-enqueued while the callback is still in progress. 64 * Can be re-enqueued while the callback is still in progress.
63 */ 65 */
66bool irq_work_queue_on(struct irq_work *work, int cpu)
67{
68 /* All work should have been flushed before going offline */
69 WARN_ON_ONCE(cpu_is_offline(cpu));
70
71 /* Arch remote IPI send/receive backend aren't NMI safe */
72 WARN_ON_ONCE(in_nmi());
73
74 /* Only queue if not already pending */
75 if (!irq_work_claim(work))
76 return false;
77
78 if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
79 arch_send_call_function_single_ipi(cpu);
80
81 return true;
82}
83EXPORT_SYMBOL_GPL(irq_work_queue_on);
84#endif
85
86/* Enqueue the irq work @work on the current CPU */
64bool irq_work_queue(struct irq_work *work) 87bool irq_work_queue(struct irq_work *work)
65{ 88{
66 /* Only queue if not already pending */ 89 /* Only queue if not already pending */
@@ -70,15 +93,13 @@ bool irq_work_queue(struct irq_work *work)
70 /* Queue the entry and raise the IPI if needed. */ 93 /* Queue the entry and raise the IPI if needed. */
71 preempt_disable(); 94 preempt_disable();
72 95
73 llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); 96 /* If the work is "lazy", handle it from next tick if any */
74 97 if (work->flags & IRQ_WORK_LAZY) {
75 /* 98 if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) &&
76 * If the work is not "lazy" or the tick is stopped, raise the irq 99 tick_nohz_tick_stopped())
77 * work interrupt (if supported by the arch), otherwise, just wait 100 arch_irq_work_raise();
78 * for the next tick. 101 } else {
79 */ 102 if (llist_add(&work->llnode, &__get_cpu_var(raised_list)))
80 if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
81 if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
82 arch_irq_work_raise(); 103 arch_irq_work_raise();
83 } 104 }
84 105
@@ -90,10 +111,11 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
90 111
91bool irq_work_needs_cpu(void) 112bool irq_work_needs_cpu(void)
92{ 113{
93 struct llist_head *this_list; 114 struct llist_head *raised, *lazy;
94 115
95 this_list = &__get_cpu_var(irq_work_list); 116 raised = &__get_cpu_var(raised_list);
96 if (llist_empty(this_list)) 117 lazy = &__get_cpu_var(lazy_list);
118 if (llist_empty(raised) && llist_empty(lazy))
97 return false; 119 return false;
98 120
99 /* All work should have been flushed before going offline */ 121 /* All work should have been flushed before going offline */
@@ -102,28 +124,18 @@ bool irq_work_needs_cpu(void)
102 return true; 124 return true;
103} 125}
104 126
105static void __irq_work_run(void) 127static void irq_work_run_list(struct llist_head *list)
106{ 128{
107 unsigned long flags; 129 unsigned long flags;
108 struct irq_work *work; 130 struct irq_work *work;
109 struct llist_head *this_list;
110 struct llist_node *llnode; 131 struct llist_node *llnode;
111 132
133 BUG_ON(!irqs_disabled());
112 134
113 /* 135 if (llist_empty(list))
114 * Reset the "raised" state right before we check the list because
115 * an NMI may enqueue after we find the list empty from the runner.
116 */
117 __this_cpu_write(irq_work_raised, 0);
118 barrier();
119
120 this_list = &__get_cpu_var(irq_work_list);
121 if (llist_empty(this_list))
122 return; 136 return;
123 137
124 BUG_ON(!irqs_disabled()); 138 llnode = llist_del_all(list);
125
126 llnode = llist_del_all(this_list);
127 while (llnode != NULL) { 139 while (llnode != NULL) {
128 work = llist_entry(llnode, struct irq_work, llnode); 140 work = llist_entry(llnode, struct irq_work, llnode);
129 141
@@ -149,13 +161,13 @@ static void __irq_work_run(void)
149} 161}
150 162
151/* 163/*
152 * Run the irq_work entries on this cpu. Requires to be ran from hardirq 164 * hotplug calls this through:
153 * context with local IRQs disabled. 165 * hotplug_cfd() -> flush_smp_call_function_queue()
154 */ 166 */
155void irq_work_run(void) 167void irq_work_run(void)
156{ 168{
157 BUG_ON(!in_irq()); 169 irq_work_run_list(&__get_cpu_var(raised_list));
158 __irq_work_run(); 170 irq_work_run_list(&__get_cpu_var(lazy_list));
159} 171}
160EXPORT_SYMBOL_GPL(irq_work_run); 172EXPORT_SYMBOL_GPL(irq_work_run);
161 173
@@ -171,35 +183,3 @@ void irq_work_sync(struct irq_work *work)
171 cpu_relax(); 183 cpu_relax();
172} 184}
173EXPORT_SYMBOL_GPL(irq_work_sync); 185EXPORT_SYMBOL_GPL(irq_work_sync);
174
175#ifdef CONFIG_HOTPLUG_CPU
176static int irq_work_cpu_notify(struct notifier_block *self,
177 unsigned long action, void *hcpu)
178{
179 long cpu = (long)hcpu;
180
181 switch (action) {
182 case CPU_DYING:
183 /* Called from stop_machine */
184 if (WARN_ON_ONCE(cpu != smp_processor_id()))
185 break;
186 __irq_work_run();
187 break;
188 default:
189 break;
190 }
191 return NOTIFY_OK;
192}
193
194static struct notifier_block cpu_notify;
195
196static __init int irq_work_init_cpu_notifier(void)
197{
198 cpu_notify.notifier_call = irq_work_cpu_notify;
199 cpu_notify.priority = 0;
200 register_cpu_notifier(&cpu_notify);
201 return 0;
202}
203device_initcall(irq_work_init_cpu_notifier);
204
205#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 369f41a94124..4b8f0c925884 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
33#include <linux/swap.h> 33#include <linux/swap.h>
34#include <linux/syscore_ops.h> 34#include <linux/syscore_ops.h>
35#include <linux/compiler.h> 35#include <linux/compiler.h>
36#include <linux/hugetlb.h>
36 37
37#include <asm/page.h> 38#include <asm/page.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
@@ -1619,6 +1620,9 @@ static int __init crash_save_vmcoreinfo_init(void)
1619#endif 1620#endif
1620 VMCOREINFO_NUMBER(PG_head_mask); 1621 VMCOREINFO_NUMBER(PG_head_mask);
1621 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); 1622 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1623#ifdef CONFIG_HUGETLBFS
1624 VMCOREINFO_SYMBOL(free_huge_page);
1625#endif
1622 1626
1623 arch_crash_save_vmcoreinfo(); 1627 arch_crash_save_vmcoreinfo();
1624 update_vmcoreinfo_note(); 1628 update_vmcoreinfo_note();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3214289df5a7..734e9a7d280b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2037,19 +2037,23 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
2037{ 2037{
2038 unsigned long *iter; 2038 unsigned long *iter;
2039 struct kprobe_blacklist_entry *ent; 2039 struct kprobe_blacklist_entry *ent;
2040 unsigned long offset = 0, size = 0; 2040 unsigned long entry, offset = 0, size = 0;
2041 2041
2042 for (iter = start; iter < end; iter++) { 2042 for (iter = start; iter < end; iter++) {
2043 if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) { 2043 entry = arch_deref_entry_point((void *)*iter);
2044 pr_err("Failed to find blacklist %p\n", (void *)*iter); 2044
2045 if (!kernel_text_address(entry) ||
2046 !kallsyms_lookup_size_offset(entry, &size, &offset)) {
2047 pr_err("Failed to find blacklist at %p\n",
2048 (void *)entry);
2045 continue; 2049 continue;
2046 } 2050 }
2047 2051
2048 ent = kmalloc(sizeof(*ent), GFP_KERNEL); 2052 ent = kmalloc(sizeof(*ent), GFP_KERNEL);
2049 if (!ent) 2053 if (!ent)
2050 return -ENOMEM; 2054 return -ENOMEM;
2051 ent->start_addr = *iter; 2055 ent->start_addr = entry;
2052 ent->end_addr = *iter + size; 2056 ent->end_addr = entry + size;
2053 INIT_LIST_HEAD(&ent->list); 2057 INIT_LIST_HEAD(&ent->list);
2054 list_add_tail(&ent->list, &kprobe_blacklist); 2058 list_add_tail(&ent->list, &kprobe_blacklist);
2055 } 2059 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c2390f41307b..ef483220e855 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -591,7 +591,7 @@ static void insert_kthread_work(struct kthread_worker *worker,
591 591
592 list_add_tail(&work->node, pos); 592 list_add_tail(&work->node, pos);
593 work->worker = worker; 593 work->worker = worker;
594 if (likely(worker->task)) 594 if (!worker->current_work && likely(worker->task))
595 wake_up_process(worker->task); 595 wake_up_process(worker->task);
596} 596}
597 597
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index d24e4339b46d..88d0d4420ad2 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -384,7 +384,9 @@ static void print_lockdep_off(const char *bug_msg)
384{ 384{
385 printk(KERN_DEBUG "%s\n", bug_msg); 385 printk(KERN_DEBUG "%s\n", bug_msg);
386 printk(KERN_DEBUG "turning off the locking correctness validator.\n"); 386 printk(KERN_DEBUG "turning off the locking correctness validator.\n");
387#ifdef CONFIG_LOCK_STAT
387 printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); 388 printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
389#endif
388} 390}
389 391
390static int save_trace(struct stack_trace *trace) 392static int save_trace(struct stack_trace *trace)
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index be9ee1559fca..9887a905a762 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -1,6 +1,4 @@
1
2#include <linux/percpu.h> 1#include <linux/percpu.h>
3#include <linux/mutex.h>
4#include <linux/sched.h> 2#include <linux/sched.h>
5#include "mcs_spinlock.h" 3#include "mcs_spinlock.h"
6 4
@@ -79,7 +77,7 @@ osq_wait_next(struct optimistic_spin_queue *lock,
79 break; 77 break;
80 } 78 }
81 79
82 arch_mutex_cpu_relax(); 80 cpu_relax_lowlatency();
83 } 81 }
84 82
85 return next; 83 return next;
@@ -120,7 +118,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
120 if (need_resched()) 118 if (need_resched())
121 goto unqueue; 119 goto unqueue;
122 120
123 arch_mutex_cpu_relax(); 121 cpu_relax_lowlatency();
124 } 122 }
125 return true; 123 return true;
126 124
@@ -146,7 +144,7 @@ unqueue:
146 if (smp_load_acquire(&node->locked)) 144 if (smp_load_acquire(&node->locked))
147 return true; 145 return true;
148 146
149 arch_mutex_cpu_relax(); 147 cpu_relax_lowlatency();
150 148
151 /* 149 /*
152 * Or we race against a concurrent unqueue()'s step-B, in which 150 * Or we race against a concurrent unqueue()'s step-B, in which
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 74356dc0ce29..23e89c5930e9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -27,7 +27,7 @@ struct mcs_spinlock {
27#define arch_mcs_spin_lock_contended(l) \ 27#define arch_mcs_spin_lock_contended(l) \
28do { \ 28do { \
29 while (!(smp_load_acquire(l))) \ 29 while (!(smp_load_acquire(l))) \
30 arch_mutex_cpu_relax(); \ 30 cpu_relax_lowlatency(); \
31} while (0) 31} while (0)
32#endif 32#endif
33 33
@@ -104,7 +104,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
104 return; 104 return;
105 /* Wait until the next pointer is set */ 105 /* Wait until the next pointer is set */
106 while (!(next = ACCESS_ONCE(node->next))) 106 while (!(next = ACCESS_ONCE(node->next)))
107 arch_mutex_cpu_relax(); 107 cpu_relax_lowlatency();
108 } 108 }
109 109
110 /* Pass lock to next waiter. */ 110 /* Pass lock to next waiter. */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index acca2c1a3c5e..ae712b25e492 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -46,12 +46,6 @@
46# include <asm/mutex.h> 46# include <asm/mutex.h>
47#endif 47#endif
48 48
49/*
50 * A negative mutex count indicates that waiters are sleeping waiting for the
51 * mutex.
52 */
53#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0)
54
55void 49void
56__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) 50__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
57{ 51{
@@ -152,7 +146,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
152 if (need_resched()) 146 if (need_resched())
153 break; 147 break;
154 148
155 arch_mutex_cpu_relax(); 149 cpu_relax_lowlatency();
156 } 150 }
157 rcu_read_unlock(); 151 rcu_read_unlock();
158 152
@@ -388,12 +382,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
388 /* 382 /*
389 * Optimistic spinning. 383 * Optimistic spinning.
390 * 384 *
391 * We try to spin for acquisition when we find that there are no 385 * We try to spin for acquisition when we find that the lock owner
392 * pending waiters and the lock owner is currently running on a 386 * is currently running on a (different) CPU and while we don't
393 * (different) CPU. 387 * need to reschedule. The rationale is that if the lock owner is
394 * 388 * running, it is likely to release the lock soon.
395 * The rationale is that if the lock owner is running, it is likely to
396 * release the lock soon.
397 * 389 *
398 * Since this needs the lock owner, and this mutex implementation 390 * Since this needs the lock owner, and this mutex implementation
399 * doesn't track the owner atomically in the lock field, we need to 391 * doesn't track the owner atomically in the lock field, we need to
@@ -440,7 +432,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
440 if (owner && !mutex_spin_on_owner(lock, owner)) 432 if (owner && !mutex_spin_on_owner(lock, owner))
441 break; 433 break;
442 434
443 if ((atomic_read(&lock->count) == 1) && 435 /* Try to acquire the mutex if it is unlocked. */
436 if (!mutex_is_locked(lock) &&
444 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { 437 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
445 lock_acquired(&lock->dep_map, ip); 438 lock_acquired(&lock->dep_map, ip);
446 if (use_ww_ctx) { 439 if (use_ww_ctx) {
@@ -471,7 +464,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
471 * memory barriers as we'll eventually observe the right 464 * memory barriers as we'll eventually observe the right
472 * values at the cost of a few extra spins. 465 * values at the cost of a few extra spins.
473 */ 466 */
474 arch_mutex_cpu_relax(); 467 cpu_relax_lowlatency();
475 } 468 }
476 osq_unlock(&lock->osq); 469 osq_unlock(&lock->osq);
477slowpath: 470slowpath:
@@ -485,8 +478,11 @@ slowpath:
485#endif 478#endif
486 spin_lock_mutex(&lock->wait_lock, flags); 479 spin_lock_mutex(&lock->wait_lock, flags);
487 480
488 /* once more, can we acquire the lock? */ 481 /*
489 if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) 482 * Once more, try to acquire the lock. Only try-lock the mutex if
483 * it is unlocked to reduce unnecessary xchg() operations.
484 */
485 if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1))
490 goto skip_wait; 486 goto skip_wait;
491 487
492 debug_mutex_lock_common(lock, &waiter); 488 debug_mutex_lock_common(lock, &waiter);
@@ -506,9 +502,10 @@ slowpath:
506 * it's unlocked. Later on, if we sleep, this is the 502 * it's unlocked. Later on, if we sleep, this is the
507 * operation that gives us the lock. We xchg it to -1, so 503 * operation that gives us the lock. We xchg it to -1, so
508 * that when we release the lock, we properly wake up the 504 * that when we release the lock, we properly wake up the
509 * other waiters: 505 * other waiters. We only attempt the xchg if the count is
506 * non-negative in order to avoid unnecessary xchg operations:
510 */ 507 */
511 if (MUTEX_SHOW_NO_WAITER(lock) && 508 if (atomic_read(&lock->count) >= 0 &&
512 (atomic_xchg(&lock->count, -1) == 1)) 509 (atomic_xchg(&lock->count, -1) == 1))
513 break; 510 break;
514 511
@@ -823,6 +820,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
823 unsigned long flags; 820 unsigned long flags;
824 int prev; 821 int prev;
825 822
823 /* No need to trylock if the mutex is locked. */
824 if (mutex_is_locked(lock))
825 return 0;
826
826 spin_lock_mutex(&lock->wait_lock, flags); 827 spin_lock_mutex(&lock->wait_lock, flags);
827 828
828 prev = atomic_xchg(&lock->count, -1); 829 prev = atomic_xchg(&lock->count, -1);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fb5b8ac411a5..f956ede7f90d 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -20,7 +20,6 @@
20#include <linux/cpumask.h> 20#include <linux/cpumask.h>
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/mutex.h>
24#include <asm/qrwlock.h> 23#include <asm/qrwlock.h>
25 24
26/** 25/**
@@ -35,7 +34,7 @@ static __always_inline void
35rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) 34rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
36{ 35{
37 while ((cnts & _QW_WMASK) == _QW_LOCKED) { 36 while ((cnts & _QW_WMASK) == _QW_LOCKED) {
38 arch_mutex_cpu_relax(); 37 cpu_relax_lowlatency();
39 cnts = smp_load_acquire((u32 *)&lock->cnts); 38 cnts = smp_load_acquire((u32 *)&lock->cnts);
40 } 39 }
41} 40}
@@ -75,7 +74,7 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
75 * to make sure that the write lock isn't taken. 74 * to make sure that the write lock isn't taken.
76 */ 75 */
77 while (atomic_read(&lock->cnts) & _QW_WMASK) 76 while (atomic_read(&lock->cnts) & _QW_WMASK)
78 arch_mutex_cpu_relax(); 77 cpu_relax_lowlatency();
79 78
80 cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; 79 cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
81 rspin_until_writer_unlock(lock, cnts); 80 rspin_until_writer_unlock(lock, cnts);
@@ -114,7 +113,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
114 cnts | _QW_WAITING) == cnts)) 113 cnts | _QW_WAITING) == cnts))
115 break; 114 break;
116 115
117 arch_mutex_cpu_relax(); 116 cpu_relax_lowlatency();
118 } 117 }
119 118
120 /* When no more readers, set the locked flag */ 119 /* When no more readers, set the locked flag */
@@ -125,7 +124,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
125 _QW_LOCKED) == _QW_WAITING)) 124 _QW_LOCKED) == _QW_WAITING))
126 break; 125 break;
127 126
128 arch_mutex_cpu_relax(); 127 cpu_relax_lowlatency();
129 } 128 }
130unlock: 129unlock:
131 arch_spin_unlock(&lock->lock); 130 arch_spin_unlock(&lock->lock);
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 49b2ed3dced8..62b6cee8ea7f 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -66,12 +66,13 @@ void rt_mutex_debug_task_free(struct task_struct *task)
66 * the deadlock. We print when we return. act_waiter can be NULL in 66 * the deadlock. We print when we return. act_waiter can be NULL in
67 * case of a remove waiter operation. 67 * case of a remove waiter operation.
68 */ 68 */
69void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, 69void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
70 struct rt_mutex_waiter *act_waiter,
70 struct rt_mutex *lock) 71 struct rt_mutex *lock)
71{ 72{
72 struct task_struct *task; 73 struct task_struct *task;
73 74
74 if (!debug_locks || detect || !act_waiter) 75 if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
75 return; 76 return;
76 77
77 task = rt_mutex_owner(act_waiter->lock); 78 task = rt_mutex_owner(act_waiter->lock);
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index ab29b6a22669..d0519c3432b6 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -20,14 +20,15 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
20extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, 20extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
21 struct task_struct *powner); 21 struct task_struct *powner);
22extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); 22extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
23extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, 23extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
24 struct rt_mutex_waiter *waiter,
24 struct rt_mutex *lock); 25 struct rt_mutex *lock);
25extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); 26extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
26# define debug_rt_mutex_reset_waiter(w) \ 27# define debug_rt_mutex_reset_waiter(w) \
27 do { (w)->deadlock_lock = NULL; } while (0) 28 do { (w)->deadlock_lock = NULL; } while (0)
28 29
29static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, 30static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
30 int detect) 31 enum rtmutex_chainwalk walk)
31{ 32{
32 return (waiter != NULL); 33 return (waiter != NULL);
33} 34}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index fc605941b9b8..a0ea2a141b3b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -308,6 +308,32 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
308} 308}
309 309
310/* 310/*
311 * Deadlock detection is conditional:
312 *
313 * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted
314 * if the detect argument is == RT_MUTEX_FULL_CHAINWALK.
315 *
316 * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always
317 * conducted independent of the detect argument.
318 *
319 * If the waiter argument is NULL this indicates the deboost path and
320 * deadlock detection is disabled independent of the detect argument
321 * and the config settings.
322 */
323static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
324 enum rtmutex_chainwalk chwalk)
325{
326 /*
327 * This is just a wrapper function for the following call,
328 * because debug_rt_mutex_detect_deadlock() smells like a magic
329 * debug feature and I wanted to keep the cond function in the
330 * main source file along with the comments instead of having
331 * two of the same in the headers.
332 */
333 return debug_rt_mutex_detect_deadlock(waiter, chwalk);
334}
335
336/*
311 * Max number of times we'll walk the boosting chain: 337 * Max number of times we'll walk the boosting chain:
312 */ 338 */
313int max_lock_depth = 1024; 339int max_lock_depth = 1024;
@@ -337,21 +363,65 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
337 * @top_task: the current top waiter 363 * @top_task: the current top waiter
338 * 364 *
339 * Returns 0 or -EDEADLK. 365 * Returns 0 or -EDEADLK.
366 *
367 * Chain walk basics and protection scope
368 *
369 * [R] refcount on task
370 * [P] task->pi_lock held
371 * [L] rtmutex->wait_lock held
372 *
373 * Step Description Protected by
374 * function arguments:
375 * @task [R]
376 * @orig_lock if != NULL @top_task is blocked on it
377 * @next_lock Unprotected. Cannot be
378 * dereferenced. Only used for
379 * comparison.
380 * @orig_waiter if != NULL @top_task is blocked on it
381 * @top_task current, or in case of proxy
382 * locking protected by calling
383 * code
384 * again:
385 * loop_sanity_check();
386 * retry:
387 * [1] lock(task->pi_lock); [R] acquire [P]
388 * [2] waiter = task->pi_blocked_on; [P]
389 * [3] check_exit_conditions_1(); [P]
390 * [4] lock = waiter->lock; [P]
391 * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
392 * unlock(task->pi_lock); release [P]
393 * goto retry;
394 * }
395 * [6] check_exit_conditions_2(); [P] + [L]
396 * [7] requeue_lock_waiter(lock, waiter); [P] + [L]
397 * [8] unlock(task->pi_lock); release [P]
398 * put_task_struct(task); release [R]
399 * [9] check_exit_conditions_3(); [L]
400 * [10] task = owner(lock); [L]
401 * get_task_struct(task); [L] acquire [R]
402 * lock(task->pi_lock); [L] acquire [P]
403 * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
404 * [12] check_exit_conditions_4(); [P] + [L]
405 * [13] unlock(task->pi_lock); release [P]
406 * unlock(lock->wait_lock); release [L]
407 * goto again;
340 */ 408 */
341static int rt_mutex_adjust_prio_chain(struct task_struct *task, 409static int rt_mutex_adjust_prio_chain(struct task_struct *task,
342 int deadlock_detect, 410 enum rtmutex_chainwalk chwalk,
343 struct rt_mutex *orig_lock, 411 struct rt_mutex *orig_lock,
344 struct rt_mutex *next_lock, 412 struct rt_mutex *next_lock,
345 struct rt_mutex_waiter *orig_waiter, 413 struct rt_mutex_waiter *orig_waiter,
346 struct task_struct *top_task) 414 struct task_struct *top_task)
347{ 415{
348 struct rt_mutex *lock;
349 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; 416 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
350 int detect_deadlock, ret = 0, depth = 0; 417 struct rt_mutex_waiter *prerequeue_top_waiter;
418 int ret = 0, depth = 0;
419 struct rt_mutex *lock;
420 bool detect_deadlock;
351 unsigned long flags; 421 unsigned long flags;
422 bool requeue = true;
352 423
353 detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, 424 detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
354 deadlock_detect);
355 425
356 /* 426 /*
357 * The (de)boosting is a step by step approach with a lot of 427 * The (de)boosting is a step by step approach with a lot of
@@ -360,6 +430,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
360 * carefully whether things change under us. 430 * carefully whether things change under us.
361 */ 431 */
362 again: 432 again:
433 /*
434 * We limit the lock chain length for each invocation.
435 */
363 if (++depth > max_lock_depth) { 436 if (++depth > max_lock_depth) {
364 static int prev_max; 437 static int prev_max;
365 438
@@ -377,13 +450,28 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
377 450
378 return -EDEADLK; 451 return -EDEADLK;
379 } 452 }
453
454 /*
455 * We are fully preemptible here and only hold the refcount on
456 * @task. So everything can have changed under us since the
457 * caller or our own code below (goto retry/again) dropped all
458 * locks.
459 */
380 retry: 460 retry:
381 /* 461 /*
382 * Task can not go away as we did a get_task() before ! 462 * [1] Task cannot go away as we did a get_task() before !
383 */ 463 */
384 raw_spin_lock_irqsave(&task->pi_lock, flags); 464 raw_spin_lock_irqsave(&task->pi_lock, flags);
385 465
466 /*
467 * [2] Get the waiter on which @task is blocked on.
468 */
386 waiter = task->pi_blocked_on; 469 waiter = task->pi_blocked_on;
470
471 /*
472 * [3] check_exit_conditions_1() protected by task->pi_lock.
473 */
474
387 /* 475 /*
388 * Check whether the end of the boosting chain has been 476 * Check whether the end of the boosting chain has been
389 * reached or the state of the chain has changed while we 477 * reached or the state of the chain has changed while we
@@ -421,20 +509,41 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
421 goto out_unlock_pi; 509 goto out_unlock_pi;
422 /* 510 /*
423 * If deadlock detection is off, we stop here if we 511 * If deadlock detection is off, we stop here if we
424 * are not the top pi waiter of the task. 512 * are not the top pi waiter of the task. If deadlock
513 * detection is enabled we continue, but stop the
514 * requeueing in the chain walk.
425 */ 515 */
426 if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) 516 if (top_waiter != task_top_pi_waiter(task)) {
427 goto out_unlock_pi; 517 if (!detect_deadlock)
518 goto out_unlock_pi;
519 else
520 requeue = false;
521 }
428 } 522 }
429 523
430 /* 524 /*
431 * When deadlock detection is off then we check, if further 525 * If the waiter priority is the same as the task priority
432 * priority adjustment is necessary. 526 * then there is no further priority adjustment necessary. If
527 * deadlock detection is off, we stop the chain walk. If its
528 * enabled we continue, but stop the requeueing in the chain
529 * walk.
433 */ 530 */
434 if (!detect_deadlock && waiter->prio == task->prio) 531 if (waiter->prio == task->prio) {
435 goto out_unlock_pi; 532 if (!detect_deadlock)
533 goto out_unlock_pi;
534 else
535 requeue = false;
536 }
436 537
538 /*
539 * [4] Get the next lock
540 */
437 lock = waiter->lock; 541 lock = waiter->lock;
542 /*
543 * [5] We need to trylock here as we are holding task->pi_lock,
544 * which is the reverse lock order versus the other rtmutex
545 * operations.
546 */
438 if (!raw_spin_trylock(&lock->wait_lock)) { 547 if (!raw_spin_trylock(&lock->wait_lock)) {
439 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 548 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
440 cpu_relax(); 549 cpu_relax();
@@ -442,79 +551,180 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
442 } 551 }
443 552
444 /* 553 /*
554 * [6] check_exit_conditions_2() protected by task->pi_lock and
555 * lock->wait_lock.
556 *
445 * Deadlock detection. If the lock is the same as the original 557 * Deadlock detection. If the lock is the same as the original
446 * lock which caused us to walk the lock chain or if the 558 * lock which caused us to walk the lock chain or if the
447 * current lock is owned by the task which initiated the chain 559 * current lock is owned by the task which initiated the chain
448 * walk, we detected a deadlock. 560 * walk, we detected a deadlock.
449 */ 561 */
450 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { 562 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
451 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); 563 debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
452 raw_spin_unlock(&lock->wait_lock); 564 raw_spin_unlock(&lock->wait_lock);
453 ret = -EDEADLK; 565 ret = -EDEADLK;
454 goto out_unlock_pi; 566 goto out_unlock_pi;
455 } 567 }
456 568
457 top_waiter = rt_mutex_top_waiter(lock); 569 /*
570 * If we just follow the lock chain for deadlock detection, no
571 * need to do all the requeue operations. To avoid a truckload
572 * of conditionals around the various places below, just do the
573 * minimum chain walk checks.
574 */
575 if (!requeue) {
576 /*
577 * No requeue[7] here. Just release @task [8]
578 */
579 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
580 put_task_struct(task);
581
582 /*
583 * [9] check_exit_conditions_3 protected by lock->wait_lock.
584 * If there is no owner of the lock, end of chain.
585 */
586 if (!rt_mutex_owner(lock)) {
587 raw_spin_unlock(&lock->wait_lock);
588 return 0;
589 }
590
591 /* [10] Grab the next task, i.e. owner of @lock */
592 task = rt_mutex_owner(lock);
593 get_task_struct(task);
594 raw_spin_lock_irqsave(&task->pi_lock, flags);
595
596 /*
597 * No requeue [11] here. We just do deadlock detection.
598 *
599 * [12] Store whether owner is blocked
600 * itself. Decision is made after dropping the locks
601 */
602 next_lock = task_blocked_on_lock(task);
603 /*
604 * Get the top waiter for the next iteration
605 */
606 top_waiter = rt_mutex_top_waiter(lock);
607
608 /* [13] Drop locks */
609 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
610 raw_spin_unlock(&lock->wait_lock);
611
612 /* If owner is not blocked, end of chain. */
613 if (!next_lock)
614 goto out_put_task;
615 goto again;
616 }
458 617
459 /* Requeue the waiter */ 618 /*
619 * Store the current top waiter before doing the requeue
620 * operation on @lock. We need it for the boost/deboost
621 * decision below.
622 */
623 prerequeue_top_waiter = rt_mutex_top_waiter(lock);
624
625 /* [7] Requeue the waiter in the lock waiter list. */
460 rt_mutex_dequeue(lock, waiter); 626 rt_mutex_dequeue(lock, waiter);
461 waiter->prio = task->prio; 627 waiter->prio = task->prio;
462 rt_mutex_enqueue(lock, waiter); 628 rt_mutex_enqueue(lock, waiter);
463 629
464 /* Release the task */ 630 /* [8] Release the task */
465 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 631 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
632 put_task_struct(task);
633
634 /*
635 * [9] check_exit_conditions_3 protected by lock->wait_lock.
636 *
637 * We must abort the chain walk if there is no lock owner even
638 * in the dead lock detection case, as we have nothing to
639 * follow here. This is the end of the chain we are walking.
640 */
466 if (!rt_mutex_owner(lock)) { 641 if (!rt_mutex_owner(lock)) {
467 /* 642 /*
468 * If the requeue above changed the top waiter, then we need 643 * If the requeue [7] above changed the top waiter,
469 * to wake the new top waiter up to try to get the lock. 644 * then we need to wake the new top waiter up to try
645 * to get the lock.
470 */ 646 */
471 647 if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
472 if (top_waiter != rt_mutex_top_waiter(lock))
473 wake_up_process(rt_mutex_top_waiter(lock)->task); 648 wake_up_process(rt_mutex_top_waiter(lock)->task);
474 raw_spin_unlock(&lock->wait_lock); 649 raw_spin_unlock(&lock->wait_lock);
475 goto out_put_task; 650 return 0;
476 } 651 }
477 put_task_struct(task);
478 652
479 /* Grab the next task */ 653 /* [10] Grab the next task, i.e. the owner of @lock */
480 task = rt_mutex_owner(lock); 654 task = rt_mutex_owner(lock);
481 get_task_struct(task); 655 get_task_struct(task);
482 raw_spin_lock_irqsave(&task->pi_lock, flags); 656 raw_spin_lock_irqsave(&task->pi_lock, flags);
483 657
658 /* [11] requeue the pi waiters if necessary */
484 if (waiter == rt_mutex_top_waiter(lock)) { 659 if (waiter == rt_mutex_top_waiter(lock)) {
485 /* Boost the owner */ 660 /*
486 rt_mutex_dequeue_pi(task, top_waiter); 661 * The waiter became the new top (highest priority)
662 * waiter on the lock. Replace the previous top waiter
663 * in the owner tasks pi waiters list with this waiter
664 * and adjust the priority of the owner.
665 */
666 rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
487 rt_mutex_enqueue_pi(task, waiter); 667 rt_mutex_enqueue_pi(task, waiter);
488 __rt_mutex_adjust_prio(task); 668 __rt_mutex_adjust_prio(task);
489 669
490 } else if (top_waiter == waiter) { 670 } else if (prerequeue_top_waiter == waiter) {
491 /* Deboost the owner */ 671 /*
672 * The waiter was the top waiter on the lock, but is
673 * no longer the top prority waiter. Replace waiter in
674 * the owner tasks pi waiters list with the new top
675 * (highest priority) waiter and adjust the priority
676 * of the owner.
677 * The new top waiter is stored in @waiter so that
678 * @waiter == @top_waiter evaluates to true below and
679 * we continue to deboost the rest of the chain.
680 */
492 rt_mutex_dequeue_pi(task, waiter); 681 rt_mutex_dequeue_pi(task, waiter);
493 waiter = rt_mutex_top_waiter(lock); 682 waiter = rt_mutex_top_waiter(lock);
494 rt_mutex_enqueue_pi(task, waiter); 683 rt_mutex_enqueue_pi(task, waiter);
495 __rt_mutex_adjust_prio(task); 684 __rt_mutex_adjust_prio(task);
685 } else {
686 /*
687 * Nothing changed. No need to do any priority
688 * adjustment.
689 */
496 } 690 }
497 691
498 /* 692 /*
693 * [12] check_exit_conditions_4() protected by task->pi_lock
694 * and lock->wait_lock. The actual decisions are made after we
695 * dropped the locks.
696 *
499 * Check whether the task which owns the current lock is pi 697 * Check whether the task which owns the current lock is pi
500 * blocked itself. If yes we store a pointer to the lock for 698 * blocked itself. If yes we store a pointer to the lock for
501 * the lock chain change detection above. After we dropped 699 * the lock chain change detection above. After we dropped
502 * task->pi_lock next_lock cannot be dereferenced anymore. 700 * task->pi_lock next_lock cannot be dereferenced anymore.
503 */ 701 */
504 next_lock = task_blocked_on_lock(task); 702 next_lock = task_blocked_on_lock(task);
703 /*
704 * Store the top waiter of @lock for the end of chain walk
705 * decision below.
706 */
707 top_waiter = rt_mutex_top_waiter(lock);
505 708
709 /* [13] Drop the locks */
506 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 710 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
507
508 top_waiter = rt_mutex_top_waiter(lock);
509 raw_spin_unlock(&lock->wait_lock); 711 raw_spin_unlock(&lock->wait_lock);
510 712
511 /* 713 /*
714 * Make the actual exit decisions [12], based on the stored
715 * values.
716 *
512 * We reached the end of the lock chain. Stop right here. No 717 * We reached the end of the lock chain. Stop right here. No
513 * point to go back just to figure that out. 718 * point to go back just to figure that out.
514 */ 719 */
515 if (!next_lock) 720 if (!next_lock)
516 goto out_put_task; 721 goto out_put_task;
517 722
723 /*
724 * If the current waiter is not the top waiter on the lock,
725 * then we can stop the chain walk here if we are not in full
726 * deadlock detection mode.
727 */
518 if (!detect_deadlock && waiter != top_waiter) 728 if (!detect_deadlock && waiter != top_waiter)
519 goto out_put_task; 729 goto out_put_task;
520 730
@@ -533,76 +743,119 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
533 * 743 *
534 * Must be called with lock->wait_lock held. 744 * Must be called with lock->wait_lock held.
535 * 745 *
536 * @lock: the lock to be acquired. 746 * @lock: The lock to be acquired.
537 * @task: the task which wants to acquire the lock 747 * @task: The task which wants to acquire the lock
538 * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) 748 * @waiter: The waiter that is queued to the lock's wait list if the
749 * callsite called task_blocked_on_lock(), otherwise NULL
539 */ 750 */
540static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, 751static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
541 struct rt_mutex_waiter *waiter) 752 struct rt_mutex_waiter *waiter)
542{ 753{
754 unsigned long flags;
755
543 /* 756 /*
544 * We have to be careful here if the atomic speedups are 757 * Before testing whether we can acquire @lock, we set the
545 * enabled, such that, when 758 * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
546 * - no other waiter is on the lock 759 * other tasks which try to modify @lock into the slow path
547 * - the lock has been released since we did the cmpxchg 760 * and they serialize on @lock->wait_lock.
548 * the lock can be released or taken while we are doing the 761 *
549 * checks and marking the lock with RT_MUTEX_HAS_WAITERS. 762 * The RT_MUTEX_HAS_WAITERS bit can have a transitional state
763 * as explained at the top of this file if and only if:
550 * 764 *
551 * The atomic acquire/release aware variant of 765 * - There is a lock owner. The caller must fixup the
552 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting 766 * transient state if it does a trylock or leaves the lock
553 * the WAITERS bit, the atomic release / acquire can not 767 * function due to a signal or timeout.
554 * happen anymore and lock->wait_lock protects us from the
555 * non-atomic case.
556 * 768 *
557 * Note, that this might set lock->owner = 769 * - @task acquires the lock and there are no other
558 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended 770 * waiters. This is undone in rt_mutex_set_owner(@task) at
559 * any more. This is fixed up when we take the ownership. 771 * the end of this function.
560 * This is the transitional state explained at the top of this file.
561 */ 772 */
562 mark_rt_mutex_waiters(lock); 773 mark_rt_mutex_waiters(lock);
563 774
775 /*
776 * If @lock has an owner, give up.
777 */
564 if (rt_mutex_owner(lock)) 778 if (rt_mutex_owner(lock))
565 return 0; 779 return 0;
566 780
567 /* 781 /*
568 * It will get the lock because of one of these conditions: 782 * If @waiter != NULL, @task has already enqueued the waiter
569 * 1) there is no waiter 783 * into @lock waiter list. If @waiter == NULL then this is a
570 * 2) higher priority than waiters 784 * trylock attempt.
571 * 3) it is top waiter
572 */ 785 */
573 if (rt_mutex_has_waiters(lock)) { 786 if (waiter) {
574 if (task->prio >= rt_mutex_top_waiter(lock)->prio) { 787 /*
575 if (!waiter || waiter != rt_mutex_top_waiter(lock)) 788 * If waiter is not the highest priority waiter of
576 return 0; 789 * @lock, give up.
577 } 790 */
578 } 791 if (waiter != rt_mutex_top_waiter(lock))
579 792 return 0;
580 if (waiter || rt_mutex_has_waiters(lock)) {
581 unsigned long flags;
582 struct rt_mutex_waiter *top;
583
584 raw_spin_lock_irqsave(&task->pi_lock, flags);
585 793
586 /* remove the queued waiter. */ 794 /*
587 if (waiter) { 795 * We can acquire the lock. Remove the waiter from the
588 rt_mutex_dequeue(lock, waiter); 796 * lock waiters list.
589 task->pi_blocked_on = NULL; 797 */
590 } 798 rt_mutex_dequeue(lock, waiter);
591 799
800 } else {
592 /* 801 /*
593 * We have to enqueue the top waiter(if it exists) into 802 * If the lock has waiters already we check whether @task is
594 * task->pi_waiters list. 803 * eligible to take over the lock.
804 *
805 * If there are no other waiters, @task can acquire
806 * the lock. @task->pi_blocked_on is NULL, so it does
807 * not need to be dequeued.
595 */ 808 */
596 if (rt_mutex_has_waiters(lock)) { 809 if (rt_mutex_has_waiters(lock)) {
597 top = rt_mutex_top_waiter(lock); 810 /*
598 rt_mutex_enqueue_pi(task, top); 811 * If @task->prio is greater than or equal to
812 * the top waiter priority (kernel view),
813 * @task lost.
814 */
815 if (task->prio >= rt_mutex_top_waiter(lock)->prio)
816 return 0;
817
818 /*
819 * The current top waiter stays enqueued. We
820 * don't have to change anything in the lock
821 * waiters order.
822 */
823 } else {
824 /*
825 * No waiters. Take the lock without the
826 * pi_lock dance.@task->pi_blocked_on is NULL
827 * and we have no waiters to enqueue in @task
828 * pi waiters list.
829 */
830 goto takeit;
599 } 831 }
600 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
601 } 832 }
602 833
834 /*
835 * Clear @task->pi_blocked_on. Requires protection by
836 * @task->pi_lock. Redundant operation for the @waiter == NULL
837 * case, but conditionals are more expensive than a redundant
838 * store.
839 */
840 raw_spin_lock_irqsave(&task->pi_lock, flags);
841 task->pi_blocked_on = NULL;
842 /*
843 * Finish the lock acquisition. @task is the new owner. If
844 * other waiters exist we have to insert the highest priority
845 * waiter into @task->pi_waiters list.
846 */
847 if (rt_mutex_has_waiters(lock))
848 rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
849 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
850
851takeit:
603 /* We got the lock. */ 852 /* We got the lock. */
604 debug_rt_mutex_lock(lock); 853 debug_rt_mutex_lock(lock);
605 854
855 /*
856 * This either preserves the RT_MUTEX_HAS_WAITERS bit if there
857 * are still waiters or clears it.
858 */
606 rt_mutex_set_owner(lock, task); 859 rt_mutex_set_owner(lock, task);
607 860
608 rt_mutex_deadlock_account_lock(lock, task); 861 rt_mutex_deadlock_account_lock(lock, task);
@@ -620,7 +873,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
620static int task_blocks_on_rt_mutex(struct rt_mutex *lock, 873static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
621 struct rt_mutex_waiter *waiter, 874 struct rt_mutex_waiter *waiter,
622 struct task_struct *task, 875 struct task_struct *task,
623 int detect_deadlock) 876 enum rtmutex_chainwalk chwalk)
624{ 877{
625 struct task_struct *owner = rt_mutex_owner(lock); 878 struct task_struct *owner = rt_mutex_owner(lock);
626 struct rt_mutex_waiter *top_waiter = waiter; 879 struct rt_mutex_waiter *top_waiter = waiter;
@@ -666,7 +919,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
666 __rt_mutex_adjust_prio(owner); 919 __rt_mutex_adjust_prio(owner);
667 if (owner->pi_blocked_on) 920 if (owner->pi_blocked_on)
668 chain_walk = 1; 921 chain_walk = 1;
669 } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { 922 } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
670 chain_walk = 1; 923 chain_walk = 1;
671 } 924 }
672 925
@@ -691,7 +944,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
691 944
692 raw_spin_unlock(&lock->wait_lock); 945 raw_spin_unlock(&lock->wait_lock);
693 946
694 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, 947 res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
695 next_lock, waiter, task); 948 next_lock, waiter, task);
696 949
697 raw_spin_lock(&lock->wait_lock); 950 raw_spin_lock(&lock->wait_lock);
@@ -753,9 +1006,9 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
753static void remove_waiter(struct rt_mutex *lock, 1006static void remove_waiter(struct rt_mutex *lock,
754 struct rt_mutex_waiter *waiter) 1007 struct rt_mutex_waiter *waiter)
755{ 1008{
756 int first = (waiter == rt_mutex_top_waiter(lock)); 1009 bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
757 struct task_struct *owner = rt_mutex_owner(lock); 1010 struct task_struct *owner = rt_mutex_owner(lock);
758 struct rt_mutex *next_lock = NULL; 1011 struct rt_mutex *next_lock;
759 unsigned long flags; 1012 unsigned long flags;
760 1013
761 raw_spin_lock_irqsave(&current->pi_lock, flags); 1014 raw_spin_lock_irqsave(&current->pi_lock, flags);
@@ -763,29 +1016,31 @@ static void remove_waiter(struct rt_mutex *lock,
763 current->pi_blocked_on = NULL; 1016 current->pi_blocked_on = NULL;
764 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 1017 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
765 1018
766 if (!owner) 1019 /*
1020 * Only update priority if the waiter was the highest priority
1021 * waiter of the lock and there is an owner to update.
1022 */
1023 if (!owner || !is_top_waiter)
767 return; 1024 return;
768 1025
769 if (first) { 1026 raw_spin_lock_irqsave(&owner->pi_lock, flags);
770
771 raw_spin_lock_irqsave(&owner->pi_lock, flags);
772 1027
773 rt_mutex_dequeue_pi(owner, waiter); 1028 rt_mutex_dequeue_pi(owner, waiter);
774 1029
775 if (rt_mutex_has_waiters(lock)) { 1030 if (rt_mutex_has_waiters(lock))
776 struct rt_mutex_waiter *next; 1031 rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
777 1032
778 next = rt_mutex_top_waiter(lock); 1033 __rt_mutex_adjust_prio(owner);
779 rt_mutex_enqueue_pi(owner, next);
780 }
781 __rt_mutex_adjust_prio(owner);
782 1034
783 /* Store the lock on which owner is blocked or NULL */ 1035 /* Store the lock on which owner is blocked or NULL */
784 next_lock = task_blocked_on_lock(owner); 1036 next_lock = task_blocked_on_lock(owner);
785 1037
786 raw_spin_unlock_irqrestore(&owner->pi_lock, flags); 1038 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
787 }
788 1039
1040 /*
1041 * Don't walk the chain, if the owner task is not blocked
1042 * itself.
1043 */
789 if (!next_lock) 1044 if (!next_lock)
790 return; 1045 return;
791 1046
@@ -794,7 +1049,8 @@ static void remove_waiter(struct rt_mutex *lock,
794 1049
795 raw_spin_unlock(&lock->wait_lock); 1050 raw_spin_unlock(&lock->wait_lock);
796 1051
797 rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current); 1052 rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
1053 next_lock, NULL, current);
798 1054
799 raw_spin_lock(&lock->wait_lock); 1055 raw_spin_lock(&lock->wait_lock);
800} 1056}
@@ -824,7 +1080,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
824 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 1080 /* gets dropped in rt_mutex_adjust_prio_chain()! */
825 get_task_struct(task); 1081 get_task_struct(task);
826 1082
827 rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task); 1083 rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
1084 next_lock, NULL, task);
828} 1085}
829 1086
830/** 1087/**
@@ -902,7 +1159,7 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
902static int __sched 1159static int __sched
903rt_mutex_slowlock(struct rt_mutex *lock, int state, 1160rt_mutex_slowlock(struct rt_mutex *lock, int state,
904 struct hrtimer_sleeper *timeout, 1161 struct hrtimer_sleeper *timeout,
905 int detect_deadlock) 1162 enum rtmutex_chainwalk chwalk)
906{ 1163{
907 struct rt_mutex_waiter waiter; 1164 struct rt_mutex_waiter waiter;
908 int ret = 0; 1165 int ret = 0;
@@ -928,7 +1185,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
928 timeout->task = NULL; 1185 timeout->task = NULL;
929 } 1186 }
930 1187
931 ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); 1188 ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
932 1189
933 if (likely(!ret)) 1190 if (likely(!ret))
934 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); 1191 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
@@ -937,7 +1194,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
937 1194
938 if (unlikely(ret)) { 1195 if (unlikely(ret)) {
939 remove_waiter(lock, &waiter); 1196 remove_waiter(lock, &waiter);
940 rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter); 1197 rt_mutex_handle_deadlock(ret, chwalk, &waiter);
941 } 1198 }
942 1199
943 /* 1200 /*
@@ -960,22 +1217,31 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
960/* 1217/*
961 * Slow path try-lock function: 1218 * Slow path try-lock function:
962 */ 1219 */
963static inline int 1220static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
964rt_mutex_slowtrylock(struct rt_mutex *lock)
965{ 1221{
966 int ret = 0; 1222 int ret;
1223
1224 /*
1225 * If the lock already has an owner we fail to get the lock.
1226 * This can be done without taking the @lock->wait_lock as
1227 * it is only being read, and this is a trylock anyway.
1228 */
1229 if (rt_mutex_owner(lock))
1230 return 0;
967 1231
1232 /*
1233 * The mutex has currently no owner. Lock the wait lock and
1234 * try to acquire the lock.
1235 */
968 raw_spin_lock(&lock->wait_lock); 1236 raw_spin_lock(&lock->wait_lock);
969 1237
970 if (likely(rt_mutex_owner(lock) != current)) { 1238 ret = try_to_take_rt_mutex(lock, current, NULL);
971 1239
972 ret = try_to_take_rt_mutex(lock, current, NULL); 1240 /*
973 /* 1241 * try_to_take_rt_mutex() sets the lock waiters bit
974 * try_to_take_rt_mutex() sets the lock waiters 1242 * unconditionally. Clean this up.
975 * bit unconditionally. Clean this up. 1243 */
976 */ 1244 fixup_rt_mutex_waiters(lock);
977 fixup_rt_mutex_waiters(lock);
978 }
979 1245
980 raw_spin_unlock(&lock->wait_lock); 1246 raw_spin_unlock(&lock->wait_lock);
981 1247
@@ -1053,30 +1319,31 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
1053 */ 1319 */
1054static inline int 1320static inline int
1055rt_mutex_fastlock(struct rt_mutex *lock, int state, 1321rt_mutex_fastlock(struct rt_mutex *lock, int state,
1056 int detect_deadlock,
1057 int (*slowfn)(struct rt_mutex *lock, int state, 1322 int (*slowfn)(struct rt_mutex *lock, int state,
1058 struct hrtimer_sleeper *timeout, 1323 struct hrtimer_sleeper *timeout,
1059 int detect_deadlock)) 1324 enum rtmutex_chainwalk chwalk))
1060{ 1325{
1061 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { 1326 if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
1062 rt_mutex_deadlock_account_lock(lock, current); 1327 rt_mutex_deadlock_account_lock(lock, current);
1063 return 0; 1328 return 0;
1064 } else 1329 } else
1065 return slowfn(lock, state, NULL, detect_deadlock); 1330 return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
1066} 1331}
1067 1332
1068static inline int 1333static inline int
1069rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, 1334rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
1070 struct hrtimer_sleeper *timeout, int detect_deadlock, 1335 struct hrtimer_sleeper *timeout,
1336 enum rtmutex_chainwalk chwalk,
1071 int (*slowfn)(struct rt_mutex *lock, int state, 1337 int (*slowfn)(struct rt_mutex *lock, int state,
1072 struct hrtimer_sleeper *timeout, 1338 struct hrtimer_sleeper *timeout,
1073 int detect_deadlock)) 1339 enum rtmutex_chainwalk chwalk))
1074{ 1340{
1075 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { 1341 if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
1342 likely(rt_mutex_cmpxchg(lock, NULL, current))) {
1076 rt_mutex_deadlock_account_lock(lock, current); 1343 rt_mutex_deadlock_account_lock(lock, current);
1077 return 0; 1344 return 0;
1078 } else 1345 } else
1079 return slowfn(lock, state, timeout, detect_deadlock); 1346 return slowfn(lock, state, timeout, chwalk);
1080} 1347}
1081 1348
1082static inline int 1349static inline int
@@ -1109,54 +1376,61 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
1109{ 1376{
1110 might_sleep(); 1377 might_sleep();
1111 1378
1112 rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); 1379 rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
1113} 1380}
1114EXPORT_SYMBOL_GPL(rt_mutex_lock); 1381EXPORT_SYMBOL_GPL(rt_mutex_lock);
1115 1382
1116/** 1383/**
1117 * rt_mutex_lock_interruptible - lock a rt_mutex interruptible 1384 * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
1118 * 1385 *
1119 * @lock: the rt_mutex to be locked 1386 * @lock: the rt_mutex to be locked
1120 * @detect_deadlock: deadlock detection on/off
1121 * 1387 *
1122 * Returns: 1388 * Returns:
1123 * 0 on success 1389 * 0 on success
1124 * -EINTR when interrupted by a signal 1390 * -EINTR when interrupted by a signal
1125 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
1126 */ 1391 */
1127int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, 1392int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
1128 int detect_deadlock)
1129{ 1393{
1130 might_sleep(); 1394 might_sleep();
1131 1395
1132 return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, 1396 return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
1133 detect_deadlock, rt_mutex_slowlock);
1134} 1397}
1135EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); 1398EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
1136 1399
1400/*
1401 * Futex variant with full deadlock detection.
1402 */
1403int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
1404 struct hrtimer_sleeper *timeout)
1405{
1406 might_sleep();
1407
1408 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
1409 RT_MUTEX_FULL_CHAINWALK,
1410 rt_mutex_slowlock);
1411}
1412
1137/** 1413/**
1138 * rt_mutex_timed_lock - lock a rt_mutex interruptible 1414 * rt_mutex_timed_lock - lock a rt_mutex interruptible
1139 * the timeout structure is provided 1415 * the timeout structure is provided
1140 * by the caller 1416 * by the caller
1141 * 1417 *
1142 * @lock: the rt_mutex to be locked 1418 * @lock: the rt_mutex to be locked
1143 * @timeout: timeout structure or NULL (no timeout) 1419 * @timeout: timeout structure or NULL (no timeout)
1144 * @detect_deadlock: deadlock detection on/off
1145 * 1420 *
1146 * Returns: 1421 * Returns:
1147 * 0 on success 1422 * 0 on success
1148 * -EINTR when interrupted by a signal 1423 * -EINTR when interrupted by a signal
1149 * -ETIMEDOUT when the timeout expired 1424 * -ETIMEDOUT when the timeout expired
1150 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
1151 */ 1425 */
1152int 1426int
1153rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, 1427rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
1154 int detect_deadlock)
1155{ 1428{
1156 might_sleep(); 1429 might_sleep();
1157 1430
1158 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, 1431 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
1159 detect_deadlock, rt_mutex_slowlock); 1432 RT_MUTEX_MIN_CHAINWALK,
1433 rt_mutex_slowlock);
1160} 1434}
1161EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); 1435EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
1162 1436
@@ -1262,7 +1536,6 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
1262 * @lock: the rt_mutex to take 1536 * @lock: the rt_mutex to take
1263 * @waiter: the pre-initialized rt_mutex_waiter 1537 * @waiter: the pre-initialized rt_mutex_waiter
1264 * @task: the task to prepare 1538 * @task: the task to prepare
1265 * @detect_deadlock: perform deadlock detection (1) or not (0)
1266 * 1539 *
1267 * Returns: 1540 * Returns:
1268 * 0 - task blocked on lock 1541 * 0 - task blocked on lock
@@ -1273,7 +1546,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
1273 */ 1546 */
1274int rt_mutex_start_proxy_lock(struct rt_mutex *lock, 1547int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1275 struct rt_mutex_waiter *waiter, 1548 struct rt_mutex_waiter *waiter,
1276 struct task_struct *task, int detect_deadlock) 1549 struct task_struct *task)
1277{ 1550{
1278 int ret; 1551 int ret;
1279 1552
@@ -1285,7 +1558,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1285 } 1558 }
1286 1559
1287 /* We enforce deadlock detection for futexes */ 1560 /* We enforce deadlock detection for futexes */
1288 ret = task_blocks_on_rt_mutex(lock, waiter, task, 1); 1561 ret = task_blocks_on_rt_mutex(lock, waiter, task,
1562 RT_MUTEX_FULL_CHAINWALK);
1289 1563
1290 if (ret && !rt_mutex_owner(lock)) { 1564 if (ret && !rt_mutex_owner(lock)) {
1291 /* 1565 /*
@@ -1331,22 +1605,20 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
1331 * rt_mutex_finish_proxy_lock() - Complete lock acquisition 1605 * rt_mutex_finish_proxy_lock() - Complete lock acquisition
1332 * @lock: the rt_mutex we were woken on 1606 * @lock: the rt_mutex we were woken on
1333 * @to: the timeout, null if none. hrtimer should already have 1607 * @to: the timeout, null if none. hrtimer should already have
1334 * been started. 1608 * been started.
1335 * @waiter: the pre-initialized rt_mutex_waiter 1609 * @waiter: the pre-initialized rt_mutex_waiter
1336 * @detect_deadlock: perform deadlock detection (1) or not (0)
1337 * 1610 *
1338 * Complete the lock acquisition started our behalf by another thread. 1611 * Complete the lock acquisition started our behalf by another thread.
1339 * 1612 *
1340 * Returns: 1613 * Returns:
1341 * 0 - success 1614 * 0 - success
1342 * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK 1615 * <0 - error, one of -EINTR, -ETIMEDOUT
1343 * 1616 *
1344 * Special API call for PI-futex requeue support 1617 * Special API call for PI-futex requeue support
1345 */ 1618 */
1346int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, 1619int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1347 struct hrtimer_sleeper *to, 1620 struct hrtimer_sleeper *to,
1348 struct rt_mutex_waiter *waiter, 1621 struct rt_mutex_waiter *waiter)
1349 int detect_deadlock)
1350{ 1622{
1351 int ret; 1623 int ret;
1352 1624
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index f6a1f3c133b1..c4060584c407 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -22,10 +22,15 @@
22#define debug_rt_mutex_init(m, n) do { } while (0) 22#define debug_rt_mutex_init(m, n) do { } while (0)
23#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) 23#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
24#define debug_rt_mutex_print_deadlock(w) do { } while (0) 24#define debug_rt_mutex_print_deadlock(w) do { } while (0)
25#define debug_rt_mutex_detect_deadlock(w,d) (d)
26#define debug_rt_mutex_reset_waiter(w) do { } while (0) 25#define debug_rt_mutex_reset_waiter(w) do { } while (0)
27 26
28static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) 27static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
29{ 28{
30 WARN(1, "rtmutex deadlock detected\n"); 29 WARN(1, "rtmutex deadlock detected\n");
31} 30}
31
32static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
33 enum rtmutex_chainwalk walk)
34{
35 return walk == RT_MUTEX_FULL_CHAINWALK;
36}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 7431a9c86f35..855212501407 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -102,6 +102,21 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
102} 102}
103 103
104/* 104/*
105 * Constants for rt mutex functions which have a selectable deadlock
106 * detection.
107 *
108 * RT_MUTEX_MIN_CHAINWALK: Stops the lock chain walk when there are
109 * no further PI adjustments to be made.
110 *
111 * RT_MUTEX_FULL_CHAINWALK: Invoke deadlock detection with a full
112 * walk of the lock chain.
113 */
114enum rtmutex_chainwalk {
115 RT_MUTEX_MIN_CHAINWALK,
116 RT_MUTEX_FULL_CHAINWALK,
117};
118
119/*
105 * PI-futex support (proxy locking functions, etc.): 120 * PI-futex support (proxy locking functions, etc.):
106 */ 121 */
107extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); 122extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -111,12 +126,11 @@ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
111 struct task_struct *proxy_owner); 126 struct task_struct *proxy_owner);
112extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, 127extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
113 struct rt_mutex_waiter *waiter, 128 struct rt_mutex_waiter *waiter,
114 struct task_struct *task, 129 struct task_struct *task);
115 int detect_deadlock);
116extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, 130extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
117 struct hrtimer_sleeper *to, 131 struct hrtimer_sleeper *to,
118 struct rt_mutex_waiter *waiter, 132 struct rt_mutex_waiter *waiter);
119 int detect_deadlock); 133extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
120 134
121#ifdef CONFIG_DEBUG_RT_MUTEXES 135#ifdef CONFIG_DEBUG_RT_MUTEXES
122# include "rtmutex-debug.h" 136# include "rtmutex-debug.h"
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a2391ac135c8..d6203faf2eb1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -329,7 +329,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
329 if (need_resched()) 329 if (need_resched())
330 break; 330 break;
331 331
332 arch_mutex_cpu_relax(); 332 cpu_relax_lowlatency();
333 } 333 }
334 rcu_read_unlock(); 334 rcu_read_unlock();
335 335
@@ -381,7 +381,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
381 * memory barriers as we'll eventually observe the right 381 * memory barriers as we'll eventually observe the right
382 * values at the cost of a few extra spins. 382 * values at the cost of a few extra spins.
383 */ 383 */
384 arch_mutex_cpu_relax(); 384 cpu_relax_lowlatency();
385 } 385 }
386 osq_unlock(&sem->osq); 386 osq_unlock(&sem->osq);
387done: 387done:
diff --git a/kernel/module.c b/kernel/module.c
index 81e727cf6df9..ae79ce615cb9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -60,7 +60,6 @@
60#include <linux/jump_label.h> 60#include <linux/jump_label.h>
61#include <linux/pfn.h> 61#include <linux/pfn.h>
62#include <linux/bsearch.h> 62#include <linux/bsearch.h>
63#include <linux/fips.h>
64#include <uapi/linux/module.h> 63#include <uapi/linux/module.h>
65#include "module-internal.h" 64#include "module-internal.h"
66 65
@@ -2448,9 +2447,6 @@ static int module_sig_check(struct load_info *info)
2448 } 2447 }
2449 2448
2450 /* Not having a signature is only an error if we're strict. */ 2449 /* Not having a signature is only an error if we're strict. */
2451 if (err < 0 && fips_enabled)
2452 panic("Module verification failed with error %d in FIPS mode\n",
2453 err);
2454 if (err == -ENOKEY && !sig_enforce) 2450 if (err == -ENOKEY && !sig_enforce)
2455 err = 0; 2451 err = 0;
2456 2452
diff --git a/kernel/params.c b/kernel/params.c
index 1e52ca233fd9..34f527023794 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -256,6 +256,7 @@ STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
256STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); 256STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
257STANDARD_PARAM_DEF(long, long, "%li", kstrtol); 257STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
258STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); 258STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
259STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
259 260
260int param_set_charp(const char *val, const struct kernel_param *kp) 261int param_set_charp(const char *val, const struct kernel_param *kp)
261{ 262{
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9a83d780facd..e4e4121fa327 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -253,9 +253,6 @@ config APM_EMULATION
253 anything, try disabling/enabling this option (or disabling/enabling 253 anything, try disabling/enabling this option (or disabling/enabling
254 APM in your BIOS). 254 APM in your BIOS).
255 255
256config ARCH_HAS_OPP
257 bool
258
259config PM_OPP 256config PM_OPP
260 bool 257 bool
261 ---help--- 258 ---help---
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index fcc2611d3f14..a9dfa79b6bab 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -371,7 +371,6 @@ int hibernation_snapshot(int platform_mode)
371 } 371 }
372 372
373 suspend_console(); 373 suspend_console();
374 ftrace_stop();
375 pm_restrict_gfp_mask(); 374 pm_restrict_gfp_mask();
376 375
377 error = dpm_suspend(PMSG_FREEZE); 376 error = dpm_suspend(PMSG_FREEZE);
@@ -397,7 +396,6 @@ int hibernation_snapshot(int platform_mode)
397 if (error || !in_suspend) 396 if (error || !in_suspend)
398 pm_restore_gfp_mask(); 397 pm_restore_gfp_mask();
399 398
400 ftrace_start();
401 resume_console(); 399 resume_console();
402 dpm_complete(msg); 400 dpm_complete(msg);
403 401
@@ -500,7 +498,6 @@ int hibernation_restore(int platform_mode)
500 498
501 pm_prepare_console(); 499 pm_prepare_console();
502 suspend_console(); 500 suspend_console();
503 ftrace_stop();
504 pm_restrict_gfp_mask(); 501 pm_restrict_gfp_mask();
505 error = dpm_suspend_start(PMSG_QUIESCE); 502 error = dpm_suspend_start(PMSG_QUIESCE);
506 if (!error) { 503 if (!error) {
@@ -508,7 +505,6 @@ int hibernation_restore(int platform_mode)
508 dpm_resume_end(PMSG_RECOVER); 505 dpm_resume_end(PMSG_RECOVER);
509 } 506 }
510 pm_restore_gfp_mask(); 507 pm_restore_gfp_mask();
511 ftrace_start();
512 resume_console(); 508 resume_console();
513 pm_restore_console(); 509 pm_restore_console();
514 return error; 510 return error;
@@ -535,7 +531,6 @@ int hibernation_platform_enter(void)
535 531
536 entering_platform_hibernation = true; 532 entering_platform_hibernation = true;
537 suspend_console(); 533 suspend_console();
538 ftrace_stop();
539 error = dpm_suspend_start(PMSG_HIBERNATE); 534 error = dpm_suspend_start(PMSG_HIBERNATE);
540 if (error) { 535 if (error) {
541 if (hibernation_ops->recover) 536 if (hibernation_ops->recover)
@@ -579,7 +574,6 @@ int hibernation_platform_enter(void)
579 Resume_devices: 574 Resume_devices:
580 entering_platform_hibernation = false; 575 entering_platform_hibernation = false;
581 dpm_resume_end(PMSG_RESTORE); 576 dpm_resume_end(PMSG_RESTORE);
582 ftrace_start();
583 resume_console(); 577 resume_console();
584 578
585 Close: 579 Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d57f66a367dc..9a59d042ea84 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -616,7 +616,6 @@ static struct attribute_group attr_group = {
616 .attrs = g, 616 .attrs = g,
617}; 617};
618 618
619#ifdef CONFIG_PM_RUNTIME
620struct workqueue_struct *pm_wq; 619struct workqueue_struct *pm_wq;
621EXPORT_SYMBOL_GPL(pm_wq); 620EXPORT_SYMBOL_GPL(pm_wq);
622 621
@@ -626,9 +625,6 @@ static int __init pm_start_workqueue(void)
626 625
627 return pm_wq ? 0 : -ENOMEM; 626 return pm_wq ? 0 : -ENOMEM;
628} 627}
629#else
630static inline int pm_start_workqueue(void) { return 0; }
631#endif
632 628
633static int __init pm_init(void) 629static int __init pm_init(void)
634{ 630{
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 9a071bea80eb..6dadb25cb0d8 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -290,7 +290,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
290 goto Platform_wake; 290 goto Platform_wake;
291 } 291 }
292 292
293 ftrace_stop();
294 error = disable_nonboot_cpus(); 293 error = disable_nonboot_cpus();
295 if (error || suspend_test(TEST_CPUS)) 294 if (error || suspend_test(TEST_CPUS))
296 goto Enable_cpus; 295 goto Enable_cpus;
@@ -317,7 +316,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
317 316
318 Enable_cpus: 317 Enable_cpus:
319 enable_nonboot_cpus(); 318 enable_nonboot_cpus();
320 ftrace_start();
321 319
322 Platform_wake: 320 Platform_wake:
323 platform_suspend_wake(state); 321 platform_suspend_wake(state);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index adf98622cb32..54e75226c2c4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,12 +28,6 @@
28#include <linux/compat.h> 28#include <linux/compat.h>
29 29
30 30
31static int ptrace_trapping_sleep_fn(void *flags)
32{
33 schedule();
34 return 0;
35}
36
37/* 31/*
38 * ptrace a task: make the debugger its new parent and 32 * ptrace a task: make the debugger its new parent and
39 * move it to the ptrace list. 33 * move it to the ptrace list.
@@ -371,7 +365,7 @@ unlock_creds:
371out: 365out:
372 if (!retval) { 366 if (!retval) {
373 wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, 367 wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
374 ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); 368 TASK_UNINTERRUPTIBLE);
375 proc_ptrace_connector(task, PTRACE_ATTACH); 369 proc_ptrace_connector(task, PTRACE_ATTACH);
376 } 370 }
377 371
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index bfda2726ca45..ff1a6de62f17 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -99,6 +99,10 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
99 99
100void kfree(const void *); 100void kfree(const void *);
101 101
102/*
103 * Reclaim the specified callback, either by invoking it (non-lazy case)
104 * or freeing it directly (lazy case). Return true if lazy, false otherwise.
105 */
102static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) 106static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
103{ 107{
104 unsigned long offset = (unsigned long)head->func; 108 unsigned long offset = (unsigned long)head->func;
@@ -108,12 +112,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
108 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); 112 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
109 kfree((void *)head - offset); 113 kfree((void *)head - offset);
110 rcu_lock_release(&rcu_callback_map); 114 rcu_lock_release(&rcu_callback_map);
111 return 1; 115 return true;
112 } else { 116 } else {
113 RCU_TRACE(trace_rcu_invoke_callback(rn, head)); 117 RCU_TRACE(trace_rcu_invoke_callback(rn, head));
114 head->func(head); 118 head->func(head);
115 rcu_lock_release(&rcu_callback_map); 119 rcu_lock_release(&rcu_callback_map);
116 return 0; 120 return false;
117 } 121 }
118} 122}
119 123
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7fa34f86e5ba..948a7693748e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -18,7 +18,7 @@
18 * Copyright (C) IBM Corporation, 2005, 2006 18 * Copyright (C) IBM Corporation, 2005, 2006
19 * 19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com> 20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * Josh Triplett <josh@freedesktop.org> 21 * Josh Triplett <josh@joshtriplett.org>
22 * 22 *
23 * See also: Documentation/RCU/torture.txt 23 * See also: Documentation/RCU/torture.txt
24 */ 24 */
@@ -51,7 +51,7 @@
51#include <linux/torture.h> 51#include <linux/torture.h>
52 52
53MODULE_LICENSE("GPL"); 53MODULE_LICENSE("GPL");
54MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); 54MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
55 55
56 56
57torture_param(int, fqs_duration, 0, 57torture_param(int, fqs_duration, 0,
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index c639556f3fa0..e037f3eb2f7b 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -298,9 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
298 298
299 idx = ACCESS_ONCE(sp->completed) & 0x1; 299 idx = ACCESS_ONCE(sp->completed) & 0x1;
300 preempt_disable(); 300 preempt_disable();
301 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; 301 __this_cpu_inc(sp->per_cpu_ref->c[idx]);
302 smp_mb(); /* B */ /* Avoid leaking the critical section. */ 302 smp_mb(); /* B */ /* Avoid leaking the critical section. */
303 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; 303 __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
304 preempt_enable(); 304 preempt_enable();
305 return idx; 305 return idx;
306} 306}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 625d0b0cd75a..1b70cb6fbe3c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1013,10 +1013,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
1013} 1013}
1014 1014
1015/* 1015/*
1016 * Dump stacks of all tasks running on stalled CPUs. This is a fallback 1016 * Dump stacks of all tasks running on stalled CPUs.
1017 * for architectures that do not implement trigger_all_cpu_backtrace().
1018 * The NMI-triggered stack traces are more accurate because they are
1019 * printed by the target CPU.
1020 */ 1017 */
1021static void rcu_dump_cpu_stacks(struct rcu_state *rsp) 1018static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
1022{ 1019{
@@ -1094,7 +1091,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
1094 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1091 (long)rsp->gpnum, (long)rsp->completed, totqlen);
1095 if (ndetected == 0) 1092 if (ndetected == 0)
1096 pr_err("INFO: Stall ended before state dump start\n"); 1093 pr_err("INFO: Stall ended before state dump start\n");
1097 else if (!trigger_all_cpu_backtrace()) 1094 else
1098 rcu_dump_cpu_stacks(rsp); 1095 rcu_dump_cpu_stacks(rsp);
1099 1096
1100 /* Complain about tasks blocking the grace period. */ 1097 /* Complain about tasks blocking the grace period. */
@@ -1125,8 +1122,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
1125 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", 1122 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
1126 jiffies - rsp->gp_start, 1123 jiffies - rsp->gp_start,
1127 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1124 (long)rsp->gpnum, (long)rsp->completed, totqlen);
1128 if (!trigger_all_cpu_backtrace()) 1125 rcu_dump_cpu_stacks(rsp);
1129 dump_stack();
1130 1126
1131 raw_spin_lock_irqsave(&rnp->lock, flags); 1127 raw_spin_lock_irqsave(&rnp->lock, flags);
1132 if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) 1128 if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
@@ -1305,10 +1301,16 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1305 * believe that a grace period is in progress, then we must wait 1301 * believe that a grace period is in progress, then we must wait
1306 * for the one following, which is in "c". Because our request 1302 * for the one following, which is in "c". Because our request
1307 * will be noticed at the end of the current grace period, we don't 1303 * will be noticed at the end of the current grace period, we don't
1308 * need to explicitly start one. 1304 * need to explicitly start one. We only do the lockless check
1305 * of rnp_root's fields if the current rcu_node structure thinks
1306 * there is no grace period in flight, and because we hold rnp->lock,
1307 * the only possible change is when rnp_root's two fields are
1308 * equal, in which case rnp_root->gpnum might be concurrently
1309 * incremented. But that is OK, as it will just result in our
1310 * doing some extra useless work.
1309 */ 1311 */
1310 if (rnp->gpnum != rnp->completed || 1312 if (rnp->gpnum != rnp->completed ||
1311 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { 1313 ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
1312 rnp->need_future_gp[c & 0x1]++; 1314 rnp->need_future_gp[c & 0x1]++;
1313 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); 1315 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
1314 goto out; 1316 goto out;
@@ -1645,11 +1647,6 @@ static int rcu_gp_init(struct rcu_state *rsp)
1645 rnp->level, rnp->grplo, 1647 rnp->level, rnp->grplo,
1646 rnp->grphi, rnp->qsmask); 1648 rnp->grphi, rnp->qsmask);
1647 raw_spin_unlock_irq(&rnp->lock); 1649 raw_spin_unlock_irq(&rnp->lock);
1648#ifdef CONFIG_PROVE_RCU_DELAY
1649 if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 &&
1650 system_state == SYSTEM_RUNNING)
1651 udelay(200);
1652#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1653 cond_resched(); 1650 cond_resched();
1654 } 1651 }
1655 1652
@@ -2347,7 +2344,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2347 } 2344 }
2348 smp_mb(); /* List handling before counting for rcu_barrier(). */ 2345 smp_mb(); /* List handling before counting for rcu_barrier(). */
2349 rdp->qlen_lazy -= count_lazy; 2346 rdp->qlen_lazy -= count_lazy;
2350 ACCESS_ONCE(rdp->qlen) -= count; 2347 ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
2351 rdp->n_cbs_invoked += count; 2348 rdp->n_cbs_invoked += count;
2352 2349
2353 /* Reinstate batch limit if we have worked down the excess. */ 2350 /* Reinstate batch limit if we have worked down the excess. */
@@ -2485,14 +2482,14 @@ static void force_quiescent_state(struct rcu_state *rsp)
2485 struct rcu_node *rnp_old = NULL; 2482 struct rcu_node *rnp_old = NULL;
2486 2483
2487 /* Funnel through hierarchy to reduce memory contention. */ 2484 /* Funnel through hierarchy to reduce memory contention. */
2488 rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; 2485 rnp = __this_cpu_read(rsp->rda->mynode);
2489 for (; rnp != NULL; rnp = rnp->parent) { 2486 for (; rnp != NULL; rnp = rnp->parent) {
2490 ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || 2487 ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
2491 !raw_spin_trylock(&rnp->fqslock); 2488 !raw_spin_trylock(&rnp->fqslock);
2492 if (rnp_old != NULL) 2489 if (rnp_old != NULL)
2493 raw_spin_unlock(&rnp_old->fqslock); 2490 raw_spin_unlock(&rnp_old->fqslock);
2494 if (ret) { 2491 if (ret) {
2495 ACCESS_ONCE(rsp->n_force_qs_lh)++; 2492 rsp->n_force_qs_lh++;
2496 return; 2493 return;
2497 } 2494 }
2498 rnp_old = rnp; 2495 rnp_old = rnp;
@@ -2504,7 +2501,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
2504 smp_mb__after_unlock_lock(); 2501 smp_mb__after_unlock_lock();
2505 raw_spin_unlock(&rnp_old->fqslock); 2502 raw_spin_unlock(&rnp_old->fqslock);
2506 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2503 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
2507 ACCESS_ONCE(rsp->n_force_qs_lh)++; 2504 rsp->n_force_qs_lh++;
2508 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2505 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2509 return; /* Someone beat us to it. */ 2506 return; /* Someone beat us to it. */
2510 } 2507 }
@@ -2662,7 +2659,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2662 unsigned long flags; 2659 unsigned long flags;
2663 struct rcu_data *rdp; 2660 struct rcu_data *rdp;
2664 2661
2665 WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ 2662 WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
2666 if (debug_rcu_head_queue(head)) { 2663 if (debug_rcu_head_queue(head)) {
2667 /* Probable double call_rcu(), so leak the callback. */ 2664 /* Probable double call_rcu(), so leak the callback. */
2668 ACCESS_ONCE(head->func) = rcu_leak_callback; 2665 ACCESS_ONCE(head->func) = rcu_leak_callback;
@@ -2693,7 +2690,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2693 local_irq_restore(flags); 2690 local_irq_restore(flags);
2694 return; 2691 return;
2695 } 2692 }
2696 ACCESS_ONCE(rdp->qlen)++; 2693 ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
2697 if (lazy) 2694 if (lazy)
2698 rdp->qlen_lazy++; 2695 rdp->qlen_lazy++;
2699 else 2696 else
@@ -3257,7 +3254,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
3257 * ACCESS_ONCE() to prevent the compiler from speculating 3254 * ACCESS_ONCE() to prevent the compiler from speculating
3258 * the increment to precede the early-exit check. 3255 * the increment to precede the early-exit check.
3259 */ 3256 */
3260 ACCESS_ONCE(rsp->n_barrier_done)++; 3257 ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
3261 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); 3258 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
3262 _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); 3259 _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
3263 smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ 3260 smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
@@ -3307,7 +3304,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
3307 3304
3308 /* Increment ->n_barrier_done to prevent duplicate work. */ 3305 /* Increment ->n_barrier_done to prevent duplicate work. */
3309 smp_mb(); /* Keep increment after above mechanism. */ 3306 smp_mb(); /* Keep increment after above mechanism. */
3310 ACCESS_ONCE(rsp->n_barrier_done)++; 3307 ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
3311 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); 3308 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
3312 _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); 3309 _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
3313 smp_mb(); /* Keep increment before caller's subsequent code. */ 3310 smp_mb(); /* Keep increment before caller's subsequent code. */
@@ -3564,14 +3561,16 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
3564static void __init rcu_init_one(struct rcu_state *rsp, 3561static void __init rcu_init_one(struct rcu_state *rsp,
3565 struct rcu_data __percpu *rda) 3562 struct rcu_data __percpu *rda)
3566{ 3563{
3567 static char *buf[] = { "rcu_node_0", 3564 static const char * const buf[] = {
3568 "rcu_node_1", 3565 "rcu_node_0",
3569 "rcu_node_2", 3566 "rcu_node_1",
3570 "rcu_node_3" }; /* Match MAX_RCU_LVLS */ 3567 "rcu_node_2",
3571 static char *fqs[] = { "rcu_node_fqs_0", 3568 "rcu_node_3" }; /* Match MAX_RCU_LVLS */
3572 "rcu_node_fqs_1", 3569 static const char * const fqs[] = {
3573 "rcu_node_fqs_2", 3570 "rcu_node_fqs_0",
3574 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ 3571 "rcu_node_fqs_1",
3572 "rcu_node_fqs_2",
3573 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
3575 static u8 fl_mask = 0x1; 3574 static u8 fl_mask = 0x1;
3576 int cpustride = 1; 3575 int cpustride = 1;
3577 int i; 3576 int i;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 0f69a79c5b7d..71e64c718f75 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -172,6 +172,14 @@ struct rcu_node {
172 /* queued on this rcu_node structure that */ 172 /* queued on this rcu_node structure that */
173 /* are blocking the current grace period, */ 173 /* are blocking the current grace period, */
174 /* there can be no such task. */ 174 /* there can be no such task. */
175 struct completion boost_completion;
176 /* Used to ensure that the rt_mutex used */
177 /* to carry out the boosting is fully */
178 /* released with no future boostee accesses */
179 /* before that rt_mutex is re-initialized. */
180 struct rt_mutex boost_mtx;
181 /* Used only for the priority-boosting */
182 /* side effect, not as a lock. */
175 unsigned long boost_time; 183 unsigned long boost_time;
176 /* When to start boosting (jiffies). */ 184 /* When to start boosting (jiffies). */
177 struct task_struct *boost_kthread_task; 185 struct task_struct *boost_kthread_task;
@@ -334,11 +342,29 @@ struct rcu_data {
334 struct rcu_head **nocb_tail; 342 struct rcu_head **nocb_tail;
335 atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ 343 atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
336 atomic_long_t nocb_q_count_lazy; /* (approximate). */ 344 atomic_long_t nocb_q_count_lazy; /* (approximate). */
345 struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
346 struct rcu_head **nocb_follower_tail;
347 atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
348 atomic_long_t nocb_follower_count_lazy; /* (approximate). */
337 int nocb_p_count; /* # CBs being invoked by kthread */ 349 int nocb_p_count; /* # CBs being invoked by kthread */
338 int nocb_p_count_lazy; /* (approximate). */ 350 int nocb_p_count_lazy; /* (approximate). */
339 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ 351 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
340 struct task_struct *nocb_kthread; 352 struct task_struct *nocb_kthread;
341 bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ 353 bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
354
355 /* The following fields are used by the leader, hence own cacheline. */
356 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
357 /* CBs waiting for GP. */
358 struct rcu_head **nocb_gp_tail;
359 long nocb_gp_count;
360 long nocb_gp_count_lazy;
361 bool nocb_leader_wake; /* Is the nocb leader thread awake? */
362 struct rcu_data *nocb_next_follower;
363 /* Next follower in wakeup chain. */
364
365 /* The following fields are used by the follower, hence new cachline. */
366 struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp;
367 /* Leader CPU takes GP-end wakeups. */
342#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 368#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
343 369
344 /* 8) RCU CPU stall data. */ 370 /* 8) RCU CPU stall data. */
@@ -587,8 +613,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
587/* Sum up queue lengths for tracing. */ 613/* Sum up queue lengths for tracing. */
588static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) 614static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
589{ 615{
590 *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; 616 *ql = atomic_long_read(&rdp->nocb_q_count) +
591 *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; 617 rdp->nocb_p_count +
618 atomic_long_read(&rdp->nocb_follower_count) +
619 rdp->nocb_p_count + rdp->nocb_gp_count;
620 *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
621 rdp->nocb_p_count_lazy +
622 atomic_long_read(&rdp->nocb_follower_count_lazy) +
623 rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
592} 624}
593#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 625#else /* #ifdef CONFIG_RCU_NOCB_CPU */
594static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) 626static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 02ac0fb186b8..00dc411e9676 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -33,6 +33,7 @@
33#define RCU_KTHREAD_PRIO 1 33#define RCU_KTHREAD_PRIO 1
34 34
35#ifdef CONFIG_RCU_BOOST 35#ifdef CONFIG_RCU_BOOST
36#include "../locking/rtmutex_common.h"
36#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO 37#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
37#else 38#else
38#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO 39#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
@@ -336,7 +337,7 @@ void rcu_read_unlock_special(struct task_struct *t)
336 unsigned long flags; 337 unsigned long flags;
337 struct list_head *np; 338 struct list_head *np;
338#ifdef CONFIG_RCU_BOOST 339#ifdef CONFIG_RCU_BOOST
339 struct rt_mutex *rbmp = NULL; 340 bool drop_boost_mutex = false;
340#endif /* #ifdef CONFIG_RCU_BOOST */ 341#endif /* #ifdef CONFIG_RCU_BOOST */
341 struct rcu_node *rnp; 342 struct rcu_node *rnp;
342 int special; 343 int special;
@@ -398,11 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t)
398#ifdef CONFIG_RCU_BOOST 399#ifdef CONFIG_RCU_BOOST
399 if (&t->rcu_node_entry == rnp->boost_tasks) 400 if (&t->rcu_node_entry == rnp->boost_tasks)
400 rnp->boost_tasks = np; 401 rnp->boost_tasks = np;
401 /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ 402 /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
402 if (t->rcu_boost_mutex) { 403 drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
403 rbmp = t->rcu_boost_mutex;
404 t->rcu_boost_mutex = NULL;
405 }
406#endif /* #ifdef CONFIG_RCU_BOOST */ 404#endif /* #ifdef CONFIG_RCU_BOOST */
407 405
408 /* 406 /*
@@ -427,8 +425,10 @@ void rcu_read_unlock_special(struct task_struct *t)
427 425
428#ifdef CONFIG_RCU_BOOST 426#ifdef CONFIG_RCU_BOOST
429 /* Unboost if we were boosted. */ 427 /* Unboost if we were boosted. */
430 if (rbmp) 428 if (drop_boost_mutex) {
431 rt_mutex_unlock(rbmp); 429 rt_mutex_unlock(&rnp->boost_mtx);
430 complete(&rnp->boost_completion);
431 }
432#endif /* #ifdef CONFIG_RCU_BOOST */ 432#endif /* #ifdef CONFIG_RCU_BOOST */
433 433
434 /* 434 /*
@@ -988,6 +988,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
988 988
989/* Because preemptible RCU does not exist, no quieting of tasks. */ 989/* Because preemptible RCU does not exist, no quieting of tasks. */
990static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 990static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
991 __releases(rnp->lock)
991{ 992{
992 raw_spin_unlock_irqrestore(&rnp->lock, flags); 993 raw_spin_unlock_irqrestore(&rnp->lock, flags);
993} 994}
@@ -1149,7 +1150,6 @@ static void rcu_wake_cond(struct task_struct *t, int status)
1149static int rcu_boost(struct rcu_node *rnp) 1150static int rcu_boost(struct rcu_node *rnp)
1150{ 1151{
1151 unsigned long flags; 1152 unsigned long flags;
1152 struct rt_mutex mtx;
1153 struct task_struct *t; 1153 struct task_struct *t;
1154 struct list_head *tb; 1154 struct list_head *tb;
1155 1155
@@ -1200,11 +1200,15 @@ static int rcu_boost(struct rcu_node *rnp)
1200 * section. 1200 * section.
1201 */ 1201 */
1202 t = container_of(tb, struct task_struct, rcu_node_entry); 1202 t = container_of(tb, struct task_struct, rcu_node_entry);
1203 rt_mutex_init_proxy_locked(&mtx, t); 1203 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
1204 t->rcu_boost_mutex = &mtx; 1204 init_completion(&rnp->boost_completion);
1205 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1205 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1206 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ 1206 /* Lock only for side effect: boosts task t's priority. */
1207 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 1207 rt_mutex_lock(&rnp->boost_mtx);
1208 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
1209
1210 /* Wait for boostee to be done w/boost_mtx before reinitializing. */
1211 wait_for_completion(&rnp->boost_completion);
1208 1212
1209 return ACCESS_ONCE(rnp->exp_tasks) != NULL || 1213 return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1210 ACCESS_ONCE(rnp->boost_tasks) != NULL; 1214 ACCESS_ONCE(rnp->boost_tasks) != NULL;
@@ -1256,6 +1260,7 @@ static int rcu_boost_kthread(void *arg)
1256 * about it going away. 1260 * about it going away.
1257 */ 1261 */
1258static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1262static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1263 __releases(rnp->lock)
1259{ 1264{
1260 struct task_struct *t; 1265 struct task_struct *t;
1261 1266
@@ -1491,6 +1496,7 @@ static void rcu_prepare_kthreads(int cpu)
1491#else /* #ifdef CONFIG_RCU_BOOST */ 1496#else /* #ifdef CONFIG_RCU_BOOST */
1492 1497
1493static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1498static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1499 __releases(rnp->lock)
1494{ 1500{
1495 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1501 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1496} 1502}
@@ -2060,6 +2066,22 @@ bool rcu_is_nocb_cpu(int cpu)
2060#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ 2066#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
2061 2067
2062/* 2068/*
2069 * Kick the leader kthread for this NOCB group.
2070 */
2071static void wake_nocb_leader(struct rcu_data *rdp, bool force)
2072{
2073 struct rcu_data *rdp_leader = rdp->nocb_leader;
2074
2075 if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
2076 return;
2077 if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) {
2078 /* Prior xchg orders against prior callback enqueue. */
2079 ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true;
2080 wake_up(&rdp_leader->nocb_wq);
2081 }
2082}
2083
2084/*
2063 * Enqueue the specified string of rcu_head structures onto the specified 2085 * Enqueue the specified string of rcu_head structures onto the specified
2064 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the 2086 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
2065 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy 2087 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
@@ -2093,7 +2115,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2093 len = atomic_long_read(&rdp->nocb_q_count); 2115 len = atomic_long_read(&rdp->nocb_q_count);
2094 if (old_rhpp == &rdp->nocb_head) { 2116 if (old_rhpp == &rdp->nocb_head) {
2095 if (!irqs_disabled_flags(flags)) { 2117 if (!irqs_disabled_flags(flags)) {
2096 wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */ 2118 /* ... if queue was empty ... */
2119 wake_nocb_leader(rdp, false);
2097 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2120 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2098 TPS("WakeEmpty")); 2121 TPS("WakeEmpty"));
2099 } else { 2122 } else {
@@ -2103,7 +2126,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2103 } 2126 }
2104 rdp->qlen_last_fqs_check = 0; 2127 rdp->qlen_last_fqs_check = 0;
2105 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 2128 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2106 wake_up_process(t); /* ... or if many callbacks queued. */ 2129 /* ... or if many callbacks queued. */
2130 wake_nocb_leader(rdp, true);
2107 rdp->qlen_last_fqs_check = LONG_MAX / 2; 2131 rdp->qlen_last_fqs_check = LONG_MAX / 2;
2108 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); 2132 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
2109 } else { 2133 } else {
@@ -2213,13 +2237,150 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2213} 2237}
2214 2238
2215/* 2239/*
2240 * Leaders come here to wait for additional callbacks to show up.
2241 * This function does not return until callbacks appear.
2242 */
2243static void nocb_leader_wait(struct rcu_data *my_rdp)
2244{
2245 bool firsttime = true;
2246 bool gotcbs;
2247 struct rcu_data *rdp;
2248 struct rcu_head **tail;
2249
2250wait_again:
2251
2252 /* Wait for callbacks to appear. */
2253 if (!rcu_nocb_poll) {
2254 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
2255 wait_event_interruptible(my_rdp->nocb_wq,
2256 ACCESS_ONCE(my_rdp->nocb_leader_wake));
2257 /* Memory barrier handled by smp_mb() calls below and repoll. */
2258 } else if (firsttime) {
2259 firsttime = false; /* Don't drown trace log with "Poll"! */
2260 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
2261 }
2262
2263 /*
2264 * Each pass through the following loop checks a follower for CBs.
2265 * We are our own first follower. Any CBs found are moved to
2266 * nocb_gp_head, where they await a grace period.
2267 */
2268 gotcbs = false;
2269 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
2270 rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
2271 if (!rdp->nocb_gp_head)
2272 continue; /* No CBs here, try next follower. */
2273
2274 /* Move callbacks to wait-for-GP list, which is empty. */
2275 ACCESS_ONCE(rdp->nocb_head) = NULL;
2276 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2277 rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
2278 rdp->nocb_gp_count_lazy =
2279 atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2280 gotcbs = true;
2281 }
2282
2283 /*
2284 * If there were no callbacks, sleep a bit, rescan after a
2285 * memory barrier, and go retry.
2286 */
2287 if (unlikely(!gotcbs)) {
2288 if (!rcu_nocb_poll)
2289 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
2290 "WokeEmpty");
2291 flush_signals(current);
2292 schedule_timeout_interruptible(1);
2293
2294 /* Rescan in case we were a victim of memory ordering. */
2295 my_rdp->nocb_leader_wake = false;
2296 smp_mb(); /* Ensure _wake false before scan. */
2297 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
2298 if (ACCESS_ONCE(rdp->nocb_head)) {
2299 /* Found CB, so short-circuit next wait. */
2300 my_rdp->nocb_leader_wake = true;
2301 break;
2302 }
2303 goto wait_again;
2304 }
2305
2306 /* Wait for one grace period. */
2307 rcu_nocb_wait_gp(my_rdp);
2308
2309 /*
2310 * We left ->nocb_leader_wake set to reduce cache thrashing.
2311 * We clear it now, but recheck for new callbacks while
2312 * traversing our follower list.
2313 */
2314 my_rdp->nocb_leader_wake = false;
2315 smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */
2316
2317 /* Each pass through the following loop wakes a follower, if needed. */
2318 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
2319 if (ACCESS_ONCE(rdp->nocb_head))
2320 my_rdp->nocb_leader_wake = true; /* No need to wait. */
2321 if (!rdp->nocb_gp_head)
2322 continue; /* No CBs, so no need to wake follower. */
2323
2324 /* Append callbacks to follower's "done" list. */
2325 tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
2326 *tail = rdp->nocb_gp_head;
2327 atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
2328 atomic_long_add(rdp->nocb_gp_count_lazy,
2329 &rdp->nocb_follower_count_lazy);
2330 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
2331 /*
2332 * List was empty, wake up the follower.
2333 * Memory barriers supplied by atomic_long_add().
2334 */
2335 wake_up(&rdp->nocb_wq);
2336 }
2337 }
2338
2339 /* If we (the leader) don't have CBs, go wait some more. */
2340 if (!my_rdp->nocb_follower_head)
2341 goto wait_again;
2342}
2343
2344/*
2345 * Followers come here to wait for additional callbacks to show up.
2346 * This function does not return until callbacks appear.
2347 */
2348static void nocb_follower_wait(struct rcu_data *rdp)
2349{
2350 bool firsttime = true;
2351
2352 for (;;) {
2353 if (!rcu_nocb_poll) {
2354 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2355 "FollowerSleep");
2356 wait_event_interruptible(rdp->nocb_wq,
2357 ACCESS_ONCE(rdp->nocb_follower_head));
2358 } else if (firsttime) {
2359 /* Don't drown trace log with "Poll"! */
2360 firsttime = false;
2361 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
2362 }
2363 if (smp_load_acquire(&rdp->nocb_follower_head)) {
2364 /* ^^^ Ensure CB invocation follows _head test. */
2365 return;
2366 }
2367 if (!rcu_nocb_poll)
2368 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2369 "WokeEmpty");
2370 flush_signals(current);
2371 schedule_timeout_interruptible(1);
2372 }
2373}
2374
2375/*
2216 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes 2376 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
2217 * callbacks queued by the corresponding no-CBs CPU. 2377 * callbacks queued by the corresponding no-CBs CPU, however, there is
2378 * an optional leader-follower relationship so that the grace-period
2379 * kthreads don't have to do quite so many wakeups.
2218 */ 2380 */
2219static int rcu_nocb_kthread(void *arg) 2381static int rcu_nocb_kthread(void *arg)
2220{ 2382{
2221 int c, cl; 2383 int c, cl;
2222 bool firsttime = 1;
2223 struct rcu_head *list; 2384 struct rcu_head *list;
2224 struct rcu_head *next; 2385 struct rcu_head *next;
2225 struct rcu_head **tail; 2386 struct rcu_head **tail;
@@ -2227,41 +2388,22 @@ static int rcu_nocb_kthread(void *arg)
2227 2388
2228 /* Each pass through this loop invokes one batch of callbacks */ 2389 /* Each pass through this loop invokes one batch of callbacks */
2229 for (;;) { 2390 for (;;) {
2230 /* If not polling, wait for next batch of callbacks. */ 2391 /* Wait for callbacks. */
2231 if (!rcu_nocb_poll) { 2392 if (rdp->nocb_leader == rdp)
2232 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2393 nocb_leader_wait(rdp);
2233 TPS("Sleep")); 2394 else
2234 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); 2395 nocb_follower_wait(rdp);
2235 /* Memory barrier provide by xchg() below. */ 2396
2236 } else if (firsttime) { 2397 /* Pull the ready-to-invoke callbacks onto local list. */
2237 firsttime = 0; 2398 list = ACCESS_ONCE(rdp->nocb_follower_head);
2238 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2399 BUG_ON(!list);
2239 TPS("Poll")); 2400 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
2240 } 2401 ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
2241 list = ACCESS_ONCE(rdp->nocb_head); 2402 tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
2242 if (!list) { 2403 c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
2243 if (!rcu_nocb_poll) 2404 cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
2244 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2405 rdp->nocb_p_count += c;
2245 TPS("WokeEmpty")); 2406 rdp->nocb_p_count_lazy += cl;
2246 schedule_timeout_interruptible(1);
2247 flush_signals(current);
2248 continue;
2249 }
2250 firsttime = 1;
2251 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2252 TPS("WokeNonEmpty"));
2253
2254 /*
2255 * Extract queued callbacks, update counts, and wait
2256 * for a grace period to elapse.
2257 */
2258 ACCESS_ONCE(rdp->nocb_head) = NULL;
2259 tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2260 c = atomic_long_xchg(&rdp->nocb_q_count, 0);
2261 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2262 ACCESS_ONCE(rdp->nocb_p_count) += c;
2263 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
2264 rcu_nocb_wait_gp(rdp);
2265 2407
2266 /* Each pass through the following loop invokes a callback. */ 2408 /* Each pass through the following loop invokes a callback. */
2267 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); 2409 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@ -2305,7 +2447,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2305 if (!rcu_nocb_need_deferred_wakeup(rdp)) 2447 if (!rcu_nocb_need_deferred_wakeup(rdp))
2306 return; 2448 return;
2307 ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; 2449 ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
2308 wake_up(&rdp->nocb_wq); 2450 wake_nocb_leader(rdp, false);
2309 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); 2451 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
2310} 2452}
2311 2453
@@ -2314,19 +2456,57 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2314{ 2456{
2315 rdp->nocb_tail = &rdp->nocb_head; 2457 rdp->nocb_tail = &rdp->nocb_head;
2316 init_waitqueue_head(&rdp->nocb_wq); 2458 init_waitqueue_head(&rdp->nocb_wq);
2459 rdp->nocb_follower_tail = &rdp->nocb_follower_head;
2317} 2460}
2318 2461
2319/* Create a kthread for each RCU flavor for each no-CBs CPU. */ 2462/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
2463static int rcu_nocb_leader_stride = -1;
2464module_param(rcu_nocb_leader_stride, int, 0444);
2465
2466/*
2467 * Create a kthread for each RCU flavor for each no-CBs CPU.
2468 * Also initialize leader-follower relationships.
2469 */
2320static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) 2470static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2321{ 2471{
2322 int cpu; 2472 int cpu;
2473 int ls = rcu_nocb_leader_stride;
2474 int nl = 0; /* Next leader. */
2323 struct rcu_data *rdp; 2475 struct rcu_data *rdp;
2476 struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
2477 struct rcu_data *rdp_prev = NULL;
2324 struct task_struct *t; 2478 struct task_struct *t;
2325 2479
2326 if (rcu_nocb_mask == NULL) 2480 if (rcu_nocb_mask == NULL)
2327 return; 2481 return;
2482#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL)
2483 if (tick_nohz_full_running)
2484 cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
2485#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */
2486 if (ls == -1) {
2487 ls = int_sqrt(nr_cpu_ids);
2488 rcu_nocb_leader_stride = ls;
2489 }
2490
2491 /*
2492 * Each pass through this loop sets up one rcu_data structure and
2493 * spawns one rcu_nocb_kthread().
2494 */
2328 for_each_cpu(cpu, rcu_nocb_mask) { 2495 for_each_cpu(cpu, rcu_nocb_mask) {
2329 rdp = per_cpu_ptr(rsp->rda, cpu); 2496 rdp = per_cpu_ptr(rsp->rda, cpu);
2497 if (rdp->cpu >= nl) {
2498 /* New leader, set up for followers & next leader. */
2499 nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
2500 rdp->nocb_leader = rdp;
2501 rdp_leader = rdp;
2502 } else {
2503 /* Another follower, link to previous leader. */
2504 rdp->nocb_leader = rdp_leader;
2505 rdp_prev->nocb_next_follower = rdp;
2506 }
2507 rdp_prev = rdp;
2508
2509 /* Spawn the kthread for this CPU. */
2330 t = kthread_run(rcu_nocb_kthread, rdp, 2510 t = kthread_run(rcu_nocb_kthread, rdp,
2331 "rcuo%c/%d", rsp->abbr, cpu); 2511 "rcuo%c/%d", rsp->abbr, cpu);
2332 BUG_ON(IS_ERR(t)); 2512 BUG_ON(IS_ERR(t));
@@ -2843,12 +3023,16 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
2843 */ 3023 */
2844static void rcu_bind_gp_kthread(void) 3024static void rcu_bind_gp_kthread(void)
2845{ 3025{
2846#ifdef CONFIG_NO_HZ_FULL 3026 int __maybe_unused cpu;
2847 int cpu = ACCESS_ONCE(tick_do_timer_cpu);
2848 3027
2849 if (cpu < 0 || cpu >= nr_cpu_ids) 3028 if (!tick_nohz_full_enabled())
2850 return; 3029 return;
2851 if (raw_smp_processor_id() != cpu) 3030#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
3031 cpu = tick_do_timer_cpu;
3032 if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu)
2852 set_cpus_allowed_ptr(current, cpumask_of(cpu)); 3033 set_cpus_allowed_ptr(current, cpumask_of(cpu));
2853#endif /* #ifdef CONFIG_NO_HZ_FULL */ 3034#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3035 if (!is_housekeeping_cpu(raw_smp_processor_id()))
3036 housekeeping_affine(current);
3037#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2854} 3038}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index bc7883570530..4056d7992a6c 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -90,9 +90,6 @@ void __rcu_read_unlock(void)
90 } else { 90 } else {
91 barrier(); /* critical section before exit code. */ 91 barrier(); /* critical section before exit code. */
92 t->rcu_read_lock_nesting = INT_MIN; 92 t->rcu_read_lock_nesting = INT_MIN;
93#ifdef CONFIG_PROVE_RCU_DELAY
94 udelay(10); /* Make preemption more probable. */
95#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
96 barrier(); /* assign before ->rcu_read_unlock_special load */ 93 barrier(); /* assign before ->rcu_read_unlock_special load */
97 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 94 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
98 rcu_read_unlock_special(t); 95 rcu_read_unlock_special(t);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bc1638b33449..ec1a286684a5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -139,6 +139,8 @@ void update_rq_clock(struct rq *rq)
139 return; 139 return;
140 140
141 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 141 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
142 if (delta < 0)
143 return;
142 rq->clock += delta; 144 rq->clock += delta;
143 update_rq_clock_task(rq, delta); 145 update_rq_clock_task(rq, delta);
144} 146}
@@ -243,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
243 char buf[64]; 245 char buf[64];
244 char *cmp; 246 char *cmp;
245 int i; 247 int i;
248 struct inode *inode;
246 249
247 if (cnt > 63) 250 if (cnt > 63)
248 cnt = 63; 251 cnt = 63;
@@ -253,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
253 buf[cnt] = 0; 256 buf[cnt] = 0;
254 cmp = strstrip(buf); 257 cmp = strstrip(buf);
255 258
259 /* Ensure the static_key remains in a consistent state */
260 inode = file_inode(filp);
261 mutex_lock(&inode->i_mutex);
256 i = sched_feat_set(cmp); 262 i = sched_feat_set(cmp);
263 mutex_unlock(&inode->i_mutex);
257 if (i == __SCHED_FEAT_NR) 264 if (i == __SCHED_FEAT_NR)
258 return -EINVAL; 265 return -EINVAL;
259 266
@@ -587,30 +594,31 @@ static bool set_nr_if_polling(struct task_struct *p)
587#endif 594#endif
588 595
589/* 596/*
590 * resched_task - mark a task 'to be rescheduled now'. 597 * resched_curr - mark rq's current task 'to be rescheduled now'.
591 * 598 *
592 * On UP this means the setting of the need_resched flag, on SMP it 599 * On UP this means the setting of the need_resched flag, on SMP it
593 * might also involve a cross-CPU call to trigger the scheduler on 600 * might also involve a cross-CPU call to trigger the scheduler on
594 * the target CPU. 601 * the target CPU.
595 */ 602 */
596void resched_task(struct task_struct *p) 603void resched_curr(struct rq *rq)
597{ 604{
605 struct task_struct *curr = rq->curr;
598 int cpu; 606 int cpu;
599 607
600 lockdep_assert_held(&task_rq(p)->lock); 608 lockdep_assert_held(&rq->lock);
601 609
602 if (test_tsk_need_resched(p)) 610 if (test_tsk_need_resched(curr))
603 return; 611 return;
604 612
605 cpu = task_cpu(p); 613 cpu = cpu_of(rq);
606 614
607 if (cpu == smp_processor_id()) { 615 if (cpu == smp_processor_id()) {
608 set_tsk_need_resched(p); 616 set_tsk_need_resched(curr);
609 set_preempt_need_resched(); 617 set_preempt_need_resched();
610 return; 618 return;
611 } 619 }
612 620
613 if (set_nr_and_not_polling(p)) 621 if (set_nr_and_not_polling(curr))
614 smp_send_reschedule(cpu); 622 smp_send_reschedule(cpu);
615 else 623 else
616 trace_sched_wake_idle_without_ipi(cpu); 624 trace_sched_wake_idle_without_ipi(cpu);
@@ -623,7 +631,7 @@ void resched_cpu(int cpu)
623 631
624 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 632 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
625 return; 633 return;
626 resched_task(cpu_curr(cpu)); 634 resched_curr(rq);
627 raw_spin_unlock_irqrestore(&rq->lock, flags); 635 raw_spin_unlock_irqrestore(&rq->lock, flags);
628} 636}
629 637
@@ -684,10 +692,16 @@ static void wake_up_idle_cpu(int cpu)
684 692
685static bool wake_up_full_nohz_cpu(int cpu) 693static bool wake_up_full_nohz_cpu(int cpu)
686{ 694{
695 /*
696 * We just need the target to call irq_exit() and re-evaluate
697 * the next tick. The nohz full kick at least implies that.
698 * If needed we can still optimize that later with an
699 * empty IRQ.
700 */
687 if (tick_nohz_full_cpu(cpu)) { 701 if (tick_nohz_full_cpu(cpu)) {
688 if (cpu != smp_processor_id() || 702 if (cpu != smp_processor_id() ||
689 tick_nohz_tick_stopped()) 703 tick_nohz_tick_stopped())
690 smp_send_reschedule(cpu); 704 tick_nohz_full_kick_cpu(cpu);
691 return true; 705 return true;
692 } 706 }
693 707
@@ -730,18 +744,15 @@ static inline bool got_nohz_idle_kick(void)
730#ifdef CONFIG_NO_HZ_FULL 744#ifdef CONFIG_NO_HZ_FULL
731bool sched_can_stop_tick(void) 745bool sched_can_stop_tick(void)
732{ 746{
733 struct rq *rq; 747 /*
734 748 * More than one running task need preemption.
735 rq = this_rq(); 749 * nr_running update is assumed to be visible
736 750 * after IPI is sent from wakers.
737 /* Make sure rq->nr_running update is visible after the IPI */ 751 */
738 smp_rmb(); 752 if (this_rq()->nr_running > 1)
739 753 return false;
740 /* More than one running task need preemption */
741 if (rq->nr_running > 1)
742 return false;
743 754
744 return true; 755 return true;
745} 756}
746#endif /* CONFIG_NO_HZ_FULL */ 757#endif /* CONFIG_NO_HZ_FULL */
747 758
@@ -1022,7 +1033,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1022 if (class == rq->curr->sched_class) 1033 if (class == rq->curr->sched_class)
1023 break; 1034 break;
1024 if (class == p->sched_class) { 1035 if (class == p->sched_class) {
1025 resched_task(rq->curr); 1036 resched_curr(rq);
1026 break; 1037 break;
1027 } 1038 }
1028 } 1039 }
@@ -1568,9 +1579,7 @@ void scheduler_ipi(void)
1568 */ 1579 */
1569 preempt_fold_need_resched(); 1580 preempt_fold_need_resched();
1570 1581
1571 if (llist_empty(&this_rq()->wake_list) 1582 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1572 && !tick_nohz_full_cpu(smp_processor_id())
1573 && !got_nohz_idle_kick())
1574 return; 1583 return;
1575 1584
1576 /* 1585 /*
@@ -1587,7 +1596,6 @@ void scheduler_ipi(void)
1587 * somewhat pessimize the simple resched case. 1596 * somewhat pessimize the simple resched case.
1588 */ 1597 */
1589 irq_enter(); 1598 irq_enter();
1590 tick_nohz_full_check();
1591 sched_ttwu_pending(); 1599 sched_ttwu_pending();
1592 1600
1593 /* 1601 /*
@@ -2385,6 +2393,13 @@ unsigned long nr_iowait_cpu(int cpu)
2385 return atomic_read(&this->nr_iowait); 2393 return atomic_read(&this->nr_iowait);
2386} 2394}
2387 2395
2396void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
2397{
2398 struct rq *this = this_rq();
2399 *nr_waiters = atomic_read(&this->nr_iowait);
2400 *load = this->cpu_load[0];
2401}
2402
2388#ifdef CONFIG_SMP 2403#ifdef CONFIG_SMP
2389 2404
2390/* 2405/*
@@ -2431,7 +2446,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2431{ 2446{
2432 u64 ns = 0; 2447 u64 ns = 0;
2433 2448
2434 if (task_current(rq, p)) { 2449 /*
2450 * Must be ->curr _and_ ->on_rq. If dequeued, we would
2451 * project cycles that may never be accounted to this
2452 * thread, breaking clock_gettime().
2453 */
2454 if (task_current(rq, p) && p->on_rq) {
2435 update_rq_clock(rq); 2455 update_rq_clock(rq);
2436 ns = rq_clock_task(rq) - p->se.exec_start; 2456 ns = rq_clock_task(rq) - p->se.exec_start;
2437 if ((s64)ns < 0) 2457 if ((s64)ns < 0)
@@ -2474,8 +2494,10 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2474 * If we race with it leaving cpu, we'll take a lock. So we're correct. 2494 * If we race with it leaving cpu, we'll take a lock. So we're correct.
2475 * If we race with it entering cpu, unaccounted time is 0. This is 2495 * If we race with it entering cpu, unaccounted time is 0. This is
2476 * indistinguishable from the read occurring a few cycles earlier. 2496 * indistinguishable from the read occurring a few cycles earlier.
2497 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
2498 * been accounted, so we're correct here as well.
2477 */ 2499 */
2478 if (!p->on_cpu) 2500 if (!p->on_cpu || !p->on_rq)
2479 return p->se.sum_exec_runtime; 2501 return p->se.sum_exec_runtime;
2480#endif 2502#endif
2481 2503
@@ -2971,7 +2993,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2971 } 2993 }
2972 2994
2973 trace_sched_pi_setprio(p, prio); 2995 trace_sched_pi_setprio(p, prio);
2974 p->pi_top_task = rt_mutex_get_top_task(p);
2975 oldprio = p->prio; 2996 oldprio = p->prio;
2976 prev_class = p->sched_class; 2997 prev_class = p->sched_class;
2977 on_rq = p->on_rq; 2998 on_rq = p->on_rq;
@@ -2991,8 +3012,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2991 * running task 3012 * running task
2992 */ 3013 */
2993 if (dl_prio(prio)) { 3014 if (dl_prio(prio)) {
2994 if (!dl_prio(p->normal_prio) || (p->pi_top_task && 3015 struct task_struct *pi_task = rt_mutex_get_top_task(p);
2995 dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { 3016 if (!dl_prio(p->normal_prio) ||
3017 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
2996 p->dl.dl_boosted = 1; 3018 p->dl.dl_boosted = 1;
2997 p->dl.dl_throttled = 0; 3019 p->dl.dl_throttled = 0;
2998 enqueue_flag = ENQUEUE_REPLENISH; 3020 enqueue_flag = ENQUEUE_REPLENISH;
@@ -3064,7 +3086,7 @@ void set_user_nice(struct task_struct *p, long nice)
3064 * lowered its priority, then reschedule its CPU: 3086 * lowered its priority, then reschedule its CPU:
3065 */ 3087 */
3066 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3088 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3067 resched_task(rq->curr); 3089 resched_curr(rq);
3068 } 3090 }
3069out_unlock: 3091out_unlock:
3070 task_rq_unlock(rq, p, &flags); 3092 task_rq_unlock(rq, p, &flags);
@@ -3203,12 +3225,18 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3203 dl_se->dl_yielded = 0; 3225 dl_se->dl_yielded = 0;
3204} 3226}
3205 3227
3228/*
3229 * sched_setparam() passes in -1 for its policy, to let the functions
3230 * it calls know not to change it.
3231 */
3232#define SETPARAM_POLICY -1
3233
3206static void __setscheduler_params(struct task_struct *p, 3234static void __setscheduler_params(struct task_struct *p,
3207 const struct sched_attr *attr) 3235 const struct sched_attr *attr)
3208{ 3236{
3209 int policy = attr->sched_policy; 3237 int policy = attr->sched_policy;
3210 3238
3211 if (policy == -1) /* setparam */ 3239 if (policy == SETPARAM_POLICY)
3212 policy = p->policy; 3240 policy = p->policy;
3213 3241
3214 p->policy = policy; 3242 p->policy = policy;
@@ -3557,10 +3585,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
3557 .sched_nice = PRIO_TO_NICE(p->static_prio), 3585 .sched_nice = PRIO_TO_NICE(p->static_prio),
3558 }; 3586 };
3559 3587
3560 /* 3588 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
3561 * Fixup the legacy SCHED_RESET_ON_FORK hack 3589 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
3562 */
3563 if (policy & SCHED_RESET_ON_FORK) {
3564 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 3590 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3565 policy &= ~SCHED_RESET_ON_FORK; 3591 policy &= ~SCHED_RESET_ON_FORK;
3566 attr.sched_policy = policy; 3592 attr.sched_policy = policy;
@@ -3730,7 +3756,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3730 */ 3756 */
3731SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3757SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3732{ 3758{
3733 return do_sched_setscheduler(pid, -1, param); 3759 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
3734} 3760}
3735 3761
3736/** 3762/**
@@ -4285,7 +4311,7 @@ again:
4285 * fairness. 4311 * fairness.
4286 */ 4312 */
4287 if (preempt && rq != p_rq) 4313 if (preempt && rq != p_rq)
4288 resched_task(p_rq->curr); 4314 resched_curr(p_rq);
4289 } 4315 }
4290 4316
4291out_unlock: 4317out_unlock:
@@ -6465,6 +6491,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6465 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6491 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6466 child->parent = sd; 6492 child->parent = sd;
6467 sd->child = child; 6493 sd->child = child;
6494
6495 if (!cpumask_subset(sched_domain_span(child),
6496 sched_domain_span(sd))) {
6497 pr_err("BUG: arch topology borken\n");
6498#ifdef CONFIG_SCHED_DEBUG
6499 pr_err(" the %s domain not a subset of the %s domain\n",
6500 child->name, sd->name);
6501#endif
6502 /* Fixup, ensure @sd has at least @child cpus. */
6503 cpumask_or(sched_domain_span(sd),
6504 sched_domain_span(sd),
6505 sched_domain_span(child));
6506 }
6507
6468 } 6508 }
6469 set_domain_attribute(sd, attr); 6509 set_domain_attribute(sd, attr);
6470 6510
@@ -7092,7 +7132,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7092 __setscheduler(rq, p, &attr); 7132 __setscheduler(rq, p, &attr);
7093 if (on_rq) { 7133 if (on_rq) {
7094 enqueue_task(rq, p, 0); 7134 enqueue_task(rq, p, 0);
7095 resched_task(rq->curr); 7135 resched_curr(rq);
7096 } 7136 }
7097 7137
7098 check_class_changed(rq, p, prev_class, old_prio); 7138 check_class_changed(rq, p, prev_class, old_prio);
@@ -7803,6 +7843,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7803 if (period > max_cfs_quota_period) 7843 if (period > max_cfs_quota_period)
7804 return -EINVAL; 7844 return -EINVAL;
7805 7845
7846 /*
7847 * Prevent race between setting of cfs_rq->runtime_enabled and
7848 * unthrottle_offline_cfs_rqs().
7849 */
7850 get_online_cpus();
7806 mutex_lock(&cfs_constraints_mutex); 7851 mutex_lock(&cfs_constraints_mutex);
7807 ret = __cfs_schedulable(tg, period, quota); 7852 ret = __cfs_schedulable(tg, period, quota);
7808 if (ret) 7853 if (ret)
@@ -7828,7 +7873,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7828 } 7873 }
7829 raw_spin_unlock_irq(&cfs_b->lock); 7874 raw_spin_unlock_irq(&cfs_b->lock);
7830 7875
7831 for_each_possible_cpu(i) { 7876 for_each_online_cpu(i) {
7832 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7877 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7833 struct rq *rq = cfs_rq->rq; 7878 struct rq *rq = cfs_rq->rq;
7834 7879
@@ -7844,6 +7889,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7844 cfs_bandwidth_usage_dec(); 7889 cfs_bandwidth_usage_dec();
7845out_unlock: 7890out_unlock:
7846 mutex_unlock(&cfs_constraints_mutex); 7891 mutex_unlock(&cfs_constraints_mutex);
7892 put_online_cpus();
7847 7893
7848 return ret; 7894 return ret;
7849} 7895}
@@ -8083,7 +8129,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
8083 .can_attach = cpu_cgroup_can_attach, 8129 .can_attach = cpu_cgroup_can_attach,
8084 .attach = cpu_cgroup_attach, 8130 .attach = cpu_cgroup_attach,
8085 .exit = cpu_cgroup_exit, 8131 .exit = cpu_cgroup_exit,
8086 .base_cftypes = cpu_files, 8132 .legacy_cftypes = cpu_files,
8087 .early_init = 1, 8133 .early_init = 1,
8088}; 8134};
8089 8135
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9cf350c94ec4..dd7cbb55bbf2 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -278,6 +278,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
278struct cgroup_subsys cpuacct_cgrp_subsys = { 278struct cgroup_subsys cpuacct_cgrp_subsys = {
279 .css_alloc = cpuacct_css_alloc, 279 .css_alloc = cpuacct_css_alloc,
280 .css_free = cpuacct_css_free, 280 .css_free = cpuacct_css_free,
281 .base_cftypes = files, 281 .legacy_cftypes = files,
282 .early_init = 1, 282 .early_init = 1,
283}; 283};
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fc4f98b1258f..255ce138b652 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -306,7 +306,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
306 * the overrunning entity can't interfere with other entity in the system and 306 * the overrunning entity can't interfere with other entity in the system and
307 * can't make them miss their deadlines. Reasons why this kind of overruns 307 * can't make them miss their deadlines. Reasons why this kind of overruns
308 * could happen are, typically, a entity voluntarily trying to overcome its 308 * could happen are, typically, a entity voluntarily trying to overcome its
309 * runtime, or it just underestimated it during sched_setscheduler_ex(). 309 * runtime, or it just underestimated it during sched_setattr().
310 */ 310 */
311static void replenish_dl_entity(struct sched_dl_entity *dl_se, 311static void replenish_dl_entity(struct sched_dl_entity *dl_se,
312 struct sched_dl_entity *pi_se) 312 struct sched_dl_entity *pi_se)
@@ -535,7 +535,7 @@ again:
535 if (task_has_dl_policy(rq->curr)) 535 if (task_has_dl_policy(rq->curr))
536 check_preempt_curr_dl(rq, p, 0); 536 check_preempt_curr_dl(rq, p, 0);
537 else 537 else
538 resched_task(rq->curr); 538 resched_curr(rq);
539#ifdef CONFIG_SMP 539#ifdef CONFIG_SMP
540 /* 540 /*
541 * Queueing this task back might have overloaded rq, 541 * Queueing this task back might have overloaded rq,
@@ -634,7 +634,7 @@ static void update_curr_dl(struct rq *rq)
634 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); 634 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
635 635
636 if (!is_leftmost(curr, &rq->dl)) 636 if (!is_leftmost(curr, &rq->dl))
637 resched_task(curr); 637 resched_curr(rq);
638 } 638 }
639 639
640 /* 640 /*
@@ -964,7 +964,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
964 cpudl_find(&rq->rd->cpudl, p, NULL) != -1) 964 cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
965 return; 965 return;
966 966
967 resched_task(rq->curr); 967 resched_curr(rq);
968} 968}
969 969
970static int pull_dl_task(struct rq *this_rq); 970static int pull_dl_task(struct rq *this_rq);
@@ -979,7 +979,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
979 int flags) 979 int flags)
980{ 980{
981 if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { 981 if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
982 resched_task(rq->curr); 982 resched_curr(rq);
983 return; 983 return;
984 } 984 }
985 985
@@ -1333,7 +1333,7 @@ retry:
1333 if (dl_task(rq->curr) && 1333 if (dl_task(rq->curr) &&
1334 dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && 1334 dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
1335 rq->curr->nr_cpus_allowed > 1) { 1335 rq->curr->nr_cpus_allowed > 1) {
1336 resched_task(rq->curr); 1336 resched_curr(rq);
1337 return 0; 1337 return 0;
1338 } 1338 }
1339 1339
@@ -1373,7 +1373,7 @@ retry:
1373 set_task_cpu(next_task, later_rq->cpu); 1373 set_task_cpu(next_task, later_rq->cpu);
1374 activate_task(later_rq, next_task, 0); 1374 activate_task(later_rq, next_task, 0);
1375 1375
1376 resched_task(later_rq->curr); 1376 resched_curr(later_rq);
1377 1377
1378 double_unlock_balance(rq, later_rq); 1378 double_unlock_balance(rq, later_rq);
1379 1379
@@ -1632,14 +1632,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
1632 */ 1632 */
1633 if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && 1633 if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
1634 rq->curr == p) 1634 rq->curr == p)
1635 resched_task(p); 1635 resched_curr(rq);
1636#else 1636#else
1637 /* 1637 /*
1638 * Again, we don't know if p has a earlier 1638 * Again, we don't know if p has a earlier
1639 * or later deadline, so let's blindly set a 1639 * or later deadline, so let's blindly set a
1640 * (maybe not needed) rescheduling point. 1640 * (maybe not needed) rescheduling point.
1641 */ 1641 */
1642 resched_task(p); 1642 resched_curr(rq);
1643#endif /* CONFIG_SMP */ 1643#endif /* CONFIG_SMP */
1644 } else 1644 } else
1645 switched_to_dl(rq, p); 1645 switched_to_dl(rq, p);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fea7d3335e1f..bfa3c86d0d68 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1062 if (!cpus) 1062 if (!cpus)
1063 return; 1063 return;
1064 1064
1065 ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
1066 ns->task_capacity = 1065 ns->task_capacity =
1067 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); 1066 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
1068 ns->has_free_capacity = (ns->nr_running < ns->task_capacity); 1067 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
@@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env,
1096 env->best_cpu = env->dst_cpu; 1095 env->best_cpu = env->dst_cpu;
1097} 1096}
1098 1097
1099static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, 1098static bool load_too_imbalanced(long src_load, long dst_load,
1100 long src_load, long dst_load,
1101 struct task_numa_env *env) 1099 struct task_numa_env *env)
1102{ 1100{
1103 long imb, old_imb; 1101 long imb, old_imb;
1102 long orig_src_load, orig_dst_load;
1103 long src_capacity, dst_capacity;
1104
1105 /*
1106 * The load is corrected for the CPU capacity available on each node.
1107 *
1108 * src_load dst_load
1109 * ------------ vs ---------
1110 * src_capacity dst_capacity
1111 */
1112 src_capacity = env->src_stats.compute_capacity;
1113 dst_capacity = env->dst_stats.compute_capacity;
1104 1114
1105 /* We care about the slope of the imbalance, not the direction. */ 1115 /* We care about the slope of the imbalance, not the direction. */
1106 if (dst_load < src_load) 1116 if (dst_load < src_load)
1107 swap(dst_load, src_load); 1117 swap(dst_load, src_load);
1108 1118
1109 /* Is the difference below the threshold? */ 1119 /* Is the difference below the threshold? */
1110 imb = dst_load * 100 - src_load * env->imbalance_pct; 1120 imb = dst_load * src_capacity * 100 -
1121 src_load * dst_capacity * env->imbalance_pct;
1111 if (imb <= 0) 1122 if (imb <= 0)
1112 return false; 1123 return false;
1113 1124
@@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
1115 * The imbalance is above the allowed threshold. 1126 * The imbalance is above the allowed threshold.
1116 * Compare it with the old imbalance. 1127 * Compare it with the old imbalance.
1117 */ 1128 */
1129 orig_src_load = env->src_stats.load;
1130 orig_dst_load = env->dst_stats.load;
1131
1118 if (orig_dst_load < orig_src_load) 1132 if (orig_dst_load < orig_src_load)
1119 swap(orig_dst_load, orig_src_load); 1133 swap(orig_dst_load, orig_src_load);
1120 1134
1121 old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; 1135 old_imb = orig_dst_load * src_capacity * 100 -
1136 orig_src_load * dst_capacity * env->imbalance_pct;
1122 1137
1123 /* Would this change make things worse? */ 1138 /* Would this change make things worse? */
1124 return (imb > old_imb); 1139 return (imb > old_imb);
@@ -1136,10 +1151,10 @@ static void task_numa_compare(struct task_numa_env *env,
1136 struct rq *src_rq = cpu_rq(env->src_cpu); 1151 struct rq *src_rq = cpu_rq(env->src_cpu);
1137 struct rq *dst_rq = cpu_rq(env->dst_cpu); 1152 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1138 struct task_struct *cur; 1153 struct task_struct *cur;
1139 long orig_src_load, src_load; 1154 long src_load, dst_load;
1140 long orig_dst_load, dst_load;
1141 long load; 1155 long load;
1142 long imp = (groupimp > 0) ? groupimp : taskimp; 1156 long imp = env->p->numa_group ? groupimp : taskimp;
1157 long moveimp = imp;
1143 1158
1144 rcu_read_lock(); 1159 rcu_read_lock();
1145 cur = ACCESS_ONCE(dst_rq->curr); 1160 cur = ACCESS_ONCE(dst_rq->curr);
@@ -1177,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env,
1177 * itself (not part of a group), use the task weight 1192 * itself (not part of a group), use the task weight
1178 * instead. 1193 * instead.
1179 */ 1194 */
1180 if (env->p->numa_group)
1181 imp = groupimp;
1182 else
1183 imp = taskimp;
1184
1185 if (cur->numa_group) 1195 if (cur->numa_group)
1186 imp += group_weight(cur, env->src_nid) - 1196 imp += group_weight(cur, env->src_nid) -
1187 group_weight(cur, env->dst_nid); 1197 group_weight(cur, env->dst_nid);
@@ -1191,7 +1201,7 @@ static void task_numa_compare(struct task_numa_env *env,
1191 } 1201 }
1192 } 1202 }
1193 1203
1194 if (imp < env->best_imp) 1204 if (imp <= env->best_imp && moveimp <= env->best_imp)
1195 goto unlock; 1205 goto unlock;
1196 1206
1197 if (!cur) { 1207 if (!cur) {
@@ -1204,20 +1214,34 @@ static void task_numa_compare(struct task_numa_env *env,
1204 } 1214 }
1205 1215
1206 /* Balance doesn't matter much if we're running a task per cpu */ 1216 /* Balance doesn't matter much if we're running a task per cpu */
1207 if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) 1217 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1218 dst_rq->nr_running == 1)
1208 goto assign; 1219 goto assign;
1209 1220
1210 /* 1221 /*
1211 * In the overloaded case, try and keep the load balanced. 1222 * In the overloaded case, try and keep the load balanced.
1212 */ 1223 */
1213balance: 1224balance:
1214 orig_dst_load = env->dst_stats.load;
1215 orig_src_load = env->src_stats.load;
1216
1217 /* XXX missing capacity terms */
1218 load = task_h_load(env->p); 1225 load = task_h_load(env->p);
1219 dst_load = orig_dst_load + load; 1226 dst_load = env->dst_stats.load + load;
1220 src_load = orig_src_load - load; 1227 src_load = env->src_stats.load - load;
1228
1229 if (moveimp > imp && moveimp > env->best_imp) {
1230 /*
1231 * If the improvement from just moving env->p direction is
1232 * better than swapping tasks around, check if a move is
1233 * possible. Store a slightly smaller score than moveimp,
1234 * so an actually idle CPU will win.
1235 */
1236 if (!load_too_imbalanced(src_load, dst_load, env)) {
1237 imp = moveimp - 1;
1238 cur = NULL;
1239 goto assign;
1240 }
1241 }
1242
1243 if (imp <= env->best_imp)
1244 goto unlock;
1221 1245
1222 if (cur) { 1246 if (cur) {
1223 load = task_h_load(cur); 1247 load = task_h_load(cur);
@@ -1225,8 +1249,7 @@ balance:
1225 src_load += load; 1249 src_load += load;
1226 } 1250 }
1227 1251
1228 if (load_too_imbalanced(orig_src_load, orig_dst_load, 1252 if (load_too_imbalanced(src_load, dst_load, env))
1229 src_load, dst_load, env))
1230 goto unlock; 1253 goto unlock;
1231 1254
1232assign: 1255assign:
@@ -1302,9 +1325,8 @@ static int task_numa_migrate(struct task_struct *p)
1302 groupimp = group_weight(p, env.dst_nid) - groupweight; 1325 groupimp = group_weight(p, env.dst_nid) - groupweight;
1303 update_numa_stats(&env.dst_stats, env.dst_nid); 1326 update_numa_stats(&env.dst_stats, env.dst_nid);
1304 1327
1305 /* If the preferred nid has free capacity, try to use it. */ 1328 /* Try to find a spot on the preferred nid. */
1306 if (env.dst_stats.has_free_capacity) 1329 task_numa_find_cpu(&env, taskimp, groupimp);
1307 task_numa_find_cpu(&env, taskimp, groupimp);
1308 1330
1309 /* No space available on the preferred nid. Look elsewhere. */ 1331 /* No space available on the preferred nid. Look elsewhere. */
1310 if (env.best_cpu == -1) { 1332 if (env.best_cpu == -1) {
@@ -1324,10 +1346,6 @@ static int task_numa_migrate(struct task_struct *p)
1324 } 1346 }
1325 } 1347 }
1326 1348
1327 /* No better CPU than the current one was found. */
1328 if (env.best_cpu == -1)
1329 return -EAGAIN;
1330
1331 /* 1349 /*
1332 * If the task is part of a workload that spans multiple NUMA nodes, 1350 * If the task is part of a workload that spans multiple NUMA nodes,
1333 * and is migrating into one of the workload's active nodes, remember 1351 * and is migrating into one of the workload's active nodes, remember
@@ -1336,8 +1354,19 @@ static int task_numa_migrate(struct task_struct *p)
1336 * A task that migrated to a second choice node will be better off 1354 * A task that migrated to a second choice node will be better off
1337 * trying for a better one later. Do not set the preferred node here. 1355 * trying for a better one later. Do not set the preferred node here.
1338 */ 1356 */
1339 if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) 1357 if (p->numa_group) {
1340 sched_setnuma(p, env.dst_nid); 1358 if (env.best_cpu == -1)
1359 nid = env.src_nid;
1360 else
1361 nid = env.dst_nid;
1362
1363 if (node_isset(nid, p->numa_group->active_nodes))
1364 sched_setnuma(p, env.dst_nid);
1365 }
1366
1367 /* No better CPU than the current one was found. */
1368 if (env.best_cpu == -1)
1369 return -EAGAIN;
1341 1370
1342 /* 1371 /*
1343 * Reset the scan period if the task is being rescheduled on an 1372 * Reset the scan period if the task is being rescheduled on an
@@ -1415,12 +1444,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
1415/* 1444/*
1416 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS 1445 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1417 * increments. The more local the fault statistics are, the higher the scan 1446 * increments. The more local the fault statistics are, the higher the scan
1418 * period will be for the next scan window. If local/remote ratio is below 1447 * period will be for the next scan window. If local/(local+remote) ratio is
1419 * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the 1448 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1420 * scan period will decrease 1449 * the scan period will decrease. Aim for 70% local accesses.
1421 */ 1450 */
1422#define NUMA_PERIOD_SLOTS 10 1451#define NUMA_PERIOD_SLOTS 10
1423#define NUMA_PERIOD_THRESHOLD 3 1452#define NUMA_PERIOD_THRESHOLD 7
1424 1453
1425/* 1454/*
1426 * Increase the scan period (slow down scanning) if the majority of 1455 * Increase the scan period (slow down scanning) if the majority of
@@ -1595,30 +1624,17 @@ static void task_numa_placement(struct task_struct *p)
1595 1624
1596 if (p->numa_group) { 1625 if (p->numa_group) {
1597 update_numa_active_node_mask(p->numa_group); 1626 update_numa_active_node_mask(p->numa_group);
1598 /*
1599 * If the preferred task and group nids are different,
1600 * iterate over the nodes again to find the best place.
1601 */
1602 if (max_nid != max_group_nid) {
1603 unsigned long weight, max_weight = 0;
1604
1605 for_each_online_node(nid) {
1606 weight = task_weight(p, nid) + group_weight(p, nid);
1607 if (weight > max_weight) {
1608 max_weight = weight;
1609 max_nid = nid;
1610 }
1611 }
1612 }
1613
1614 spin_unlock_irq(group_lock); 1627 spin_unlock_irq(group_lock);
1628 max_nid = max_group_nid;
1615 } 1629 }
1616 1630
1617 /* Preferred node as the node with the most faults */ 1631 if (max_faults) {
1618 if (max_faults && max_nid != p->numa_preferred_nid) { 1632 /* Set the new preferred node */
1619 /* Update the preferred nid and migrate task if possible */ 1633 if (max_nid != p->numa_preferred_nid)
1620 sched_setnuma(p, max_nid); 1634 sched_setnuma(p, max_nid);
1621 numa_migrate_preferred(p); 1635
1636 if (task_node(p) != p->numa_preferred_nid)
1637 numa_migrate_preferred(p);
1622 } 1638 }
1623} 1639}
1624 1640
@@ -2899,7 +2915,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2899 ideal_runtime = sched_slice(cfs_rq, curr); 2915 ideal_runtime = sched_slice(cfs_rq, curr);
2900 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 2916 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
2901 if (delta_exec > ideal_runtime) { 2917 if (delta_exec > ideal_runtime) {
2902 resched_task(rq_of(cfs_rq)->curr); 2918 resched_curr(rq_of(cfs_rq));
2903 /* 2919 /*
2904 * The current task ran long enough, ensure it doesn't get 2920 * The current task ran long enough, ensure it doesn't get
2905 * re-elected due to buddy favours. 2921 * re-elected due to buddy favours.
@@ -2923,7 +2939,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2923 return; 2939 return;
2924 2940
2925 if (delta > ideal_runtime) 2941 if (delta > ideal_runtime)
2926 resched_task(rq_of(cfs_rq)->curr); 2942 resched_curr(rq_of(cfs_rq));
2927} 2943}
2928 2944
2929static void 2945static void
@@ -3063,7 +3079,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3063 * validating it and just reschedule. 3079 * validating it and just reschedule.
3064 */ 3080 */
3065 if (queued) { 3081 if (queued) {
3066 resched_task(rq_of(cfs_rq)->curr); 3082 resched_curr(rq_of(cfs_rq));
3067 return; 3083 return;
3068 } 3084 }
3069 /* 3085 /*
@@ -3254,7 +3270,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3254 * hierarchy can be throttled 3270 * hierarchy can be throttled
3255 */ 3271 */
3256 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) 3272 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3257 resched_task(rq_of(cfs_rq)->curr); 3273 resched_curr(rq_of(cfs_rq));
3258} 3274}
3259 3275
3260static __always_inline 3276static __always_inline
@@ -3360,7 +3376,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3360 cfs_rq->throttled = 1; 3376 cfs_rq->throttled = 1;
3361 cfs_rq->throttled_clock = rq_clock(rq); 3377 cfs_rq->throttled_clock = rq_clock(rq);
3362 raw_spin_lock(&cfs_b->lock); 3378 raw_spin_lock(&cfs_b->lock);
3363 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 3379 /*
3380 * Add to the _head_ of the list, so that an already-started
3381 * distribute_cfs_runtime will not see us
3382 */
3383 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3364 if (!cfs_b->timer_active) 3384 if (!cfs_b->timer_active)
3365 __start_cfs_bandwidth(cfs_b, false); 3385 __start_cfs_bandwidth(cfs_b, false);
3366 raw_spin_unlock(&cfs_b->lock); 3386 raw_spin_unlock(&cfs_b->lock);
@@ -3410,14 +3430,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3410 3430
3411 /* determine whether we need to wake up potentially idle cpu */ 3431 /* determine whether we need to wake up potentially idle cpu */
3412 if (rq->curr == rq->idle && rq->cfs.nr_running) 3432 if (rq->curr == rq->idle && rq->cfs.nr_running)
3413 resched_task(rq->curr); 3433 resched_curr(rq);
3414} 3434}
3415 3435
3416static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, 3436static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3417 u64 remaining, u64 expires) 3437 u64 remaining, u64 expires)
3418{ 3438{
3419 struct cfs_rq *cfs_rq; 3439 struct cfs_rq *cfs_rq;
3420 u64 runtime = remaining; 3440 u64 runtime;
3441 u64 starting_runtime = remaining;
3421 3442
3422 rcu_read_lock(); 3443 rcu_read_lock();
3423 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, 3444 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -3448,7 +3469,7 @@ next:
3448 } 3469 }
3449 rcu_read_unlock(); 3470 rcu_read_unlock();
3450 3471
3451 return remaining; 3472 return starting_runtime - remaining;
3452} 3473}
3453 3474
3454/* 3475/*
@@ -3494,22 +3515,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3494 /* account preceding periods in which throttling occurred */ 3515 /* account preceding periods in which throttling occurred */
3495 cfs_b->nr_throttled += overrun; 3516 cfs_b->nr_throttled += overrun;
3496 3517
3497 /*
3498 * There are throttled entities so we must first use the new bandwidth
3499 * to unthrottle them before making it generally available. This
3500 * ensures that all existing debts will be paid before a new cfs_rq is
3501 * allowed to run.
3502 */
3503 runtime = cfs_b->runtime;
3504 runtime_expires = cfs_b->runtime_expires; 3518 runtime_expires = cfs_b->runtime_expires;
3505 cfs_b->runtime = 0;
3506 3519
3507 /* 3520 /*
3508 * This check is repeated as we are holding onto the new bandwidth 3521 * This check is repeated as we are holding onto the new bandwidth while
3509 * while we unthrottle. This can potentially race with an unthrottled 3522 * we unthrottle. This can potentially race with an unthrottled group
3510 * group trying to acquire new bandwidth from the global pool. 3523 * trying to acquire new bandwidth from the global pool. This can result
3524 * in us over-using our runtime if it is all used during this loop, but
3525 * only by limited amounts in that extreme case.
3511 */ 3526 */
3512 while (throttled && runtime > 0) { 3527 while (throttled && cfs_b->runtime > 0) {
3528 runtime = cfs_b->runtime;
3513 raw_spin_unlock(&cfs_b->lock); 3529 raw_spin_unlock(&cfs_b->lock);
3514 /* we can't nest cfs_b->lock while distributing bandwidth */ 3530 /* we can't nest cfs_b->lock while distributing bandwidth */
3515 runtime = distribute_cfs_runtime(cfs_b, runtime, 3531 runtime = distribute_cfs_runtime(cfs_b, runtime,
@@ -3517,10 +3533,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3517 raw_spin_lock(&cfs_b->lock); 3533 raw_spin_lock(&cfs_b->lock);
3518 3534
3519 throttled = !list_empty(&cfs_b->throttled_cfs_rq); 3535 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3536
3537 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3520 } 3538 }
3521 3539
3522 /* return (any) remaining runtime */
3523 cfs_b->runtime = runtime;
3524 /* 3540 /*
3525 * While we are ensured activity in the period following an 3541 * While we are ensured activity in the period following an
3526 * unthrottle, this also covers the case in which the new bandwidth is 3542 * unthrottle, this also covers the case in which the new bandwidth is
@@ -3631,10 +3647,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3631 return; 3647 return;
3632 } 3648 }
3633 3649
3634 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { 3650 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
3635 runtime = cfs_b->runtime; 3651 runtime = cfs_b->runtime;
3636 cfs_b->runtime = 0; 3652
3637 }
3638 expires = cfs_b->runtime_expires; 3653 expires = cfs_b->runtime_expires;
3639 raw_spin_unlock(&cfs_b->lock); 3654 raw_spin_unlock(&cfs_b->lock);
3640 3655
@@ -3645,7 +3660,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3645 3660
3646 raw_spin_lock(&cfs_b->lock); 3661 raw_spin_lock(&cfs_b->lock);
3647 if (expires == cfs_b->runtime_expires) 3662 if (expires == cfs_b->runtime_expires)
3648 cfs_b->runtime = runtime; 3663 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3649 raw_spin_unlock(&cfs_b->lock); 3664 raw_spin_unlock(&cfs_b->lock);
3650} 3665}
3651 3666
@@ -3775,6 +3790,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3775 hrtimer_cancel(&cfs_b->slack_timer); 3790 hrtimer_cancel(&cfs_b->slack_timer);
3776} 3791}
3777 3792
3793static void __maybe_unused update_runtime_enabled(struct rq *rq)
3794{
3795 struct cfs_rq *cfs_rq;
3796
3797 for_each_leaf_cfs_rq(rq, cfs_rq) {
3798 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
3799
3800 raw_spin_lock(&cfs_b->lock);
3801 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
3802 raw_spin_unlock(&cfs_b->lock);
3803 }
3804}
3805
3778static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) 3806static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3779{ 3807{
3780 struct cfs_rq *cfs_rq; 3808 struct cfs_rq *cfs_rq;
@@ -3788,6 +3816,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3788 * there's some valid quota amount 3816 * there's some valid quota amount
3789 */ 3817 */
3790 cfs_rq->runtime_remaining = 1; 3818 cfs_rq->runtime_remaining = 1;
3819 /*
3820 * Offline rq is schedulable till cpu is completely disabled
3821 * in take_cpu_down(), so we prevent new cfs throttling here.
3822 */
3823 cfs_rq->runtime_enabled = 0;
3824
3791 if (cfs_rq_throttled(cfs_rq)) 3825 if (cfs_rq_throttled(cfs_rq))
3792 unthrottle_cfs_rq(cfs_rq); 3826 unthrottle_cfs_rq(cfs_rq);
3793 } 3827 }
@@ -3831,6 +3865,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3831 return NULL; 3865 return NULL;
3832} 3866}
3833static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} 3867static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3868static inline void update_runtime_enabled(struct rq *rq) {}
3834static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} 3869static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
3835 3870
3836#endif /* CONFIG_CFS_BANDWIDTH */ 3871#endif /* CONFIG_CFS_BANDWIDTH */
@@ -3854,7 +3889,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
3854 3889
3855 if (delta < 0) { 3890 if (delta < 0) {
3856 if (rq->curr == p) 3891 if (rq->curr == p)
3857 resched_task(p); 3892 resched_curr(rq);
3858 return; 3893 return;
3859 } 3894 }
3860 3895
@@ -4723,7 +4758,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
4723 return; 4758 return;
4724 4759
4725preempt: 4760preempt:
4726 resched_task(curr); 4761 resched_curr(rq);
4727 /* 4762 /*
4728 * Only set the backward buddy when the current task is still 4763 * Only set the backward buddy when the current task is still
4729 * on the rq. This can happen when a wakeup gets interleaved 4764 * on the rq. This can happen when a wakeup gets interleaved
@@ -5094,8 +5129,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
5094/* 5129/*
5095 * Is this task likely cache-hot: 5130 * Is this task likely cache-hot:
5096 */ 5131 */
5097static int 5132static int task_hot(struct task_struct *p, struct lb_env *env)
5098task_hot(struct task_struct *p, u64 now)
5099{ 5133{
5100 s64 delta; 5134 s64 delta;
5101 5135
@@ -5108,7 +5142,7 @@ task_hot(struct task_struct *p, u64 now)
5108 /* 5142 /*
5109 * Buddy candidates are cache hot: 5143 * Buddy candidates are cache hot:
5110 */ 5144 */
5111 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && 5145 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
5112 (&p->se == cfs_rq_of(&p->se)->next || 5146 (&p->se == cfs_rq_of(&p->se)->next ||
5113 &p->se == cfs_rq_of(&p->se)->last)) 5147 &p->se == cfs_rq_of(&p->se)->last))
5114 return 1; 5148 return 1;
@@ -5118,7 +5152,7 @@ task_hot(struct task_struct *p, u64 now)
5118 if (sysctl_sched_migration_cost == 0) 5152 if (sysctl_sched_migration_cost == 0)
5119 return 0; 5153 return 0;
5120 5154
5121 delta = now - p->se.exec_start; 5155 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
5122 5156
5123 return delta < (s64)sysctl_sched_migration_cost; 5157 return delta < (s64)sysctl_sched_migration_cost;
5124} 5158}
@@ -5272,7 +5306,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
5272 * 2) task is cache cold, or 5306 * 2) task is cache cold, or
5273 * 3) too many balance attempts have failed. 5307 * 3) too many balance attempts have failed.
5274 */ 5308 */
5275 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); 5309 tsk_cache_hot = task_hot(p, env);
5276 if (!tsk_cache_hot) 5310 if (!tsk_cache_hot)
5277 tsk_cache_hot = migrate_degrades_locality(p, env); 5311 tsk_cache_hot = migrate_degrades_locality(p, env);
5278 5312
@@ -5864,10 +5898,12 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
5864 * @load_idx: Load index of sched_domain of this_cpu for load calc. 5898 * @load_idx: Load index of sched_domain of this_cpu for load calc.
5865 * @local_group: Does group contain this_cpu. 5899 * @local_group: Does group contain this_cpu.
5866 * @sgs: variable to hold the statistics for this group. 5900 * @sgs: variable to hold the statistics for this group.
5901 * @overload: Indicate more than one runnable task for any CPU.
5867 */ 5902 */
5868static inline void update_sg_lb_stats(struct lb_env *env, 5903static inline void update_sg_lb_stats(struct lb_env *env,
5869 struct sched_group *group, int load_idx, 5904 struct sched_group *group, int load_idx,
5870 int local_group, struct sg_lb_stats *sgs) 5905 int local_group, struct sg_lb_stats *sgs,
5906 bool *overload)
5871{ 5907{
5872 unsigned long load; 5908 unsigned long load;
5873 int i; 5909 int i;
@@ -5885,6 +5921,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5885 5921
5886 sgs->group_load += load; 5922 sgs->group_load += load;
5887 sgs->sum_nr_running += rq->nr_running; 5923 sgs->sum_nr_running += rq->nr_running;
5924
5925 if (rq->nr_running > 1)
5926 *overload = true;
5927
5888#ifdef CONFIG_NUMA_BALANCING 5928#ifdef CONFIG_NUMA_BALANCING
5889 sgs->nr_numa_running += rq->nr_numa_running; 5929 sgs->nr_numa_running += rq->nr_numa_running;
5890 sgs->nr_preferred_running += rq->nr_preferred_running; 5930 sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -5995,6 +6035,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
5995 struct sched_group *sg = env->sd->groups; 6035 struct sched_group *sg = env->sd->groups;
5996 struct sg_lb_stats tmp_sgs; 6036 struct sg_lb_stats tmp_sgs;
5997 int load_idx, prefer_sibling = 0; 6037 int load_idx, prefer_sibling = 0;
6038 bool overload = false;
5998 6039
5999 if (child && child->flags & SD_PREFER_SIBLING) 6040 if (child && child->flags & SD_PREFER_SIBLING)
6000 prefer_sibling = 1; 6041 prefer_sibling = 1;
@@ -6015,7 +6056,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
6015 update_group_capacity(env->sd, env->dst_cpu); 6056 update_group_capacity(env->sd, env->dst_cpu);
6016 } 6057 }
6017 6058
6018 update_sg_lb_stats(env, sg, load_idx, local_group, sgs); 6059 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
6060 &overload);
6019 6061
6020 if (local_group) 6062 if (local_group)
6021 goto next_group; 6063 goto next_group;
@@ -6049,6 +6091,13 @@ next_group:
6049 6091
6050 if (env->sd->flags & SD_NUMA) 6092 if (env->sd->flags & SD_NUMA)
6051 env->fbq_type = fbq_classify_group(&sds->busiest_stat); 6093 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
6094
6095 if (!env->sd->parent) {
6096 /* update overload indicator if we are at root domain */
6097 if (env->dst_rq->rd->overload != overload)
6098 env->dst_rq->rd->overload = overload;
6099 }
6100
6052} 6101}
6053 6102
6054/** 6103/**
@@ -6767,7 +6816,8 @@ static int idle_balance(struct rq *this_rq)
6767 */ 6816 */
6768 this_rq->idle_stamp = rq_clock(this_rq); 6817 this_rq->idle_stamp = rq_clock(this_rq);
6769 6818
6770 if (this_rq->avg_idle < sysctl_sched_migration_cost) { 6819 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
6820 !this_rq->rd->overload) {
6771 rcu_read_lock(); 6821 rcu_read_lock();
6772 sd = rcu_dereference_check_sched_domain(this_rq->sd); 6822 sd = rcu_dereference_check_sched_domain(this_rq->sd);
6773 if (sd) 6823 if (sd)
@@ -7325,6 +7375,8 @@ void trigger_load_balance(struct rq *rq)
7325static void rq_online_fair(struct rq *rq) 7375static void rq_online_fair(struct rq *rq)
7326{ 7376{
7327 update_sysctl(); 7377 update_sysctl();
7378
7379 update_runtime_enabled(rq);
7328} 7380}
7329 7381
7330static void rq_offline_fair(struct rq *rq) 7382static void rq_offline_fair(struct rq *rq)
@@ -7398,7 +7450,7 @@ static void task_fork_fair(struct task_struct *p)
7398 * 'current' within the tree based on its new key value. 7450 * 'current' within the tree based on its new key value.
7399 */ 7451 */
7400 swap(curr->vruntime, se->vruntime); 7452 swap(curr->vruntime, se->vruntime);
7401 resched_task(rq->curr); 7453 resched_curr(rq);
7402 } 7454 }
7403 7455
7404 se->vruntime -= cfs_rq->min_vruntime; 7456 se->vruntime -= cfs_rq->min_vruntime;
@@ -7423,7 +7475,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7423 */ 7475 */
7424 if (rq->curr == p) { 7476 if (rq->curr == p) {
7425 if (p->prio > oldprio) 7477 if (p->prio > oldprio)
7426 resched_task(rq->curr); 7478 resched_curr(rq);
7427 } else 7479 } else
7428 check_preempt_curr(rq, p, 0); 7480 check_preempt_curr(rq, p, 0);
7429} 7481}
@@ -7486,7 +7538,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
7486 * if we can still preempt the current task. 7538 * if we can still preempt the current task.
7487 */ 7539 */
7488 if (rq->curr == p) 7540 if (rq->curr == p)
7489 resched_task(rq->curr); 7541 resched_curr(rq);
7490 else 7542 else
7491 check_preempt_curr(rq, p, 0); 7543 check_preempt_curr(rq, p, 0);
7492} 7544}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index cf009fb0bc25..11e7bc434f43 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -79,7 +79,7 @@ static void cpuidle_idle_call(void)
79 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); 79 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
80 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); 80 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
81 int next_state, entered_state; 81 int next_state, entered_state;
82 bool broadcast; 82 unsigned int broadcast;
83 83
84 /* 84 /*
85 * Check if the idle task must be rescheduled. If it is the 85 * Check if the idle task must be rescheduled. If it is the
@@ -135,7 +135,7 @@ use_default:
135 goto exit_idle; 135 goto exit_idle;
136 } 136 }
137 137
138 broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); 138 broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP;
139 139
140 /* 140 /*
141 * Tell the time framework to switch to a broadcast timer 141 * Tell the time framework to switch to a broadcast timer
@@ -147,8 +147,6 @@ use_default:
147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) 147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
148 goto use_default; 148 goto use_default;
149 149
150 trace_cpu_idle_rcuidle(next_state, dev->cpu);
151
152 /* 150 /*
153 * Enter the idle state previously returned by the governor decision. 151 * Enter the idle state previously returned by the governor decision.
154 * This function will block until an interrupt occurs and will take 152 * This function will block until an interrupt occurs and will take
@@ -156,8 +154,6 @@ use_default:
156 */ 154 */
157 entered_state = cpuidle_enter(drv, dev, next_state); 155 entered_state = cpuidle_enter(drv, dev, next_state);
158 156
159 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
160
161 if (broadcast) 157 if (broadcast)
162 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); 158 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
163 159
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 879f2b75266a..67ad4e7f506a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
20 */ 20 */
21static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) 21static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
22{ 22{
23 resched_task(rq->idle); 23 resched_curr(rq);
24} 24}
25 25
26static struct task_struct * 26static struct task_struct *
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
index 16f5a30f9c88..8ecd552fe4f2 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/proc.c
@@ -8,13 +8,6 @@
8 8
9#include "sched.h" 9#include "sched.h"
10 10
11unsigned long this_cpu_load(void)
12{
13 struct rq *this = this_rq();
14 return this->cpu_load[0];
15}
16
17
18/* 11/*
19 * Global load-average calculations 12 * Global load-average calculations
20 * 13 *
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a49083192c64..5f6edca4fafd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -463,9 +463,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
463static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 463static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
464{ 464{
465 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 465 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
466 struct rq *rq = rq_of_rt_rq(rt_rq);
466 struct sched_rt_entity *rt_se; 467 struct sched_rt_entity *rt_se;
467 468
468 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 469 int cpu = cpu_of(rq);
469 470
470 rt_se = rt_rq->tg->rt_se[cpu]; 471 rt_se = rt_rq->tg->rt_se[cpu];
471 472
@@ -476,7 +477,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
476 enqueue_rt_entity(rt_se, false); 477 enqueue_rt_entity(rt_se, false);
477 478
478 if (rt_rq->highest_prio.curr < curr->prio) 479 if (rt_rq->highest_prio.curr < curr->prio)
479 resched_task(curr); 480 resched_curr(rq);
480 } 481 }
481} 482}
482 483
@@ -566,7 +567,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
566 return; 567 return;
567 568
568 enqueue_top_rt_rq(rt_rq); 569 enqueue_top_rt_rq(rt_rq);
569 resched_task(rq->curr); 570 resched_curr(rq);
570} 571}
571 572
572static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 573static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -740,6 +741,9 @@ balanced:
740 rt_rq->rt_throttled = 0; 741 rt_rq->rt_throttled = 0;
741 raw_spin_unlock(&rt_rq->rt_runtime_lock); 742 raw_spin_unlock(&rt_rq->rt_runtime_lock);
742 raw_spin_unlock(&rt_b->rt_runtime_lock); 743 raw_spin_unlock(&rt_b->rt_runtime_lock);
744
745 /* Make rt_rq available for pick_next_task() */
746 sched_rt_rq_enqueue(rt_rq);
743 } 747 }
744} 748}
745 749
@@ -948,7 +952,7 @@ static void update_curr_rt(struct rq *rq)
948 raw_spin_lock(&rt_rq->rt_runtime_lock); 952 raw_spin_lock(&rt_rq->rt_runtime_lock);
949 rt_rq->rt_time += delta_exec; 953 rt_rq->rt_time += delta_exec;
950 if (sched_rt_runtime_exceeded(rt_rq)) 954 if (sched_rt_runtime_exceeded(rt_rq))
951 resched_task(curr); 955 resched_curr(rq);
952 raw_spin_unlock(&rt_rq->rt_runtime_lock); 956 raw_spin_unlock(&rt_rq->rt_runtime_lock);
953 } 957 }
954 } 958 }
@@ -1363,7 +1367,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1363 * to try and push current away: 1367 * to try and push current away:
1364 */ 1368 */
1365 requeue_task_rt(rq, p, 1); 1369 requeue_task_rt(rq, p, 1);
1366 resched_task(rq->curr); 1370 resched_curr(rq);
1367} 1371}
1368 1372
1369#endif /* CONFIG_SMP */ 1373#endif /* CONFIG_SMP */
@@ -1374,7 +1378,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1374static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) 1378static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1375{ 1379{
1376 if (p->prio < rq->curr->prio) { 1380 if (p->prio < rq->curr->prio) {
1377 resched_task(rq->curr); 1381 resched_curr(rq);
1378 return; 1382 return;
1379 } 1383 }
1380 1384
@@ -1690,7 +1694,7 @@ retry:
1690 * just reschedule current. 1694 * just reschedule current.
1691 */ 1695 */
1692 if (unlikely(next_task->prio < rq->curr->prio)) { 1696 if (unlikely(next_task->prio < rq->curr->prio)) {
1693 resched_task(rq->curr); 1697 resched_curr(rq);
1694 return 0; 1698 return 0;
1695 } 1699 }
1696 1700
@@ -1737,7 +1741,7 @@ retry:
1737 activate_task(lowest_rq, next_task, 0); 1741 activate_task(lowest_rq, next_task, 0);
1738 ret = 1; 1742 ret = 1;
1739 1743
1740 resched_task(lowest_rq->curr); 1744 resched_curr(lowest_rq);
1741 1745
1742 double_unlock_balance(rq, lowest_rq); 1746 double_unlock_balance(rq, lowest_rq);
1743 1747
@@ -1936,7 +1940,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1936 return; 1940 return;
1937 1941
1938 if (pull_rt_task(rq)) 1942 if (pull_rt_task(rq))
1939 resched_task(rq->curr); 1943 resched_curr(rq);
1940} 1944}
1941 1945
1942void __init init_sched_rt_class(void) 1946void __init init_sched_rt_class(void)
@@ -1974,7 +1978,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1974 check_resched = 0; 1978 check_resched = 0;
1975#endif /* CONFIG_SMP */ 1979#endif /* CONFIG_SMP */
1976 if (check_resched && p->prio < rq->curr->prio) 1980 if (check_resched && p->prio < rq->curr->prio)
1977 resched_task(rq->curr); 1981 resched_curr(rq);
1978 } 1982 }
1979} 1983}
1980 1984
@@ -2003,11 +2007,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2003 * Only reschedule if p is still on the same runqueue. 2007 * Only reschedule if p is still on the same runqueue.
2004 */ 2008 */
2005 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) 2009 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
2006 resched_task(p); 2010 resched_curr(rq);
2007#else 2011#else
2008 /* For UP simply resched on drop of prio */ 2012 /* For UP simply resched on drop of prio */
2009 if (oldprio < p->prio) 2013 if (oldprio < p->prio)
2010 resched_task(p); 2014 resched_curr(rq);
2011#endif /* CONFIG_SMP */ 2015#endif /* CONFIG_SMP */
2012 } else { 2016 } else {
2013 /* 2017 /*
@@ -2016,7 +2020,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2016 * then reschedule. 2020 * then reschedule.
2017 */ 2021 */
2018 if (p->prio < rq->curr->prio) 2022 if (p->prio < rq->curr->prio)
2019 resched_task(rq->curr); 2023 resched_curr(rq);
2020 } 2024 }
2021} 2025}
2022 2026
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 31cc02ebc54e..579712f4e9d5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -477,6 +477,9 @@ struct root_domain {
477 cpumask_var_t span; 477 cpumask_var_t span;
478 cpumask_var_t online; 478 cpumask_var_t online;
479 479
480 /* Indicate more than one runnable task for any CPU */
481 bool overload;
482
480 /* 483 /*
481 * The bit corresponding to a CPU gets set here if such CPU has more 484 * The bit corresponding to a CPU gets set here if such CPU has more
482 * than one runnable -deadline task (as it is below for RT tasks). 485 * than one runnable -deadline task (as it is below for RT tasks).
@@ -884,20 +887,10 @@ enum {
884#undef SCHED_FEAT 887#undef SCHED_FEAT
885 888
886#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) 889#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
887static __always_inline bool static_branch__true(struct static_key *key)
888{
889 return static_key_true(key); /* Not out of line branch. */
890}
891
892static __always_inline bool static_branch__false(struct static_key *key)
893{
894 return static_key_false(key); /* Out of line branch. */
895}
896
897#define SCHED_FEAT(name, enabled) \ 890#define SCHED_FEAT(name, enabled) \
898static __always_inline bool static_branch_##name(struct static_key *key) \ 891static __always_inline bool static_branch_##name(struct static_key *key) \
899{ \ 892{ \
900 return static_branch__##enabled(key); \ 893 return static_key_##enabled(key); \
901} 894}
902 895
903#include "features.h" 896#include "features.h"
@@ -1196,7 +1189,7 @@ extern void init_sched_rt_class(void);
1196extern void init_sched_fair_class(void); 1189extern void init_sched_fair_class(void);
1197extern void init_sched_dl_class(void); 1190extern void init_sched_dl_class(void);
1198 1191
1199extern void resched_task(struct task_struct *p); 1192extern void resched_curr(struct rq *rq);
1200extern void resched_cpu(int cpu); 1193extern void resched_cpu(int cpu);
1201 1194
1202extern struct rt_bandwidth def_rt_bandwidth; 1195extern struct rt_bandwidth def_rt_bandwidth;
@@ -1218,15 +1211,26 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
1218 1211
1219 rq->nr_running = prev_nr + count; 1212 rq->nr_running = prev_nr + count;
1220 1213
1221#ifdef CONFIG_NO_HZ_FULL
1222 if (prev_nr < 2 && rq->nr_running >= 2) { 1214 if (prev_nr < 2 && rq->nr_running >= 2) {
1215#ifdef CONFIG_SMP
1216 if (!rq->rd->overload)
1217 rq->rd->overload = true;
1218#endif
1219
1220#ifdef CONFIG_NO_HZ_FULL
1223 if (tick_nohz_full_cpu(rq->cpu)) { 1221 if (tick_nohz_full_cpu(rq->cpu)) {
1224 /* Order rq->nr_running write against the IPI */ 1222 /*
1225 smp_wmb(); 1223 * Tick is needed if more than one task runs on a CPU.
1226 smp_send_reschedule(rq->cpu); 1224 * Send the target an IPI to kick it out of nohz mode.
1225 *
1226 * We assume that IPI implies full memory barrier and the
1227 * new value of rq->nr_running is visible on reception
1228 * from the target.
1229 */
1230 tick_nohz_full_kick_cpu(rq->cpu);
1227 } 1231 }
1228 }
1229#endif 1232#endif
1233 }
1230} 1234}
1231 1235
1232static inline void sub_nr_running(struct rq *rq, unsigned count) 1236static inline void sub_nr_running(struct rq *rq, unsigned count)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 0ffa20ae657b..15cab1a4f84e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -319,14 +319,14 @@ EXPORT_SYMBOL(wake_bit_function);
319 */ 319 */
320int __sched 320int __sched
321__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, 321__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
322 int (*action)(void *), unsigned mode) 322 wait_bit_action_f *action, unsigned mode)
323{ 323{
324 int ret = 0; 324 int ret = 0;
325 325
326 do { 326 do {
327 prepare_to_wait(wq, &q->wait, mode); 327 prepare_to_wait(wq, &q->wait, mode);
328 if (test_bit(q->key.bit_nr, q->key.flags)) 328 if (test_bit(q->key.bit_nr, q->key.flags))
329 ret = (*action)(q->key.flags); 329 ret = (*action)(&q->key);
330 } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); 330 } while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
331 finish_wait(wq, &q->wait); 331 finish_wait(wq, &q->wait);
332 return ret; 332 return ret;
@@ -334,7 +334,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
334EXPORT_SYMBOL(__wait_on_bit); 334EXPORT_SYMBOL(__wait_on_bit);
335 335
336int __sched out_of_line_wait_on_bit(void *word, int bit, 336int __sched out_of_line_wait_on_bit(void *word, int bit,
337 int (*action)(void *), unsigned mode) 337 wait_bit_action_f *action, unsigned mode)
338{ 338{
339 wait_queue_head_t *wq = bit_waitqueue(word, bit); 339 wait_queue_head_t *wq = bit_waitqueue(word, bit);
340 DEFINE_WAIT_BIT(wait, word, bit); 340 DEFINE_WAIT_BIT(wait, word, bit);
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit);
345 345
346int __sched 346int __sched
347__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, 347__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
348 int (*action)(void *), unsigned mode) 348 wait_bit_action_f *action, unsigned mode)
349{ 349{
350 do { 350 do {
351 int ret; 351 int ret;
@@ -353,7 +353,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
353 prepare_to_wait_exclusive(wq, &q->wait, mode); 353 prepare_to_wait_exclusive(wq, &q->wait, mode);
354 if (!test_bit(q->key.bit_nr, q->key.flags)) 354 if (!test_bit(q->key.bit_nr, q->key.flags))
355 continue; 355 continue;
356 ret = action(q->key.flags); 356 ret = action(&q->key);
357 if (!ret) 357 if (!ret)
358 continue; 358 continue;
359 abort_exclusive_wait(wq, &q->wait, mode, &q->key); 359 abort_exclusive_wait(wq, &q->wait, mode, &q->key);
@@ -365,7 +365,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
365EXPORT_SYMBOL(__wait_on_bit_lock); 365EXPORT_SYMBOL(__wait_on_bit_lock);
366 366
367int __sched out_of_line_wait_on_bit_lock(void *word, int bit, 367int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
368 int (*action)(void *), unsigned mode) 368 wait_bit_action_f *action, unsigned mode)
369{ 369{
370 wait_queue_head_t *wq = bit_waitqueue(word, bit); 370 wait_queue_head_t *wq = bit_waitqueue(word, bit);
371 DEFINE_WAIT_BIT(wait, word, bit); 371 DEFINE_WAIT_BIT(wait, word, bit);
@@ -502,3 +502,21 @@ void wake_up_atomic_t(atomic_t *p)
502 __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); 502 __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
503} 503}
504EXPORT_SYMBOL(wake_up_atomic_t); 504EXPORT_SYMBOL(wake_up_atomic_t);
505
506__sched int bit_wait(struct wait_bit_key *word)
507{
508 if (signal_pending_state(current->state, current))
509 return 1;
510 schedule();
511 return 0;
512}
513EXPORT_SYMBOL(bit_wait);
514
515__sched int bit_wait_io(struct wait_bit_key *word)
516{
517 if (signal_pending_state(current->state, current))
518 return 1;
519 io_schedule();
520 return 0;
521}
522EXPORT_SYMBOL(bit_wait_io);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 301bbc24739c..25b0043f4755 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -18,15 +18,17 @@
18#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/seccomp.h> 20#include <linux/seccomp.h>
21#include <linux/slab.h>
22#include <linux/syscalls.h>
21 23
22/* #define SECCOMP_DEBUG 1 */ 24/* #define SECCOMP_DEBUG 1 */
23 25
24#ifdef CONFIG_SECCOMP_FILTER 26#ifdef CONFIG_SECCOMP_FILTER
25#include <asm/syscall.h> 27#include <asm/syscall.h>
26#include <linux/filter.h> 28#include <linux/filter.h>
29#include <linux/pid.h>
27#include <linux/ptrace.h> 30#include <linux/ptrace.h>
28#include <linux/security.h> 31#include <linux/security.h>
29#include <linux/slab.h>
30#include <linux/tracehook.h> 32#include <linux/tracehook.h>
31#include <linux/uaccess.h> 33#include <linux/uaccess.h>
32 34
@@ -54,7 +56,7 @@
54struct seccomp_filter { 56struct seccomp_filter {
55 atomic_t usage; 57 atomic_t usage;
56 struct seccomp_filter *prev; 58 struct seccomp_filter *prev;
57 struct sk_filter *prog; 59 struct bpf_prog *prog;
58}; 60};
59 61
60/* Limit any path through the tree to 256KB worth of instructions. */ 62/* Limit any path through the tree to 256KB worth of instructions. */
@@ -87,7 +89,7 @@ static void populate_seccomp_data(struct seccomp_data *sd)
87 * @filter: filter to verify 89 * @filter: filter to verify
88 * @flen: length of filter 90 * @flen: length of filter
89 * 91 *
90 * Takes a previously checked filter (by sk_chk_filter) and 92 * Takes a previously checked filter (by bpf_check_classic) and
91 * redirects all filter code that loads struct sk_buff data 93 * redirects all filter code that loads struct sk_buff data
92 * and related data through seccomp_bpf_load. It also 94 * and related data through seccomp_bpf_load. It also
93 * enforces length and alignment checking of those loads. 95 * enforces length and alignment checking of those loads.
@@ -172,51 +174,184 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
172 */ 174 */
173static u32 seccomp_run_filters(int syscall) 175static u32 seccomp_run_filters(int syscall)
174{ 176{
175 struct seccomp_filter *f; 177 struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
176 struct seccomp_data sd; 178 struct seccomp_data sd;
177 u32 ret = SECCOMP_RET_ALLOW; 179 u32 ret = SECCOMP_RET_ALLOW;
178 180
179 /* Ensure unexpected behavior doesn't result in failing open. */ 181 /* Ensure unexpected behavior doesn't result in failing open. */
180 if (WARN_ON(current->seccomp.filter == NULL)) 182 if (unlikely(WARN_ON(f == NULL)))
181 return SECCOMP_RET_KILL; 183 return SECCOMP_RET_KILL;
182 184
185 /* Make sure cross-thread synced filter points somewhere sane. */
186 smp_read_barrier_depends();
187
183 populate_seccomp_data(&sd); 188 populate_seccomp_data(&sd);
184 189
185 /* 190 /*
186 * All filters in the list are evaluated and the lowest BPF return 191 * All filters in the list are evaluated and the lowest BPF return
187 * value always takes priority (ignoring the DATA). 192 * value always takes priority (ignoring the DATA).
188 */ 193 */
189 for (f = current->seccomp.filter; f; f = f->prev) { 194 for (; f; f = f->prev) {
190 u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); 195 u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd);
191 196
192 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) 197 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
193 ret = cur_ret; 198 ret = cur_ret;
194 } 199 }
195 return ret; 200 return ret;
196} 201}
202#endif /* CONFIG_SECCOMP_FILTER */
203
204static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
205{
206 BUG_ON(!spin_is_locked(&current->sighand->siglock));
207
208 if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
209 return false;
210
211 return true;
212}
213
214static inline void seccomp_assign_mode(struct task_struct *task,
215 unsigned long seccomp_mode)
216{
217 BUG_ON(!spin_is_locked(&task->sighand->siglock));
218
219 task->seccomp.mode = seccomp_mode;
220 /*
221 * Make sure TIF_SECCOMP cannot be set before the mode (and
222 * filter) is set.
223 */
224 smp_mb__before_atomic();
225 set_tsk_thread_flag(task, TIF_SECCOMP);
226}
227
228#ifdef CONFIG_SECCOMP_FILTER
229/* Returns 1 if the parent is an ancestor of the child. */
230static int is_ancestor(struct seccomp_filter *parent,
231 struct seccomp_filter *child)
232{
233 /* NULL is the root ancestor. */
234 if (parent == NULL)
235 return 1;
236 for (; child; child = child->prev)
237 if (child == parent)
238 return 1;
239 return 0;
240}
197 241
198/** 242/**
199 * seccomp_attach_filter: Attaches a seccomp filter to current. 243 * seccomp_can_sync_threads: checks if all threads can be synchronized
244 *
245 * Expects sighand and cred_guard_mutex locks to be held.
246 *
247 * Returns 0 on success, -ve on error, or the pid of a thread which was
248 * either not in the correct seccomp mode or it did not have an ancestral
249 * seccomp filter.
250 */
251static inline pid_t seccomp_can_sync_threads(void)
252{
253 struct task_struct *thread, *caller;
254
255 BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
256 BUG_ON(!spin_is_locked(&current->sighand->siglock));
257
258 /* Validate all threads being eligible for synchronization. */
259 caller = current;
260 for_each_thread(caller, thread) {
261 pid_t failed;
262
263 /* Skip current, since it is initiating the sync. */
264 if (thread == caller)
265 continue;
266
267 if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
268 (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
269 is_ancestor(thread->seccomp.filter,
270 caller->seccomp.filter)))
271 continue;
272
273 /* Return the first thread that cannot be synchronized. */
274 failed = task_pid_vnr(thread);
275 /* If the pid cannot be resolved, then return -ESRCH */
276 if (unlikely(WARN_ON(failed == 0)))
277 failed = -ESRCH;
278 return failed;
279 }
280
281 return 0;
282}
283
284/**
285 * seccomp_sync_threads: sets all threads to use current's filter
286 *
287 * Expects sighand and cred_guard_mutex locks to be held, and for
288 * seccomp_can_sync_threads() to have returned success already
289 * without dropping the locks.
290 *
291 */
292static inline void seccomp_sync_threads(void)
293{
294 struct task_struct *thread, *caller;
295
296 BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
297 BUG_ON(!spin_is_locked(&current->sighand->siglock));
298
299 /* Synchronize all threads. */
300 caller = current;
301 for_each_thread(caller, thread) {
302 /* Skip current, since it needs no changes. */
303 if (thread == caller)
304 continue;
305
306 /* Get a task reference for the new leaf node. */
307 get_seccomp_filter(caller);
308 /*
309 * Drop the task reference to the shared ancestor since
310 * current's path will hold a reference. (This also
311 * allows a put before the assignment.)
312 */
313 put_seccomp_filter(thread);
314 smp_store_release(&thread->seccomp.filter,
315 caller->seccomp.filter);
316 /*
317 * Opt the other thread into seccomp if needed.
318 * As threads are considered to be trust-realm
319 * equivalent (see ptrace_may_access), it is safe to
320 * allow one thread to transition the other.
321 */
322 if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
323 /*
324 * Don't let an unprivileged task work around
325 * the no_new_privs restriction by creating
326 * a thread that sets it up, enters seccomp,
327 * then dies.
328 */
329 if (task_no_new_privs(caller))
330 task_set_no_new_privs(thread);
331
332 seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
333 }
334 }
335}
336
337/**
338 * seccomp_prepare_filter: Prepares a seccomp filter for use.
200 * @fprog: BPF program to install 339 * @fprog: BPF program to install
201 * 340 *
202 * Returns 0 on success or an errno on failure. 341 * Returns filter on success or an ERR_PTR on failure.
203 */ 342 */
204static long seccomp_attach_filter(struct sock_fprog *fprog) 343static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
205{ 344{
206 struct seccomp_filter *filter; 345 struct seccomp_filter *filter;
207 unsigned long fp_size = fprog->len * sizeof(struct sock_filter); 346 unsigned long fp_size;
208 unsigned long total_insns = fprog->len;
209 struct sock_filter *fp; 347 struct sock_filter *fp;
210 int new_len; 348 int new_len;
211 long ret; 349 long ret;
212 350
213 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) 351 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
214 return -EINVAL; 352 return ERR_PTR(-EINVAL);
215 353 BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
216 for (filter = current->seccomp.filter; filter; filter = filter->prev) 354 fp_size = fprog->len * sizeof(struct sock_filter);
217 total_insns += filter->prog->len + 4; /* include a 4 instr penalty */
218 if (total_insns > MAX_INSNS_PER_PATH)
219 return -ENOMEM;
220 355
221 /* 356 /*
222 * Installing a seccomp filter requires that the task has 357 * Installing a seccomp filter requires that the task has
@@ -224,14 +359,14 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
224 * This avoids scenarios where unprivileged tasks can affect the 359 * This avoids scenarios where unprivileged tasks can affect the
225 * behavior of privileged children. 360 * behavior of privileged children.
226 */ 361 */
227 if (!current->no_new_privs && 362 if (!task_no_new_privs(current) &&
228 security_capable_noaudit(current_cred(), current_user_ns(), 363 security_capable_noaudit(current_cred(), current_user_ns(),
229 CAP_SYS_ADMIN) != 0) 364 CAP_SYS_ADMIN) != 0)
230 return -EACCES; 365 return ERR_PTR(-EACCES);
231 366
232 fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); 367 fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
233 if (!fp) 368 if (!fp)
234 return -ENOMEM; 369 return ERR_PTR(-ENOMEM);
235 370
236 /* Copy the instructions from fprog. */ 371 /* Copy the instructions from fprog. */
237 ret = -EFAULT; 372 ret = -EFAULT;
@@ -239,7 +374,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
239 goto free_prog; 374 goto free_prog;
240 375
241 /* Check and rewrite the fprog via the skb checker */ 376 /* Check and rewrite the fprog via the skb checker */
242 ret = sk_chk_filter(fp, fprog->len); 377 ret = bpf_check_classic(fp, fprog->len);
243 if (ret) 378 if (ret)
244 goto free_prog; 379 goto free_prog;
245 380
@@ -248,8 +383,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
248 if (ret) 383 if (ret)
249 goto free_prog; 384 goto free_prog;
250 385
251 /* Convert 'sock_filter' insns to 'sock_filter_int' insns */ 386 /* Convert 'sock_filter' insns to 'bpf_insn' insns */
252 ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); 387 ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len);
253 if (ret) 388 if (ret)
254 goto free_prog; 389 goto free_prog;
255 390
@@ -260,12 +395,12 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
260 if (!filter) 395 if (!filter)
261 goto free_prog; 396 goto free_prog;
262 397
263 filter->prog = kzalloc(sk_filter_size(new_len), 398 filter->prog = kzalloc(bpf_prog_size(new_len),
264 GFP_KERNEL|__GFP_NOWARN); 399 GFP_KERNEL|__GFP_NOWARN);
265 if (!filter->prog) 400 if (!filter->prog)
266 goto free_filter; 401 goto free_filter;
267 402
268 ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); 403 ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
269 if (ret) 404 if (ret)
270 goto free_filter_prog; 405 goto free_filter_prog;
271 kfree(fp); 406 kfree(fp);
@@ -273,15 +408,9 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
273 atomic_set(&filter->usage, 1); 408 atomic_set(&filter->usage, 1);
274 filter->prog->len = new_len; 409 filter->prog->len = new_len;
275 410
276 sk_filter_select_runtime(filter->prog); 411 bpf_prog_select_runtime(filter->prog);
277 412
278 /* 413 return filter;
279 * If there is an existing filter, make it the prev and don't drop its
280 * task reference.
281 */
282 filter->prev = current->seccomp.filter;
283 current->seccomp.filter = filter;
284 return 0;
285 414
286free_filter_prog: 415free_filter_prog:
287 kfree(filter->prog); 416 kfree(filter->prog);
@@ -289,19 +418,20 @@ free_filter:
289 kfree(filter); 418 kfree(filter);
290free_prog: 419free_prog:
291 kfree(fp); 420 kfree(fp);
292 return ret; 421 return ERR_PTR(ret);
293} 422}
294 423
295/** 424/**
296 * seccomp_attach_user_filter - attaches a user-supplied sock_fprog 425 * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
297 * @user_filter: pointer to the user data containing a sock_fprog. 426 * @user_filter: pointer to the user data containing a sock_fprog.
298 * 427 *
299 * Returns 0 on success and non-zero otherwise. 428 * Returns 0 on success and non-zero otherwise.
300 */ 429 */
301static long seccomp_attach_user_filter(char __user *user_filter) 430static struct seccomp_filter *
431seccomp_prepare_user_filter(const char __user *user_filter)
302{ 432{
303 struct sock_fprog fprog; 433 struct sock_fprog fprog;
304 long ret = -EFAULT; 434 struct seccomp_filter *filter = ERR_PTR(-EFAULT);
305 435
306#ifdef CONFIG_COMPAT 436#ifdef CONFIG_COMPAT
307 if (is_compat_task()) { 437 if (is_compat_task()) {
@@ -314,9 +444,56 @@ static long seccomp_attach_user_filter(char __user *user_filter)
314#endif 444#endif
315 if (copy_from_user(&fprog, user_filter, sizeof(fprog))) 445 if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
316 goto out; 446 goto out;
317 ret = seccomp_attach_filter(&fprog); 447 filter = seccomp_prepare_filter(&fprog);
318out: 448out:
319 return ret; 449 return filter;
450}
451
452/**
453 * seccomp_attach_filter: validate and attach filter
454 * @flags: flags to change filter behavior
455 * @filter: seccomp filter to add to the current process
456 *
457 * Caller must be holding current->sighand->siglock lock.
458 *
459 * Returns 0 on success, -ve on error.
460 */
461static long seccomp_attach_filter(unsigned int flags,
462 struct seccomp_filter *filter)
463{
464 unsigned long total_insns;
465 struct seccomp_filter *walker;
466
467 BUG_ON(!spin_is_locked(&current->sighand->siglock));
468
469 /* Validate resulting filter length. */
470 total_insns = filter->prog->len;
471 for (walker = current->seccomp.filter; walker; walker = walker->prev)
472 total_insns += walker->prog->len + 4; /* 4 instr penalty */
473 if (total_insns > MAX_INSNS_PER_PATH)
474 return -ENOMEM;
475
476 /* If thread sync has been requested, check that it is possible. */
477 if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
478 int ret;
479
480 ret = seccomp_can_sync_threads();
481 if (ret)
482 return ret;
483 }
484
485 /*
486 * If there is an existing filter, make it the prev and don't drop its
487 * task reference.
488 */
489 filter->prev = current->seccomp.filter;
490 current->seccomp.filter = filter;
491
492 /* Now that the new filter is in place, synchronize to all threads. */
493 if (flags & SECCOMP_FILTER_FLAG_TSYNC)
494 seccomp_sync_threads();
495
496 return 0;
320} 497}
321 498
322/* get_seccomp_filter - increments the reference count of the filter on @tsk */ 499/* get_seccomp_filter - increments the reference count of the filter on @tsk */
@@ -329,6 +506,14 @@ void get_seccomp_filter(struct task_struct *tsk)
329 atomic_inc(&orig->usage); 506 atomic_inc(&orig->usage);
330} 507}
331 508
509static inline void seccomp_filter_free(struct seccomp_filter *filter)
510{
511 if (filter) {
512 bpf_prog_free(filter->prog);
513 kfree(filter);
514 }
515}
516
332/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ 517/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
333void put_seccomp_filter(struct task_struct *tsk) 518void put_seccomp_filter(struct task_struct *tsk)
334{ 519{
@@ -337,8 +522,7 @@ void put_seccomp_filter(struct task_struct *tsk)
337 while (orig && atomic_dec_and_test(&orig->usage)) { 522 while (orig && atomic_dec_and_test(&orig->usage)) {
338 struct seccomp_filter *freeme = orig; 523 struct seccomp_filter *freeme = orig;
339 orig = orig->prev; 524 orig = orig->prev;
340 sk_filter_free(freeme->prog); 525 seccomp_filter_free(freeme);
341 kfree(freeme);
342 } 526 }
343} 527}
344 528
@@ -382,12 +566,17 @@ static int mode1_syscalls_32[] = {
382 566
383int __secure_computing(int this_syscall) 567int __secure_computing(int this_syscall)
384{ 568{
385 int mode = current->seccomp.mode;
386 int exit_sig = 0; 569 int exit_sig = 0;
387 int *syscall; 570 int *syscall;
388 u32 ret; 571 u32 ret;
389 572
390 switch (mode) { 573 /*
574 * Make sure that any changes to mode from another thread have
575 * been seen after TIF_SECCOMP was seen.
576 */
577 rmb();
578
579 switch (current->seccomp.mode) {
391 case SECCOMP_MODE_STRICT: 580 case SECCOMP_MODE_STRICT:
392 syscall = mode1_syscalls; 581 syscall = mode1_syscalls;
393#ifdef CONFIG_COMPAT 582#ifdef CONFIG_COMPAT
@@ -473,47 +662,152 @@ long prctl_get_seccomp(void)
473} 662}
474 663
475/** 664/**
476 * prctl_set_seccomp: configures current->seccomp.mode 665 * seccomp_set_mode_strict: internal function for setting strict seccomp
477 * @seccomp_mode: requested mode to use
478 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
479 * 666 *
480 * This function may be called repeatedly with a @seccomp_mode of 667 * Once current->seccomp.mode is non-zero, it may not be changed.
481 * SECCOMP_MODE_FILTER to install additional filters. Every filter 668 *
482 * successfully installed will be evaluated (in reverse order) for each system 669 * Returns 0 on success or -EINVAL on failure.
483 * call the task makes. 670 */
671static long seccomp_set_mode_strict(void)
672{
673 const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
674 long ret = -EINVAL;
675
676 spin_lock_irq(&current->sighand->siglock);
677
678 if (!seccomp_may_assign_mode(seccomp_mode))
679 goto out;
680
681#ifdef TIF_NOTSC
682 disable_TSC();
683#endif
684 seccomp_assign_mode(current, seccomp_mode);
685 ret = 0;
686
687out:
688 spin_unlock_irq(&current->sighand->siglock);
689
690 return ret;
691}
692
693#ifdef CONFIG_SECCOMP_FILTER
694/**
695 * seccomp_set_mode_filter: internal function for setting seccomp filter
696 * @flags: flags to change filter behavior
697 * @filter: struct sock_fprog containing filter
698 *
699 * This function may be called repeatedly to install additional filters.
700 * Every filter successfully installed will be evaluated (in reverse order)
701 * for each system call the task makes.
484 * 702 *
485 * Once current->seccomp.mode is non-zero, it may not be changed. 703 * Once current->seccomp.mode is non-zero, it may not be changed.
486 * 704 *
487 * Returns 0 on success or -EINVAL on failure. 705 * Returns 0 on success or -EINVAL on failure.
488 */ 706 */
489long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) 707static long seccomp_set_mode_filter(unsigned int flags,
708 const char __user *filter)
490{ 709{
710 const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
711 struct seccomp_filter *prepared = NULL;
491 long ret = -EINVAL; 712 long ret = -EINVAL;
492 713
493 if (current->seccomp.mode && 714 /* Validate flags. */
494 current->seccomp.mode != seccomp_mode) 715 if (flags & ~SECCOMP_FILTER_FLAG_MASK)
716 return -EINVAL;
717
718 /* Prepare the new filter before holding any locks. */
719 prepared = seccomp_prepare_user_filter(filter);
720 if (IS_ERR(prepared))
721 return PTR_ERR(prepared);
722
723 /*
724 * Make sure we cannot change seccomp or nnp state via TSYNC
725 * while another thread is in the middle of calling exec.
726 */
727 if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
728 mutex_lock_killable(&current->signal->cred_guard_mutex))
729 goto out_free;
730
731 spin_lock_irq(&current->sighand->siglock);
732
733 if (!seccomp_may_assign_mode(seccomp_mode))
734 goto out;
735
736 ret = seccomp_attach_filter(flags, prepared);
737 if (ret)
495 goto out; 738 goto out;
739 /* Do not free the successfully attached filter. */
740 prepared = NULL;
741
742 seccomp_assign_mode(current, seccomp_mode);
743out:
744 spin_unlock_irq(&current->sighand->siglock);
745 if (flags & SECCOMP_FILTER_FLAG_TSYNC)
746 mutex_unlock(&current->signal->cred_guard_mutex);
747out_free:
748 seccomp_filter_free(prepared);
749 return ret;
750}
751#else
752static inline long seccomp_set_mode_filter(unsigned int flags,
753 const char __user *filter)
754{
755 return -EINVAL;
756}
757#endif
758
759/* Common entry point for both prctl and syscall. */
760static long do_seccomp(unsigned int op, unsigned int flags,
761 const char __user *uargs)
762{
763 switch (op) {
764 case SECCOMP_SET_MODE_STRICT:
765 if (flags != 0 || uargs != NULL)
766 return -EINVAL;
767 return seccomp_set_mode_strict();
768 case SECCOMP_SET_MODE_FILTER:
769 return seccomp_set_mode_filter(flags, uargs);
770 default:
771 return -EINVAL;
772 }
773}
774
775SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
776 const char __user *, uargs)
777{
778 return do_seccomp(op, flags, uargs);
779}
780
781/**
782 * prctl_set_seccomp: configures current->seccomp.mode
783 * @seccomp_mode: requested mode to use
784 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
785 *
786 * Returns 0 on success or -EINVAL on failure.
787 */
788long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
789{
790 unsigned int op;
791 char __user *uargs;
496 792
497 switch (seccomp_mode) { 793 switch (seccomp_mode) {
498 case SECCOMP_MODE_STRICT: 794 case SECCOMP_MODE_STRICT:
499 ret = 0; 795 op = SECCOMP_SET_MODE_STRICT;
500#ifdef TIF_NOTSC 796 /*
501 disable_TSC(); 797 * Setting strict mode through prctl always ignored filter,
502#endif 798 * so make sure it is always NULL here to pass the internal
799 * check in do_seccomp().
800 */
801 uargs = NULL;
503 break; 802 break;
504#ifdef CONFIG_SECCOMP_FILTER
505 case SECCOMP_MODE_FILTER: 803 case SECCOMP_MODE_FILTER:
506 ret = seccomp_attach_user_filter(filter); 804 op = SECCOMP_SET_MODE_FILTER;
507 if (ret) 805 uargs = filter;
508 goto out;
509 break; 806 break;
510#endif
511 default: 807 default:
512 goto out; 808 return -EINVAL;
513 } 809 }
514 810
515 current->seccomp.mode = seccomp_mode; 811 /* prctl interface doesn't have flags, so they are always zero. */
516 set_thread_flag(TIF_SECCOMP); 812 return do_seccomp(op, 0, uargs);
517out:
518 return ret;
519} 813}
diff --git a/kernel/signal.c b/kernel/signal.c
index a4077e90f19f..40b76e351e64 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1263,6 +1263,10 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1263 struct sighand_struct *sighand; 1263 struct sighand_struct *sighand;
1264 1264
1265 for (;;) { 1265 for (;;) {
1266 /*
1267 * Disable interrupts early to avoid deadlocks.
1268 * See rcu_read_unlock() comment header for details.
1269 */
1266 local_irq_save(*flags); 1270 local_irq_save(*flags);
1267 rcu_read_lock(); 1271 rcu_read_lock();
1268 sighand = rcu_dereference(tsk->sighand); 1272 sighand = rcu_dereference(tsk->sighand);
diff --git a/kernel/smp.c b/kernel/smp.c
index 80c33f8de14f..487653b5844f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008 4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008
5 */ 5 */
6#include <linux/irq_work.h>
6#include <linux/rcupdate.h> 7#include <linux/rcupdate.h>
7#include <linux/rculist.h> 8#include <linux/rculist.h>
8#include <linux/kernel.h> 9#include <linux/kernel.h>
@@ -251,6 +252,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
251 csd->func(csd->info); 252 csd->func(csd->info);
252 csd_unlock(csd); 253 csd_unlock(csd);
253 } 254 }
255
256 /*
257 * Handle irq works queued remotely by irq_work_queue_on().
258 * Smp functions above are typically synchronous so they
259 * better run first since some other CPUs may be busy waiting
260 * for them.
261 */
262 irq_work_run();
254} 263}
255 264
256/* 265/*
diff --git a/kernel/sys.c b/kernel/sys.c
index 66a751ebf9d9..ce8129192a26 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1990,12 +1990,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1990 if (arg2 != 1 || arg3 || arg4 || arg5) 1990 if (arg2 != 1 || arg3 || arg4 || arg5)
1991 return -EINVAL; 1991 return -EINVAL;
1992 1992
1993 current->no_new_privs = 1; 1993 task_set_no_new_privs(current);
1994 break; 1994 break;
1995 case PR_GET_NO_NEW_PRIVS: 1995 case PR_GET_NO_NEW_PRIVS:
1996 if (arg2 || arg3 || arg4 || arg5) 1996 if (arg2 || arg3 || arg4 || arg5)
1997 return -EINVAL; 1997 return -EINVAL;
1998 return current->no_new_privs ? 1 : 0; 1998 return task_no_new_privs(current) ? 1 : 0;
1999 case PR_GET_THP_DISABLE: 1999 case PR_GET_THP_DISABLE:
2000 if (arg2 || arg3 || arg4 || arg5) 2000 if (arg2 || arg3 || arg4 || arg5)
2001 return -EINVAL; 2001 return -EINVAL;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 36441b51b5df..2904a2105914 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -213,3 +213,6 @@ cond_syscall(compat_sys_open_by_handle_at);
213 213
214/* compare kernel pointers */ 214/* compare kernel pointers */
215cond_syscall(sys_kcmp); 215cond_syscall(sys_kcmp);
216
217/* operate on Secure Computing state */
218cond_syscall(sys_seccomp);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 653cbbd9e7ad..e4ba9a5a5ccb 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -522,6 +522,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = {
522 { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, 522 { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
523 { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, 523 { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" },
524 { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, 524 { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
525 { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" },
525 {} 526 {}
526}; 527};
527 528
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
index 52ebc70263f4..875f64e8935b 100644
--- a/kernel/system_keyring.c
+++ b/kernel/system_keyring.c
@@ -89,6 +89,7 @@ static __init int load_system_certificate_list(void)
89 pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", 89 pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
90 PTR_ERR(key)); 90 PTR_ERR(key));
91 } else { 91 } else {
92 set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);
92 pr_notice("Loaded X.509 cert '%s'\n", 93 pr_notice("Loaded X.509 cert '%s'\n",
93 key_ref_to_ptr(key)->description); 94 key_ref_to_ptr(key)->description);
94 key_ref_put(key); 95 key_ref_put(key);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f448513a45ed..d626dc98e8df 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG
12config ARCH_CLOCKSOURCE_DATA 12config ARCH_CLOCKSOURCE_DATA
13 bool 13 bool
14 14
15# Clocksources require validation of the clocksource against the last
16# cycle update - x86/TSC misfeature
17config CLOCKSOURCE_VALIDATE_LAST_CYCLE
18 bool
19
15# Timekeeping vsyscall support 20# Timekeeping vsyscall support
16config GENERIC_TIME_VSYSCALL 21config GENERIC_TIME_VSYSCALL
17 bool 22 bool
@@ -20,10 +25,6 @@ config GENERIC_TIME_VSYSCALL
20config GENERIC_TIME_VSYSCALL_OLD 25config GENERIC_TIME_VSYSCALL_OLD
21 bool 26 bool
22 27
23# ktime_t scalar 64bit nsec representation
24config KTIME_SCALAR
25 bool
26
27# Old style timekeeping 28# Old style timekeeping
28config ARCH_USES_GETTIMEOFFSET 29config ARCH_USES_GETTIMEOFFSET
29 bool 30 bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 57a413fd0ebf..7347426fa68d 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,3 +1,4 @@
1obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
2obj-y += timeconv.o posix-clock.o alarmtimer.o 3obj-y += timeconv.o posix-clock.o alarmtimer.o
3 4
@@ -12,3 +13,21 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
12obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o 13obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
13obj-$(CONFIG_TIMER_STATS) += timer_stats.o 14obj-$(CONFIG_TIMER_STATS) += timer_stats.o
14obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o 15obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
16obj-$(CONFIG_TEST_UDELAY) += udelay_test.o
17
18$(obj)/time.o: $(obj)/timeconst.h
19
20quiet_cmd_hzfile = HZFILE $@
21 cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
22
23targets += hz.bc
24$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
25 $(call if_changed,hzfile)
26
27quiet_cmd_bc = BC $@
28 cmd_bc = bc -q $(filter-out FORCE,$^) > $@
29
30targets += timeconst.h
31$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
32 $(call if_changed,bc)
33
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index fe75444ae7ec..4aec4a457431 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -71,7 +71,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void)
71 71
72 return ret; 72 return ret;
73} 73}
74 74EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
75 75
76static int alarmtimer_rtc_add_device(struct device *dev, 76static int alarmtimer_rtc_add_device(struct device *dev,
77 struct class_interface *class_intf) 77 struct class_interface *class_intf)
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index ad362c260ef4..9c94c19f1305 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -146,7 +146,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
146{ 146{
147 /* Nothing to do if we already reached the limit */ 147 /* Nothing to do if we already reached the limit */
148 if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { 148 if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
149 printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); 149 printk_deferred(KERN_WARNING
150 "CE: Reprogramming failure. Giving up\n");
150 dev->next_event.tv64 = KTIME_MAX; 151 dev->next_event.tv64 = KTIME_MAX;
151 return -ETIME; 152 return -ETIME;
152 } 153 }
@@ -159,9 +160,10 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
159 if (dev->min_delta_ns > MIN_DELTA_LIMIT) 160 if (dev->min_delta_ns > MIN_DELTA_LIMIT)
160 dev->min_delta_ns = MIN_DELTA_LIMIT; 161 dev->min_delta_ns = MIN_DELTA_LIMIT;
161 162
162 printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", 163 printk_deferred(KERN_WARNING
163 dev->name ? dev->name : "?", 164 "CE: %s increased min_delta_ns to %llu nsec\n",
164 (unsigned long long) dev->min_delta_ns); 165 dev->name ? dev->name : "?",
166 (unsigned long long) dev->min_delta_ns);
165 return 0; 167 return 0;
166} 168}
167 169
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ba3e502c955a..2e949cc9c9f1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -32,6 +32,7 @@
32#include <linux/kthread.h> 32#include <linux/kthread.h>
33 33
34#include "tick-internal.h" 34#include "tick-internal.h"
35#include "timekeeping_internal.h"
35 36
36void timecounter_init(struct timecounter *tc, 37void timecounter_init(struct timecounter *tc,
37 const struct cyclecounter *cc, 38 const struct cyclecounter *cc,
@@ -249,7 +250,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
249static void clocksource_watchdog(unsigned long data) 250static void clocksource_watchdog(unsigned long data)
250{ 251{
251 struct clocksource *cs; 252 struct clocksource *cs;
252 cycle_t csnow, wdnow; 253 cycle_t csnow, wdnow, delta;
253 int64_t wd_nsec, cs_nsec; 254 int64_t wd_nsec, cs_nsec;
254 int next_cpu, reset_pending; 255 int next_cpu, reset_pending;
255 256
@@ -282,11 +283,12 @@ static void clocksource_watchdog(unsigned long data)
282 continue; 283 continue;
283 } 284 }
284 285
285 wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, 286 delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
286 watchdog->mult, watchdog->shift); 287 wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
288 watchdog->shift);
287 289
288 cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & 290 delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
289 cs->mask, cs->mult, cs->shift); 291 cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
290 cs->cs_last = csnow; 292 cs->cs_last = csnow;
291 cs->wd_last = wdnow; 293 cs->wd_last = wdnow;
292 294
diff --git a/kernel/hrtimer.c b/kernel/time/hrtimer.c
index 3ab28993f6e0..1c2fe7de2842 100644
--- a/kernel/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -54,6 +54,8 @@
54 54
55#include <trace/events/timer.h> 55#include <trace/events/timer.h>
56 56
57#include "timekeeping.h"
58
57/* 59/*
58 * The timer bases: 60 * The timer bases:
59 * 61 *
@@ -114,21 +116,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
114 */ 116 */
115static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) 117static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
116{ 118{
117 ktime_t xtim, mono, boot; 119 ktime_t xtim, mono, boot, tai;
118 struct timespec xts, tom, slp; 120 ktime_t off_real, off_boot, off_tai;
119 s32 tai_offset;
120 121
121 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); 122 mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
122 tai_offset = timekeeping_get_tai_offset(); 123 boot = ktime_add(mono, off_boot);
124 xtim = ktime_add(mono, off_real);
125 tai = ktime_add(xtim, off_tai);
123 126
124 xtim = timespec_to_ktime(xts);
125 mono = ktime_add(xtim, timespec_to_ktime(tom));
126 boot = ktime_add(mono, timespec_to_ktime(slp));
127 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; 127 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
128 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; 128 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
129 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; 129 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
130 base->clock_base[HRTIMER_BASE_TAI].softirq_time = 130 base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
131 ktime_add(xtim, ktime_set(tai_offset, 0));
132} 131}
133 132
134/* 133/*
@@ -264,60 +263,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
264 * too large for inlining: 263 * too large for inlining:
265 */ 264 */
266#if BITS_PER_LONG < 64 265#if BITS_PER_LONG < 64
267# ifndef CONFIG_KTIME_SCALAR
268/**
269 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
270 * @kt: addend
271 * @nsec: the scalar nsec value to add
272 *
273 * Returns the sum of kt and nsec in ktime_t format
274 */
275ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
276{
277 ktime_t tmp;
278
279 if (likely(nsec < NSEC_PER_SEC)) {
280 tmp.tv64 = nsec;
281 } else {
282 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
283
284 /* Make sure nsec fits into long */
285 if (unlikely(nsec > KTIME_SEC_MAX))
286 return (ktime_t){ .tv64 = KTIME_MAX };
287
288 tmp = ktime_set((long)nsec, rem);
289 }
290
291 return ktime_add(kt, tmp);
292}
293
294EXPORT_SYMBOL_GPL(ktime_add_ns);
295
296/**
297 * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable
298 * @kt: minuend
299 * @nsec: the scalar nsec value to subtract
300 *
301 * Returns the subtraction of @nsec from @kt in ktime_t format
302 */
303ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec)
304{
305 ktime_t tmp;
306
307 if (likely(nsec < NSEC_PER_SEC)) {
308 tmp.tv64 = nsec;
309 } else {
310 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
311
312 tmp = ktime_set((long)nsec, rem);
313 }
314
315 return ktime_sub(kt, tmp);
316}
317
318EXPORT_SYMBOL_GPL(ktime_sub_ns);
319# endif /* !CONFIG_KTIME_SCALAR */
320
321/* 266/*
322 * Divide a ktime value by a nanosecond value 267 * Divide a ktime value by a nanosecond value
323 */ 268 */
@@ -337,6 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)
337 282
338 return dclc; 283 return dclc;
339} 284}
285EXPORT_SYMBOL_GPL(ktime_divns);
340#endif /* BITS_PER_LONG >= 64 */ 286#endif /* BITS_PER_LONG >= 64 */
341 287
342/* 288/*
@@ -602,6 +548,11 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
602 * timers, we have to check, whether it expires earlier than the timer for 548 * timers, we have to check, whether it expires earlier than the timer for
603 * which the clock event device was armed. 549 * which the clock event device was armed.
604 * 550 *
551 * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
552 * and no expiry check happens. The timer gets enqueued into the rbtree. The
553 * reprogramming and expiry check is done in the hrtimer_interrupt or in the
554 * softirq.
555 *
605 * Called with interrupts disabled and base->cpu_base.lock held 556 * Called with interrupts disabled and base->cpu_base.lock held
606 */ 557 */
607static int hrtimer_reprogram(struct hrtimer *timer, 558static int hrtimer_reprogram(struct hrtimer *timer,
@@ -662,25 +613,13 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
662 base->hres_active = 0; 613 base->hres_active = 0;
663} 614}
664 615
665/*
666 * When High resolution timers are active, try to reprogram. Note, that in case
667 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
668 * check happens. The timer gets enqueued into the rbtree. The reprogramming
669 * and expiry check is done in the hrtimer_interrupt or in the softirq.
670 */
671static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
672 struct hrtimer_clock_base *base)
673{
674 return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
675}
676
677static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) 616static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
678{ 617{
679 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; 618 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
680 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; 619 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
681 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; 620 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
682 621
683 return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); 622 return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
684} 623}
685 624
686/* 625/*
@@ -755,8 +694,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
755static inline int hrtimer_switch_to_hres(void) { return 0; } 694static inline int hrtimer_switch_to_hres(void) { return 0; }
756static inline void 695static inline void
757hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } 696hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
758static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 697static inline int hrtimer_reprogram(struct hrtimer *timer,
759 struct hrtimer_clock_base *base) 698 struct hrtimer_clock_base *base)
760{ 699{
761 return 0; 700 return 0;
762} 701}
@@ -1013,14 +952,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
1013 952
1014 leftmost = enqueue_hrtimer(timer, new_base); 953 leftmost = enqueue_hrtimer(timer, new_base);
1015 954
1016 /* 955 if (!leftmost) {
1017 * Only allow reprogramming if the new base is on this CPU. 956 unlock_hrtimer_base(timer, &flags);
1018 * (it might still be on another CPU if the timer was pending) 957 return ret;
1019 * 958 }
1020 * XXX send_remote_softirq() ? 959
1021 */ 960 if (!hrtimer_is_hres_active(timer)) {
1022 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) 961 /*
1023 && hrtimer_enqueue_reprogram(timer, new_base)) { 962 * Kick to reschedule the next tick to handle the new timer
963 * on dynticks target.
964 */
965 wake_up_nohz_cpu(new_base->cpu_base->cpu);
966 } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) &&
967 hrtimer_reprogram(timer, new_base)) {
968 /*
969 * Only allow reprogramming if the new base is on this CPU.
970 * (it might still be on another CPU if the timer was pending)
971 *
972 * XXX send_remote_softirq() ?
973 */
1024 if (wakeup) { 974 if (wakeup) {
1025 /* 975 /*
1026 * We need to drop cpu_base->lock to avoid a 976 * We need to drop cpu_base->lock to avoid a
@@ -1680,6 +1630,7 @@ static void init_hrtimers_cpu(int cpu)
1680 timerqueue_init_head(&cpu_base->clock_base[i].active); 1630 timerqueue_init_head(&cpu_base->clock_base[i].active);
1681 } 1631 }
1682 1632
1633 cpu_base->cpu = cpu;
1683 hrtimer_init_hres(cpu_base); 1634 hrtimer_init_hres(cpu_base);
1684} 1635}
1685 1636
diff --git a/kernel/itimer.c b/kernel/time/itimer.c
index 8d262b467573..8d262b467573 100644
--- a/kernel/itimer.c
+++ b/kernel/time/itimer.c
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 33db43a39515..87a346fd6d61 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -466,7 +466,8 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
466 466
467static void sync_cmos_clock(struct work_struct *work) 467static void sync_cmos_clock(struct work_struct *work)
468{ 468{
469 struct timespec now, next; 469 struct timespec64 now;
470 struct timespec next;
470 int fail = 1; 471 int fail = 1;
471 472
472 /* 473 /*
@@ -485,9 +486,9 @@ static void sync_cmos_clock(struct work_struct *work)
485 return; 486 return;
486 } 487 }
487 488
488 getnstimeofday(&now); 489 getnstimeofday64(&now);
489 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { 490 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
490 struct timespec adjust = now; 491 struct timespec adjust = timespec64_to_timespec(now);
491 492
492 fail = -ENODEV; 493 fail = -ENODEV;
493 if (persistent_clock_is_local) 494 if (persistent_clock_is_local)
@@ -531,7 +532,7 @@ void ntp_notify_cmos_timer(void) { }
531/* 532/*
532 * Propagate a new txc->status value into the NTP state: 533 * Propagate a new txc->status value into the NTP state:
533 */ 534 */
534static inline void process_adj_status(struct timex *txc, struct timespec *ts) 535static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
535{ 536{
536 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { 537 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
537 time_state = TIME_OK; 538 time_state = TIME_OK;
@@ -554,7 +555,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
554 555
555 556
556static inline void process_adjtimex_modes(struct timex *txc, 557static inline void process_adjtimex_modes(struct timex *txc,
557 struct timespec *ts, 558 struct timespec64 *ts,
558 s32 *time_tai) 559 s32 *time_tai)
559{ 560{
560 if (txc->modes & ADJ_STATUS) 561 if (txc->modes & ADJ_STATUS)
@@ -640,7 +641,7 @@ int ntp_validate_timex(struct timex *txc)
640 * adjtimex mainly allows reading (and writing, if superuser) of 641 * adjtimex mainly allows reading (and writing, if superuser) of
641 * kernel time-keeping variables. used by xntpd. 642 * kernel time-keeping variables. used by xntpd.
642 */ 643 */
643int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) 644int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
644{ 645{
645 int result; 646 int result;
646 647
@@ -684,7 +685,7 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
684 /* fill PPS status fields */ 685 /* fill PPS status fields */
685 pps_fill_timex(txc); 686 pps_fill_timex(txc);
686 687
687 txc->time.tv_sec = ts->tv_sec; 688 txc->time.tv_sec = (time_t)ts->tv_sec;
688 txc->time.tv_usec = ts->tv_nsec; 689 txc->time.tv_usec = ts->tv_nsec;
689 if (!(time_status & STA_NANO)) 690 if (!(time_status & STA_NANO))
690 txc->time.tv_usec /= NSEC_PER_USEC; 691 txc->time.tv_usec /= NSEC_PER_USEC;
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 1950cb4ca2a4..bbd102ad9df7 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -7,6 +7,6 @@ extern void ntp_clear(void);
7extern u64 ntp_tick_length(void); 7extern u64 ntp_tick_length(void);
8extern int second_overflow(unsigned long secs); 8extern int second_overflow(unsigned long secs);
9extern int ntp_validate_timex(struct timex *); 9extern int ntp_validate_timex(struct timex *);
10extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); 10extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
11extern void __hardpps(const struct timespec *, const struct timespec *); 11extern void __hardpps(const struct timespec *, const struct timespec *);
12#endif /* _LINUX_NTP_INTERNAL_H */ 12#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b8946416a5f..3b8946416a5f 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
diff --git a/kernel/posix-timers.c b/kernel/time/posix-timers.c
index 424c2d4265c9..42b463ad90f2 100644
--- a/kernel/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -49,6 +49,8 @@
49#include <linux/export.h> 49#include <linux/export.h>
50#include <linux/hashtable.h> 50#include <linux/hashtable.h>
51 51
52#include "timekeeping.h"
53
52/* 54/*
53 * Management arrays for POSIX timers. Timers are now kept in static hash table 55 * Management arrays for POSIX timers. Timers are now kept in static hash table
54 * with 512 entries. 56 * with 512 entries.
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 445106d2c729..01d2d15aa662 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -191,7 +191,8 @@ void __init sched_clock_postinit(void)
191 191
192static int sched_clock_suspend(void) 192static int sched_clock_suspend(void)
193{ 193{
194 sched_clock_poll(&sched_clock_timer); 194 update_sched_clock();
195 hrtimer_cancel(&sched_clock_timer);
195 cd.suspended = true; 196 cd.suspended = true;
196 return 0; 197 return 0;
197} 198}
@@ -199,6 +200,7 @@ static int sched_clock_suspend(void)
199static void sched_clock_resume(void) 200static void sched_clock_resume(void)
200{ 201{
201 cd.epoch_cyc = read_sched_clock(); 202 cd.epoch_cyc = read_sched_clock();
203 hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
202 cd.suspended = false; 204 cd.suspended = false;
203} 205}
204 206
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 7ab92b19965a..c19c1d84b6f3 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
4#include <linux/hrtimer.h> 4#include <linux/hrtimer.h>
5#include <linux/tick.h> 5#include <linux/tick.h>
6 6
7#include "timekeeping.h"
8
7extern seqlock_t jiffies_lock; 9extern seqlock_t jiffies_lock;
8 10
9#define CS_NAME_LEN 32 11#define CS_NAME_LEN 32
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6558b7ac112d..99aa6ee3908f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -154,6 +154,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
154 154
155#ifdef CONFIG_NO_HZ_FULL 155#ifdef CONFIG_NO_HZ_FULL
156cpumask_var_t tick_nohz_full_mask; 156cpumask_var_t tick_nohz_full_mask;
157cpumask_var_t housekeeping_mask;
157bool tick_nohz_full_running; 158bool tick_nohz_full_running;
158 159
159static bool can_stop_full_tick(void) 160static bool can_stop_full_tick(void)
@@ -224,13 +225,15 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
224}; 225};
225 226
226/* 227/*
227 * Kick the current CPU if it's full dynticks in order to force it to 228 * Kick the CPU if it's full dynticks in order to force it to
228 * re-evaluate its dependency on the tick and restart it if necessary. 229 * re-evaluate its dependency on the tick and restart it if necessary.
229 */ 230 */
230void tick_nohz_full_kick(void) 231void tick_nohz_full_kick_cpu(int cpu)
231{ 232{
232 if (tick_nohz_full_cpu(smp_processor_id())) 233 if (!tick_nohz_full_cpu(cpu))
233 irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); 234 return;
235
236 irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
234} 237}
235 238
236static void nohz_full_kick_ipi(void *info) 239static void nohz_full_kick_ipi(void *info)
@@ -281,6 +284,7 @@ static int __init tick_nohz_full_setup(char *str)
281 int cpu; 284 int cpu;
282 285
283 alloc_bootmem_cpumask_var(&tick_nohz_full_mask); 286 alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
287 alloc_bootmem_cpumask_var(&housekeeping_mask);
284 if (cpulist_parse(str, tick_nohz_full_mask) < 0) { 288 if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
285 pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); 289 pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
286 return 1; 290 return 1;
@@ -291,6 +295,8 @@ static int __init tick_nohz_full_setup(char *str)
291 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); 295 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
292 cpumask_clear_cpu(cpu, tick_nohz_full_mask); 296 cpumask_clear_cpu(cpu, tick_nohz_full_mask);
293 } 297 }
298 cpumask_andnot(housekeeping_mask,
299 cpu_possible_mask, tick_nohz_full_mask);
294 tick_nohz_full_running = true; 300 tick_nohz_full_running = true;
295 301
296 return 1; 302 return 1;
@@ -332,9 +338,15 @@ static int tick_nohz_init_all(void)
332 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); 338 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
333 return err; 339 return err;
334 } 340 }
341 if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
342 pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n");
343 return err;
344 }
335 err = 0; 345 err = 0;
336 cpumask_setall(tick_nohz_full_mask); 346 cpumask_setall(tick_nohz_full_mask);
337 cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); 347 cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
348 cpumask_clear(housekeeping_mask);
349 cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
338 tick_nohz_full_running = true; 350 tick_nohz_full_running = true;
339#endif 351#endif
340 return err; 352 return err;
diff --git a/kernel/time.c b/kernel/time/time.c
index 7c7964c33ae7..f0294ba14634 100644
--- a/kernel/time.c
+++ b/kernel/time/time.c
@@ -42,6 +42,7 @@
42#include <asm/unistd.h> 42#include <asm/unistd.h>
43 43
44#include "timeconst.h" 44#include "timeconst.h"
45#include "timekeeping.h"
45 46
46/* 47/*
47 * The timezone where the local system is located. Used as a default by some 48 * The timezone where the local system is located. Used as a default by some
@@ -420,6 +421,68 @@ struct timeval ns_to_timeval(const s64 nsec)
420} 421}
421EXPORT_SYMBOL(ns_to_timeval); 422EXPORT_SYMBOL(ns_to_timeval);
422 423
424#if BITS_PER_LONG == 32
425/**
426 * set_normalized_timespec - set timespec sec and nsec parts and normalize
427 *
428 * @ts: pointer to timespec variable to be set
429 * @sec: seconds to set
430 * @nsec: nanoseconds to set
431 *
432 * Set seconds and nanoseconds field of a timespec variable and
433 * normalize to the timespec storage format
434 *
435 * Note: The tv_nsec part is always in the range of
436 * 0 <= tv_nsec < NSEC_PER_SEC
437 * For negative values only the tv_sec field is negative !
438 */
439void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
440{
441 while (nsec >= NSEC_PER_SEC) {
442 /*
443 * The following asm() prevents the compiler from
444 * optimising this loop into a modulo operation. See
445 * also __iter_div_u64_rem() in include/linux/time.h
446 */
447 asm("" : "+rm"(nsec));
448 nsec -= NSEC_PER_SEC;
449 ++sec;
450 }
451 while (nsec < 0) {
452 asm("" : "+rm"(nsec));
453 nsec += NSEC_PER_SEC;
454 --sec;
455 }
456 ts->tv_sec = sec;
457 ts->tv_nsec = nsec;
458}
459EXPORT_SYMBOL(set_normalized_timespec64);
460
461/**
462 * ns_to_timespec64 - Convert nanoseconds to timespec64
463 * @nsec: the nanoseconds value to be converted
464 *
465 * Returns the timespec64 representation of the nsec parameter.
466 */
467struct timespec64 ns_to_timespec64(const s64 nsec)
468{
469 struct timespec64 ts;
470 s32 rem;
471
472 if (!nsec)
473 return (struct timespec64) {0, 0};
474
475 ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
476 if (unlikely(rem < 0)) {
477 ts.tv_sec--;
478 rem += NSEC_PER_SEC;
479 }
480 ts.tv_nsec = rem;
481
482 return ts;
483}
484EXPORT_SYMBOL(ns_to_timespec64);
485#endif
423/* 486/*
424 * When we convert to jiffies then we interpret incoming values 487 * When we convert to jiffies then we interpret incoming values
425 * the following way: 488 * the following way:
@@ -694,6 +757,7 @@ unsigned long nsecs_to_jiffies(u64 n)
694{ 757{
695 return (unsigned long)nsecs_to_jiffies64(n); 758 return (unsigned long)nsecs_to_jiffies64(n);
696} 759}
760EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
697 761
698/* 762/*
699 * Add two timespec values and do a safety check for overflow. 763 * Add two timespec values and do a safety check for overflow.
diff --git a/kernel/timeconst.bc b/kernel/time/timeconst.bc
index 511bdf2cafda..511bdf2cafda 100644
--- a/kernel/timeconst.bc
+++ b/kernel/time/timeconst.bc
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 32d8d6aaedb8..f36b02838a47 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -32,11 +32,34 @@
32#define TK_MIRROR (1 << 1) 32#define TK_MIRROR (1 << 1)
33#define TK_CLOCK_WAS_SET (1 << 2) 33#define TK_CLOCK_WAS_SET (1 << 2)
34 34
35static struct timekeeper timekeeper; 35/*
36 * The most important data for readout fits into a single 64 byte
37 * cache line.
38 */
39static struct {
40 seqcount_t seq;
41 struct timekeeper timekeeper;
42} tk_core ____cacheline_aligned;
43
36static DEFINE_RAW_SPINLOCK(timekeeper_lock); 44static DEFINE_RAW_SPINLOCK(timekeeper_lock);
37static seqcount_t timekeeper_seq;
38static struct timekeeper shadow_timekeeper; 45static struct timekeeper shadow_timekeeper;
39 46
47/**
48 * struct tk_fast - NMI safe timekeeper
49 * @seq: Sequence counter for protecting updates. The lowest bit
50 * is the index for the tk_read_base array
51 * @base: tk_read_base array. Access is indexed by the lowest bit of
52 * @seq.
53 *
54 * See @update_fast_timekeeper() below.
55 */
56struct tk_fast {
57 seqcount_t seq;
58 struct tk_read_base base[2];
59};
60
61static struct tk_fast tk_fast_mono ____cacheline_aligned;
62
40/* flag for if timekeeping is suspended */ 63/* flag for if timekeeping is suspended */
41int __read_mostly timekeeping_suspended; 64int __read_mostly timekeeping_suspended;
42 65
@@ -45,49 +68,54 @@ bool __read_mostly persistent_clock_exist = false;
45 68
46static inline void tk_normalize_xtime(struct timekeeper *tk) 69static inline void tk_normalize_xtime(struct timekeeper *tk)
47{ 70{
48 while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { 71 while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) {
49 tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; 72 tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;
50 tk->xtime_sec++; 73 tk->xtime_sec++;
51 } 74 }
52} 75}
53 76
54static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) 77static inline struct timespec64 tk_xtime(struct timekeeper *tk)
78{
79 struct timespec64 ts;
80
81 ts.tv_sec = tk->xtime_sec;
82 ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
83 return ts;
84}
85
86static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
55{ 87{
56 tk->xtime_sec = ts->tv_sec; 88 tk->xtime_sec = ts->tv_sec;
57 tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; 89 tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;
58} 90}
59 91
60static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) 92static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
61{ 93{
62 tk->xtime_sec += ts->tv_sec; 94 tk->xtime_sec += ts->tv_sec;
63 tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; 95 tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;
64 tk_normalize_xtime(tk); 96 tk_normalize_xtime(tk);
65} 97}
66 98
67static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) 99static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
68{ 100{
69 struct timespec tmp; 101 struct timespec64 tmp;
70 102
71 /* 103 /*
72 * Verify consistency of: offset_real = -wall_to_monotonic 104 * Verify consistency of: offset_real = -wall_to_monotonic
73 * before modifying anything 105 * before modifying anything
74 */ 106 */
75 set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, 107 set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
76 -tk->wall_to_monotonic.tv_nsec); 108 -tk->wall_to_monotonic.tv_nsec);
77 WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); 109 WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64);
78 tk->wall_to_monotonic = wtm; 110 tk->wall_to_monotonic = wtm;
79 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); 111 set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
80 tk->offs_real = timespec_to_ktime(tmp); 112 tk->offs_real = timespec64_to_ktime(tmp);
81 tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); 113 tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
82} 114}
83 115
84static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) 116static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
85{ 117{
86 /* Verify consistency before modifying */ 118 tk->offs_boot = ktime_add(tk->offs_boot, delta);
87 WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
88
89 tk->total_sleep_time = t;
90 tk->offs_boot = timespec_to_ktime(t);
91} 119}
92 120
93/** 121/**
@@ -107,9 +135,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
107 u64 tmp, ntpinterval; 135 u64 tmp, ntpinterval;
108 struct clocksource *old_clock; 136 struct clocksource *old_clock;
109 137
110 old_clock = tk->clock; 138 old_clock = tk->tkr.clock;
111 tk->clock = clock; 139 tk->tkr.clock = clock;
112 tk->cycle_last = clock->cycle_last = clock->read(clock); 140 tk->tkr.read = clock->read;
141 tk->tkr.mask = clock->mask;
142 tk->tkr.cycle_last = tk->tkr.read(clock);
113 143
114 /* Do the ns -> cycle conversion first, using original mult */ 144 /* Do the ns -> cycle conversion first, using original mult */
115 tmp = NTP_INTERVAL_LENGTH; 145 tmp = NTP_INTERVAL_LENGTH;
@@ -133,78 +163,212 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
133 if (old_clock) { 163 if (old_clock) {
134 int shift_change = clock->shift - old_clock->shift; 164 int shift_change = clock->shift - old_clock->shift;
135 if (shift_change < 0) 165 if (shift_change < 0)
136 tk->xtime_nsec >>= -shift_change; 166 tk->tkr.xtime_nsec >>= -shift_change;
137 else 167 else
138 tk->xtime_nsec <<= shift_change; 168 tk->tkr.xtime_nsec <<= shift_change;
139 } 169 }
140 tk->shift = clock->shift; 170 tk->tkr.shift = clock->shift;
141 171
142 tk->ntp_error = 0; 172 tk->ntp_error = 0;
143 tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; 173 tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
174 tk->ntp_tick = ntpinterval << tk->ntp_error_shift;
144 175
145 /* 176 /*
146 * The timekeeper keeps its own mult values for the currently 177 * The timekeeper keeps its own mult values for the currently
147 * active clocksource. These value will be adjusted via NTP 178 * active clocksource. These value will be adjusted via NTP
148 * to counteract clock drifting. 179 * to counteract clock drifting.
149 */ 180 */
150 tk->mult = clock->mult; 181 tk->tkr.mult = clock->mult;
182 tk->ntp_err_mult = 0;
151} 183}
152 184
153/* Timekeeper helper functions. */ 185/* Timekeeper helper functions. */
154 186
155#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 187#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
156u32 (*arch_gettimeoffset)(void); 188static u32 default_arch_gettimeoffset(void) { return 0; }
157 189u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
158u32 get_arch_timeoffset(void)
159{
160 if (likely(arch_gettimeoffset))
161 return arch_gettimeoffset();
162 return 0;
163}
164#else 190#else
165static inline u32 get_arch_timeoffset(void) { return 0; } 191static inline u32 arch_gettimeoffset(void) { return 0; }
166#endif 192#endif
167 193
168static inline s64 timekeeping_get_ns(struct timekeeper *tk) 194static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
169{ 195{
170 cycle_t cycle_now, cycle_delta; 196 cycle_t cycle_now, delta;
171 struct clocksource *clock;
172 s64 nsec; 197 s64 nsec;
173 198
174 /* read clocksource: */ 199 /* read clocksource: */
175 clock = tk->clock; 200 cycle_now = tkr->read(tkr->clock);
176 cycle_now = clock->read(clock);
177 201
178 /* calculate the delta since the last update_wall_time: */ 202 /* calculate the delta since the last update_wall_time: */
179 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 203 delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
180 204
181 nsec = cycle_delta * tk->mult + tk->xtime_nsec; 205 nsec = delta * tkr->mult + tkr->xtime_nsec;
182 nsec >>= tk->shift; 206 nsec >>= tkr->shift;
183 207
184 /* If arch requires, add in get_arch_timeoffset() */ 208 /* If arch requires, add in get_arch_timeoffset() */
185 return nsec + get_arch_timeoffset(); 209 return nsec + arch_gettimeoffset();
186} 210}
187 211
188static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) 212static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
189{ 213{
190 cycle_t cycle_now, cycle_delta; 214 struct clocksource *clock = tk->tkr.clock;
191 struct clocksource *clock; 215 cycle_t cycle_now, delta;
192 s64 nsec; 216 s64 nsec;
193 217
194 /* read clocksource: */ 218 /* read clocksource: */
195 clock = tk->clock; 219 cycle_now = tk->tkr.read(clock);
196 cycle_now = clock->read(clock);
197 220
198 /* calculate the delta since the last update_wall_time: */ 221 /* calculate the delta since the last update_wall_time: */
199 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 222 delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
200 223
201 /* convert delta to nanoseconds. */ 224 /* convert delta to nanoseconds. */
202 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 225 nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
203 226
204 /* If arch requires, add in get_arch_timeoffset() */ 227 /* If arch requires, add in get_arch_timeoffset() */
205 return nsec + get_arch_timeoffset(); 228 return nsec + arch_gettimeoffset();
229}
230
231/**
232 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
233 * @tk: The timekeeper from which we take the update
234 * @tkf: The fast timekeeper to update
235 * @tbase: The time base for the fast timekeeper (mono/raw)
236 *
237 * We want to use this from any context including NMI and tracing /
238 * instrumenting the timekeeping code itself.
239 *
240 * So we handle this differently than the other timekeeping accessor
241 * functions which retry when the sequence count has changed. The
242 * update side does:
243 *
244 * smp_wmb(); <- Ensure that the last base[1] update is visible
245 * tkf->seq++;
246 * smp_wmb(); <- Ensure that the seqcount update is visible
247 * update(tkf->base[0], tk);
248 * smp_wmb(); <- Ensure that the base[0] update is visible
249 * tkf->seq++;
250 * smp_wmb(); <- Ensure that the seqcount update is visible
251 * update(tkf->base[1], tk);
252 *
253 * The reader side does:
254 *
255 * do {
256 * seq = tkf->seq;
257 * smp_rmb();
258 * idx = seq & 0x01;
259 * now = now(tkf->base[idx]);
260 * smp_rmb();
261 * } while (seq != tkf->seq)
262 *
263 * As long as we update base[0] readers are forced off to
264 * base[1]. Once base[0] is updated readers are redirected to base[0]
265 * and the base[1] update takes place.
266 *
267 * So if a NMI hits the update of base[0] then it will use base[1]
268 * which is still consistent. In the worst case this can result is a
269 * slightly wrong timestamp (a few nanoseconds). See
270 * @ktime_get_mono_fast_ns.
271 */
272static void update_fast_timekeeper(struct timekeeper *tk)
273{
274 struct tk_read_base *base = tk_fast_mono.base;
275
276 /* Force readers off to base[1] */
277 raw_write_seqcount_latch(&tk_fast_mono.seq);
278
279 /* Update base[0] */
280 memcpy(base, &tk->tkr, sizeof(*base));
281
282 /* Force readers back to base[0] */
283 raw_write_seqcount_latch(&tk_fast_mono.seq);
284
285 /* Update base[1] */
286 memcpy(base + 1, base, sizeof(*base));
206} 287}
207 288
289/**
290 * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
291 *
292 * This timestamp is not guaranteed to be monotonic across an update.
293 * The timestamp is calculated by:
294 *
295 * now = base_mono + clock_delta * slope
296 *
297 * So if the update lowers the slope, readers who are forced to the
298 * not yet updated second array are still using the old steeper slope.
299 *
300 * tmono
301 * ^
302 * | o n
303 * | o n
304 * | u
305 * | o
306 * |o
307 * |12345678---> reader order
308 *
309 * o = old slope
310 * u = update
311 * n = new slope
312 *
313 * So reader 6 will observe time going backwards versus reader 5.
314 *
315 * While other CPUs are likely to be able observe that, the only way
316 * for a CPU local observation is when an NMI hits in the middle of
317 * the update. Timestamps taken from that NMI context might be ahead
318 * of the following timestamps. Callers need to be aware of that and
319 * deal with it.
320 */
321u64 notrace ktime_get_mono_fast_ns(void)
322{
323 struct tk_read_base *tkr;
324 unsigned int seq;
325 u64 now;
326
327 do {
328 seq = raw_read_seqcount(&tk_fast_mono.seq);
329 tkr = tk_fast_mono.base + (seq & 0x01);
330 now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
331
332 } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
333 return now;
334}
335EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
336
337#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
338
339static inline void update_vsyscall(struct timekeeper *tk)
340{
341 struct timespec xt;
342
343 xt = timespec64_to_timespec(tk_xtime(tk));
344 update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult,
345 tk->tkr.cycle_last);
346}
347
348static inline void old_vsyscall_fixup(struct timekeeper *tk)
349{
350 s64 remainder;
351
352 /*
353 * Store only full nanoseconds into xtime_nsec after rounding
354 * it up and add the remainder to the error difference.
355 * XXX - This is necessary to avoid small 1ns inconsistnecies caused
356 * by truncating the remainder in vsyscalls. However, it causes
357 * additional work to be done in timekeeping_adjust(). Once
358 * the vsyscall implementations are converted to use xtime_nsec
359 * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
360 * users are removed, this can be killed.
361 */
362 remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1);
363 tk->tkr.xtime_nsec -= remainder;
364 tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift;
365 tk->ntp_error += remainder << tk->ntp_error_shift;
366 tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift;
367}
368#else
369#define old_vsyscall_fixup(tk)
370#endif
371
208static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); 372static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
209 373
210static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) 374static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
@@ -217,7 +381,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
217 */ 381 */
218int pvclock_gtod_register_notifier(struct notifier_block *nb) 382int pvclock_gtod_register_notifier(struct notifier_block *nb)
219{ 383{
220 struct timekeeper *tk = &timekeeper; 384 struct timekeeper *tk = &tk_core.timekeeper;
221 unsigned long flags; 385 unsigned long flags;
222 int ret; 386 int ret;
223 387
@@ -247,6 +411,29 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
247} 411}
248EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); 412EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
249 413
414/*
415 * Update the ktime_t based scalar nsec members of the timekeeper
416 */
417static inline void tk_update_ktime_data(struct timekeeper *tk)
418{
419 s64 nsec;
420
421 /*
422 * The xtime based monotonic readout is:
423 * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
424 * The ktime based monotonic readout is:
425 * nsec = base_mono + now();
426 * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
427 */
428 nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
429 nsec *= NSEC_PER_SEC;
430 nsec += tk->wall_to_monotonic.tv_nsec;
431 tk->tkr.base_mono = ns_to_ktime(nsec);
432
433 /* Update the monotonic raw base */
434 tk->base_raw = timespec64_to_ktime(tk->raw_time);
435}
436
250/* must hold timekeeper_lock */ 437/* must hold timekeeper_lock */
251static void timekeeping_update(struct timekeeper *tk, unsigned int action) 438static void timekeeping_update(struct timekeeper *tk, unsigned int action)
252{ 439{
@@ -257,8 +444,13 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
257 update_vsyscall(tk); 444 update_vsyscall(tk);
258 update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); 445 update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
259 446
447 tk_update_ktime_data(tk);
448
260 if (action & TK_MIRROR) 449 if (action & TK_MIRROR)
261 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); 450 memcpy(&shadow_timekeeper, &tk_core.timekeeper,
451 sizeof(tk_core.timekeeper));
452
453 update_fast_timekeeper(tk);
262} 454}
263 455
264/** 456/**
@@ -270,49 +462,48 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
270 */ 462 */
271static void timekeeping_forward_now(struct timekeeper *tk) 463static void timekeeping_forward_now(struct timekeeper *tk)
272{ 464{
273 cycle_t cycle_now, cycle_delta; 465 struct clocksource *clock = tk->tkr.clock;
274 struct clocksource *clock; 466 cycle_t cycle_now, delta;
275 s64 nsec; 467 s64 nsec;
276 468
277 clock = tk->clock; 469 cycle_now = tk->tkr.read(clock);
278 cycle_now = clock->read(clock); 470 delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
279 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 471 tk->tkr.cycle_last = cycle_now;
280 tk->cycle_last = clock->cycle_last = cycle_now;
281 472
282 tk->xtime_nsec += cycle_delta * tk->mult; 473 tk->tkr.xtime_nsec += delta * tk->tkr.mult;
283 474
284 /* If arch requires, add in get_arch_timeoffset() */ 475 /* If arch requires, add in get_arch_timeoffset() */
285 tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift; 476 tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;
286 477
287 tk_normalize_xtime(tk); 478 tk_normalize_xtime(tk);
288 479
289 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 480 nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
290 timespec_add_ns(&tk->raw_time, nsec); 481 timespec64_add_ns(&tk->raw_time, nsec);
291} 482}
292 483
293/** 484/**
294 * __getnstimeofday - Returns the time of day in a timespec. 485 * __getnstimeofday64 - Returns the time of day in a timespec64.
295 * @ts: pointer to the timespec to be set 486 * @ts: pointer to the timespec to be set
296 * 487 *
297 * Updates the time of day in the timespec. 488 * Updates the time of day in the timespec.
298 * Returns 0 on success, or -ve when suspended (timespec will be undefined). 489 * Returns 0 on success, or -ve when suspended (timespec will be undefined).
299 */ 490 */
300int __getnstimeofday(struct timespec *ts) 491int __getnstimeofday64(struct timespec64 *ts)
301{ 492{
302 struct timekeeper *tk = &timekeeper; 493 struct timekeeper *tk = &tk_core.timekeeper;
303 unsigned long seq; 494 unsigned long seq;
304 s64 nsecs = 0; 495 s64 nsecs = 0;
305 496
306 do { 497 do {
307 seq = read_seqcount_begin(&timekeeper_seq); 498 seq = read_seqcount_begin(&tk_core.seq);
308 499
309 ts->tv_sec = tk->xtime_sec; 500 ts->tv_sec = tk->xtime_sec;
310 nsecs = timekeeping_get_ns(tk); 501 nsecs = timekeeping_get_ns(&tk->tkr);
311 502
312 } while (read_seqcount_retry(&timekeeper_seq, seq)); 503 } while (read_seqcount_retry(&tk_core.seq, seq));
313 504
314 ts->tv_nsec = 0; 505 ts->tv_nsec = 0;
315 timespec_add_ns(ts, nsecs); 506 timespec64_add_ns(ts, nsecs);
316 507
317 /* 508 /*
318 * Do not bail out early, in case there were callers still using 509 * Do not bail out early, in case there were callers still using
@@ -322,116 +513,138 @@ int __getnstimeofday(struct timespec *ts)
322 return -EAGAIN; 513 return -EAGAIN;
323 return 0; 514 return 0;
324} 515}
325EXPORT_SYMBOL(__getnstimeofday); 516EXPORT_SYMBOL(__getnstimeofday64);
326 517
327/** 518/**
328 * getnstimeofday - Returns the time of day in a timespec. 519 * getnstimeofday64 - Returns the time of day in a timespec64.
329 * @ts: pointer to the timespec to be set 520 * @ts: pointer to the timespec to be set
330 * 521 *
331 * Returns the time of day in a timespec (WARN if suspended). 522 * Returns the time of day in a timespec (WARN if suspended).
332 */ 523 */
333void getnstimeofday(struct timespec *ts) 524void getnstimeofday64(struct timespec64 *ts)
334{ 525{
335 WARN_ON(__getnstimeofday(ts)); 526 WARN_ON(__getnstimeofday64(ts));
336} 527}
337EXPORT_SYMBOL(getnstimeofday); 528EXPORT_SYMBOL(getnstimeofday64);
338 529
339ktime_t ktime_get(void) 530ktime_t ktime_get(void)
340{ 531{
341 struct timekeeper *tk = &timekeeper; 532 struct timekeeper *tk = &tk_core.timekeeper;
342 unsigned int seq; 533 unsigned int seq;
343 s64 secs, nsecs; 534 ktime_t base;
535 s64 nsecs;
344 536
345 WARN_ON(timekeeping_suspended); 537 WARN_ON(timekeeping_suspended);
346 538
347 do { 539 do {
348 seq = read_seqcount_begin(&timekeeper_seq); 540 seq = read_seqcount_begin(&tk_core.seq);
349 secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; 541 base = tk->tkr.base_mono;
350 nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; 542 nsecs = timekeeping_get_ns(&tk->tkr);
351 543
352 } while (read_seqcount_retry(&timekeeper_seq, seq)); 544 } while (read_seqcount_retry(&tk_core.seq, seq));
353 /* 545
354 * Use ktime_set/ktime_add_ns to create a proper ktime on 546 return ktime_add_ns(base, nsecs);
355 * 32-bit architectures without CONFIG_KTIME_SCALAR.
356 */
357 return ktime_add_ns(ktime_set(secs, 0), nsecs);
358} 547}
359EXPORT_SYMBOL_GPL(ktime_get); 548EXPORT_SYMBOL_GPL(ktime_get);
360 549
361/** 550static ktime_t *offsets[TK_OFFS_MAX] = {
362 * ktime_get_ts - get the monotonic clock in timespec format 551 [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real,
363 * @ts: pointer to timespec variable 552 [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot,
364 * 553 [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai,
365 * The function calculates the monotonic clock from the realtime 554};
366 * clock and the wall_to_monotonic offset and stores the result 555
367 * in normalized timespec format in the variable pointed to by @ts. 556ktime_t ktime_get_with_offset(enum tk_offsets offs)
368 */
369void ktime_get_ts(struct timespec *ts)
370{ 557{
371 struct timekeeper *tk = &timekeeper; 558 struct timekeeper *tk = &tk_core.timekeeper;
372 struct timespec tomono;
373 s64 nsec;
374 unsigned int seq; 559 unsigned int seq;
560 ktime_t base, *offset = offsets[offs];
561 s64 nsecs;
375 562
376 WARN_ON(timekeeping_suspended); 563 WARN_ON(timekeeping_suspended);
377 564
378 do { 565 do {
379 seq = read_seqcount_begin(&timekeeper_seq); 566 seq = read_seqcount_begin(&tk_core.seq);
380 ts->tv_sec = tk->xtime_sec; 567 base = ktime_add(tk->tkr.base_mono, *offset);
381 nsec = timekeeping_get_ns(tk); 568 nsecs = timekeeping_get_ns(&tk->tkr);
382 tomono = tk->wall_to_monotonic;
383 569
384 } while (read_seqcount_retry(&timekeeper_seq, seq)); 570 } while (read_seqcount_retry(&tk_core.seq, seq));
385 571
386 ts->tv_sec += tomono.tv_sec; 572 return ktime_add_ns(base, nsecs);
387 ts->tv_nsec = 0;
388 timespec_add_ns(ts, nsec + tomono.tv_nsec);
389}
390EXPORT_SYMBOL_GPL(ktime_get_ts);
391 573
574}
575EXPORT_SYMBOL_GPL(ktime_get_with_offset);
392 576
393/** 577/**
394 * timekeeping_clocktai - Returns the TAI time of day in a timespec 578 * ktime_mono_to_any() - convert mononotic time to any other time
395 * @ts: pointer to the timespec to be set 579 * @tmono: time to convert.
396 * 580 * @offs: which offset to use
397 * Returns the time of day in a timespec.
398 */ 581 */
399void timekeeping_clocktai(struct timespec *ts) 582ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
400{ 583{
401 struct timekeeper *tk = &timekeeper; 584 ktime_t *offset = offsets[offs];
402 unsigned long seq; 585 unsigned long seq;
403 u64 nsecs; 586 ktime_t tconv;
404
405 WARN_ON(timekeeping_suspended);
406 587
407 do { 588 do {
408 seq = read_seqcount_begin(&timekeeper_seq); 589 seq = read_seqcount_begin(&tk_core.seq);
590 tconv = ktime_add(tmono, *offset);
591 } while (read_seqcount_retry(&tk_core.seq, seq));
409 592
410 ts->tv_sec = tk->xtime_sec + tk->tai_offset; 593 return tconv;
411 nsecs = timekeeping_get_ns(tk); 594}
595EXPORT_SYMBOL_GPL(ktime_mono_to_any);
412 596
413 } while (read_seqcount_retry(&timekeeper_seq, seq)); 597/**
598 * ktime_get_raw - Returns the raw monotonic time in ktime_t format
599 */
600ktime_t ktime_get_raw(void)
601{
602 struct timekeeper *tk = &tk_core.timekeeper;
603 unsigned int seq;
604 ktime_t base;
605 s64 nsecs;
414 606
415 ts->tv_nsec = 0; 607 do {
416 timespec_add_ns(ts, nsecs); 608 seq = read_seqcount_begin(&tk_core.seq);
609 base = tk->base_raw;
610 nsecs = timekeeping_get_ns_raw(tk);
417 611
418} 612 } while (read_seqcount_retry(&tk_core.seq, seq));
419EXPORT_SYMBOL(timekeeping_clocktai);
420 613
614 return ktime_add_ns(base, nsecs);
615}
616EXPORT_SYMBOL_GPL(ktime_get_raw);
421 617
422/** 618/**
423 * ktime_get_clocktai - Returns the TAI time of day in a ktime 619 * ktime_get_ts64 - get the monotonic clock in timespec64 format
620 * @ts: pointer to timespec variable
424 * 621 *
425 * Returns the time of day in a ktime. 622 * The function calculates the monotonic clock from the realtime
623 * clock and the wall_to_monotonic offset and stores the result
624 * in normalized timespec format in the variable pointed to by @ts.
426 */ 625 */
427ktime_t ktime_get_clocktai(void) 626void ktime_get_ts64(struct timespec64 *ts)
428{ 627{
429 struct timespec ts; 628 struct timekeeper *tk = &tk_core.timekeeper;
629 struct timespec64 tomono;
630 s64 nsec;
631 unsigned int seq;
632
633 WARN_ON(timekeeping_suspended);
430 634
431 timekeeping_clocktai(&ts); 635 do {
432 return timespec_to_ktime(ts); 636 seq = read_seqcount_begin(&tk_core.seq);
637 ts->tv_sec = tk->xtime_sec;
638 nsec = timekeeping_get_ns(&tk->tkr);
639 tomono = tk->wall_to_monotonic;
640
641 } while (read_seqcount_retry(&tk_core.seq, seq));
642
643 ts->tv_sec += tomono.tv_sec;
644 ts->tv_nsec = 0;
645 timespec64_add_ns(ts, nsec + tomono.tv_nsec);
433} 646}
434EXPORT_SYMBOL(ktime_get_clocktai); 647EXPORT_SYMBOL_GPL(ktime_get_ts64);
435 648
436#ifdef CONFIG_NTP_PPS 649#ifdef CONFIG_NTP_PPS
437 650
@@ -446,23 +659,23 @@ EXPORT_SYMBOL(ktime_get_clocktai);
446 */ 659 */
447void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) 660void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
448{ 661{
449 struct timekeeper *tk = &timekeeper; 662 struct timekeeper *tk = &tk_core.timekeeper;
450 unsigned long seq; 663 unsigned long seq;
451 s64 nsecs_raw, nsecs_real; 664 s64 nsecs_raw, nsecs_real;
452 665
453 WARN_ON_ONCE(timekeeping_suspended); 666 WARN_ON_ONCE(timekeeping_suspended);
454 667
455 do { 668 do {
456 seq = read_seqcount_begin(&timekeeper_seq); 669 seq = read_seqcount_begin(&tk_core.seq);
457 670
458 *ts_raw = tk->raw_time; 671 *ts_raw = timespec64_to_timespec(tk->raw_time);
459 ts_real->tv_sec = tk->xtime_sec; 672 ts_real->tv_sec = tk->xtime_sec;
460 ts_real->tv_nsec = 0; 673 ts_real->tv_nsec = 0;
461 674
462 nsecs_raw = timekeeping_get_ns_raw(tk); 675 nsecs_raw = timekeeping_get_ns_raw(tk);
463 nsecs_real = timekeeping_get_ns(tk); 676 nsecs_real = timekeeping_get_ns(&tk->tkr);
464 677
465 } while (read_seqcount_retry(&timekeeper_seq, seq)); 678 } while (read_seqcount_retry(&tk_core.seq, seq));
466 679
467 timespec_add_ns(ts_raw, nsecs_raw); 680 timespec_add_ns(ts_raw, nsecs_raw);
468 timespec_add_ns(ts_real, nsecs_real); 681 timespec_add_ns(ts_real, nsecs_real);
@@ -479,9 +692,9 @@ EXPORT_SYMBOL(getnstime_raw_and_real);
479 */ 692 */
480void do_gettimeofday(struct timeval *tv) 693void do_gettimeofday(struct timeval *tv)
481{ 694{
482 struct timespec now; 695 struct timespec64 now;
483 696
484 getnstimeofday(&now); 697 getnstimeofday64(&now);
485 tv->tv_sec = now.tv_sec; 698 tv->tv_sec = now.tv_sec;
486 tv->tv_usec = now.tv_nsec/1000; 699 tv->tv_usec = now.tv_nsec/1000;
487} 700}
@@ -495,15 +708,15 @@ EXPORT_SYMBOL(do_gettimeofday);
495 */ 708 */
496int do_settimeofday(const struct timespec *tv) 709int do_settimeofday(const struct timespec *tv)
497{ 710{
498 struct timekeeper *tk = &timekeeper; 711 struct timekeeper *tk = &tk_core.timekeeper;
499 struct timespec ts_delta, xt; 712 struct timespec64 ts_delta, xt, tmp;
500 unsigned long flags; 713 unsigned long flags;
501 714
502 if (!timespec_valid_strict(tv)) 715 if (!timespec_valid_strict(tv))
503 return -EINVAL; 716 return -EINVAL;
504 717
505 raw_spin_lock_irqsave(&timekeeper_lock, flags); 718 raw_spin_lock_irqsave(&timekeeper_lock, flags);
506 write_seqcount_begin(&timekeeper_seq); 719 write_seqcount_begin(&tk_core.seq);
507 720
508 timekeeping_forward_now(tk); 721 timekeeping_forward_now(tk);
509 722
@@ -511,13 +724,14 @@ int do_settimeofday(const struct timespec *tv)
511 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; 724 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
512 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; 725 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
513 726
514 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); 727 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
515 728
516 tk_set_xtime(tk, tv); 729 tmp = timespec_to_timespec64(*tv);
730 tk_set_xtime(tk, &tmp);
517 731
518 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 732 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
519 733
520 write_seqcount_end(&timekeeper_seq); 734 write_seqcount_end(&tk_core.seq);
521 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 735 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
522 736
523 /* signal hrtimers about time change */ 737 /* signal hrtimers about time change */
@@ -535,33 +749,35 @@ EXPORT_SYMBOL(do_settimeofday);
535 */ 749 */
536int timekeeping_inject_offset(struct timespec *ts) 750int timekeeping_inject_offset(struct timespec *ts)
537{ 751{
538 struct timekeeper *tk = &timekeeper; 752 struct timekeeper *tk = &tk_core.timekeeper;
539 unsigned long flags; 753 unsigned long flags;
540 struct timespec tmp; 754 struct timespec64 ts64, tmp;
541 int ret = 0; 755 int ret = 0;
542 756
543 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) 757 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
544 return -EINVAL; 758 return -EINVAL;
545 759
760 ts64 = timespec_to_timespec64(*ts);
761
546 raw_spin_lock_irqsave(&timekeeper_lock, flags); 762 raw_spin_lock_irqsave(&timekeeper_lock, flags);
547 write_seqcount_begin(&timekeeper_seq); 763 write_seqcount_begin(&tk_core.seq);
548 764
549 timekeeping_forward_now(tk); 765 timekeeping_forward_now(tk);
550 766
551 /* Make sure the proposed value is valid */ 767 /* Make sure the proposed value is valid */
552 tmp = timespec_add(tk_xtime(tk), *ts); 768 tmp = timespec64_add(tk_xtime(tk), ts64);
553 if (!timespec_valid_strict(&tmp)) { 769 if (!timespec64_valid_strict(&tmp)) {
554 ret = -EINVAL; 770 ret = -EINVAL;
555 goto error; 771 goto error;
556 } 772 }
557 773
558 tk_xtime_add(tk, ts); 774 tk_xtime_add(tk, &ts64);
559 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); 775 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64));
560 776
561error: /* even if we error out, we forwarded the time, so call update */ 777error: /* even if we error out, we forwarded the time, so call update */
562 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 778 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
563 779
564 write_seqcount_end(&timekeeper_seq); 780 write_seqcount_end(&tk_core.seq);
565 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 781 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
566 782
567 /* signal hrtimers about time change */ 783 /* signal hrtimers about time change */
@@ -578,14 +794,14 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
578 */ 794 */
579s32 timekeeping_get_tai_offset(void) 795s32 timekeeping_get_tai_offset(void)
580{ 796{
581 struct timekeeper *tk = &timekeeper; 797 struct timekeeper *tk = &tk_core.timekeeper;
582 unsigned int seq; 798 unsigned int seq;
583 s32 ret; 799 s32 ret;
584 800
585 do { 801 do {
586 seq = read_seqcount_begin(&timekeeper_seq); 802 seq = read_seqcount_begin(&tk_core.seq);
587 ret = tk->tai_offset; 803 ret = tk->tai_offset;
588 } while (read_seqcount_retry(&timekeeper_seq, seq)); 804 } while (read_seqcount_retry(&tk_core.seq, seq));
589 805
590 return ret; 806 return ret;
591} 807}
@@ -606,14 +822,14 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
606 */ 822 */
607void timekeeping_set_tai_offset(s32 tai_offset) 823void timekeeping_set_tai_offset(s32 tai_offset)
608{ 824{
609 struct timekeeper *tk = &timekeeper; 825 struct timekeeper *tk = &tk_core.timekeeper;
610 unsigned long flags; 826 unsigned long flags;
611 827
612 raw_spin_lock_irqsave(&timekeeper_lock, flags); 828 raw_spin_lock_irqsave(&timekeeper_lock, flags);
613 write_seqcount_begin(&timekeeper_seq); 829 write_seqcount_begin(&tk_core.seq);
614 __timekeeping_set_tai_offset(tk, tai_offset); 830 __timekeeping_set_tai_offset(tk, tai_offset);
615 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); 831 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
616 write_seqcount_end(&timekeeper_seq); 832 write_seqcount_end(&tk_core.seq);
617 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 833 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
618 clock_was_set(); 834 clock_was_set();
619} 835}
@@ -625,14 +841,14 @@ void timekeeping_set_tai_offset(s32 tai_offset)
625 */ 841 */
626static int change_clocksource(void *data) 842static int change_clocksource(void *data)
627{ 843{
628 struct timekeeper *tk = &timekeeper; 844 struct timekeeper *tk = &tk_core.timekeeper;
629 struct clocksource *new, *old; 845 struct clocksource *new, *old;
630 unsigned long flags; 846 unsigned long flags;
631 847
632 new = (struct clocksource *) data; 848 new = (struct clocksource *) data;
633 849
634 raw_spin_lock_irqsave(&timekeeper_lock, flags); 850 raw_spin_lock_irqsave(&timekeeper_lock, flags);
635 write_seqcount_begin(&timekeeper_seq); 851 write_seqcount_begin(&tk_core.seq);
636 852
637 timekeeping_forward_now(tk); 853 timekeeping_forward_now(tk);
638 /* 854 /*
@@ -641,7 +857,7 @@ static int change_clocksource(void *data)
641 */ 857 */
642 if (try_module_get(new->owner)) { 858 if (try_module_get(new->owner)) {
643 if (!new->enable || new->enable(new) == 0) { 859 if (!new->enable || new->enable(new) == 0) {
644 old = tk->clock; 860 old = tk->tkr.clock;
645 tk_setup_internals(tk, new); 861 tk_setup_internals(tk, new);
646 if (old->disable) 862 if (old->disable)
647 old->disable(old); 863 old->disable(old);
@@ -652,7 +868,7 @@ static int change_clocksource(void *data)
652 } 868 }
653 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 869 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
654 870
655 write_seqcount_end(&timekeeper_seq); 871 write_seqcount_end(&tk_core.seq);
656 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 872 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
657 873
658 return 0; 874 return 0;
@@ -667,29 +883,14 @@ static int change_clocksource(void *data)
667 */ 883 */
668int timekeeping_notify(struct clocksource *clock) 884int timekeeping_notify(struct clocksource *clock)
669{ 885{
670 struct timekeeper *tk = &timekeeper; 886 struct timekeeper *tk = &tk_core.timekeeper;
671 887
672 if (tk->clock == clock) 888 if (tk->tkr.clock == clock)
673 return 0; 889 return 0;
674 stop_machine(change_clocksource, clock, NULL); 890 stop_machine(change_clocksource, clock, NULL);
675 tick_clock_notify(); 891 tick_clock_notify();
676 return tk->clock == clock ? 0 : -1; 892 return tk->tkr.clock == clock ? 0 : -1;
677}
678
679/**
680 * ktime_get_real - get the real (wall-) time in ktime_t format
681 *
682 * returns the time in ktime_t format
683 */
684ktime_t ktime_get_real(void)
685{
686 struct timespec now;
687
688 getnstimeofday(&now);
689
690 return timespec_to_ktime(now);
691} 893}
692EXPORT_SYMBOL_GPL(ktime_get_real);
693 894
694/** 895/**
695 * getrawmonotonic - Returns the raw monotonic time in a timespec 896 * getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -699,18 +900,20 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
699 */ 900 */
700void getrawmonotonic(struct timespec *ts) 901void getrawmonotonic(struct timespec *ts)
701{ 902{
702 struct timekeeper *tk = &timekeeper; 903 struct timekeeper *tk = &tk_core.timekeeper;
904 struct timespec64 ts64;
703 unsigned long seq; 905 unsigned long seq;
704 s64 nsecs; 906 s64 nsecs;
705 907
706 do { 908 do {
707 seq = read_seqcount_begin(&timekeeper_seq); 909 seq = read_seqcount_begin(&tk_core.seq);
708 nsecs = timekeeping_get_ns_raw(tk); 910 nsecs = timekeeping_get_ns_raw(tk);
709 *ts = tk->raw_time; 911 ts64 = tk->raw_time;
710 912
711 } while (read_seqcount_retry(&timekeeper_seq, seq)); 913 } while (read_seqcount_retry(&tk_core.seq, seq));
712 914
713 timespec_add_ns(ts, nsecs); 915 timespec64_add_ns(&ts64, nsecs);
916 *ts = timespec64_to_timespec(ts64);
714} 917}
715EXPORT_SYMBOL(getrawmonotonic); 918EXPORT_SYMBOL(getrawmonotonic);
716 919
@@ -719,16 +922,16 @@ EXPORT_SYMBOL(getrawmonotonic);
719 */ 922 */
720int timekeeping_valid_for_hres(void) 923int timekeeping_valid_for_hres(void)
721{ 924{
722 struct timekeeper *tk = &timekeeper; 925 struct timekeeper *tk = &tk_core.timekeeper;
723 unsigned long seq; 926 unsigned long seq;
724 int ret; 927 int ret;
725 928
726 do { 929 do {
727 seq = read_seqcount_begin(&timekeeper_seq); 930 seq = read_seqcount_begin(&tk_core.seq);
728 931
729 ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 932 ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
730 933
731 } while (read_seqcount_retry(&timekeeper_seq, seq)); 934 } while (read_seqcount_retry(&tk_core.seq, seq));
732 935
733 return ret; 936 return ret;
734} 937}
@@ -738,16 +941,16 @@ int timekeeping_valid_for_hres(void)
738 */ 941 */
739u64 timekeeping_max_deferment(void) 942u64 timekeeping_max_deferment(void)
740{ 943{
741 struct timekeeper *tk = &timekeeper; 944 struct timekeeper *tk = &tk_core.timekeeper;
742 unsigned long seq; 945 unsigned long seq;
743 u64 ret; 946 u64 ret;
744 947
745 do { 948 do {
746 seq = read_seqcount_begin(&timekeeper_seq); 949 seq = read_seqcount_begin(&tk_core.seq);
747 950
748 ret = tk->clock->max_idle_ns; 951 ret = tk->tkr.clock->max_idle_ns;
749 952
750 } while (read_seqcount_retry(&timekeeper_seq, seq)); 953 } while (read_seqcount_retry(&tk_core.seq, seq));
751 954
752 return ret; 955 return ret;
753} 956}
@@ -787,14 +990,15 @@ void __weak read_boot_clock(struct timespec *ts)
787 */ 990 */
788void __init timekeeping_init(void) 991void __init timekeeping_init(void)
789{ 992{
790 struct timekeeper *tk = &timekeeper; 993 struct timekeeper *tk = &tk_core.timekeeper;
791 struct clocksource *clock; 994 struct clocksource *clock;
792 unsigned long flags; 995 unsigned long flags;
793 struct timespec now, boot, tmp; 996 struct timespec64 now, boot, tmp;
794 997 struct timespec ts;
795 read_persistent_clock(&now);
796 998
797 if (!timespec_valid_strict(&now)) { 999 read_persistent_clock(&ts);
1000 now = timespec_to_timespec64(ts);
1001 if (!timespec64_valid_strict(&now)) {
798 pr_warn("WARNING: Persistent clock returned invalid value!\n" 1002 pr_warn("WARNING: Persistent clock returned invalid value!\n"
799 " Check your CMOS/BIOS settings.\n"); 1003 " Check your CMOS/BIOS settings.\n");
800 now.tv_sec = 0; 1004 now.tv_sec = 0;
@@ -802,8 +1006,9 @@ void __init timekeeping_init(void)
802 } else if (now.tv_sec || now.tv_nsec) 1006 } else if (now.tv_sec || now.tv_nsec)
803 persistent_clock_exist = true; 1007 persistent_clock_exist = true;
804 1008
805 read_boot_clock(&boot); 1009 read_boot_clock(&ts);
806 if (!timespec_valid_strict(&boot)) { 1010 boot = timespec_to_timespec64(ts);
1011 if (!timespec64_valid_strict(&boot)) {
807 pr_warn("WARNING: Boot clock returned invalid value!\n" 1012 pr_warn("WARNING: Boot clock returned invalid value!\n"
808 " Check your CMOS/BIOS settings.\n"); 1013 " Check your CMOS/BIOS settings.\n");
809 boot.tv_sec = 0; 1014 boot.tv_sec = 0;
@@ -811,7 +1016,7 @@ void __init timekeeping_init(void)
811 } 1016 }
812 1017
813 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1018 raw_spin_lock_irqsave(&timekeeper_lock, flags);
814 write_seqcount_begin(&timekeeper_seq); 1019 write_seqcount_begin(&tk_core.seq);
815 ntp_init(); 1020 ntp_init();
816 1021
817 clock = clocksource_default_clock(); 1022 clock = clocksource_default_clock();
@@ -822,24 +1027,21 @@ void __init timekeeping_init(void)
822 tk_set_xtime(tk, &now); 1027 tk_set_xtime(tk, &now);
823 tk->raw_time.tv_sec = 0; 1028 tk->raw_time.tv_sec = 0;
824 tk->raw_time.tv_nsec = 0; 1029 tk->raw_time.tv_nsec = 0;
1030 tk->base_raw.tv64 = 0;
825 if (boot.tv_sec == 0 && boot.tv_nsec == 0) 1031 if (boot.tv_sec == 0 && boot.tv_nsec == 0)
826 boot = tk_xtime(tk); 1032 boot = tk_xtime(tk);
827 1033
828 set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); 1034 set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
829 tk_set_wall_to_mono(tk, tmp); 1035 tk_set_wall_to_mono(tk, tmp);
830 1036
831 tmp.tv_sec = 0; 1037 timekeeping_update(tk, TK_MIRROR);
832 tmp.tv_nsec = 0;
833 tk_set_sleep_time(tk, tmp);
834
835 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
836 1038
837 write_seqcount_end(&timekeeper_seq); 1039 write_seqcount_end(&tk_core.seq);
838 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1040 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
839} 1041}
840 1042
841/* time in seconds when suspend began */ 1043/* time in seconds when suspend began */
842static struct timespec timekeeping_suspend_time; 1044static struct timespec64 timekeeping_suspend_time;
843 1045
844/** 1046/**
845 * __timekeeping_inject_sleeptime - Internal function to add sleep interval 1047 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
@@ -849,17 +1051,17 @@ static struct timespec timekeeping_suspend_time;
849 * adds the sleep offset to the timekeeping variables. 1051 * adds the sleep offset to the timekeeping variables.
850 */ 1052 */
851static void __timekeeping_inject_sleeptime(struct timekeeper *tk, 1053static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
852 struct timespec *delta) 1054 struct timespec64 *delta)
853{ 1055{
854 if (!timespec_valid_strict(delta)) { 1056 if (!timespec64_valid_strict(delta)) {
855 printk_deferred(KERN_WARNING 1057 printk_deferred(KERN_WARNING
856 "__timekeeping_inject_sleeptime: Invalid " 1058 "__timekeeping_inject_sleeptime: Invalid "
857 "sleep delta value!\n"); 1059 "sleep delta value!\n");
858 return; 1060 return;
859 } 1061 }
860 tk_xtime_add(tk, delta); 1062 tk_xtime_add(tk, delta);
861 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); 1063 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
862 tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); 1064 tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
863 tk_debug_account_sleep_time(delta); 1065 tk_debug_account_sleep_time(delta);
864} 1066}
865 1067
@@ -875,7 +1077,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
875 */ 1077 */
876void timekeeping_inject_sleeptime(struct timespec *delta) 1078void timekeeping_inject_sleeptime(struct timespec *delta)
877{ 1079{
878 struct timekeeper *tk = &timekeeper; 1080 struct timekeeper *tk = &tk_core.timekeeper;
1081 struct timespec64 tmp;
879 unsigned long flags; 1082 unsigned long flags;
880 1083
881 /* 1084 /*
@@ -886,15 +1089,16 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
886 return; 1089 return;
887 1090
888 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1091 raw_spin_lock_irqsave(&timekeeper_lock, flags);
889 write_seqcount_begin(&timekeeper_seq); 1092 write_seqcount_begin(&tk_core.seq);
890 1093
891 timekeeping_forward_now(tk); 1094 timekeeping_forward_now(tk);
892 1095
893 __timekeeping_inject_sleeptime(tk, delta); 1096 tmp = timespec_to_timespec64(*delta);
1097 __timekeeping_inject_sleeptime(tk, &tmp);
894 1098
895 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 1099 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
896 1100
897 write_seqcount_end(&timekeeper_seq); 1101 write_seqcount_end(&tk_core.seq);
898 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1102 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
899 1103
900 /* signal hrtimers about time change */ 1104 /* signal hrtimers about time change */
@@ -910,20 +1114,22 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
910 */ 1114 */
911static void timekeeping_resume(void) 1115static void timekeeping_resume(void)
912{ 1116{
913 struct timekeeper *tk = &timekeeper; 1117 struct timekeeper *tk = &tk_core.timekeeper;
914 struct clocksource *clock = tk->clock; 1118 struct clocksource *clock = tk->tkr.clock;
915 unsigned long flags; 1119 unsigned long flags;
916 struct timespec ts_new, ts_delta; 1120 struct timespec64 ts_new, ts_delta;
1121 struct timespec tmp;
917 cycle_t cycle_now, cycle_delta; 1122 cycle_t cycle_now, cycle_delta;
918 bool suspendtime_found = false; 1123 bool suspendtime_found = false;
919 1124
920 read_persistent_clock(&ts_new); 1125 read_persistent_clock(&tmp);
1126 ts_new = timespec_to_timespec64(tmp);
921 1127
922 clockevents_resume(); 1128 clockevents_resume();
923 clocksource_resume(); 1129 clocksource_resume();
924 1130
925 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1131 raw_spin_lock_irqsave(&timekeeper_lock, flags);
926 write_seqcount_begin(&timekeeper_seq); 1132 write_seqcount_begin(&tk_core.seq);
927 1133
928 /* 1134 /*
929 * After system resumes, we need to calculate the suspended time and 1135 * After system resumes, we need to calculate the suspended time and
@@ -937,15 +1143,16 @@ static void timekeeping_resume(void)
937 * The less preferred source will only be tried if there is no better 1143 * The less preferred source will only be tried if there is no better
938 * usable source. The rtc part is handled separately in rtc core code. 1144 * usable source. The rtc part is handled separately in rtc core code.
939 */ 1145 */
940 cycle_now = clock->read(clock); 1146 cycle_now = tk->tkr.read(clock);
941 if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && 1147 if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
942 cycle_now > clock->cycle_last) { 1148 cycle_now > tk->tkr.cycle_last) {
943 u64 num, max = ULLONG_MAX; 1149 u64 num, max = ULLONG_MAX;
944 u32 mult = clock->mult; 1150 u32 mult = clock->mult;
945 u32 shift = clock->shift; 1151 u32 shift = clock->shift;
946 s64 nsec = 0; 1152 s64 nsec = 0;
947 1153
948 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 1154 cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last,
1155 tk->tkr.mask);
949 1156
950 /* 1157 /*
951 * "cycle_delta * mutl" may cause 64 bits overflow, if the 1158 * "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -960,10 +1167,10 @@ static void timekeeping_resume(void)
960 } 1167 }
961 nsec += ((u64) cycle_delta * mult) >> shift; 1168 nsec += ((u64) cycle_delta * mult) >> shift;
962 1169
963 ts_delta = ns_to_timespec(nsec); 1170 ts_delta = ns_to_timespec64(nsec);
964 suspendtime_found = true; 1171 suspendtime_found = true;
965 } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { 1172 } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
966 ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); 1173 ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
967 suspendtime_found = true; 1174 suspendtime_found = true;
968 } 1175 }
969 1176
@@ -971,11 +1178,11 @@ static void timekeeping_resume(void)
971 __timekeeping_inject_sleeptime(tk, &ts_delta); 1178 __timekeeping_inject_sleeptime(tk, &ts_delta);
972 1179
973 /* Re-base the last cycle value */ 1180 /* Re-base the last cycle value */
974 tk->cycle_last = clock->cycle_last = cycle_now; 1181 tk->tkr.cycle_last = cycle_now;
975 tk->ntp_error = 0; 1182 tk->ntp_error = 0;
976 timekeeping_suspended = 0; 1183 timekeeping_suspended = 0;
977 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); 1184 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
978 write_seqcount_end(&timekeeper_seq); 1185 write_seqcount_end(&tk_core.seq);
979 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1186 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
980 1187
981 touch_softlockup_watchdog(); 1188 touch_softlockup_watchdog();
@@ -988,12 +1195,14 @@ static void timekeeping_resume(void)
988 1195
989static int timekeeping_suspend(void) 1196static int timekeeping_suspend(void)
990{ 1197{
991 struct timekeeper *tk = &timekeeper; 1198 struct timekeeper *tk = &tk_core.timekeeper;
992 unsigned long flags; 1199 unsigned long flags;
993 struct timespec delta, delta_delta; 1200 struct timespec64 delta, delta_delta;
994 static struct timespec old_delta; 1201 static struct timespec64 old_delta;
1202 struct timespec tmp;
995 1203
996 read_persistent_clock(&timekeeping_suspend_time); 1204 read_persistent_clock(&tmp);
1205 timekeeping_suspend_time = timespec_to_timespec64(tmp);
997 1206
998 /* 1207 /*
999 * On some systems the persistent_clock can not be detected at 1208 * On some systems the persistent_clock can not be detected at
@@ -1004,7 +1213,7 @@ static int timekeeping_suspend(void)
1004 persistent_clock_exist = true; 1213 persistent_clock_exist = true;
1005 1214
1006 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1215 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1007 write_seqcount_begin(&timekeeper_seq); 1216 write_seqcount_begin(&tk_core.seq);
1008 timekeeping_forward_now(tk); 1217 timekeeping_forward_now(tk);
1009 timekeeping_suspended = 1; 1218 timekeeping_suspended = 1;
1010 1219
@@ -1014,8 +1223,8 @@ static int timekeeping_suspend(void)
1014 * try to compensate so the difference in system time 1223 * try to compensate so the difference in system time
1015 * and persistent_clock time stays close to constant. 1224 * and persistent_clock time stays close to constant.
1016 */ 1225 */
1017 delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); 1226 delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
1018 delta_delta = timespec_sub(delta, old_delta); 1227 delta_delta = timespec64_sub(delta, old_delta);
1019 if (abs(delta_delta.tv_sec) >= 2) { 1228 if (abs(delta_delta.tv_sec) >= 2) {
1020 /* 1229 /*
1021 * if delta_delta is too large, assume time correction 1230 * if delta_delta is too large, assume time correction
@@ -1025,11 +1234,11 @@ static int timekeeping_suspend(void)
1025 } else { 1234 } else {
1026 /* Otherwise try to adjust old_system to compensate */ 1235 /* Otherwise try to adjust old_system to compensate */
1027 timekeeping_suspend_time = 1236 timekeeping_suspend_time =
1028 timespec_add(timekeeping_suspend_time, delta_delta); 1237 timespec64_add(timekeeping_suspend_time, delta_delta);
1029 } 1238 }
1030 1239
1031 timekeeping_update(tk, TK_MIRROR); 1240 timekeeping_update(tk, TK_MIRROR);
1032 write_seqcount_end(&timekeeper_seq); 1241 write_seqcount_end(&tk_core.seq);
1033 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1242 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1034 1243
1035 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 1244 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
@@ -1050,125 +1259,34 @@ static int __init timekeeping_init_ops(void)
1050 register_syscore_ops(&timekeeping_syscore_ops); 1259 register_syscore_ops(&timekeeping_syscore_ops);
1051 return 0; 1260 return 0;
1052} 1261}
1053
1054device_initcall(timekeeping_init_ops); 1262device_initcall(timekeeping_init_ops);
1055 1263
1056/* 1264/*
1057 * If the error is already larger, we look ahead even further 1265 * Apply a multiplier adjustment to the timekeeper
1058 * to compensate for late or lost adjustments.
1059 */ 1266 */
1060static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, 1267static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
1061 s64 error, s64 *interval, 1268 s64 offset,
1062 s64 *offset) 1269 bool negative,
1270 int adj_scale)
1063{ 1271{
1064 s64 tick_error, i; 1272 s64 interval = tk->cycle_interval;
1065 u32 look_ahead, adj; 1273 s32 mult_adj = 1;
1066 s32 error2, mult;
1067
1068 /*
1069 * Use the current error value to determine how much to look ahead.
1070 * The larger the error the slower we adjust for it to avoid problems
1071 * with losing too many ticks, otherwise we would overadjust and
1072 * produce an even larger error. The smaller the adjustment the
1073 * faster we try to adjust for it, as lost ticks can do less harm
1074 * here. This is tuned so that an error of about 1 msec is adjusted
1075 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
1076 */
1077 error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
1078 error2 = abs(error2);
1079 for (look_ahead = 0; error2 > 0; look_ahead++)
1080 error2 >>= 2;
1081 1274
1082 /* 1275 if (negative) {
1083 * Now calculate the error in (1 << look_ahead) ticks, but first 1276 mult_adj = -mult_adj;
1084 * remove the single look ahead already included in the error. 1277 interval = -interval;
1085 */ 1278 offset = -offset;
1086 tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1);
1087 tick_error -= tk->xtime_interval >> 1;
1088 error = ((error - tick_error) >> look_ahead) + tick_error;
1089
1090 /* Finally calculate the adjustment shift value. */
1091 i = *interval;
1092 mult = 1;
1093 if (error < 0) {
1094 error = -error;
1095 *interval = -*interval;
1096 *offset = -*offset;
1097 mult = -1;
1098 } 1279 }
1099 for (adj = 0; error > i; adj++) 1280 mult_adj <<= adj_scale;
1100 error >>= 1; 1281 interval <<= adj_scale;
1101 1282 offset <<= adj_scale;
1102 *interval <<= adj;
1103 *offset <<= adj;
1104 return mult << adj;
1105}
1106
1107/*
1108 * Adjust the multiplier to reduce the error value,
1109 * this is optimized for the most common adjustments of -1,0,1,
1110 * for other values we can do a bit more work.
1111 */
1112static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1113{
1114 s64 error, interval = tk->cycle_interval;
1115 int adj;
1116 1283
1117 /* 1284 /*
1118 * The point of this is to check if the error is greater than half
1119 * an interval.
1120 *
1121 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
1122 *
1123 * Note we subtract one in the shift, so that error is really error*2.
1124 * This "saves" dividing(shifting) interval twice, but keeps the
1125 * (error > interval) comparison as still measuring if error is
1126 * larger than half an interval.
1127 *
1128 * Note: It does not "save" on aggravation when reading the code.
1129 */
1130 error = tk->ntp_error >> (tk->ntp_error_shift - 1);
1131 if (error > interval) {
1132 /*
1133 * We now divide error by 4(via shift), which checks if
1134 * the error is greater than twice the interval.
1135 * If it is greater, we need a bigadjust, if its smaller,
1136 * we can adjust by 1.
1137 */
1138 error >>= 2;
1139 if (likely(error <= interval))
1140 adj = 1;
1141 else
1142 adj = timekeeping_bigadjust(tk, error, &interval, &offset);
1143 } else {
1144 if (error < -interval) {
1145 /* See comment above, this is just switched for the negative */
1146 error >>= 2;
1147 if (likely(error >= -interval)) {
1148 adj = -1;
1149 interval = -interval;
1150 offset = -offset;
1151 } else {
1152 adj = timekeeping_bigadjust(tk, error, &interval, &offset);
1153 }
1154 } else {
1155 goto out_adjust;
1156 }
1157 }
1158
1159 if (unlikely(tk->clock->maxadj &&
1160 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
1161 printk_deferred_once(KERN_WARNING
1162 "Adjusting %s more than 11%% (%ld vs %ld)\n",
1163 tk->clock->name, (long)tk->mult + adj,
1164 (long)tk->clock->mult + tk->clock->maxadj);
1165 }
1166 /*
1167 * So the following can be confusing. 1285 * So the following can be confusing.
1168 * 1286 *
1169 * To keep things simple, lets assume adj == 1 for now. 1287 * To keep things simple, lets assume mult_adj == 1 for now.
1170 * 1288 *
1171 * When adj != 1, remember that the interval and offset values 1289 * When mult_adj != 1, remember that the interval and offset values
1172 * have been appropriately scaled so the math is the same. 1290 * have been appropriately scaled so the math is the same.
1173 * 1291 *
1174 * The basic idea here is that we're increasing the multiplier 1292 * The basic idea here is that we're increasing the multiplier
@@ -1212,12 +1330,78 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1212 * 1330 *
1213 * XXX - TODO: Doc ntp_error calculation. 1331 * XXX - TODO: Doc ntp_error calculation.
1214 */ 1332 */
1215 tk->mult += adj; 1333 tk->tkr.mult += mult_adj;
1216 tk->xtime_interval += interval; 1334 tk->xtime_interval += interval;
1217 tk->xtime_nsec -= offset; 1335 tk->tkr.xtime_nsec -= offset;
1218 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; 1336 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
1337}
1338
1339/*
1340 * Calculate the multiplier adjustment needed to match the frequency
1341 * specified by NTP
1342 */
1343static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
1344 s64 offset)
1345{
1346 s64 interval = tk->cycle_interval;
1347 s64 xinterval = tk->xtime_interval;
1348 s64 tick_error;
1349 bool negative;
1350 u32 adj;
1351
1352 /* Remove any current error adj from freq calculation */
1353 if (tk->ntp_err_mult)
1354 xinterval -= tk->cycle_interval;
1355
1356 tk->ntp_tick = ntp_tick_length();
1357
1358 /* Calculate current error per tick */
1359 tick_error = ntp_tick_length() >> tk->ntp_error_shift;
1360 tick_error -= (xinterval + tk->xtime_remainder);
1361
1362 /* Don't worry about correcting it if its small */
1363 if (likely((tick_error >= 0) && (tick_error <= interval)))
1364 return;
1365
1366 /* preserve the direction of correction */
1367 negative = (tick_error < 0);
1368
1369 /* Sort out the magnitude of the correction */
1370 tick_error = abs(tick_error);
1371 for (adj = 0; tick_error > interval; adj++)
1372 tick_error >>= 1;
1373
1374 /* scale the corrections */
1375 timekeeping_apply_adjustment(tk, offset, negative, adj);
1376}
1377
1378/*
1379 * Adjust the timekeeper's multiplier to the correct frequency
1380 * and also to reduce the accumulated error value.
1381 */
1382static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1383{
1384 /* Correct for the current frequency error */
1385 timekeeping_freqadjust(tk, offset);
1386
1387 /* Next make a small adjustment to fix any cumulative error */
1388 if (!tk->ntp_err_mult && (tk->ntp_error > 0)) {
1389 tk->ntp_err_mult = 1;
1390 timekeeping_apply_adjustment(tk, offset, 0, 0);
1391 } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) {
1392 /* Undo any existing error adjustment */
1393 timekeeping_apply_adjustment(tk, offset, 1, 0);
1394 tk->ntp_err_mult = 0;
1395 }
1396
1397 if (unlikely(tk->tkr.clock->maxadj &&
1398 (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) {
1399 printk_once(KERN_WARNING
1400 "Adjusting %s more than 11%% (%ld vs %ld)\n",
1401 tk->tkr.clock->name, (long)tk->tkr.mult,
1402 (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj);
1403 }
1219 1404
1220out_adjust:
1221 /* 1405 /*
1222 * It may be possible that when we entered this function, xtime_nsec 1406 * It may be possible that when we entered this function, xtime_nsec
1223 * was very small. Further, if we're slightly speeding the clocksource 1407 * was very small. Further, if we're slightly speeding the clocksource
@@ -1232,12 +1416,11 @@ out_adjust:
1232 * We'll correct this error next time through this function, when 1416 * We'll correct this error next time through this function, when
1233 * xtime_nsec is not as small. 1417 * xtime_nsec is not as small.
1234 */ 1418 */
1235 if (unlikely((s64)tk->xtime_nsec < 0)) { 1419 if (unlikely((s64)tk->tkr.xtime_nsec < 0)) {
1236 s64 neg = -(s64)tk->xtime_nsec; 1420 s64 neg = -(s64)tk->tkr.xtime_nsec;
1237 tk->xtime_nsec = 0; 1421 tk->tkr.xtime_nsec = 0;
1238 tk->ntp_error += neg << tk->ntp_error_shift; 1422 tk->ntp_error += neg << tk->ntp_error_shift;
1239 } 1423 }
1240
1241} 1424}
1242 1425
1243/** 1426/**
@@ -1250,26 +1433,26 @@ out_adjust:
1250 */ 1433 */
1251static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) 1434static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
1252{ 1435{
1253 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; 1436 u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;
1254 unsigned int clock_set = 0; 1437 unsigned int clock_set = 0;
1255 1438
1256 while (tk->xtime_nsec >= nsecps) { 1439 while (tk->tkr.xtime_nsec >= nsecps) {
1257 int leap; 1440 int leap;
1258 1441
1259 tk->xtime_nsec -= nsecps; 1442 tk->tkr.xtime_nsec -= nsecps;
1260 tk->xtime_sec++; 1443 tk->xtime_sec++;
1261 1444
1262 /* Figure out if its a leap sec and apply if needed */ 1445 /* Figure out if its a leap sec and apply if needed */
1263 leap = second_overflow(tk->xtime_sec); 1446 leap = second_overflow(tk->xtime_sec);
1264 if (unlikely(leap)) { 1447 if (unlikely(leap)) {
1265 struct timespec ts; 1448 struct timespec64 ts;
1266 1449
1267 tk->xtime_sec += leap; 1450 tk->xtime_sec += leap;
1268 1451
1269 ts.tv_sec = leap; 1452 ts.tv_sec = leap;
1270 ts.tv_nsec = 0; 1453 ts.tv_nsec = 0;
1271 tk_set_wall_to_mono(tk, 1454 tk_set_wall_to_mono(tk,
1272 timespec_sub(tk->wall_to_monotonic, ts)); 1455 timespec64_sub(tk->wall_to_monotonic, ts));
1273 1456
1274 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); 1457 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
1275 1458
@@ -1301,9 +1484,9 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1301 1484
1302 /* Accumulate one shifted interval */ 1485 /* Accumulate one shifted interval */
1303 offset -= interval; 1486 offset -= interval;
1304 tk->cycle_last += interval; 1487 tk->tkr.cycle_last += interval;
1305 1488
1306 tk->xtime_nsec += tk->xtime_interval << shift; 1489 tk->tkr.xtime_nsec += tk->xtime_interval << shift;
1307 *clock_set |= accumulate_nsecs_to_secs(tk); 1490 *clock_set |= accumulate_nsecs_to_secs(tk);
1308 1491
1309 /* Accumulate raw time */ 1492 /* Accumulate raw time */
@@ -1317,48 +1500,20 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1317 tk->raw_time.tv_nsec = raw_nsecs; 1500 tk->raw_time.tv_nsec = raw_nsecs;
1318 1501
1319 /* Accumulate error between NTP and clock interval */ 1502 /* Accumulate error between NTP and clock interval */
1320 tk->ntp_error += ntp_tick_length() << shift; 1503 tk->ntp_error += tk->ntp_tick << shift;
1321 tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << 1504 tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
1322 (tk->ntp_error_shift + shift); 1505 (tk->ntp_error_shift + shift);
1323 1506
1324 return offset; 1507 return offset;
1325} 1508}
1326 1509
1327#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
1328static inline void old_vsyscall_fixup(struct timekeeper *tk)
1329{
1330 s64 remainder;
1331
1332 /*
1333 * Store only full nanoseconds into xtime_nsec after rounding
1334 * it up and add the remainder to the error difference.
1335 * XXX - This is necessary to avoid small 1ns inconsistnecies caused
1336 * by truncating the remainder in vsyscalls. However, it causes
1337 * additional work to be done in timekeeping_adjust(). Once
1338 * the vsyscall implementations are converted to use xtime_nsec
1339 * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
1340 * users are removed, this can be killed.
1341 */
1342 remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
1343 tk->xtime_nsec -= remainder;
1344 tk->xtime_nsec += 1ULL << tk->shift;
1345 tk->ntp_error += remainder << tk->ntp_error_shift;
1346 tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
1347}
1348#else
1349#define old_vsyscall_fixup(tk)
1350#endif
1351
1352
1353
1354/** 1510/**
1355 * update_wall_time - Uses the current clocksource to increment the wall time 1511 * update_wall_time - Uses the current clocksource to increment the wall time
1356 * 1512 *
1357 */ 1513 */
1358void update_wall_time(void) 1514void update_wall_time(void)
1359{ 1515{
1360 struct clocksource *clock; 1516 struct timekeeper *real_tk = &tk_core.timekeeper;
1361 struct timekeeper *real_tk = &timekeeper;
1362 struct timekeeper *tk = &shadow_timekeeper; 1517 struct timekeeper *tk = &shadow_timekeeper;
1363 cycle_t offset; 1518 cycle_t offset;
1364 int shift = 0, maxshift; 1519 int shift = 0, maxshift;
@@ -1371,12 +1526,11 @@ void update_wall_time(void)
1371 if (unlikely(timekeeping_suspended)) 1526 if (unlikely(timekeeping_suspended))
1372 goto out; 1527 goto out;
1373 1528
1374 clock = real_tk->clock;
1375
1376#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 1529#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
1377 offset = real_tk->cycle_interval; 1530 offset = real_tk->cycle_interval;
1378#else 1531#else
1379 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 1532 offset = clocksource_delta(tk->tkr.read(tk->tkr.clock),
1533 tk->tkr.cycle_last, tk->tkr.mask);
1380#endif 1534#endif
1381 1535
1382 /* Check if there's really nothing to do */ 1536 /* Check if there's really nothing to do */
@@ -1418,9 +1572,7 @@ void update_wall_time(void)
1418 */ 1572 */
1419 clock_set |= accumulate_nsecs_to_secs(tk); 1573 clock_set |= accumulate_nsecs_to_secs(tk);
1420 1574
1421 write_seqcount_begin(&timekeeper_seq); 1575 write_seqcount_begin(&tk_core.seq);
1422 /* Update clock->cycle_last with the new value */
1423 clock->cycle_last = tk->cycle_last;
1424 /* 1576 /*
1425 * Update the real timekeeper. 1577 * Update the real timekeeper.
1426 * 1578 *
@@ -1428,12 +1580,12 @@ void update_wall_time(void)
1428 * requires changes to all other timekeeper usage sites as 1580 * requires changes to all other timekeeper usage sites as
1429 * well, i.e. move the timekeeper pointer getter into the 1581 * well, i.e. move the timekeeper pointer getter into the
1430 * spinlocked/seqcount protected sections. And we trade this 1582 * spinlocked/seqcount protected sections. And we trade this
1431 * memcpy under the timekeeper_seq against one before we start 1583 * memcpy under the tk_core.seq against one before we start
1432 * updating. 1584 * updating.
1433 */ 1585 */
1434 memcpy(real_tk, tk, sizeof(*tk)); 1586 memcpy(real_tk, tk, sizeof(*tk));
1435 timekeeping_update(real_tk, clock_set); 1587 timekeeping_update(real_tk, clock_set);
1436 write_seqcount_end(&timekeeper_seq); 1588 write_seqcount_end(&tk_core.seq);
1437out: 1589out:
1438 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1590 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1439 if (clock_set) 1591 if (clock_set)
@@ -1454,83 +1606,16 @@ out:
1454 */ 1606 */
1455void getboottime(struct timespec *ts) 1607void getboottime(struct timespec *ts)
1456{ 1608{
1457 struct timekeeper *tk = &timekeeper; 1609 struct timekeeper *tk = &tk_core.timekeeper;
1458 struct timespec boottime = { 1610 ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
1459 .tv_sec = tk->wall_to_monotonic.tv_sec +
1460 tk->total_sleep_time.tv_sec,
1461 .tv_nsec = tk->wall_to_monotonic.tv_nsec +
1462 tk->total_sleep_time.tv_nsec
1463 };
1464
1465 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
1466}
1467EXPORT_SYMBOL_GPL(getboottime);
1468
1469/**
1470 * get_monotonic_boottime - Returns monotonic time since boot
1471 * @ts: pointer to the timespec to be set
1472 *
1473 * Returns the monotonic time since boot in a timespec.
1474 *
1475 * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
1476 * includes the time spent in suspend.
1477 */
1478void get_monotonic_boottime(struct timespec *ts)
1479{
1480 struct timekeeper *tk = &timekeeper;
1481 struct timespec tomono, sleep;
1482 s64 nsec;
1483 unsigned int seq;
1484
1485 WARN_ON(timekeeping_suspended);
1486
1487 do {
1488 seq = read_seqcount_begin(&timekeeper_seq);
1489 ts->tv_sec = tk->xtime_sec;
1490 nsec = timekeeping_get_ns(tk);
1491 tomono = tk->wall_to_monotonic;
1492 sleep = tk->total_sleep_time;
1493
1494 } while (read_seqcount_retry(&timekeeper_seq, seq));
1495
1496 ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
1497 ts->tv_nsec = 0;
1498 timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec);
1499}
1500EXPORT_SYMBOL_GPL(get_monotonic_boottime);
1501
1502/**
1503 * ktime_get_boottime - Returns monotonic time since boot in a ktime
1504 *
1505 * Returns the monotonic time since boot in a ktime
1506 *
1507 * This is similar to CLOCK_MONTONIC/ktime_get, but also
1508 * includes the time spent in suspend.
1509 */
1510ktime_t ktime_get_boottime(void)
1511{
1512 struct timespec ts;
1513
1514 get_monotonic_boottime(&ts);
1515 return timespec_to_ktime(ts);
1516}
1517EXPORT_SYMBOL_GPL(ktime_get_boottime);
1518
1519/**
1520 * monotonic_to_bootbased - Convert the monotonic time to boot based.
1521 * @ts: pointer to the timespec to be converted
1522 */
1523void monotonic_to_bootbased(struct timespec *ts)
1524{
1525 struct timekeeper *tk = &timekeeper;
1526 1611
1527 *ts = timespec_add(*ts, tk->total_sleep_time); 1612 *ts = ktime_to_timespec(t);
1528} 1613}
1529EXPORT_SYMBOL_GPL(monotonic_to_bootbased); 1614EXPORT_SYMBOL_GPL(getboottime);
1530 1615
1531unsigned long get_seconds(void) 1616unsigned long get_seconds(void)
1532{ 1617{
1533 struct timekeeper *tk = &timekeeper; 1618 struct timekeeper *tk = &tk_core.timekeeper;
1534 1619
1535 return tk->xtime_sec; 1620 return tk->xtime_sec;
1536} 1621}
@@ -1538,43 +1623,44 @@ EXPORT_SYMBOL(get_seconds);
1538 1623
1539struct timespec __current_kernel_time(void) 1624struct timespec __current_kernel_time(void)
1540{ 1625{
1541 struct timekeeper *tk = &timekeeper; 1626 struct timekeeper *tk = &tk_core.timekeeper;
1542 1627
1543 return tk_xtime(tk); 1628 return timespec64_to_timespec(tk_xtime(tk));
1544} 1629}
1545 1630
1546struct timespec current_kernel_time(void) 1631struct timespec current_kernel_time(void)
1547{ 1632{
1548 struct timekeeper *tk = &timekeeper; 1633 struct timekeeper *tk = &tk_core.timekeeper;
1549 struct timespec now; 1634 struct timespec64 now;
1550 unsigned long seq; 1635 unsigned long seq;
1551 1636
1552 do { 1637 do {
1553 seq = read_seqcount_begin(&timekeeper_seq); 1638 seq = read_seqcount_begin(&tk_core.seq);
1554 1639
1555 now = tk_xtime(tk); 1640 now = tk_xtime(tk);
1556 } while (read_seqcount_retry(&timekeeper_seq, seq)); 1641 } while (read_seqcount_retry(&tk_core.seq, seq));
1557 1642
1558 return now; 1643 return timespec64_to_timespec(now);
1559} 1644}
1560EXPORT_SYMBOL(current_kernel_time); 1645EXPORT_SYMBOL(current_kernel_time);
1561 1646
1562struct timespec get_monotonic_coarse(void) 1647struct timespec get_monotonic_coarse(void)
1563{ 1648{
1564 struct timekeeper *tk = &timekeeper; 1649 struct timekeeper *tk = &tk_core.timekeeper;
1565 struct timespec now, mono; 1650 struct timespec64 now, mono;
1566 unsigned long seq; 1651 unsigned long seq;
1567 1652
1568 do { 1653 do {
1569 seq = read_seqcount_begin(&timekeeper_seq); 1654 seq = read_seqcount_begin(&tk_core.seq);
1570 1655
1571 now = tk_xtime(tk); 1656 now = tk_xtime(tk);
1572 mono = tk->wall_to_monotonic; 1657 mono = tk->wall_to_monotonic;
1573 } while (read_seqcount_retry(&timekeeper_seq, seq)); 1658 } while (read_seqcount_retry(&tk_core.seq, seq));
1574 1659
1575 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, 1660 set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
1576 now.tv_nsec + mono.tv_nsec); 1661 now.tv_nsec + mono.tv_nsec);
1577 return now; 1662
1663 return timespec64_to_timespec(now);
1578} 1664}
1579 1665
1580/* 1666/*
@@ -1587,29 +1673,38 @@ void do_timer(unsigned long ticks)
1587} 1673}
1588 1674
1589/** 1675/**
1590 * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, 1676 * ktime_get_update_offsets_tick - hrtimer helper
1591 * and sleep offsets. 1677 * @offs_real: pointer to storage for monotonic -> realtime offset
1592 * @xtim: pointer to timespec to be set with xtime 1678 * @offs_boot: pointer to storage for monotonic -> boottime offset
1593 * @wtom: pointer to timespec to be set with wall_to_monotonic 1679 * @offs_tai: pointer to storage for monotonic -> clock tai offset
1594 * @sleep: pointer to timespec to be set with time in suspend 1680 *
1681 * Returns monotonic time at last tick and various offsets
1595 */ 1682 */
1596void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, 1683ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
1597 struct timespec *wtom, struct timespec *sleep) 1684 ktime_t *offs_tai)
1598{ 1685{
1599 struct timekeeper *tk = &timekeeper; 1686 struct timekeeper *tk = &tk_core.timekeeper;
1600 unsigned long seq; 1687 unsigned int seq;
1688 ktime_t base;
1689 u64 nsecs;
1601 1690
1602 do { 1691 do {
1603 seq = read_seqcount_begin(&timekeeper_seq); 1692 seq = read_seqcount_begin(&tk_core.seq);
1604 *xtim = tk_xtime(tk); 1693
1605 *wtom = tk->wall_to_monotonic; 1694 base = tk->tkr.base_mono;
1606 *sleep = tk->total_sleep_time; 1695 nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift;
1607 } while (read_seqcount_retry(&timekeeper_seq, seq)); 1696
1697 *offs_real = tk->offs_real;
1698 *offs_boot = tk->offs_boot;
1699 *offs_tai = tk->offs_tai;
1700 } while (read_seqcount_retry(&tk_core.seq, seq));
1701
1702 return ktime_add_ns(base, nsecs);
1608} 1703}
1609 1704
1610#ifdef CONFIG_HIGH_RES_TIMERS 1705#ifdef CONFIG_HIGH_RES_TIMERS
1611/** 1706/**
1612 * ktime_get_update_offsets - hrtimer helper 1707 * ktime_get_update_offsets_now - hrtimer helper
1613 * @offs_real: pointer to storage for monotonic -> realtime offset 1708 * @offs_real: pointer to storage for monotonic -> realtime offset
1614 * @offs_boot: pointer to storage for monotonic -> boottime offset 1709 * @offs_boot: pointer to storage for monotonic -> boottime offset
1615 * @offs_tai: pointer to storage for monotonic -> clock tai offset 1710 * @offs_tai: pointer to storage for monotonic -> clock tai offset
@@ -1617,57 +1712,37 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1617 * Returns current monotonic time and updates the offsets 1712 * Returns current monotonic time and updates the offsets
1618 * Called from hrtimer_interrupt() or retrigger_next_event() 1713 * Called from hrtimer_interrupt() or retrigger_next_event()
1619 */ 1714 */
1620ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, 1715ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
1621 ktime_t *offs_tai) 1716 ktime_t *offs_tai)
1622{ 1717{
1623 struct timekeeper *tk = &timekeeper; 1718 struct timekeeper *tk = &tk_core.timekeeper;
1624 ktime_t now;
1625 unsigned int seq; 1719 unsigned int seq;
1626 u64 secs, nsecs; 1720 ktime_t base;
1721 u64 nsecs;
1627 1722
1628 do { 1723 do {
1629 seq = read_seqcount_begin(&timekeeper_seq); 1724 seq = read_seqcount_begin(&tk_core.seq);
1630 1725
1631 secs = tk->xtime_sec; 1726 base = tk->tkr.base_mono;
1632 nsecs = timekeeping_get_ns(tk); 1727 nsecs = timekeeping_get_ns(&tk->tkr);
1633 1728
1634 *offs_real = tk->offs_real; 1729 *offs_real = tk->offs_real;
1635 *offs_boot = tk->offs_boot; 1730 *offs_boot = tk->offs_boot;
1636 *offs_tai = tk->offs_tai; 1731 *offs_tai = tk->offs_tai;
1637 } while (read_seqcount_retry(&timekeeper_seq, seq)); 1732 } while (read_seqcount_retry(&tk_core.seq, seq));
1638 1733
1639 now = ktime_add_ns(ktime_set(secs, 0), nsecs); 1734 return ktime_add_ns(base, nsecs);
1640 now = ktime_sub(now, *offs_real);
1641 return now;
1642} 1735}
1643#endif 1736#endif
1644 1737
1645/** 1738/**
1646 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
1647 */
1648ktime_t ktime_get_monotonic_offset(void)
1649{
1650 struct timekeeper *tk = &timekeeper;
1651 unsigned long seq;
1652 struct timespec wtom;
1653
1654 do {
1655 seq = read_seqcount_begin(&timekeeper_seq);
1656 wtom = tk->wall_to_monotonic;
1657 } while (read_seqcount_retry(&timekeeper_seq, seq));
1658
1659 return timespec_to_ktime(wtom);
1660}
1661EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1662
1663/**
1664 * do_adjtimex() - Accessor function to NTP __do_adjtimex function 1739 * do_adjtimex() - Accessor function to NTP __do_adjtimex function
1665 */ 1740 */
1666int do_adjtimex(struct timex *txc) 1741int do_adjtimex(struct timex *txc)
1667{ 1742{
1668 struct timekeeper *tk = &timekeeper; 1743 struct timekeeper *tk = &tk_core.timekeeper;
1669 unsigned long flags; 1744 unsigned long flags;
1670 struct timespec ts; 1745 struct timespec64 ts;
1671 s32 orig_tai, tai; 1746 s32 orig_tai, tai;
1672 int ret; 1747 int ret;
1673 1748
@@ -1687,10 +1762,10 @@ int do_adjtimex(struct timex *txc)
1687 return ret; 1762 return ret;
1688 } 1763 }
1689 1764
1690 getnstimeofday(&ts); 1765 getnstimeofday64(&ts);
1691 1766
1692 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1767 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1693 write_seqcount_begin(&timekeeper_seq); 1768 write_seqcount_begin(&tk_core.seq);
1694 1769
1695 orig_tai = tai = tk->tai_offset; 1770 orig_tai = tai = tk->tai_offset;
1696 ret = __do_adjtimex(txc, &ts, &tai); 1771 ret = __do_adjtimex(txc, &ts, &tai);
@@ -1699,7 +1774,7 @@ int do_adjtimex(struct timex *txc)
1699 __timekeeping_set_tai_offset(tk, tai); 1774 __timekeeping_set_tai_offset(tk, tai);
1700 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); 1775 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
1701 } 1776 }
1702 write_seqcount_end(&timekeeper_seq); 1777 write_seqcount_end(&tk_core.seq);
1703 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1778 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1704 1779
1705 if (tai != orig_tai) 1780 if (tai != orig_tai)
@@ -1719,11 +1794,11 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
1719 unsigned long flags; 1794 unsigned long flags;
1720 1795
1721 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1796 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1722 write_seqcount_begin(&timekeeper_seq); 1797 write_seqcount_begin(&tk_core.seq);
1723 1798
1724 __hardpps(phase_ts, raw_ts); 1799 __hardpps(phase_ts, raw_ts);
1725 1800
1726 write_seqcount_end(&timekeeper_seq); 1801 write_seqcount_end(&tk_core.seq);
1727 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1802 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1728} 1803}
1729EXPORT_SYMBOL(hardpps); 1804EXPORT_SYMBOL(hardpps);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
new file mode 100644
index 000000000000..adc1fc98bde3
--- /dev/null
+++ b/kernel/time/timekeeping.h
@@ -0,0 +1,20 @@
1#ifndef _KERNEL_TIME_TIMEKEEPING_H
2#define _KERNEL_TIME_TIMEKEEPING_H
3/*
4 * Internal interfaces for kernel/time/
5 */
6extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real,
7 ktime_t *offs_boot,
8 ktime_t *offs_tai);
9extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
10 ktime_t *offs_boot,
11 ktime_t *offs_tai);
12
13extern int timekeeping_valid_for_hres(void);
14extern u64 timekeeping_max_deferment(void);
15extern int timekeeping_inject_offset(struct timespec *ts);
16extern s32 timekeeping_get_tai_offset(void);
17extern void timekeeping_set_tai_offset(s32 tai_offset);
18extern void timekeeping_clocktai(struct timespec *ts);
19
20#endif
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 4d54f97558df..f6bd65236712 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -67,7 +67,7 @@ static int __init tk_debug_sleep_time_init(void)
67} 67}
68late_initcall(tk_debug_sleep_time_init); 68late_initcall(tk_debug_sleep_time_init);
69 69
70void tk_debug_account_sleep_time(struct timespec *t) 70void tk_debug_account_sleep_time(struct timespec64 *t)
71{ 71{
72 sleep_time_bin[fls(t->tv_sec)]++; 72 sleep_time_bin[fls(t->tv_sec)]++;
73} 73}
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 13323ea08ffa..4ea005a7f9da 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -3,12 +3,27 @@
3/* 3/*
4 * timekeeping debug functions 4 * timekeeping debug functions
5 */ 5 */
6#include <linux/clocksource.h>
6#include <linux/time.h> 7#include <linux/time.h>
7 8
8#ifdef CONFIG_DEBUG_FS 9#ifdef CONFIG_DEBUG_FS
9extern void tk_debug_account_sleep_time(struct timespec *t); 10extern void tk_debug_account_sleep_time(struct timespec64 *t);
10#else 11#else
11#define tk_debug_account_sleep_time(x) 12#define tk_debug_account_sleep_time(x)
12#endif 13#endif
13 14
15#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
16static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
17{
18 cycle_t ret = (now - last) & mask;
19
20 return (s64) ret > 0 ? ret : 0;
21}
22#else
23static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
24{
25 return (now - last) & mask;
26}
27#endif
28
14#endif /* _TIMEKEEPING_INTERNAL_H */ 29#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/timer.c b/kernel/time/timer.c
index 3bb01a323b2a..aca5dfe2fa3d 100644
--- a/kernel/timer.c
+++ b/kernel/time/timer.c
@@ -82,6 +82,7 @@ struct tvec_base {
82 unsigned long next_timer; 82 unsigned long next_timer;
83 unsigned long active_timers; 83 unsigned long active_timers;
84 unsigned long all_timers; 84 unsigned long all_timers;
85 int cpu;
85 struct tvec_root tv1; 86 struct tvec_root tv1;
86 struct tvec tv2; 87 struct tvec tv2;
87 struct tvec tv3; 88 struct tvec tv3;
@@ -409,6 +410,22 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
409 base->next_timer = timer->expires; 410 base->next_timer = timer->expires;
410 } 411 }
411 base->all_timers++; 412 base->all_timers++;
413
414 /*
415 * Check whether the other CPU is in dynticks mode and needs
416 * to be triggered to reevaluate the timer wheel.
417 * We are protected against the other CPU fiddling
418 * with the timer by holding the timer base lock. This also
419 * makes sure that a CPU on the way to stop its tick can not
420 * evaluate the timer wheel.
421 *
422 * Spare the IPI for deferrable timers on idle targets though.
423 * The next busy ticks will take care of it. Except full dynticks
424 * require special care against races with idle_cpu(), lets deal
425 * with that later.
426 */
427 if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu))
428 wake_up_nohz_cpu(base->cpu);
412} 429}
413 430
414#ifdef CONFIG_TIMER_STATS 431#ifdef CONFIG_TIMER_STATS
@@ -948,22 +965,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
948 timer_set_base(timer, base); 965 timer_set_base(timer, base);
949 debug_activate(timer, timer->expires); 966 debug_activate(timer, timer->expires);
950 internal_add_timer(base, timer); 967 internal_add_timer(base, timer);
951 /*
952 * Check whether the other CPU is in dynticks mode and needs
953 * to be triggered to reevaluate the timer wheel.
954 * We are protected against the other CPU fiddling
955 * with the timer by holding the timer base lock. This also
956 * makes sure that a CPU on the way to stop its tick can not
957 * evaluate the timer wheel.
958 *
959 * Spare the IPI for deferrable timers on idle targets though.
960 * The next busy ticks will take care of it. Except full dynticks
961 * require special care against races with idle_cpu(), lets deal
962 * with that later.
963 */
964 if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu))
965 wake_up_nohz_cpu(cpu);
966
967 spin_unlock_irqrestore(&base->lock, flags); 968 spin_unlock_irqrestore(&base->lock, flags);
968} 969}
969EXPORT_SYMBOL_GPL(add_timer_on); 970EXPORT_SYMBOL_GPL(add_timer_on);
@@ -1568,6 +1569,7 @@ static int init_timers_cpu(int cpu)
1568 } 1569 }
1569 spin_lock_init(&base->lock); 1570 spin_lock_init(&base->lock);
1570 tvec_base_done[cpu] = 1; 1571 tvec_base_done[cpu] = 1;
1572 base->cpu = cpu;
1571 } else { 1573 } else {
1572 base = per_cpu(tvec_bases, cpu); 1574 base = per_cpu(tvec_bases, cpu);
1573 } 1575 }
diff --git a/kernel/time/udelay_test.c b/kernel/time/udelay_test.c
new file mode 100644
index 000000000000..e622ba365a13
--- /dev/null
+++ b/kernel/time/udelay_test.c
@@ -0,0 +1,168 @@
1/*
2 * udelay() test kernel module
3 *
4 * Test is executed by writing and reading to /sys/kernel/debug/udelay_test
5 * Tests are configured by writing: USECS ITERATIONS
6 * Tests are executed by reading from the same file.
7 * Specifying usecs of 0 or negative values will run multiples tests.
8 *
9 * Copyright (C) 2014 Google, Inc.
10 *
11 * This software is licensed under the terms of the GNU General Public
12 * License version 2, as published by the Free Software Foundation, and
13 * may be copied, distributed, and modified under those terms.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 */
20
21#include <linux/debugfs.h>
22#include <linux/delay.h>
23#include <linux/ktime.h>
24#include <linux/module.h>
25#include <linux/uaccess.h>
26
27#define DEFAULT_ITERATIONS 100
28
29#define DEBUGFS_FILENAME "udelay_test"
30
31static DEFINE_MUTEX(udelay_test_lock);
32static struct dentry *udelay_test_debugfs_file;
33static int udelay_test_usecs;
34static int udelay_test_iterations = DEFAULT_ITERATIONS;
35
36static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters)
37{
38 int min = 0, max = 0, fail_count = 0;
39 uint64_t sum = 0;
40 uint64_t avg;
41 int i;
42 /* Allow udelay to be up to 0.5% fast */
43 int allowed_error_ns = usecs * 5;
44
45 for (i = 0; i < iters; ++i) {
46 struct timespec ts1, ts2;
47 int time_passed;
48
49 ktime_get_ts(&ts1);
50 udelay(usecs);
51 ktime_get_ts(&ts2);
52 time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1);
53
54 if (i == 0 || time_passed < min)
55 min = time_passed;
56 if (i == 0 || time_passed > max)
57 max = time_passed;
58 if ((time_passed + allowed_error_ns) / 1000 < usecs)
59 ++fail_count;
60 WARN_ON(time_passed < 0);
61 sum += time_passed;
62 }
63
64 avg = sum;
65 do_div(avg, iters);
66 seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d",
67 usecs, iters, usecs * 1000,
68 (usecs * 1000) - allowed_error_ns, min, avg, max);
69 if (fail_count)
70 seq_printf(s, " FAIL=%d", fail_count);
71 seq_puts(s, "\n");
72
73 return 0;
74}
75
76static int udelay_test_show(struct seq_file *s, void *v)
77{
78 int usecs;
79 int iters;
80 int ret = 0;
81
82 mutex_lock(&udelay_test_lock);
83 usecs = udelay_test_usecs;
84 iters = udelay_test_iterations;
85 mutex_unlock(&udelay_test_lock);
86
87 if (usecs > 0 && iters > 0) {
88 return udelay_test_single(s, usecs, iters);
89 } else if (usecs == 0) {
90 struct timespec ts;
91
92 ktime_get_ts(&ts);
93 seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n",
94 loops_per_jiffy, ts.tv_sec, ts.tv_nsec);
95 seq_puts(s, "usage:\n");
96 seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n");
97 seq_puts(s, "cat " DEBUGFS_FILENAME "\n");
98 }
99
100 return ret;
101}
102
103static int udelay_test_open(struct inode *inode, struct file *file)
104{
105 return single_open(file, udelay_test_show, inode->i_private);
106}
107
108static ssize_t udelay_test_write(struct file *file, const char __user *buf,
109 size_t count, loff_t *pos)
110{
111 char lbuf[32];
112 int ret;
113 int usecs;
114 int iters;
115
116 if (count >= sizeof(lbuf))
117 return -EINVAL;
118
119 if (copy_from_user(lbuf, buf, count))
120 return -EFAULT;
121 lbuf[count] = '\0';
122
123 ret = sscanf(lbuf, "%d %d", &usecs, &iters);
124 if (ret < 1)
125 return -EINVAL;
126 else if (ret < 2)
127 iters = DEFAULT_ITERATIONS;
128
129 mutex_lock(&udelay_test_lock);
130 udelay_test_usecs = usecs;
131 udelay_test_iterations = iters;
132 mutex_unlock(&udelay_test_lock);
133
134 return count;
135}
136
137static const struct file_operations udelay_test_debugfs_ops = {
138 .owner = THIS_MODULE,
139 .open = udelay_test_open,
140 .read = seq_read,
141 .write = udelay_test_write,
142 .llseek = seq_lseek,
143 .release = single_release,
144};
145
146static int __init udelay_test_init(void)
147{
148 mutex_lock(&udelay_test_lock);
149 udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME,
150 S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops);
151 mutex_unlock(&udelay_test_lock);
152
153 return 0;
154}
155
156module_init(udelay_test_init);
157
158static void __exit udelay_test_exit(void)
159{
160 mutex_lock(&udelay_test_lock);
161 debugfs_remove(udelay_test_debugfs_file);
162 mutex_unlock(&udelay_test_lock);
163}
164
165module_exit(udelay_test_exit);
166
167MODULE_AUTHOR("David Riley <davidriley@chromium.org>");
168MODULE_LICENSE("GPL");
diff --git a/kernel/torture.c b/kernel/torture.c
index 40bb511cca48..d600af21f022 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -708,7 +708,7 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
708 int ret = 0; 708 int ret = 0;
709 709
710 VERBOSE_TOROUT_STRING(m); 710 VERBOSE_TOROUT_STRING(m);
711 *tp = kthread_run(fn, arg, s); 711 *tp = kthread_run(fn, arg, "%s", s);
712 if (IS_ERR(*tp)) { 712 if (IS_ERR(*tp)) {
713 ret = PTR_ERR(*tp); 713 ret = PTR_ERR(*tp);
714 VERBOSE_TOROUT_ERRSTRING(f); 714 VERBOSE_TOROUT_ERRSTRING(f);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d4409356f40d..a5da09c899dd 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -29,11 +29,6 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
29 help 29 help
30 See Documentation/trace/ftrace-design.txt 30 See Documentation/trace/ftrace-design.txt
31 31
32config HAVE_FUNCTION_TRACE_MCOUNT_TEST
33 bool
34 help
35 See Documentation/trace/ftrace-design.txt
36
37config HAVE_DYNAMIC_FTRACE 32config HAVE_DYNAMIC_FTRACE
38 bool 33 bool
39 help 34 help
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2611613f14f1..67d6369ddf83 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
28 28
29obj-$(CONFIG_TRACING) += trace.o 29obj-$(CONFIG_TRACING) += trace.o
30obj-$(CONFIG_TRACING) += trace_output.o 30obj-$(CONFIG_TRACING) += trace_output.o
31obj-$(CONFIG_TRACING) += trace_seq.o
31obj-$(CONFIG_TRACING) += trace_stat.o 32obj-$(CONFIG_TRACING) += trace_stat.o
32obj-$(CONFIG_TRACING) += trace_printk.o 33obj-$(CONFIG_TRACING) += trace_printk.o
33obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o 34obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ac9d1dad630b..1654b12c891a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -80,9 +80,6 @@ static struct ftrace_ops ftrace_list_end __read_mostly = {
80int ftrace_enabled __read_mostly; 80int ftrace_enabled __read_mostly;
81static int last_ftrace_enabled; 81static int last_ftrace_enabled;
82 82
83/* Quick disabling of function tracer. */
84int function_trace_stop __read_mostly;
85
86/* Current function tracing op */ 83/* Current function tracing op */
87struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; 84struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
88/* What to set function_trace_op to */ 85/* What to set function_trace_op to */
@@ -1042,6 +1039,8 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
1042 1039
1043#ifdef CONFIG_DYNAMIC_FTRACE 1040#ifdef CONFIG_DYNAMIC_FTRACE
1044 1041
1042static struct ftrace_ops *removed_ops;
1043
1045#ifndef CONFIG_FTRACE_MCOUNT_RECORD 1044#ifndef CONFIG_FTRACE_MCOUNT_RECORD
1046# error Dynamic ftrace depends on MCOUNT_RECORD 1045# error Dynamic ftrace depends on MCOUNT_RECORD
1047#endif 1046#endif
@@ -1304,25 +1303,15 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1304 struct ftrace_hash *new_hash; 1303 struct ftrace_hash *new_hash;
1305 int size = src->count; 1304 int size = src->count;
1306 int bits = 0; 1305 int bits = 0;
1307 int ret;
1308 int i; 1306 int i;
1309 1307
1310 /* 1308 /*
1311 * Remove the current set, update the hash and add
1312 * them back.
1313 */
1314 ftrace_hash_rec_disable(ops, enable);
1315
1316 /*
1317 * If the new source is empty, just free dst and assign it 1309 * If the new source is empty, just free dst and assign it
1318 * the empty_hash. 1310 * the empty_hash.
1319 */ 1311 */
1320 if (!src->count) { 1312 if (!src->count) {
1321 free_ftrace_hash_rcu(*dst); 1313 new_hash = EMPTY_HASH;
1322 rcu_assign_pointer(*dst, EMPTY_HASH); 1314 goto update;
1323 /* still need to update the function records */
1324 ret = 0;
1325 goto out;
1326 } 1315 }
1327 1316
1328 /* 1317 /*
@@ -1335,10 +1324,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1335 if (bits > FTRACE_HASH_MAX_BITS) 1324 if (bits > FTRACE_HASH_MAX_BITS)
1336 bits = FTRACE_HASH_MAX_BITS; 1325 bits = FTRACE_HASH_MAX_BITS;
1337 1326
1338 ret = -ENOMEM;
1339 new_hash = alloc_ftrace_hash(bits); 1327 new_hash = alloc_ftrace_hash(bits);
1340 if (!new_hash) 1328 if (!new_hash)
1341 goto out; 1329 return -ENOMEM;
1342 1330
1343 size = 1 << src->size_bits; 1331 size = 1 << src->size_bits;
1344 for (i = 0; i < size; i++) { 1332 for (i = 0; i < size; i++) {
@@ -1349,20 +1337,20 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1349 } 1337 }
1350 } 1338 }
1351 1339
1340update:
1341 /*
1342 * Remove the current set, update the hash and add
1343 * them back.
1344 */
1345 ftrace_hash_rec_disable(ops, enable);
1346
1352 old_hash = *dst; 1347 old_hash = *dst;
1353 rcu_assign_pointer(*dst, new_hash); 1348 rcu_assign_pointer(*dst, new_hash);
1354 free_ftrace_hash_rcu(old_hash); 1349 free_ftrace_hash_rcu(old_hash);
1355 1350
1356 ret = 0;
1357 out:
1358 /*
1359 * Enable regardless of ret:
1360 * On success, we enable the new hash.
1361 * On failure, we re-enable the original hash.
1362 */
1363 ftrace_hash_rec_enable(ops, enable); 1351 ftrace_hash_rec_enable(ops, enable);
1364 1352
1365 return ret; 1353 return 0;
1366} 1354}
1367 1355
1368/* 1356/*
@@ -1492,6 +1480,53 @@ int ftrace_text_reserved(const void *start, const void *end)
1492 return (int)!!ret; 1480 return (int)!!ret;
1493} 1481}
1494 1482
1483/* Test if ops registered to this rec needs regs */
1484static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
1485{
1486 struct ftrace_ops *ops;
1487 bool keep_regs = false;
1488
1489 for (ops = ftrace_ops_list;
1490 ops != &ftrace_list_end; ops = ops->next) {
1491 /* pass rec in as regs to have non-NULL val */
1492 if (ftrace_ops_test(ops, rec->ip, rec)) {
1493 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
1494 keep_regs = true;
1495 break;
1496 }
1497 }
1498 }
1499
1500 return keep_regs;
1501}
1502
1503static void ftrace_remove_tramp(struct ftrace_ops *ops,
1504 struct dyn_ftrace *rec)
1505{
1506 struct ftrace_func_entry *entry;
1507
1508 entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip);
1509 if (!entry)
1510 return;
1511
1512 /*
1513 * The tramp_hash entry will be removed at time
1514 * of update.
1515 */
1516 ops->nr_trampolines--;
1517 rec->flags &= ~FTRACE_FL_TRAMP;
1518}
1519
1520static void ftrace_clear_tramps(struct dyn_ftrace *rec)
1521{
1522 struct ftrace_ops *op;
1523
1524 do_for_each_ftrace_op(op, ftrace_ops_list) {
1525 if (op->nr_trampolines)
1526 ftrace_remove_tramp(op, rec);
1527 } while_for_each_ftrace_op(op);
1528}
1529
1495static void __ftrace_hash_rec_update(struct ftrace_ops *ops, 1530static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1496 int filter_hash, 1531 int filter_hash,
1497 bool inc) 1532 bool inc)
@@ -1572,8 +1607,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1572 1607
1573 if (inc) { 1608 if (inc) {
1574 rec->flags++; 1609 rec->flags++;
1575 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) 1610 if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX))
1576 return; 1611 return;
1612
1613 /*
1614 * If there's only a single callback registered to a
1615 * function, and the ops has a trampoline registered
1616 * for it, then we can call it directly.
1617 */
1618 if (ftrace_rec_count(rec) == 1 && ops->trampoline) {
1619 rec->flags |= FTRACE_FL_TRAMP;
1620 ops->nr_trampolines++;
1621 } else {
1622 /*
1623 * If we are adding another function callback
1624 * to this function, and the previous had a
1625 * trampoline used, then we need to go back to
1626 * the default trampoline.
1627 */
1628 rec->flags &= ~FTRACE_FL_TRAMP;
1629
1630 /* remove trampolines from any ops for this rec */
1631 ftrace_clear_tramps(rec);
1632 }
1633
1577 /* 1634 /*
1578 * If any ops wants regs saved for this function 1635 * If any ops wants regs saved for this function
1579 * then all ops will get saved regs. 1636 * then all ops will get saved regs.
@@ -1581,9 +1638,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1581 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) 1638 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
1582 rec->flags |= FTRACE_FL_REGS; 1639 rec->flags |= FTRACE_FL_REGS;
1583 } else { 1640 } else {
1584 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) 1641 if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0))
1585 return; 1642 return;
1586 rec->flags--; 1643 rec->flags--;
1644
1645 if (ops->trampoline && !ftrace_rec_count(rec))
1646 ftrace_remove_tramp(ops, rec);
1647
1648 /*
1649 * If the rec had REGS enabled and the ops that is
1650 * being removed had REGS set, then see if there is
1651 * still any ops for this record that wants regs.
1652 * If not, we can stop recording them.
1653 */
1654 if (ftrace_rec_count(rec) > 0 &&
1655 rec->flags & FTRACE_FL_REGS &&
1656 ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
1657 if (!test_rec_ops_needs_regs(rec))
1658 rec->flags &= ~FTRACE_FL_REGS;
1659 }
1660
1661 /*
1662 * flags will be cleared in ftrace_check_record()
1663 * if rec count is zero.
1664 */
1587 } 1665 }
1588 count++; 1666 count++;
1589 /* Shortcut, if we handled all records, we are done. */ 1667 /* Shortcut, if we handled all records, we are done. */
@@ -1668,17 +1746,23 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1668 * If we are disabling calls, then disable all records that 1746 * If we are disabling calls, then disable all records that
1669 * are enabled. 1747 * are enabled.
1670 */ 1748 */
1671 if (enable && (rec->flags & ~FTRACE_FL_MASK)) 1749 if (enable && ftrace_rec_count(rec))
1672 flag = FTRACE_FL_ENABLED; 1750 flag = FTRACE_FL_ENABLED;
1673 1751
1674 /* 1752 /*
1675 * If enabling and the REGS flag does not match the REGS_EN, then 1753 * If enabling and the REGS flag does not match the REGS_EN, or
1676 * do not ignore this record. Set flags to fail the compare against 1754 * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore
1677 * ENABLED. 1755 * this record. Set flags to fail the compare against ENABLED.
1678 */ 1756 */
1679 if (flag && 1757 if (flag) {
1680 (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) 1758 if (!(rec->flags & FTRACE_FL_REGS) !=
1681 flag |= FTRACE_FL_REGS; 1759 !(rec->flags & FTRACE_FL_REGS_EN))
1760 flag |= FTRACE_FL_REGS;
1761
1762 if (!(rec->flags & FTRACE_FL_TRAMP) !=
1763 !(rec->flags & FTRACE_FL_TRAMP_EN))
1764 flag |= FTRACE_FL_TRAMP;
1765 }
1682 1766
1683 /* If the state of this record hasn't changed, then do nothing */ 1767 /* If the state of this record hasn't changed, then do nothing */
1684 if ((rec->flags & FTRACE_FL_ENABLED) == flag) 1768 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1696,6 +1780,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1696 else 1780 else
1697 rec->flags &= ~FTRACE_FL_REGS_EN; 1781 rec->flags &= ~FTRACE_FL_REGS_EN;
1698 } 1782 }
1783 if (flag & FTRACE_FL_TRAMP) {
1784 if (rec->flags & FTRACE_FL_TRAMP)
1785 rec->flags |= FTRACE_FL_TRAMP_EN;
1786 else
1787 rec->flags &= ~FTRACE_FL_TRAMP_EN;
1788 }
1699 } 1789 }
1700 1790
1701 /* 1791 /*
@@ -1704,7 +1794,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1704 * Otherwise, 1794 * Otherwise,
1705 * return UPDATE_MODIFY_CALL to tell the caller to convert 1795 * return UPDATE_MODIFY_CALL to tell the caller to convert
1706 * from the save regs, to a non-save regs function or 1796 * from the save regs, to a non-save regs function or
1707 * vice versa. 1797 * vice versa, or from a trampoline call.
1708 */ 1798 */
1709 if (flag & FTRACE_FL_ENABLED) 1799 if (flag & FTRACE_FL_ENABLED)
1710 return FTRACE_UPDATE_MAKE_CALL; 1800 return FTRACE_UPDATE_MAKE_CALL;
@@ -1714,7 +1804,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1714 1804
1715 if (update) { 1805 if (update) {
1716 /* If there's no more users, clear all flags */ 1806 /* If there's no more users, clear all flags */
1717 if (!(rec->flags & ~FTRACE_FL_MASK)) 1807 if (!ftrace_rec_count(rec))
1718 rec->flags = 0; 1808 rec->flags = 0;
1719 else 1809 else
1720 /* Just disable the record (keep REGS state) */ 1810 /* Just disable the record (keep REGS state) */
@@ -1751,6 +1841,43 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1751 return ftrace_check_record(rec, enable, 0); 1841 return ftrace_check_record(rec, enable, 0);
1752} 1842}
1753 1843
1844static struct ftrace_ops *
1845ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
1846{
1847 struct ftrace_ops *op;
1848
1849 /* Removed ops need to be tested first */
1850 if (removed_ops && removed_ops->tramp_hash) {
1851 if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip))
1852 return removed_ops;
1853 }
1854
1855 do_for_each_ftrace_op(op, ftrace_ops_list) {
1856 if (!op->tramp_hash)
1857 continue;
1858
1859 if (ftrace_lookup_ip(op->tramp_hash, rec->ip))
1860 return op;
1861
1862 } while_for_each_ftrace_op(op);
1863
1864 return NULL;
1865}
1866
1867static struct ftrace_ops *
1868ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
1869{
1870 struct ftrace_ops *op;
1871
1872 do_for_each_ftrace_op(op, ftrace_ops_list) {
1873 /* pass rec in as regs to have non-NULL val */
1874 if (ftrace_ops_test(op, rec->ip, rec))
1875 return op;
1876 } while_for_each_ftrace_op(op);
1877
1878 return NULL;
1879}
1880
1754/** 1881/**
1755 * ftrace_get_addr_new - Get the call address to set to 1882 * ftrace_get_addr_new - Get the call address to set to
1756 * @rec: The ftrace record descriptor 1883 * @rec: The ftrace record descriptor
@@ -1763,6 +1890,20 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1763 */ 1890 */
1764unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) 1891unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
1765{ 1892{
1893 struct ftrace_ops *ops;
1894
1895 /* Trampolines take precedence over regs */
1896 if (rec->flags & FTRACE_FL_TRAMP) {
1897 ops = ftrace_find_tramp_ops_new(rec);
1898 if (FTRACE_WARN_ON(!ops || !ops->trampoline)) {
1899 pr_warning("Bad trampoline accounting at: %p (%pS)\n",
1900 (void *)rec->ip, (void *)rec->ip);
1901 /* Ftrace is shutting down, return anything */
1902 return (unsigned long)FTRACE_ADDR;
1903 }
1904 return ops->trampoline;
1905 }
1906
1766 if (rec->flags & FTRACE_FL_REGS) 1907 if (rec->flags & FTRACE_FL_REGS)
1767 return (unsigned long)FTRACE_REGS_ADDR; 1908 return (unsigned long)FTRACE_REGS_ADDR;
1768 else 1909 else
@@ -1781,6 +1922,20 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
1781 */ 1922 */
1782unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) 1923unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
1783{ 1924{
1925 struct ftrace_ops *ops;
1926
1927 /* Trampolines take precedence over regs */
1928 if (rec->flags & FTRACE_FL_TRAMP_EN) {
1929 ops = ftrace_find_tramp_ops_curr(rec);
1930 if (FTRACE_WARN_ON(!ops)) {
1931 pr_warning("Bad trampoline accounting at: %p (%pS)\n",
1932 (void *)rec->ip, (void *)rec->ip);
1933 /* Ftrace is shutting down, return anything */
1934 return (unsigned long)FTRACE_ADDR;
1935 }
1936 return ops->trampoline;
1937 }
1938
1784 if (rec->flags & FTRACE_FL_REGS_EN) 1939 if (rec->flags & FTRACE_FL_REGS_EN)
1785 return (unsigned long)FTRACE_REGS_ADDR; 1940 return (unsigned long)FTRACE_REGS_ADDR;
1786 else 1941 else
@@ -2023,6 +2178,89 @@ void __weak arch_ftrace_update_code(int command)
2023 ftrace_run_stop_machine(command); 2178 ftrace_run_stop_machine(command);
2024} 2179}
2025 2180
2181static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops)
2182{
2183 struct ftrace_page *pg;
2184 struct dyn_ftrace *rec;
2185 int size, bits;
2186 int ret;
2187
2188 size = ops->nr_trampolines;
2189 bits = 0;
2190 /*
2191 * Make the hash size about 1/2 the # found
2192 */
2193 for (size /= 2; size; size >>= 1)
2194 bits++;
2195
2196 ops->tramp_hash = alloc_ftrace_hash(bits);
2197 /*
2198 * TODO: a failed allocation is going to screw up
2199 * the accounting of what needs to be modified
2200 * and not. For now, we kill ftrace if we fail
2201 * to allocate here. But there are ways around this,
2202 * but that will take a little more work.
2203 */
2204 if (!ops->tramp_hash)
2205 return -ENOMEM;
2206
2207 do_for_each_ftrace_rec(pg, rec) {
2208 if (ftrace_rec_count(rec) == 1 &&
2209 ftrace_ops_test(ops, rec->ip, rec)) {
2210
2211 /*
2212 * If another ops adds to a rec, the rec will
2213 * lose its trampoline and never get it back
2214 * until all ops are off of it.
2215 */
2216 if (!(rec->flags & FTRACE_FL_TRAMP))
2217 continue;
2218
2219 /* This record had better have a trampoline */
2220 if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN)))
2221 return -1;
2222
2223 ret = add_hash_entry(ops->tramp_hash, rec->ip);
2224 if (ret < 0)
2225 return ret;
2226 }
2227 } while_for_each_ftrace_rec();
2228
2229 /* The number of recs in the hash must match nr_trampolines */
2230 FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines);
2231
2232 return 0;
2233}
2234
2235static int ftrace_save_tramp_hashes(void)
2236{
2237 struct ftrace_ops *op;
2238 int ret;
2239
2240 /*
2241 * Now that any trampoline is being used, we need to save the
2242 * hashes for the ops that have them. This allows the mapping
2243 * back from the record to the ops that has the trampoline to
2244 * know what code is being replaced. Modifying code must always
2245 * verify what it is changing.
2246 */
2247 do_for_each_ftrace_op(op, ftrace_ops_list) {
2248
2249 /* The tramp_hash is recreated each time. */
2250 free_ftrace_hash(op->tramp_hash);
2251 op->tramp_hash = NULL;
2252
2253 if (op->nr_trampolines) {
2254 ret = ftrace_save_ops_tramp_hash(op);
2255 if (ret)
2256 return ret;
2257 }
2258
2259 } while_for_each_ftrace_op(op);
2260
2261 return 0;
2262}
2263
2026static void ftrace_run_update_code(int command) 2264static void ftrace_run_update_code(int command)
2027{ 2265{
2028 int ret; 2266 int ret;
@@ -2031,11 +2269,6 @@ static void ftrace_run_update_code(int command)
2031 FTRACE_WARN_ON(ret); 2269 FTRACE_WARN_ON(ret);
2032 if (ret) 2270 if (ret)
2033 return; 2271 return;
2034 /*
2035 * Do not call function tracer while we update the code.
2036 * We are in stop machine.
2037 */
2038 function_trace_stop++;
2039 2272
2040 /* 2273 /*
2041 * By default we use stop_machine() to modify the code. 2274 * By default we use stop_machine() to modify the code.
@@ -2045,15 +2278,15 @@ static void ftrace_run_update_code(int command)
2045 */ 2278 */
2046 arch_ftrace_update_code(command); 2279 arch_ftrace_update_code(command);
2047 2280
2048 function_trace_stop--;
2049
2050 ret = ftrace_arch_code_modify_post_process(); 2281 ret = ftrace_arch_code_modify_post_process();
2051 FTRACE_WARN_ON(ret); 2282 FTRACE_WARN_ON(ret);
2283
2284 ret = ftrace_save_tramp_hashes();
2285 FTRACE_WARN_ON(ret);
2052} 2286}
2053 2287
2054static ftrace_func_t saved_ftrace_func; 2288static ftrace_func_t saved_ftrace_func;
2055static int ftrace_start_up; 2289static int ftrace_start_up;
2056static int global_start_up;
2057 2290
2058static void control_ops_free(struct ftrace_ops *ops) 2291static void control_ops_free(struct ftrace_ops *ops)
2059{ 2292{
@@ -2117,8 +2350,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2117 2350
2118 ftrace_hash_rec_disable(ops, 1); 2351 ftrace_hash_rec_disable(ops, 1);
2119 2352
2120 if (!global_start_up) 2353 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
2121 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
2122 2354
2123 command |= FTRACE_UPDATE_CALLS; 2355 command |= FTRACE_UPDATE_CALLS;
2124 2356
@@ -2139,8 +2371,16 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2139 return 0; 2371 return 0;
2140 } 2372 }
2141 2373
2374 /*
2375 * If the ops uses a trampoline, then it needs to be
2376 * tested first on update.
2377 */
2378 removed_ops = ops;
2379
2142 ftrace_run_update_code(command); 2380 ftrace_run_update_code(command);
2143 2381
2382 removed_ops = NULL;
2383
2144 /* 2384 /*
2145 * Dynamic ops may be freed, we must make sure that all 2385 * Dynamic ops may be freed, we must make sure that all
2146 * callers are done before leaving this function. 2386 * callers are done before leaving this function.
@@ -2398,7 +2638,8 @@ ftrace_allocate_pages(unsigned long num_to_init)
2398 return start_pg; 2638 return start_pg;
2399 2639
2400 free_pages: 2640 free_pages:
2401 while (start_pg) { 2641 pg = start_pg;
2642 while (pg) {
2402 order = get_count_order(pg->size / ENTRIES_PER_PAGE); 2643 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
2403 free_pages((unsigned long)pg->records, order); 2644 free_pages((unsigned long)pg->records, order);
2404 start_pg = pg->next; 2645 start_pg = pg->next;
@@ -2595,8 +2836,10 @@ static void *t_start(struct seq_file *m, loff_t *pos)
2595 * off, we can short cut and just print out that all 2836 * off, we can short cut and just print out that all
2596 * functions are enabled. 2837 * functions are enabled.
2597 */ 2838 */
2598 if (iter->flags & FTRACE_ITER_FILTER && 2839 if ((iter->flags & FTRACE_ITER_FILTER &&
2599 ftrace_hash_empty(ops->filter_hash)) { 2840 ftrace_hash_empty(ops->filter_hash)) ||
2841 (iter->flags & FTRACE_ITER_NOTRACE &&
2842 ftrace_hash_empty(ops->notrace_hash))) {
2600 if (*pos > 0) 2843 if (*pos > 0)
2601 return t_hash_start(m, pos); 2844 return t_hash_start(m, pos);
2602 iter->flags |= FTRACE_ITER_PRINTALL; 2845 iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2641,7 +2884,10 @@ static int t_show(struct seq_file *m, void *v)
2641 return t_hash_show(m, iter); 2884 return t_hash_show(m, iter);
2642 2885
2643 if (iter->flags & FTRACE_ITER_PRINTALL) { 2886 if (iter->flags & FTRACE_ITER_PRINTALL) {
2644 seq_printf(m, "#### all functions enabled ####\n"); 2887 if (iter->flags & FTRACE_ITER_NOTRACE)
2888 seq_printf(m, "#### no functions disabled ####\n");
2889 else
2890 seq_printf(m, "#### all functions enabled ####\n");
2645 return 0; 2891 return 0;
2646 } 2892 }
2647 2893
@@ -2651,10 +2897,22 @@ static int t_show(struct seq_file *m, void *v)
2651 return 0; 2897 return 0;
2652 2898
2653 seq_printf(m, "%ps", (void *)rec->ip); 2899 seq_printf(m, "%ps", (void *)rec->ip);
2654 if (iter->flags & FTRACE_ITER_ENABLED) 2900 if (iter->flags & FTRACE_ITER_ENABLED) {
2655 seq_printf(m, " (%ld)%s", 2901 seq_printf(m, " (%ld)%s",
2656 rec->flags & ~FTRACE_FL_MASK, 2902 ftrace_rec_count(rec),
2657 rec->flags & FTRACE_FL_REGS ? " R" : ""); 2903 rec->flags & FTRACE_FL_REGS ? " R" : " ");
2904 if (rec->flags & FTRACE_FL_TRAMP_EN) {
2905 struct ftrace_ops *ops;
2906
2907 ops = ftrace_find_tramp_ops_curr(rec);
2908 if (ops && ops->trampoline)
2909 seq_printf(m, "\ttramp: %pS",
2910 (void *)ops->trampoline);
2911 else
2912 seq_printf(m, "\ttramp: ERROR!");
2913 }
2914 }
2915
2658 seq_printf(m, "\n"); 2916 seq_printf(m, "\n");
2659 2917
2660 return 0; 2918 return 0;
@@ -2702,13 +2960,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
2702 return iter ? 0 : -ENOMEM; 2960 return iter ? 0 : -ENOMEM;
2703} 2961}
2704 2962
2705static void ftrace_filter_reset(struct ftrace_hash *hash)
2706{
2707 mutex_lock(&ftrace_lock);
2708 ftrace_hash_clear(hash);
2709 mutex_unlock(&ftrace_lock);
2710}
2711
2712/** 2963/**
2713 * ftrace_regex_open - initialize function tracer filter files 2964 * ftrace_regex_open - initialize function tracer filter files
2714 * @ops: The ftrace_ops that hold the hash filters 2965 * @ops: The ftrace_ops that hold the hash filters
@@ -2758,7 +3009,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
2758 hash = ops->filter_hash; 3009 hash = ops->filter_hash;
2759 3010
2760 if (file->f_mode & FMODE_WRITE) { 3011 if (file->f_mode & FMODE_WRITE) {
2761 iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); 3012 const int size_bits = FTRACE_HASH_DEFAULT_BITS;
3013
3014 if (file->f_flags & O_TRUNC)
3015 iter->hash = alloc_ftrace_hash(size_bits);
3016 else
3017 iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
3018
2762 if (!iter->hash) { 3019 if (!iter->hash) {
2763 trace_parser_put(&iter->parser); 3020 trace_parser_put(&iter->parser);
2764 kfree(iter); 3021 kfree(iter);
@@ -2767,10 +3024,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
2767 } 3024 }
2768 } 3025 }
2769 3026
2770 if ((file->f_mode & FMODE_WRITE) &&
2771 (file->f_flags & O_TRUNC))
2772 ftrace_filter_reset(iter->hash);
2773
2774 if (file->f_mode & FMODE_READ) { 3027 if (file->f_mode & FMODE_READ) {
2775 iter->pg = ftrace_pages_start; 3028 iter->pg = ftrace_pages_start;
2776 3029
@@ -3471,14 +3724,16 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3471 else 3724 else
3472 orig_hash = &ops->notrace_hash; 3725 orig_hash = &ops->notrace_hash;
3473 3726
3474 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); 3727 if (reset)
3728 hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
3729 else
3730 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
3731
3475 if (!hash) { 3732 if (!hash) {
3476 ret = -ENOMEM; 3733 ret = -ENOMEM;
3477 goto out_regex_unlock; 3734 goto out_regex_unlock;
3478 } 3735 }
3479 3736
3480 if (reset)
3481 ftrace_filter_reset(hash);
3482 if (buf && !ftrace_match_records(hash, buf, len)) { 3737 if (buf && !ftrace_match_records(hash, buf, len)) {
3483 ret = -EINVAL; 3738 ret = -EINVAL;
3484 goto out_regex_unlock; 3739 goto out_regex_unlock;
@@ -3630,6 +3885,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
3630 3885
3631#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3886#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3632static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; 3887static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
3888static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
3633static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); 3889static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
3634 3890
3635static int __init set_graph_function(char *str) 3891static int __init set_graph_function(char *str)
@@ -3639,16 +3895,29 @@ static int __init set_graph_function(char *str)
3639} 3895}
3640__setup("ftrace_graph_filter=", set_graph_function); 3896__setup("ftrace_graph_filter=", set_graph_function);
3641 3897
3642static void __init set_ftrace_early_graph(char *buf) 3898static int __init set_graph_notrace_function(char *str)
3899{
3900 strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
3901 return 1;
3902}
3903__setup("ftrace_graph_notrace=", set_graph_notrace_function);
3904
3905static void __init set_ftrace_early_graph(char *buf, int enable)
3643{ 3906{
3644 int ret; 3907 int ret;
3645 char *func; 3908 char *func;
3909 unsigned long *table = ftrace_graph_funcs;
3910 int *count = &ftrace_graph_count;
3911
3912 if (!enable) {
3913 table = ftrace_graph_notrace_funcs;
3914 count = &ftrace_graph_notrace_count;
3915 }
3646 3916
3647 while (buf) { 3917 while (buf) {
3648 func = strsep(&buf, ","); 3918 func = strsep(&buf, ",");
3649 /* we allow only one expression at a time */ 3919 /* we allow only one expression at a time */
3650 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, 3920 ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func);
3651 FTRACE_GRAPH_MAX_FUNCS, func);
3652 if (ret) 3921 if (ret)
3653 printk(KERN_DEBUG "ftrace: function %s not " 3922 printk(KERN_DEBUG "ftrace: function %s not "
3654 "traceable\n", func); 3923 "traceable\n", func);
@@ -3677,7 +3946,9 @@ static void __init set_ftrace_early_filters(void)
3677 ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); 3946 ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
3678#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3947#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3679 if (ftrace_graph_buf[0]) 3948 if (ftrace_graph_buf[0])
3680 set_ftrace_early_graph(ftrace_graph_buf); 3949 set_ftrace_early_graph(ftrace_graph_buf, 1);
3950 if (ftrace_graph_notrace_buf[0])
3951 set_ftrace_early_graph(ftrace_graph_notrace_buf, 0);
3681#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3952#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3682} 3953}
3683 3954
@@ -3819,7 +4090,12 @@ static int g_show(struct seq_file *m, void *v)
3819 return 0; 4090 return 0;
3820 4091
3821 if (ptr == (unsigned long *)1) { 4092 if (ptr == (unsigned long *)1) {
3822 seq_printf(m, "#### all functions enabled ####\n"); 4093 struct ftrace_graph_data *fgd = m->private;
4094
4095 if (fgd->table == ftrace_graph_funcs)
4096 seq_printf(m, "#### all functions enabled ####\n");
4097 else
4098 seq_printf(m, "#### no functions disabled ####\n");
3823 return 0; 4099 return 0;
3824 } 4100 }
3825 4101
@@ -4447,9 +4723,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4447 struct ftrace_ops *op; 4723 struct ftrace_ops *op;
4448 int bit; 4724 int bit;
4449 4725
4450 if (function_trace_stop)
4451 return;
4452
4453 bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); 4726 bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
4454 if (bit < 0) 4727 if (bit < 0)
4455 return; 4728 return;
@@ -4461,9 +4734,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4461 preempt_disable_notrace(); 4734 preempt_disable_notrace();
4462 do_for_each_ftrace_op(op, ftrace_ops_list) { 4735 do_for_each_ftrace_op(op, ftrace_ops_list) {
4463 if (ftrace_ops_test(op, ip, regs)) { 4736 if (ftrace_ops_test(op, ip, regs)) {
4464 if (WARN_ON(!op->func)) { 4737 if (FTRACE_WARN_ON(!op->func)) {
4465 function_trace_stop = 1; 4738 pr_warn("op=%p %pS\n", op, op);
4466 printk("op=%p %pS\n", op, op);
4467 goto out; 4739 goto out;
4468 } 4740 }
4469 op->func(ip, parent_ip, op, regs); 4741 op->func(ip, parent_ip, op, regs);
@@ -5084,6 +5356,12 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5084 /* Function graph doesn't use the .func field of global_ops */ 5356 /* Function graph doesn't use the .func field of global_ops */
5085 global_ops.flags |= FTRACE_OPS_FL_STUB; 5357 global_ops.flags |= FTRACE_OPS_FL_STUB;
5086 5358
5359#ifdef CONFIG_DYNAMIC_FTRACE
5360 /* Optimize function graph calling (if implemented by arch) */
5361 if (FTRACE_GRAPH_TRAMP_ADDR != 0)
5362 global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR;
5363#endif
5364
5087 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); 5365 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
5088 5366
5089out: 5367out:
@@ -5104,6 +5382,10 @@ void unregister_ftrace_graph(void)
5104 __ftrace_graph_entry = ftrace_graph_entry_stub; 5382 __ftrace_graph_entry = ftrace_graph_entry_stub;
5105 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); 5383 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
5106 global_ops.flags &= ~FTRACE_OPS_FL_STUB; 5384 global_ops.flags &= ~FTRACE_OPS_FL_STUB;
5385#ifdef CONFIG_DYNAMIC_FTRACE
5386 if (FTRACE_GRAPH_TRAMP_ADDR != 0)
5387 global_ops.trampoline = 0;
5388#endif
5107 unregister_pm_notifier(&ftrace_suspend_notifier); 5389 unregister_pm_notifier(&ftrace_suspend_notifier);
5108 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 5390 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
5109 5391
@@ -5183,9 +5465,4 @@ void ftrace_graph_exit_task(struct task_struct *t)
5183 5465
5184 kfree(ret_stack); 5466 kfree(ret_stack);
5185} 5467}
5186
5187void ftrace_graph_stop(void)
5188{
5189 ftrace_stop();
5190}
5191#endif 5468#endif
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ff7027199a9a..925f629658d6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1689,22 +1689,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1689 if (!cpu_buffer->nr_pages_to_update) 1689 if (!cpu_buffer->nr_pages_to_update)
1690 continue; 1690 continue;
1691 1691
1692 /* The update must run on the CPU that is being updated. */ 1692 /* Can't run something on an offline CPU. */
1693 preempt_disable(); 1693 if (!cpu_online(cpu)) {
1694 if (cpu == smp_processor_id() || !cpu_online(cpu)) {
1695 rb_update_pages(cpu_buffer); 1694 rb_update_pages(cpu_buffer);
1696 cpu_buffer->nr_pages_to_update = 0; 1695 cpu_buffer->nr_pages_to_update = 0;
1697 } else { 1696 } else {
1698 /*
1699 * Can not disable preemption for schedule_work_on()
1700 * on PREEMPT_RT.
1701 */
1702 preempt_enable();
1703 schedule_work_on(cpu, 1697 schedule_work_on(cpu,
1704 &cpu_buffer->update_pages_work); 1698 &cpu_buffer->update_pages_work);
1705 preempt_disable();
1706 } 1699 }
1707 preempt_enable();
1708 } 1700 }
1709 1701
1710 /* wait for all the updates to complete */ 1702 /* wait for all the updates to complete */
@@ -1742,22 +1734,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1742 1734
1743 get_online_cpus(); 1735 get_online_cpus();
1744 1736
1745 preempt_disable(); 1737 /* Can't run something on an offline CPU. */
1746 /* The update must run on the CPU that is being updated. */ 1738 if (!cpu_online(cpu_id))
1747 if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
1748 rb_update_pages(cpu_buffer); 1739 rb_update_pages(cpu_buffer);
1749 else { 1740 else {
1750 /*
1751 * Can not disable preemption for schedule_work_on()
1752 * on PREEMPT_RT.
1753 */
1754 preempt_enable();
1755 schedule_work_on(cpu_id, 1741 schedule_work_on(cpu_id,
1756 &cpu_buffer->update_pages_work); 1742 &cpu_buffer->update_pages_work);
1757 wait_for_completion(&cpu_buffer->update_done); 1743 wait_for_completion(&cpu_buffer->update_done);
1758 preempt_disable();
1759 } 1744 }
1760 preempt_enable();
1761 1745
1762 cpu_buffer->nr_pages_to_update = 0; 1746 cpu_buffer->nr_pages_to_update = 0;
1763 put_online_cpus(); 1747 put_online_cpus();
@@ -3775,7 +3759,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3775 if (rb_per_cpu_empty(cpu_buffer)) 3759 if (rb_per_cpu_empty(cpu_buffer))
3776 return NULL; 3760 return NULL;
3777 3761
3778 if (iter->head >= local_read(&iter->head_page->page->commit)) { 3762 if (iter->head >= rb_page_size(iter->head_page)) {
3779 rb_inc_iter(iter); 3763 rb_inc_iter(iter);
3780 goto again; 3764 goto again;
3781 } 3765 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bda9621638cc..8a528392b1f4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -820,11 +820,12 @@ static struct {
820 const char *name; 820 const char *name;
821 int in_ns; /* is this clock in nanoseconds? */ 821 int in_ns; /* is this clock in nanoseconds? */
822} trace_clocks[] = { 822} trace_clocks[] = {
823 { trace_clock_local, "local", 1 }, 823 { trace_clock_local, "local", 1 },
824 { trace_clock_global, "global", 1 }, 824 { trace_clock_global, "global", 1 },
825 { trace_clock_counter, "counter", 0 }, 825 { trace_clock_counter, "counter", 0 },
826 { trace_clock_jiffies, "uptime", 1 }, 826 { trace_clock_jiffies, "uptime", 0 },
827 { trace_clock, "perf", 1 }, 827 { trace_clock, "perf", 1 },
828 { ktime_get_mono_fast_ns, "mono", 1 },
828 ARCH_TRACE_CLOCKS 829 ARCH_TRACE_CLOCKS
829}; 830};
830 831
@@ -937,30 +938,6 @@ out:
937 return ret; 938 return ret;
938} 939}
939 940
940ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
941{
942 int len;
943 int ret;
944
945 if (!cnt)
946 return 0;
947
948 if (s->len <= s->readpos)
949 return -EBUSY;
950
951 len = s->len - s->readpos;
952 if (cnt > len)
953 cnt = len;
954 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
955 if (ret == cnt)
956 return -EFAULT;
957
958 cnt -= ret;
959
960 s->readpos += cnt;
961 return cnt;
962}
963
964static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) 941static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
965{ 942{
966 int len; 943 int len;
@@ -3699,6 +3676,7 @@ static const char readme_msg[] =
3699#endif 3676#endif
3700#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3677#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3701 " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" 3678 " set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
3679 " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n"
3702 " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" 3680 " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
3703#endif 3681#endif
3704#ifdef CONFIG_TRACER_SNAPSHOT 3682#ifdef CONFIG_TRACER_SNAPSHOT
@@ -4238,10 +4216,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
4238} 4216}
4239 4217
4240static ssize_t 4218static ssize_t
4241tracing_max_lat_read(struct file *filp, char __user *ubuf, 4219tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
4242 size_t cnt, loff_t *ppos) 4220 size_t cnt, loff_t *ppos)
4243{ 4221{
4244 unsigned long *ptr = filp->private_data;
4245 char buf[64]; 4222 char buf[64];
4246 int r; 4223 int r;
4247 4224
@@ -4253,10 +4230,9 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf,
4253} 4230}
4254 4231
4255static ssize_t 4232static ssize_t
4256tracing_max_lat_write(struct file *filp, const char __user *ubuf, 4233tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
4257 size_t cnt, loff_t *ppos) 4234 size_t cnt, loff_t *ppos)
4258{ 4235{
4259 unsigned long *ptr = filp->private_data;
4260 unsigned long val; 4236 unsigned long val;
4261 int ret; 4237 int ret;
4262 4238
@@ -4269,6 +4245,52 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
4269 return cnt; 4245 return cnt;
4270} 4246}
4271 4247
4248static ssize_t
4249tracing_thresh_read(struct file *filp, char __user *ubuf,
4250 size_t cnt, loff_t *ppos)
4251{
4252 return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos);
4253}
4254
4255static ssize_t
4256tracing_thresh_write(struct file *filp, const char __user *ubuf,
4257 size_t cnt, loff_t *ppos)
4258{
4259 struct trace_array *tr = filp->private_data;
4260 int ret;
4261
4262 mutex_lock(&trace_types_lock);
4263 ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos);
4264 if (ret < 0)
4265 goto out;
4266
4267 if (tr->current_trace->update_thresh) {
4268 ret = tr->current_trace->update_thresh(tr);
4269 if (ret < 0)
4270 goto out;
4271 }
4272
4273 ret = cnt;
4274out:
4275 mutex_unlock(&trace_types_lock);
4276
4277 return ret;
4278}
4279
4280static ssize_t
4281tracing_max_lat_read(struct file *filp, char __user *ubuf,
4282 size_t cnt, loff_t *ppos)
4283{
4284 return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos);
4285}
4286
4287static ssize_t
4288tracing_max_lat_write(struct file *filp, const char __user *ubuf,
4289 size_t cnt, loff_t *ppos)
4290{
4291 return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos);
4292}
4293
4272static int tracing_open_pipe(struct inode *inode, struct file *filp) 4294static int tracing_open_pipe(struct inode *inode, struct file *filp)
4273{ 4295{
4274 struct trace_array *tr = inode->i_private; 4296 struct trace_array *tr = inode->i_private;
@@ -5170,6 +5192,13 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp)
5170#endif /* CONFIG_TRACER_SNAPSHOT */ 5192#endif /* CONFIG_TRACER_SNAPSHOT */
5171 5193
5172 5194
5195static const struct file_operations tracing_thresh_fops = {
5196 .open = tracing_open_generic,
5197 .read = tracing_thresh_read,
5198 .write = tracing_thresh_write,
5199 .llseek = generic_file_llseek,
5200};
5201
5173static const struct file_operations tracing_max_lat_fops = { 5202static const struct file_operations tracing_max_lat_fops = {
5174 .open = tracing_open_generic, 5203 .open = tracing_open_generic,
5175 .read = tracing_max_lat_read, 5204 .read = tracing_max_lat_read,
@@ -6107,10 +6136,8 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
6107 if (!topts) 6136 if (!topts)
6108 return; 6137 return;
6109 6138
6110 for (cnt = 0; topts[cnt].opt; cnt++) { 6139 for (cnt = 0; topts[cnt].opt; cnt++)
6111 if (topts[cnt].entry) 6140 debugfs_remove(topts[cnt].entry);
6112 debugfs_remove(topts[cnt].entry);
6113 }
6114 6141
6115 kfree(topts); 6142 kfree(topts);
6116} 6143}
@@ -6533,7 +6560,7 @@ static __init int tracer_init_debugfs(void)
6533 init_tracer_debugfs(&global_trace, d_tracer); 6560 init_tracer_debugfs(&global_trace, d_tracer);
6534 6561
6535 trace_create_file("tracing_thresh", 0644, d_tracer, 6562 trace_create_file("tracing_thresh", 0644, d_tracer,
6536 &tracing_thresh, &tracing_max_lat_fops); 6563 &global_trace, &tracing_thresh_fops);
6537 6564
6538 trace_create_file("README", 0444, d_tracer, 6565 trace_create_file("README", 0444, d_tracer,
6539 NULL, &tracing_readme_fops); 6566 NULL, &tracing_readme_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9258f5a815db..385391fb1d3b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -339,6 +339,7 @@ struct tracer_flags {
339 * @reset: called when one switches to another tracer 339 * @reset: called when one switches to another tracer
340 * @start: called when tracing is unpaused (echo 1 > tracing_enabled) 340 * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
341 * @stop: called when tracing is paused (echo 0 > tracing_enabled) 341 * @stop: called when tracing is paused (echo 0 > tracing_enabled)
342 * @update_thresh: called when tracing_thresh is updated
342 * @open: called when the trace file is opened 343 * @open: called when the trace file is opened
343 * @pipe_open: called when the trace_pipe file is opened 344 * @pipe_open: called when the trace_pipe file is opened
344 * @close: called when the trace file is released 345 * @close: called when the trace file is released
@@ -357,6 +358,7 @@ struct tracer {
357 void (*reset)(struct trace_array *tr); 358 void (*reset)(struct trace_array *tr);
358 void (*start)(struct trace_array *tr); 359 void (*start)(struct trace_array *tr);
359 void (*stop)(struct trace_array *tr); 360 void (*stop)(struct trace_array *tr);
361 int (*update_thresh)(struct trace_array *tr);
360 void (*open)(struct trace_iterator *iter); 362 void (*open)(struct trace_iterator *iter);
361 void (*pipe_open)(struct trace_iterator *iter); 363 void (*pipe_open)(struct trace_iterator *iter);
362 void (*close)(struct trace_iterator *iter); 364 void (*close)(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 26dc348332b7..57b67b1f24d1 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -59,13 +59,14 @@ u64 notrace trace_clock(void)
59 59
60/* 60/*
61 * trace_jiffy_clock(): Simply use jiffies as a clock counter. 61 * trace_jiffy_clock(): Simply use jiffies as a clock counter.
62 * Note that this use of jiffies_64 is not completely safe on
63 * 32-bit systems. But the window is tiny, and the effect if
64 * we are affected is that we will have an obviously bogus
65 * timestamp on a trace event - i.e. not life threatening.
62 */ 66 */
63u64 notrace trace_clock_jiffies(void) 67u64 notrace trace_clock_jiffies(void)
64{ 68{
65 u64 jiffy = jiffies - INITIAL_JIFFIES; 69 return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
66
67 /* Return nsecs */
68 return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
69} 70}
70 71
71/* 72/*
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 5d12bb407b44..4b9c114ee9de 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -30,6 +30,18 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
30 return ret; 30 return ret;
31 } 31 }
32 32
33 /*
34 * We checked and allowed to create parent,
35 * allow children without checking.
36 */
37 if (p_event->parent)
38 return 0;
39
40 /*
41 * It's ok to check current process (owner) permissions in here,
42 * because code below is called only via perf_event_open syscall.
43 */
44
33 /* The ftrace function trace is allowed only for root. */ 45 /* The ftrace function trace is allowed only for root. */
34 if (ftrace_event_is_function(tp_event)) { 46 if (ftrace_event_is_function(tp_event)) {
35 if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) 47 if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2de53628689f..ef06ce7e9cf8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,6 +8,8 @@
8 * 8 *
9 */ 9 */
10 10
11#define pr_fmt(fmt) fmt
12
11#include <linux/workqueue.h> 13#include <linux/workqueue.h>
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
13#include <linux/kthread.h> 15#include <linux/kthread.h>
@@ -1491,7 +1493,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
1491 1493
1492 dir->entry = debugfs_create_dir(name, parent); 1494 dir->entry = debugfs_create_dir(name, parent);
1493 if (!dir->entry) { 1495 if (!dir->entry) {
1494 pr_warning("Failed to create system directory %s\n", name); 1496 pr_warn("Failed to create system directory %s\n", name);
1495 __put_system(system); 1497 __put_system(system);
1496 goto out_free; 1498 goto out_free;
1497 } 1499 }
@@ -1507,7 +1509,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
1507 if (!entry) { 1509 if (!entry) {
1508 kfree(system->filter); 1510 kfree(system->filter);
1509 system->filter = NULL; 1511 system->filter = NULL;
1510 pr_warning("Could not create debugfs '%s/filter' entry\n", name); 1512 pr_warn("Could not create debugfs '%s/filter' entry\n", name);
1511 } 1513 }
1512 1514
1513 trace_create_file("enable", 0644, dir->entry, dir, 1515 trace_create_file("enable", 0644, dir->entry, dir,
@@ -1522,8 +1524,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
1522 out_fail: 1524 out_fail:
1523 /* Only print this message if failed on memory allocation */ 1525 /* Only print this message if failed on memory allocation */
1524 if (!dir || !system) 1526 if (!dir || !system)
1525 pr_warning("No memory to create event subsystem %s\n", 1527 pr_warn("No memory to create event subsystem %s\n", name);
1526 name);
1527 return NULL; 1528 return NULL;
1528} 1529}
1529 1530
@@ -1551,8 +1552,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
1551 name = ftrace_event_name(call); 1552 name = ftrace_event_name(call);
1552 file->dir = debugfs_create_dir(name, d_events); 1553 file->dir = debugfs_create_dir(name, d_events);
1553 if (!file->dir) { 1554 if (!file->dir) {
1554 pr_warning("Could not create debugfs '%s' directory\n", 1555 pr_warn("Could not create debugfs '%s' directory\n", name);
1555 name);
1556 return -1; 1556 return -1;
1557 } 1557 }
1558 1558
@@ -1575,8 +1575,8 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
1575 if (list_empty(head)) { 1575 if (list_empty(head)) {
1576 ret = call->class->define_fields(call); 1576 ret = call->class->define_fields(call);
1577 if (ret < 0) { 1577 if (ret < 0) {
1578 pr_warning("Could not initialize trace point" 1578 pr_warn("Could not initialize trace point events/%s\n",
1579 " events/%s\n", name); 1579 name);
1580 return -1; 1580 return -1;
1581 } 1581 }
1582 } 1582 }
@@ -1621,7 +1621,6 @@ static void event_remove(struct ftrace_event_call *call)
1621 if (file->event_call != call) 1621 if (file->event_call != call)
1622 continue; 1622 continue;
1623 ftrace_event_enable_disable(file, 0); 1623 ftrace_event_enable_disable(file, 0);
1624 destroy_preds(file);
1625 /* 1624 /*
1626 * The do_for_each_event_file() is 1625 * The do_for_each_event_file() is
1627 * a double loop. After finding the call for this 1626 * a double loop. After finding the call for this
@@ -1649,8 +1648,7 @@ static int event_init(struct ftrace_event_call *call)
1649 if (call->class->raw_init) { 1648 if (call->class->raw_init) {
1650 ret = call->class->raw_init(call); 1649 ret = call->class->raw_init(call);
1651 if (ret < 0 && ret != -ENOSYS) 1650 if (ret < 0 && ret != -ENOSYS)
1652 pr_warn("Could not initialize trace events/%s\n", 1651 pr_warn("Could not initialize trace events/%s\n", name);
1653 name);
1654 } 1652 }
1655 1653
1656 return ret; 1654 return ret;
@@ -1749,7 +1747,8 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
1749{ 1747{
1750 event_remove(call); 1748 event_remove(call);
1751 trace_destroy_fields(call); 1749 trace_destroy_fields(call);
1752 destroy_call_preds(call); 1750 free_event_filter(call->filter);
1751 call->filter = NULL;
1753} 1752}
1754 1753
1755static int probe_remove_event_call(struct ftrace_event_call *call) 1754static int probe_remove_event_call(struct ftrace_event_call *call)
@@ -1895,8 +1894,8 @@ __trace_add_event_dirs(struct trace_array *tr)
1895 list_for_each_entry(call, &ftrace_events, list) { 1894 list_for_each_entry(call, &ftrace_events, list) {
1896 ret = __trace_add_new_event(call, tr); 1895 ret = __trace_add_new_event(call, tr);
1897 if (ret < 0) 1896 if (ret < 0)
1898 pr_warning("Could not create directory for event %s\n", 1897 pr_warn("Could not create directory for event %s\n",
1899 ftrace_event_name(call)); 1898 ftrace_event_name(call));
1900 } 1899 }
1901} 1900}
1902 1901
@@ -2208,8 +2207,8 @@ __trace_early_add_event_dirs(struct trace_array *tr)
2208 list_for_each_entry(file, &tr->events, list) { 2207 list_for_each_entry(file, &tr->events, list) {
2209 ret = event_create_dir(tr->event_dir, file); 2208 ret = event_create_dir(tr->event_dir, file);
2210 if (ret < 0) 2209 if (ret < 0)
2211 pr_warning("Could not create directory for event %s\n", 2210 pr_warn("Could not create directory for event %s\n",
2212 ftrace_event_name(file->event_call)); 2211 ftrace_event_name(file->event_call));
2213 } 2212 }
2214} 2213}
2215 2214
@@ -2232,8 +2231,8 @@ __trace_early_add_events(struct trace_array *tr)
2232 2231
2233 ret = __trace_early_add_new_event(call, tr); 2232 ret = __trace_early_add_new_event(call, tr);
2234 if (ret < 0) 2233 if (ret < 0)
2235 pr_warning("Could not create early event %s\n", 2234 pr_warn("Could not create early event %s\n",
2236 ftrace_event_name(call)); 2235 ftrace_event_name(call));
2237 } 2236 }
2238} 2237}
2239 2238
@@ -2280,13 +2279,13 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
2280 entry = debugfs_create_file("set_event", 0644, parent, 2279 entry = debugfs_create_file("set_event", 0644, parent,
2281 tr, &ftrace_set_event_fops); 2280 tr, &ftrace_set_event_fops);
2282 if (!entry) { 2281 if (!entry) {
2283 pr_warning("Could not create debugfs 'set_event' entry\n"); 2282 pr_warn("Could not create debugfs 'set_event' entry\n");
2284 return -ENOMEM; 2283 return -ENOMEM;
2285 } 2284 }
2286 2285
2287 d_events = debugfs_create_dir("events", parent); 2286 d_events = debugfs_create_dir("events", parent);
2288 if (!d_events) { 2287 if (!d_events) {
2289 pr_warning("Could not create debugfs 'events' directory\n"); 2288 pr_warn("Could not create debugfs 'events' directory\n");
2290 return -ENOMEM; 2289 return -ENOMEM;
2291 } 2290 }
2292 2291
@@ -2462,11 +2461,10 @@ static __init int event_trace_init(void)
2462 entry = debugfs_create_file("available_events", 0444, d_tracer, 2461 entry = debugfs_create_file("available_events", 0444, d_tracer,
2463 tr, &ftrace_avail_fops); 2462 tr, &ftrace_avail_fops);
2464 if (!entry) 2463 if (!entry)
2465 pr_warning("Could not create debugfs " 2464 pr_warn("Could not create debugfs 'available_events' entry\n");
2466 "'available_events' entry\n");
2467 2465
2468 if (trace_define_common_fields()) 2466 if (trace_define_common_fields())
2469 pr_warning("tracing: Failed to allocate common fields"); 2467 pr_warn("tracing: Failed to allocate common fields");
2470 2468
2471 ret = early_event_add_tracer(d_tracer, tr); 2469 ret = early_event_add_tracer(d_tracer, tr);
2472 if (ret) 2470 if (ret)
@@ -2475,7 +2473,7 @@ static __init int event_trace_init(void)
2475#ifdef CONFIG_MODULES 2473#ifdef CONFIG_MODULES
2476 ret = register_module_notifier(&trace_module_nb); 2474 ret = register_module_notifier(&trace_module_nb);
2477 if (ret) 2475 if (ret)
2478 pr_warning("Failed to register trace events module notifier\n"); 2476 pr_warn("Failed to register trace events module notifier\n");
2479#endif 2477#endif
2480 return 0; 2478 return 0;
2481} 2479}
@@ -2579,7 +2577,7 @@ static __init void event_trace_self_tests(void)
2579 * it and the self test should not be on. 2577 * it and the self test should not be on.
2580 */ 2578 */
2581 if (file->flags & FTRACE_EVENT_FL_ENABLED) { 2579 if (file->flags & FTRACE_EVENT_FL_ENABLED) {
2582 pr_warning("Enabled event during self test!\n"); 2580 pr_warn("Enabled event during self test!\n");
2583 WARN_ON_ONCE(1); 2581 WARN_ON_ONCE(1);
2584 continue; 2582 continue;
2585 } 2583 }
@@ -2607,8 +2605,8 @@ static __init void event_trace_self_tests(void)
2607 2605
2608 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); 2606 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
2609 if (WARN_ON_ONCE(ret)) { 2607 if (WARN_ON_ONCE(ret)) {
2610 pr_warning("error enabling system %s\n", 2608 pr_warn("error enabling system %s\n",
2611 system->name); 2609 system->name);
2612 continue; 2610 continue;
2613 } 2611 }
2614 2612
@@ -2616,8 +2614,8 @@ static __init void event_trace_self_tests(void)
2616 2614
2617 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); 2615 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
2618 if (WARN_ON_ONCE(ret)) { 2616 if (WARN_ON_ONCE(ret)) {
2619 pr_warning("error disabling system %s\n", 2617 pr_warn("error disabling system %s\n",
2620 system->name); 2618 system->name);
2621 continue; 2619 continue;
2622 } 2620 }
2623 2621
@@ -2631,7 +2629,7 @@ static __init void event_trace_self_tests(void)
2631 2629
2632 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); 2630 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
2633 if (WARN_ON_ONCE(ret)) { 2631 if (WARN_ON_ONCE(ret)) {
2634 pr_warning("error enabling all events\n"); 2632 pr_warn("error enabling all events\n");
2635 return; 2633 return;
2636 } 2634 }
2637 2635
@@ -2640,7 +2638,7 @@ static __init void event_trace_self_tests(void)
2640 /* reset sysname */ 2638 /* reset sysname */
2641 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); 2639 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
2642 if (WARN_ON_ONCE(ret)) { 2640 if (WARN_ON_ONCE(ret)) {
2643 pr_warning("error disabling all events\n"); 2641 pr_warn("error disabling all events\n");
2644 return; 2642 return;
2645 } 2643 }
2646 2644
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8a8631926a07..7a8c1528e141 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -774,17 +774,12 @@ static void __free_preds(struct event_filter *filter)
774 filter->n_preds = 0; 774 filter->n_preds = 0;
775} 775}
776 776
777static void call_filter_disable(struct ftrace_event_call *call)
778{
779 call->flags &= ~TRACE_EVENT_FL_FILTERED;
780}
781
782static void filter_disable(struct ftrace_event_file *file) 777static void filter_disable(struct ftrace_event_file *file)
783{ 778{
784 struct ftrace_event_call *call = file->event_call; 779 struct ftrace_event_call *call = file->event_call;
785 780
786 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) 781 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
787 call_filter_disable(call); 782 call->flags &= ~TRACE_EVENT_FL_FILTERED;
788 else 783 else
789 file->flags &= ~FTRACE_EVENT_FL_FILTERED; 784 file->flags &= ~FTRACE_EVENT_FL_FILTERED;
790} 785}
@@ -804,32 +799,6 @@ void free_event_filter(struct event_filter *filter)
804 __free_filter(filter); 799 __free_filter(filter);
805} 800}
806 801
807void destroy_call_preds(struct ftrace_event_call *call)
808{
809 __free_filter(call->filter);
810 call->filter = NULL;
811}
812
813static void destroy_file_preds(struct ftrace_event_file *file)
814{
815 __free_filter(file->filter);
816 file->filter = NULL;
817}
818
819/*
820 * Called when destroying the ftrace_event_file.
821 * The file is being freed, so we do not need to worry about
822 * the file being currently used. This is for module code removing
823 * the tracepoints from within it.
824 */
825void destroy_preds(struct ftrace_event_file *file)
826{
827 if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
828 destroy_call_preds(file->event_call);
829 else
830 destroy_file_preds(file);
831}
832
833static struct event_filter *__alloc_filter(void) 802static struct event_filter *__alloc_filter(void)
834{ 803{
835 struct event_filter *filter; 804 struct event_filter *filter;
@@ -873,17 +842,14 @@ static inline void __remove_filter(struct ftrace_event_file *file)
873 remove_filter_string(file->filter); 842 remove_filter_string(file->filter);
874} 843}
875 844
876static void filter_free_subsystem_preds(struct event_subsystem *system, 845static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
877 struct trace_array *tr) 846 struct trace_array *tr)
878{ 847{
879 struct ftrace_event_file *file; 848 struct ftrace_event_file *file;
880 struct ftrace_event_call *call;
881 849
882 list_for_each_entry(file, &tr->events, list) { 850 list_for_each_entry(file, &tr->events, list) {
883 call = file->event_call; 851 if (file->system != dir)
884 if (strcmp(call->class->system, system->name) != 0)
885 continue; 852 continue;
886
887 __remove_filter(file); 853 __remove_filter(file);
888 } 854 }
889} 855}
@@ -901,15 +867,13 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file)
901 } 867 }
902} 868}
903 869
904static void filter_free_subsystem_filters(struct event_subsystem *system, 870static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir,
905 struct trace_array *tr) 871 struct trace_array *tr)
906{ 872{
907 struct ftrace_event_file *file; 873 struct ftrace_event_file *file;
908 struct ftrace_event_call *call;
909 874
910 list_for_each_entry(file, &tr->events, list) { 875 list_for_each_entry(file, &tr->events, list) {
911 call = file->event_call; 876 if (file->system != dir)
912 if (strcmp(call->class->system, system->name) != 0)
913 continue; 877 continue;
914 __free_subsystem_filter(file); 878 __free_subsystem_filter(file);
915 } 879 }
@@ -1582,7 +1546,6 @@ static int fold_pred_tree(struct event_filter *filter,
1582static int replace_preds(struct ftrace_event_call *call, 1546static int replace_preds(struct ftrace_event_call *call,
1583 struct event_filter *filter, 1547 struct event_filter *filter,
1584 struct filter_parse_state *ps, 1548 struct filter_parse_state *ps,
1585 char *filter_string,
1586 bool dry_run) 1549 bool dry_run)
1587{ 1550{
1588 char *operand1 = NULL, *operand2 = NULL; 1551 char *operand1 = NULL, *operand2 = NULL;
@@ -1755,13 +1718,12 @@ struct filter_list {
1755 struct event_filter *filter; 1718 struct event_filter *filter;
1756}; 1719};
1757 1720
1758static int replace_system_preds(struct event_subsystem *system, 1721static int replace_system_preds(struct ftrace_subsystem_dir *dir,
1759 struct trace_array *tr, 1722 struct trace_array *tr,
1760 struct filter_parse_state *ps, 1723 struct filter_parse_state *ps,
1761 char *filter_string) 1724 char *filter_string)
1762{ 1725{
1763 struct ftrace_event_file *file; 1726 struct ftrace_event_file *file;
1764 struct ftrace_event_call *call;
1765 struct filter_list *filter_item; 1727 struct filter_list *filter_item;
1766 struct filter_list *tmp; 1728 struct filter_list *tmp;
1767 LIST_HEAD(filter_list); 1729 LIST_HEAD(filter_list);
@@ -1769,15 +1731,14 @@ static int replace_system_preds(struct event_subsystem *system,
1769 int err; 1731 int err;
1770 1732
1771 list_for_each_entry(file, &tr->events, list) { 1733 list_for_each_entry(file, &tr->events, list) {
1772 call = file->event_call; 1734 if (file->system != dir)
1773 if (strcmp(call->class->system, system->name) != 0)
1774 continue; 1735 continue;
1775 1736
1776 /* 1737 /*
1777 * Try to see if the filter can be applied 1738 * Try to see if the filter can be applied
1778 * (filter arg is ignored on dry_run) 1739 * (filter arg is ignored on dry_run)
1779 */ 1740 */
1780 err = replace_preds(call, NULL, ps, filter_string, true); 1741 err = replace_preds(file->event_call, NULL, ps, true);
1781 if (err) 1742 if (err)
1782 event_set_no_set_filter_flag(file); 1743 event_set_no_set_filter_flag(file);
1783 else 1744 else
@@ -1787,9 +1748,7 @@ static int replace_system_preds(struct event_subsystem *system,
1787 list_for_each_entry(file, &tr->events, list) { 1748 list_for_each_entry(file, &tr->events, list) {
1788 struct event_filter *filter; 1749 struct event_filter *filter;
1789 1750
1790 call = file->event_call; 1751 if (file->system != dir)
1791
1792 if (strcmp(call->class->system, system->name) != 0)
1793 continue; 1752 continue;
1794 1753
1795 if (event_no_set_filter_flag(file)) 1754 if (event_no_set_filter_flag(file))
@@ -1811,7 +1770,7 @@ static int replace_system_preds(struct event_subsystem *system,
1811 if (err) 1770 if (err)
1812 goto fail_mem; 1771 goto fail_mem;
1813 1772
1814 err = replace_preds(call, filter, ps, filter_string, false); 1773 err = replace_preds(file->event_call, filter, ps, false);
1815 if (err) { 1774 if (err) {
1816 filter_disable(file); 1775 filter_disable(file);
1817 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 1776 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
@@ -1933,7 +1892,7 @@ static int create_filter(struct ftrace_event_call *call,
1933 1892
1934 err = create_filter_start(filter_str, set_str, &ps, &filter); 1893 err = create_filter_start(filter_str, set_str, &ps, &filter);
1935 if (!err) { 1894 if (!err) {
1936 err = replace_preds(call, filter, ps, filter_str, false); 1895 err = replace_preds(call, filter, ps, false);
1937 if (err && set_str) 1896 if (err && set_str)
1938 append_filter_err(ps, filter); 1897 append_filter_err(ps, filter);
1939 } 1898 }
@@ -1959,7 +1918,7 @@ int create_event_filter(struct ftrace_event_call *call,
1959 * Identical to create_filter() except that it creates a subsystem filter 1918 * Identical to create_filter() except that it creates a subsystem filter
1960 * and always remembers @filter_str. 1919 * and always remembers @filter_str.
1961 */ 1920 */
1962static int create_system_filter(struct event_subsystem *system, 1921static int create_system_filter(struct ftrace_subsystem_dir *dir,
1963 struct trace_array *tr, 1922 struct trace_array *tr,
1964 char *filter_str, struct event_filter **filterp) 1923 char *filter_str, struct event_filter **filterp)
1965{ 1924{
@@ -1969,7 +1928,7 @@ static int create_system_filter(struct event_subsystem *system,
1969 1928
1970 err = create_filter_start(filter_str, true, &ps, &filter); 1929 err = create_filter_start(filter_str, true, &ps, &filter);
1971 if (!err) { 1930 if (!err) {
1972 err = replace_system_preds(system, tr, ps, filter_str); 1931 err = replace_system_preds(dir, tr, ps, filter_str);
1973 if (!err) { 1932 if (!err) {
1974 /* System filters just show a default message */ 1933 /* System filters just show a default message */
1975 kfree(filter->filter_string); 1934 kfree(filter->filter_string);
@@ -2053,18 +2012,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
2053 } 2012 }
2054 2013
2055 if (!strcmp(strstrip(filter_string), "0")) { 2014 if (!strcmp(strstrip(filter_string), "0")) {
2056 filter_free_subsystem_preds(system, tr); 2015 filter_free_subsystem_preds(dir, tr);
2057 remove_filter_string(system->filter); 2016 remove_filter_string(system->filter);
2058 filter = system->filter; 2017 filter = system->filter;
2059 system->filter = NULL; 2018 system->filter = NULL;
2060 /* Ensure all filters are no longer used */ 2019 /* Ensure all filters are no longer used */
2061 synchronize_sched(); 2020 synchronize_sched();
2062 filter_free_subsystem_filters(system, tr); 2021 filter_free_subsystem_filters(dir, tr);
2063 __free_filter(filter); 2022 __free_filter(filter);
2064 goto out_unlock; 2023 goto out_unlock;
2065 } 2024 }
2066 2025
2067 err = create_system_filter(system, tr, filter_string, &filter); 2026 err = create_system_filter(dir, tr, filter_string, &filter);
2068 if (filter) { 2027 if (filter) {
2069 /* 2028 /*
2070 * No event actually uses the system filter 2029 * No event actually uses the system filter
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4de3e57f723c..f0a0c982cde3 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,6 +15,33 @@
15#include "trace.h" 15#include "trace.h"
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18static bool kill_ftrace_graph;
19
20/**
21 * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
22 *
23 * ftrace_graph_stop() is called when a severe error is detected in
24 * the function graph tracing. This function is called by the critical
25 * paths of function graph to keep those paths from doing any more harm.
26 */
27bool ftrace_graph_is_dead(void)
28{
29 return kill_ftrace_graph;
30}
31
32/**
33 * ftrace_graph_stop - set to permanently disable function graph tracincg
34 *
35 * In case of an error int function graph tracing, this is called
36 * to try to keep function graph tracing from causing any more harm.
37 * Usually this is pretty severe and this is called to try to at least
38 * get a warning out to the user.
39 */
40void ftrace_graph_stop(void)
41{
42 kill_ftrace_graph = true;
43}
44
18/* When set, irq functions will be ignored */ 45/* When set, irq functions will be ignored */
19static int ftrace_graph_skip_irqs; 46static int ftrace_graph_skip_irqs;
20 47
@@ -92,6 +119,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
92 unsigned long long calltime; 119 unsigned long long calltime;
93 int index; 120 int index;
94 121
122 if (unlikely(ftrace_graph_is_dead()))
123 return -EBUSY;
124
95 if (!current->ret_stack) 125 if (!current->ret_stack)
96 return -EBUSY; 126 return -EBUSY;
97 127
@@ -323,7 +353,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
323 return ret; 353 return ret;
324} 354}
325 355
326int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) 356static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
327{ 357{
328 if (tracing_thresh) 358 if (tracing_thresh)
329 return 1; 359 return 1;
@@ -412,7 +442,7 @@ void set_graph_array(struct trace_array *tr)
412 smp_mb(); 442 smp_mb();
413} 443}
414 444
415void trace_graph_thresh_return(struct ftrace_graph_ret *trace) 445static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
416{ 446{
417 if (tracing_thresh && 447 if (tracing_thresh &&
418 (trace->rettime - trace->calltime < tracing_thresh)) 448 (trace->rettime - trace->calltime < tracing_thresh))
@@ -445,6 +475,12 @@ static void graph_trace_reset(struct trace_array *tr)
445 unregister_ftrace_graph(); 475 unregister_ftrace_graph();
446} 476}
447 477
478static int graph_trace_update_thresh(struct trace_array *tr)
479{
480 graph_trace_reset(tr);
481 return graph_trace_init(tr);
482}
483
448static int max_bytes_for_cpu; 484static int max_bytes_for_cpu;
449 485
450static enum print_line_t 486static enum print_line_t
@@ -1399,7 +1435,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1399 seq_printf(s, " | | | |\n"); 1435 seq_printf(s, " | | | |\n");
1400} 1436}
1401 1437
1402void print_graph_headers(struct seq_file *s) 1438static void print_graph_headers(struct seq_file *s)
1403{ 1439{
1404 print_graph_headers_flags(s, tracer_flags.val); 1440 print_graph_headers_flags(s, tracer_flags.val);
1405} 1441}
@@ -1495,6 +1531,7 @@ static struct trace_event graph_trace_ret_event = {
1495 1531
1496static struct tracer graph_trace __tracer_data = { 1532static struct tracer graph_trace __tracer_data = {
1497 .name = "function_graph", 1533 .name = "function_graph",
1534 .update_thresh = graph_trace_update_thresh,
1498 .open = graph_trace_open, 1535 .open = graph_trace_open,
1499 .pipe_open = graph_trace_open, 1536 .pipe_open = graph_trace_open,
1500 .close = graph_trace_close, 1537 .close = graph_trace_close,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index f3dad80c20b2..c6977d5a9b12 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -20,23 +20,6 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
20 20
21static int next_event_type = __TRACE_LAST_TYPE + 1; 21static int next_event_type = __TRACE_LAST_TYPE + 1;
22 22
23int trace_print_seq(struct seq_file *m, struct trace_seq *s)
24{
25 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
26 int ret;
27
28 ret = seq_write(m, s->buffer, len);
29
30 /*
31 * Only reset this buffer if we successfully wrote to the
32 * seq_file buffer.
33 */
34 if (!ret)
35 trace_seq_init(s);
36
37 return ret;
38}
39
40enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) 23enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
41{ 24{
42 struct trace_seq *s = &iter->seq; 25 struct trace_seq *s = &iter->seq;
@@ -85,257 +68,6 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
85 return TRACE_TYPE_HANDLED; 68 return TRACE_TYPE_HANDLED;
86} 69}
87 70
88/**
89 * trace_seq_printf - sequence printing of trace information
90 * @s: trace sequence descriptor
91 * @fmt: printf format string
92 *
93 * It returns 0 if the trace oversizes the buffer's free
94 * space, 1 otherwise.
95 *
96 * The tracer may use either sequence operations or its own
97 * copy to user routines. To simplify formating of a trace
98 * trace_seq_printf is used to store strings into a special
99 * buffer (@s). Then the output may be either used by
100 * the sequencer or pulled into another buffer.
101 */
102int
103trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
104{
105 int len = (PAGE_SIZE - 1) - s->len;
106 va_list ap;
107 int ret;
108
109 if (s->full || !len)
110 return 0;
111
112 va_start(ap, fmt);
113 ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
114 va_end(ap);
115
116 /* If we can't write it all, don't bother writing anything */
117 if (ret >= len) {
118 s->full = 1;
119 return 0;
120 }
121
122 s->len += ret;
123
124 return 1;
125}
126EXPORT_SYMBOL_GPL(trace_seq_printf);
127
128/**
129 * trace_seq_bitmask - put a list of longs as a bitmask print output
130 * @s: trace sequence descriptor
131 * @maskp: points to an array of unsigned longs that represent a bitmask
132 * @nmaskbits: The number of bits that are valid in @maskp
133 *
134 * It returns 0 if the trace oversizes the buffer's free
135 * space, 1 otherwise.
136 *
137 * Writes a ASCII representation of a bitmask string into @s.
138 */
139int
140trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
141 int nmaskbits)
142{
143 int len = (PAGE_SIZE - 1) - s->len;
144 int ret;
145
146 if (s->full || !len)
147 return 0;
148
149 ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
150 s->len += ret;
151
152 return 1;
153}
154EXPORT_SYMBOL_GPL(trace_seq_bitmask);
155
156/**
157 * trace_seq_vprintf - sequence printing of trace information
158 * @s: trace sequence descriptor
159 * @fmt: printf format string
160 *
161 * The tracer may use either sequence operations or its own
162 * copy to user routines. To simplify formating of a trace
163 * trace_seq_printf is used to store strings into a special
164 * buffer (@s). Then the output may be either used by
165 * the sequencer or pulled into another buffer.
166 */
167int
168trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
169{
170 int len = (PAGE_SIZE - 1) - s->len;
171 int ret;
172
173 if (s->full || !len)
174 return 0;
175
176 ret = vsnprintf(s->buffer + s->len, len, fmt, args);
177
178 /* If we can't write it all, don't bother writing anything */
179 if (ret >= len) {
180 s->full = 1;
181 return 0;
182 }
183
184 s->len += ret;
185
186 return len;
187}
188EXPORT_SYMBOL_GPL(trace_seq_vprintf);
189
190int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
191{
192 int len = (PAGE_SIZE - 1) - s->len;
193 int ret;
194
195 if (s->full || !len)
196 return 0;
197
198 ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
199
200 /* If we can't write it all, don't bother writing anything */
201 if (ret >= len) {
202 s->full = 1;
203 return 0;
204 }
205
206 s->len += ret;
207
208 return len;
209}
210
211/**
212 * trace_seq_puts - trace sequence printing of simple string
213 * @s: trace sequence descriptor
214 * @str: simple string to record
215 *
216 * The tracer may use either the sequence operations or its own
217 * copy to user routines. This function records a simple string
218 * into a special buffer (@s) for later retrieval by a sequencer
219 * or other mechanism.
220 */
221int trace_seq_puts(struct trace_seq *s, const char *str)
222{
223 int len = strlen(str);
224
225 if (s->full)
226 return 0;
227
228 if (len > ((PAGE_SIZE - 1) - s->len)) {
229 s->full = 1;
230 return 0;
231 }
232
233 memcpy(s->buffer + s->len, str, len);
234 s->len += len;
235
236 return len;
237}
238
239int trace_seq_putc(struct trace_seq *s, unsigned char c)
240{
241 if (s->full)
242 return 0;
243
244 if (s->len >= (PAGE_SIZE - 1)) {
245 s->full = 1;
246 return 0;
247 }
248
249 s->buffer[s->len++] = c;
250
251 return 1;
252}
253EXPORT_SYMBOL(trace_seq_putc);
254
255int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
256{
257 if (s->full)
258 return 0;
259
260 if (len > ((PAGE_SIZE - 1) - s->len)) {
261 s->full = 1;
262 return 0;
263 }
264
265 memcpy(s->buffer + s->len, mem, len);
266 s->len += len;
267
268 return len;
269}
270
271int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
272{
273 unsigned char hex[HEX_CHARS];
274 const unsigned char *data = mem;
275 int i, j;
276
277 if (s->full)
278 return 0;
279
280#ifdef __BIG_ENDIAN
281 for (i = 0, j = 0; i < len; i++) {
282#else
283 for (i = len-1, j = 0; i >= 0; i--) {
284#endif
285 hex[j++] = hex_asc_hi(data[i]);
286 hex[j++] = hex_asc_lo(data[i]);
287 }
288 hex[j++] = ' ';
289
290 return trace_seq_putmem(s, hex, j);
291}
292
293void *trace_seq_reserve(struct trace_seq *s, size_t len)
294{
295 void *ret;
296
297 if (s->full)
298 return NULL;
299
300 if (len > ((PAGE_SIZE - 1) - s->len)) {
301 s->full = 1;
302 return NULL;
303 }
304
305 ret = s->buffer + s->len;
306 s->len += len;
307
308 return ret;
309}
310
311int trace_seq_path(struct trace_seq *s, const struct path *path)
312{
313 unsigned char *p;
314
315 if (s->full)
316 return 0;
317
318 if (s->len >= (PAGE_SIZE - 1)) {
319 s->full = 1;
320 return 0;
321 }
322
323 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
324 if (!IS_ERR(p)) {
325 p = mangle_path(s->buffer + s->len, p, "\n");
326 if (p) {
327 s->len = p - s->buffer;
328 return 1;
329 }
330 } else {
331 s->buffer[s->len++] = '?';
332 return 1;
333 }
334
335 s->full = 1;
336 return 0;
337}
338
339const char * 71const char *
340ftrace_print_flags_seq(struct trace_seq *p, const char *delim, 72ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
341 unsigned long flags, 73 unsigned long flags,
@@ -343,7 +75,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
343{ 75{
344 unsigned long mask; 76 unsigned long mask;
345 const char *str; 77 const char *str;
346 const char *ret = p->buffer + p->len; 78 const char *ret = trace_seq_buffer_ptr(p);
347 int i, first = 1; 79 int i, first = 1;
348 80
349 for (i = 0; flag_array[i].name && flags; i++) { 81 for (i = 0; flag_array[i].name && flags; i++) {
@@ -379,7 +111,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
379 const struct trace_print_flags *symbol_array) 111 const struct trace_print_flags *symbol_array)
380{ 112{
381 int i; 113 int i;
382 const char *ret = p->buffer + p->len; 114 const char *ret = trace_seq_buffer_ptr(p);
383 115
384 for (i = 0; symbol_array[i].name; i++) { 116 for (i = 0; symbol_array[i].name; i++) {
385 117
@@ -390,7 +122,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
390 break; 122 break;
391 } 123 }
392 124
393 if (ret == (const char *)(p->buffer + p->len)) 125 if (ret == (const char *)(trace_seq_buffer_ptr(p)))
394 trace_seq_printf(p, "0x%lx", val); 126 trace_seq_printf(p, "0x%lx", val);
395 127
396 trace_seq_putc(p, 0); 128 trace_seq_putc(p, 0);
@@ -405,7 +137,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
405 const struct trace_print_flags_u64 *symbol_array) 137 const struct trace_print_flags_u64 *symbol_array)
406{ 138{
407 int i; 139 int i;
408 const char *ret = p->buffer + p->len; 140 const char *ret = trace_seq_buffer_ptr(p);
409 141
410 for (i = 0; symbol_array[i].name; i++) { 142 for (i = 0; symbol_array[i].name; i++) {
411 143
@@ -416,7 +148,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
416 break; 148 break;
417 } 149 }
418 150
419 if (ret == (const char *)(p->buffer + p->len)) 151 if (ret == (const char *)(trace_seq_buffer_ptr(p)))
420 trace_seq_printf(p, "0x%llx", val); 152 trace_seq_printf(p, "0x%llx", val);
421 153
422 trace_seq_putc(p, 0); 154 trace_seq_putc(p, 0);
@@ -430,7 +162,7 @@ const char *
430ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, 162ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
431 unsigned int bitmask_size) 163 unsigned int bitmask_size)
432{ 164{
433 const char *ret = p->buffer + p->len; 165 const char *ret = trace_seq_buffer_ptr(p);
434 166
435 trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); 167 trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
436 trace_seq_putc(p, 0); 168 trace_seq_putc(p, 0);
@@ -443,7 +175,7 @@ const char *
443ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) 175ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
444{ 176{
445 int i; 177 int i;
446 const char *ret = p->buffer + p->len; 178 const char *ret = trace_seq_buffer_ptr(p);
447 179
448 for (i = 0; i < buf_len; i++) 180 for (i = 0; i < buf_len; i++)
449 trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); 181 trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 127a9d8c8357..80b25b585a70 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -35,9 +35,6 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
35extern int __unregister_ftrace_event(struct trace_event *event); 35extern int __unregister_ftrace_event(struct trace_event *event);
36extern struct rw_semaphore trace_event_sem; 36extern struct rw_semaphore trace_event_sem;
37 37
38#define MAX_MEMHEX_BYTES 8
39#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
40
41#define SEQ_PUT_FIELD_RET(s, x) \ 38#define SEQ_PUT_FIELD_RET(s, x) \
42do { \ 39do { \
43 if (!trace_seq_putmem(s, &(x), sizeof(x))) \ 40 if (!trace_seq_putmem(s, &(x), sizeof(x))) \
@@ -46,7 +43,6 @@ do { \
46 43
47#define SEQ_PUT_HEX_FIELD_RET(s, x) \ 44#define SEQ_PUT_HEX_FIELD_RET(s, x) \
48do { \ 45do { \
49 BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
50 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ 46 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
51 return TRACE_TYPE_PARTIAL_LINE; \ 47 return TRACE_TYPE_PARTIAL_LINE; \
52} while (0) 48} while (0)
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
new file mode 100644
index 000000000000..1f24ed99dca2
--- /dev/null
+++ b/kernel/trace/trace_seq.c
@@ -0,0 +1,428 @@
1/*
2 * trace_seq.c
3 *
4 * Copyright (C) 2008-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
5 *
6 * The trace_seq is a handy tool that allows you to pass a descriptor around
7 * to a buffer that other functions can write to. It is similar to the
8 * seq_file functionality but has some differences.
9 *
10 * To use it, the trace_seq must be initialized with trace_seq_init().
11 * This will set up the counters within the descriptor. You can call
12 * trace_seq_init() more than once to reset the trace_seq to start
13 * from scratch.
14 *
15 * The buffer size is currently PAGE_SIZE, although it may become dynamic
16 * in the future.
17 *
18 * A write to the buffer will either succed or fail. That is, unlike
19 * sprintf() there will not be a partial write (well it may write into
20 * the buffer but it wont update the pointers). This allows users to
21 * try to write something into the trace_seq buffer and if it fails
22 * they can flush it and try again.
23 *
24 */
25#include <linux/uaccess.h>
26#include <linux/seq_file.h>
27#include <linux/trace_seq.h>
28
29/* How much buffer is left on the trace_seq? */
30#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len)
31
32/* How much buffer is written? */
33#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1))
34
35/**
36 * trace_print_seq - move the contents of trace_seq into a seq_file
37 * @m: the seq_file descriptor that is the destination
38 * @s: the trace_seq descriptor that is the source.
39 *
40 * Returns 0 on success and non zero on error. If it succeeds to
41 * write to the seq_file it will reset the trace_seq, otherwise
42 * it does not modify the trace_seq to let the caller try again.
43 */
44int trace_print_seq(struct seq_file *m, struct trace_seq *s)
45{
46 unsigned int len = TRACE_SEQ_BUF_USED(s);
47 int ret;
48
49 ret = seq_write(m, s->buffer, len);
50
51 /*
52 * Only reset this buffer if we successfully wrote to the
53 * seq_file buffer. This lets the caller try again or
54 * do something else with the contents.
55 */
56 if (!ret)
57 trace_seq_init(s);
58
59 return ret;
60}
61
62/**
63 * trace_seq_printf - sequence printing of trace information
64 * @s: trace sequence descriptor
65 * @fmt: printf format string
66 *
67 * The tracer may use either sequence operations or its own
68 * copy to user routines. To simplify formating of a trace
69 * trace_seq_printf() is used to store strings into a special
70 * buffer (@s). Then the output may be either used by
71 * the sequencer or pulled into another buffer.
72 *
73 * Returns 1 if we successfully written all the contents to
74 * the buffer.
75 * Returns 0 if we the length to write is bigger than the
76 * reserved buffer space. In this case, nothing gets written.
77 */
78int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
79{
80 unsigned int len = TRACE_SEQ_BUF_LEFT(s);
81 va_list ap;
82 int ret;
83
84 if (s->full || !len)
85 return 0;
86
87 va_start(ap, fmt);
88 ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
89 va_end(ap);
90
91 /* If we can't write it all, don't bother writing anything */
92 if (ret >= len) {
93 s->full = 1;
94 return 0;
95 }
96
97 s->len += ret;
98
99 return 1;
100}
101EXPORT_SYMBOL_GPL(trace_seq_printf);
102
103/**
104 * trace_seq_bitmask - write a bitmask array in its ASCII representation
105 * @s: trace sequence descriptor
106 * @maskp: points to an array of unsigned longs that represent a bitmask
107 * @nmaskbits: The number of bits that are valid in @maskp
108 *
109 * Writes a ASCII representation of a bitmask string into @s.
110 *
111 * Returns 1 if we successfully written all the contents to
112 * the buffer.
113 * Returns 0 if we the length to write is bigger than the
114 * reserved buffer space. In this case, nothing gets written.
115 */
116int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
117 int nmaskbits)
118{
119 unsigned int len = TRACE_SEQ_BUF_LEFT(s);
120 int ret;
121
122 if (s->full || !len)
123 return 0;
124
125 ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
126 s->len += ret;
127
128 return 1;
129}
130EXPORT_SYMBOL_GPL(trace_seq_bitmask);
131
132/**
133 * trace_seq_vprintf - sequence printing of trace information
134 * @s: trace sequence descriptor
135 * @fmt: printf format string
136 *
137 * The tracer may use either sequence operations or its own
138 * copy to user routines. To simplify formating of a trace
139 * trace_seq_printf is used to store strings into a special
140 * buffer (@s). Then the output may be either used by
141 * the sequencer or pulled into another buffer.
142 *
143 * Returns how much it wrote to the buffer.
144 */
145int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
146{
147 unsigned int len = TRACE_SEQ_BUF_LEFT(s);
148 int ret;
149
150 if (s->full || !len)
151 return 0;
152
153 ret = vsnprintf(s->buffer + s->len, len, fmt, args);
154
155 /* If we can't write it all, don't bother writing anything */
156 if (ret >= len) {
157 s->full = 1;
158 return 0;
159 }
160
161 s->len += ret;
162
163 return len;
164}
165EXPORT_SYMBOL_GPL(trace_seq_vprintf);
166
167/**
168 * trace_seq_bprintf - Write the printf string from binary arguments
169 * @s: trace sequence descriptor
170 * @fmt: The format string for the @binary arguments
171 * @binary: The binary arguments for @fmt.
172 *
173 * When recording in a fast path, a printf may be recorded with just
174 * saving the format and the arguments as they were passed to the
175 * function, instead of wasting cycles converting the arguments into
176 * ASCII characters. Instead, the arguments are saved in a 32 bit
177 * word array that is defined by the format string constraints.
178 *
179 * This function will take the format and the binary array and finish
180 * the conversion into the ASCII string within the buffer.
181 *
182 * Returns how much it wrote to the buffer.
183 */
184int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
185{
186 unsigned int len = TRACE_SEQ_BUF_LEFT(s);
187 int ret;
188
189 if (s->full || !len)
190 return 0;
191
192 ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
193
194 /* If we can't write it all, don't bother writing anything */
195 if (ret >= len) {
196 s->full = 1;
197 return 0;
198 }
199
200 s->len += ret;
201
202 return len;
203}
204EXPORT_SYMBOL_GPL(trace_seq_bprintf);
205
206/**
207 * trace_seq_puts - trace sequence printing of simple string
208 * @s: trace sequence descriptor
209 * @str: simple string to record
210 *
211 * The tracer may use either the sequence operations or its own
212 * copy to user routines. This function records a simple string
213 * into a special buffer (@s) for later retrieval by a sequencer
214 * or other mechanism.
215 *
216 * Returns how much it wrote to the buffer.
217 */
218int trace_seq_puts(struct trace_seq *s, const char *str)
219{
220 unsigned int len = strlen(str);
221
222 if (s->full)
223 return 0;
224
225 if (len > TRACE_SEQ_BUF_LEFT(s)) {
226 s->full = 1;
227 return 0;
228 }
229
230 memcpy(s->buffer + s->len, str, len);
231 s->len += len;
232
233 return len;
234}
235EXPORT_SYMBOL_GPL(trace_seq_puts);
236
237/**
238 * trace_seq_putc - trace sequence printing of simple character
239 * @s: trace sequence descriptor
240 * @c: simple character to record
241 *
242 * The tracer may use either the sequence operations or its own
243 * copy to user routines. This function records a simple charater
244 * into a special buffer (@s) for later retrieval by a sequencer
245 * or other mechanism.
246 *
247 * Returns how much it wrote to the buffer.
248 */
249int trace_seq_putc(struct trace_seq *s, unsigned char c)
250{
251 if (s->full)
252 return 0;
253
254 if (TRACE_SEQ_BUF_LEFT(s) < 1) {
255 s->full = 1;
256 return 0;
257 }
258
259 s->buffer[s->len++] = c;
260
261 return 1;
262}
263EXPORT_SYMBOL_GPL(trace_seq_putc);
264
265/**
266 * trace_seq_putmem - write raw data into the trace_seq buffer
267 * @s: trace sequence descriptor
268 * @mem: The raw memory to copy into the buffer
269 * @len: The length of the raw memory to copy (in bytes)
270 *
271 * There may be cases where raw memory needs to be written into the
272 * buffer and a strcpy() would not work. Using this function allows
273 * for such cases.
274 *
275 * Returns how much it wrote to the buffer.
276 */
277int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
278{
279 if (s->full)
280 return 0;
281
282 if (len > TRACE_SEQ_BUF_LEFT(s)) {
283 s->full = 1;
284 return 0;
285 }
286
287 memcpy(s->buffer + s->len, mem, len);
288 s->len += len;
289
290 return len;
291}
292EXPORT_SYMBOL_GPL(trace_seq_putmem);
293
294#define MAX_MEMHEX_BYTES 8U
295#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
296
297/**
298 * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
299 * @s: trace sequence descriptor
300 * @mem: The raw memory to write its hex ASCII representation of
301 * @len: The length of the raw memory to copy (in bytes)
302 *
303 * This is similar to trace_seq_putmem() except instead of just copying the
304 * raw memory into the buffer it writes its ASCII representation of it
305 * in hex characters.
306 *
307 * Returns how much it wrote to the buffer.
308 */
309int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
310 unsigned int len)
311{
312 unsigned char hex[HEX_CHARS];
313 const unsigned char *data = mem;
314 unsigned int start_len;
315 int i, j;
316 int cnt = 0;
317
318 if (s->full)
319 return 0;
320
321 while (len) {
322 start_len = min(len, HEX_CHARS - 1);
323#ifdef __BIG_ENDIAN
324 for (i = 0, j = 0; i < start_len; i++) {
325#else
326 for (i = start_len-1, j = 0; i >= 0; i--) {
327#endif
328 hex[j++] = hex_asc_hi(data[i]);
329 hex[j++] = hex_asc_lo(data[i]);
330 }
331 if (WARN_ON_ONCE(j == 0 || j/2 > len))
332 break;
333
334 /* j increments twice per loop */
335 len -= j / 2;
336 hex[j++] = ' ';
337
338 cnt += trace_seq_putmem(s, hex, j);
339 }
340 return cnt;
341}
342EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
343
344/**
345 * trace_seq_path - copy a path into the sequence buffer
346 * @s: trace sequence descriptor
347 * @path: path to write into the sequence buffer.
348 *
349 * Write a path name into the sequence buffer.
350 *
351 * Returns 1 if we successfully written all the contents to
352 * the buffer.
353 * Returns 0 if we the length to write is bigger than the
354 * reserved buffer space. In this case, nothing gets written.
355 */
356int trace_seq_path(struct trace_seq *s, const struct path *path)
357{
358 unsigned char *p;
359
360 if (s->full)
361 return 0;
362
363 if (TRACE_SEQ_BUF_LEFT(s) < 1) {
364 s->full = 1;
365 return 0;
366 }
367
368 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
369 if (!IS_ERR(p)) {
370 p = mangle_path(s->buffer + s->len, p, "\n");
371 if (p) {
372 s->len = p - s->buffer;
373 return 1;
374 }
375 } else {
376 s->buffer[s->len++] = '?';
377 return 1;
378 }
379
380 s->full = 1;
381 return 0;
382}
383EXPORT_SYMBOL_GPL(trace_seq_path);
384
385/**
386 * trace_seq_to_user - copy the squence buffer to user space
387 * @s: trace sequence descriptor
388 * @ubuf: The userspace memory location to copy to
389 * @cnt: The amount to copy
390 *
391 * Copies the sequence buffer into the userspace memory pointed to
392 * by @ubuf. It starts from the last read position (@s->readpos)
393 * and writes up to @cnt characters or till it reaches the end of
394 * the content in the buffer (@s->len), which ever comes first.
395 *
396 * On success, it returns a positive number of the number of bytes
397 * it copied.
398 *
399 * On failure it returns -EBUSY if all of the content in the
400 * sequence has been already read, which includes nothing in the
401 * sequenc (@s->len == @s->readpos).
402 *
403 * Returns -EFAULT if the copy to userspace fails.
404 */
405int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
406{
407 int len;
408 int ret;
409
410 if (!cnt)
411 return 0;
412
413 if (s->len <= s->readpos)
414 return -EBUSY;
415
416 len = s->len - s->readpos;
417 if (cnt > len)
418 cnt = len;
419 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
420 if (ret == cnt)
421 return -EFAULT;
422
423 cnt -= ret;
424
425 s->readpos += cnt;
426 return cnt;
427}
428EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 3c9b97e6b1f4..33ff6a24b802 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -265,7 +265,6 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
265 if (is_ret) 265 if (is_ret)
266 tu->consumer.ret_handler = uretprobe_dispatcher; 266 tu->consumer.ret_handler = uretprobe_dispatcher;
267 init_trace_uprobe_filter(&tu->filter); 267 init_trace_uprobe_filter(&tu->filter);
268 tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;
269 return tu; 268 return tu;
270 269
271error: 270error:
@@ -1292,7 +1291,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
1292 kfree(call->print_fmt); 1291 kfree(call->print_fmt);
1293 return -ENODEV; 1292 return -ENODEV;
1294 } 1293 }
1295 call->flags = 0; 1294
1296 call->class->reg = trace_uprobe_register; 1295 call->class->reg = trace_uprobe_register;
1297 call->data = tu; 1296 call->data = tu;
1298 ret = trace_add_event_call(call); 1297 ret = trace_add_event_call(call);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index a1dd9a1b1327..975cb49e32bf 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -31,20 +31,19 @@ void bacct_add_tsk(struct user_namespace *user_ns,
31 struct taskstats *stats, struct task_struct *tsk) 31 struct taskstats *stats, struct task_struct *tsk)
32{ 32{
33 const struct cred *tcred; 33 const struct cred *tcred;
34 struct timespec uptime, ts;
35 cputime_t utime, stime, utimescaled, stimescaled; 34 cputime_t utime, stime, utimescaled, stimescaled;
36 u64 ac_etime; 35 u64 delta;
37 36
38 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); 37 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
39 38
40 /* calculate task elapsed time in timespec */ 39 /* calculate task elapsed time in nsec */
41 do_posix_clock_monotonic_gettime(&uptime); 40 delta = ktime_get_ns() - tsk->start_time;
42 ts = timespec_sub(uptime, tsk->start_time); 41 /* Convert to micro seconds */
43 /* rebase elapsed time to usec (should never be negative) */ 42 do_div(delta, NSEC_PER_USEC);
44 ac_etime = timespec_to_ns(&ts); 43 stats->ac_etime = delta;
45 do_div(ac_etime, NSEC_PER_USEC); 44 /* Convert to seconds for btime */
46 stats->ac_etime = ac_etime; 45 do_div(delta, USEC_PER_SEC);
47 stats->ac_btime = get_seconds() - ts.tv_sec; 46 stats->ac_btime = get_seconds() - delta;
48 if (thread_group_leader(tsk)) { 47 if (thread_group_leader(tsk)) {
49 stats->ac_exitcode = tsk->exit_code; 48 stats->ac_exitcode = tsk->exit_code;
50 if (tsk->flags & PF_FORKNOEXEC) 49 if (tsk->flags & PF_FORKNOEXEC)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 35974ac69600..5dbe22aa3efd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -265,7 +265,6 @@ struct workqueue_struct {
265 265
266static struct kmem_cache *pwq_cache; 266static struct kmem_cache *pwq_cache;
267 267
268static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */
269static cpumask_var_t *wq_numa_possible_cpumask; 268static cpumask_var_t *wq_numa_possible_cpumask;
270 /* possible CPUs of each node */ 269 /* possible CPUs of each node */
271 270
@@ -758,13 +757,6 @@ static bool too_many_workers(struct worker_pool *pool)
758 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ 757 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
759 int nr_busy = pool->nr_workers - nr_idle; 758 int nr_busy = pool->nr_workers - nr_idle;
760 759
761 /*
762 * nr_idle and idle_list may disagree if idle rebinding is in
763 * progress. Never return %true if idle_list is empty.
764 */
765 if (list_empty(&pool->idle_list))
766 return false;
767
768 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; 760 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
769} 761}
770 762
@@ -850,7 +842,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
850 pool = worker->pool; 842 pool = worker->pool;
851 843
852 /* this can only happen on the local cpu */ 844 /* this can only happen on the local cpu */
853 if (WARN_ON_ONCE(cpu != raw_smp_processor_id())) 845 if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
854 return NULL; 846 return NULL;
855 847
856 /* 848 /*
@@ -874,35 +866,22 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
874 * worker_set_flags - set worker flags and adjust nr_running accordingly 866 * worker_set_flags - set worker flags and adjust nr_running accordingly
875 * @worker: self 867 * @worker: self
876 * @flags: flags to set 868 * @flags: flags to set
877 * @wakeup: wakeup an idle worker if necessary
878 * 869 *
879 * Set @flags in @worker->flags and adjust nr_running accordingly. If 870 * Set @flags in @worker->flags and adjust nr_running accordingly.
880 * nr_running becomes zero and @wakeup is %true, an idle worker is
881 * woken up.
882 * 871 *
883 * CONTEXT: 872 * CONTEXT:
884 * spin_lock_irq(pool->lock) 873 * spin_lock_irq(pool->lock)
885 */ 874 */
886static inline void worker_set_flags(struct worker *worker, unsigned int flags, 875static inline void worker_set_flags(struct worker *worker, unsigned int flags)
887 bool wakeup)
888{ 876{
889 struct worker_pool *pool = worker->pool; 877 struct worker_pool *pool = worker->pool;
890 878
891 WARN_ON_ONCE(worker->task != current); 879 WARN_ON_ONCE(worker->task != current);
892 880
893 /* 881 /* If transitioning into NOT_RUNNING, adjust nr_running. */
894 * If transitioning into NOT_RUNNING, adjust nr_running and
895 * wake up an idle worker as necessary if requested by
896 * @wakeup.
897 */
898 if ((flags & WORKER_NOT_RUNNING) && 882 if ((flags & WORKER_NOT_RUNNING) &&
899 !(worker->flags & WORKER_NOT_RUNNING)) { 883 !(worker->flags & WORKER_NOT_RUNNING)) {
900 if (wakeup) { 884 atomic_dec(&pool->nr_running);
901 if (atomic_dec_and_test(&pool->nr_running) &&
902 !list_empty(&pool->worklist))
903 wake_up_worker(pool);
904 } else
905 atomic_dec(&pool->nr_running);
906 } 885 }
907 886
908 worker->flags |= flags; 887 worker->flags |= flags;
@@ -1232,7 +1211,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1232 pwq_activate_delayed_work(work); 1211 pwq_activate_delayed_work(work);
1233 1212
1234 list_del_init(&work->entry); 1213 list_del_init(&work->entry);
1235 pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work)); 1214 pwq_dec_nr_in_flight(pwq, get_work_color(work));
1236 1215
1237 /* work->data points to pwq iff queued, point to pool */ 1216 /* work->data points to pwq iff queued, point to pool */
1238 set_work_pool_and_keep_pending(work, pool->id); 1217 set_work_pool_and_keep_pending(work, pool->id);
@@ -1560,7 +1539,7 @@ static void worker_enter_idle(struct worker *worker)
1560 (worker->hentry.next || worker->hentry.pprev))) 1539 (worker->hentry.next || worker->hentry.pprev)))
1561 return; 1540 return;
1562 1541
1563 /* can't use worker_set_flags(), also called from start_worker() */ 1542 /* can't use worker_set_flags(), also called from create_worker() */
1564 worker->flags |= WORKER_IDLE; 1543 worker->flags |= WORKER_IDLE;
1565 pool->nr_idle++; 1544 pool->nr_idle++;
1566 worker->last_active = jiffies; 1545 worker->last_active = jiffies;
@@ -1602,11 +1581,11 @@ static void worker_leave_idle(struct worker *worker)
1602 list_del_init(&worker->entry); 1581 list_del_init(&worker->entry);
1603} 1582}
1604 1583
1605static struct worker *alloc_worker(void) 1584static struct worker *alloc_worker(int node)
1606{ 1585{
1607 struct worker *worker; 1586 struct worker *worker;
1608 1587
1609 worker = kzalloc(sizeof(*worker), GFP_KERNEL); 1588 worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
1610 if (worker) { 1589 if (worker) {
1611 INIT_LIST_HEAD(&worker->entry); 1590 INIT_LIST_HEAD(&worker->entry);
1612 INIT_LIST_HEAD(&worker->scheduled); 1591 INIT_LIST_HEAD(&worker->scheduled);
@@ -1670,6 +1649,9 @@ static void worker_detach_from_pool(struct worker *worker,
1670 detach_completion = pool->detach_completion; 1649 detach_completion = pool->detach_completion;
1671 mutex_unlock(&pool->attach_mutex); 1650 mutex_unlock(&pool->attach_mutex);
1672 1651
1652 /* clear leftover flags without pool->lock after it is detached */
1653 worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
1654
1673 if (detach_completion) 1655 if (detach_completion)
1674 complete(detach_completion); 1656 complete(detach_completion);
1675} 1657}
@@ -1678,8 +1660,7 @@ static void worker_detach_from_pool(struct worker *worker,
1678 * create_worker - create a new workqueue worker 1660 * create_worker - create a new workqueue worker
1679 * @pool: pool the new worker will belong to 1661 * @pool: pool the new worker will belong to
1680 * 1662 *
1681 * Create a new worker which is attached to @pool. The new worker must be 1663 * Create and start a new worker which is attached to @pool.
1682 * started by start_worker().
1683 * 1664 *
1684 * CONTEXT: 1665 * CONTEXT:
1685 * Might sleep. Does GFP_KERNEL allocations. 1666 * Might sleep. Does GFP_KERNEL allocations.
@@ -1698,7 +1679,7 @@ static struct worker *create_worker(struct worker_pool *pool)
1698 if (id < 0) 1679 if (id < 0)
1699 goto fail; 1680 goto fail;
1700 1681
1701 worker = alloc_worker(); 1682 worker = alloc_worker(pool->node);
1702 if (!worker) 1683 if (!worker)
1703 goto fail; 1684 goto fail;
1704 1685
@@ -1724,6 +1705,13 @@ static struct worker *create_worker(struct worker_pool *pool)
1724 /* successful, attach the worker to the pool */ 1705 /* successful, attach the worker to the pool */
1725 worker_attach_to_pool(worker, pool); 1706 worker_attach_to_pool(worker, pool);
1726 1707
1708 /* start the newly created worker */
1709 spin_lock_irq(&pool->lock);
1710 worker->pool->nr_workers++;
1711 worker_enter_idle(worker);
1712 wake_up_process(worker->task);
1713 spin_unlock_irq(&pool->lock);
1714
1727 return worker; 1715 return worker;
1728 1716
1729fail: 1717fail:
@@ -1734,44 +1722,6 @@ fail:
1734} 1722}
1735 1723
1736/** 1724/**
1737 * start_worker - start a newly created worker
1738 * @worker: worker to start
1739 *
1740 * Make the pool aware of @worker and start it.
1741 *
1742 * CONTEXT:
1743 * spin_lock_irq(pool->lock).
1744 */
1745static void start_worker(struct worker *worker)
1746{
1747 worker->pool->nr_workers++;
1748 worker_enter_idle(worker);
1749 wake_up_process(worker->task);
1750}
1751
1752/**
1753 * create_and_start_worker - create and start a worker for a pool
1754 * @pool: the target pool
1755 *
1756 * Grab the managership of @pool and create and start a new worker for it.
1757 *
1758 * Return: 0 on success. A negative error code otherwise.
1759 */
1760static int create_and_start_worker(struct worker_pool *pool)
1761{
1762 struct worker *worker;
1763
1764 worker = create_worker(pool);
1765 if (worker) {
1766 spin_lock_irq(&pool->lock);
1767 start_worker(worker);
1768 spin_unlock_irq(&pool->lock);
1769 }
1770
1771 return worker ? 0 : -ENOMEM;
1772}
1773
1774/**
1775 * destroy_worker - destroy a workqueue worker 1725 * destroy_worker - destroy a workqueue worker
1776 * @worker: worker to be destroyed 1726 * @worker: worker to be destroyed
1777 * 1727 *
@@ -1909,23 +1859,10 @@ restart:
1909 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); 1859 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1910 1860
1911 while (true) { 1861 while (true) {
1912 struct worker *worker; 1862 if (create_worker(pool) || !need_to_create_worker(pool))
1913
1914 worker = create_worker(pool);
1915 if (worker) {
1916 del_timer_sync(&pool->mayday_timer);
1917 spin_lock_irq(&pool->lock);
1918 start_worker(worker);
1919 if (WARN_ON_ONCE(need_to_create_worker(pool)))
1920 goto restart;
1921 return true;
1922 }
1923
1924 if (!need_to_create_worker(pool))
1925 break; 1863 break;
1926 1864
1927 __set_current_state(TASK_INTERRUPTIBLE); 1865 schedule_timeout_interruptible(CREATE_COOLDOWN);
1928 schedule_timeout(CREATE_COOLDOWN);
1929 1866
1930 if (!need_to_create_worker(pool)) 1867 if (!need_to_create_worker(pool))
1931 break; 1868 break;
@@ -1933,6 +1870,11 @@ restart:
1933 1870
1934 del_timer_sync(&pool->mayday_timer); 1871 del_timer_sync(&pool->mayday_timer);
1935 spin_lock_irq(&pool->lock); 1872 spin_lock_irq(&pool->lock);
1873 /*
1874 * This is necessary even after a new worker was just successfully
1875 * created as @pool->lock was dropped and the new worker might have
1876 * already become busy.
1877 */
1936 if (need_to_create_worker(pool)) 1878 if (need_to_create_worker(pool))
1937 goto restart; 1879 goto restart;
1938 return true; 1880 return true;
@@ -2020,13 +1962,8 @@ __acquires(&pool->lock)
2020 1962
2021 lockdep_copy_map(&lockdep_map, &work->lockdep_map); 1963 lockdep_copy_map(&lockdep_map, &work->lockdep_map);
2022#endif 1964#endif
2023 /* 1965 /* ensure we're on the correct CPU */
2024 * Ensure we're on the correct CPU. DISASSOCIATED test is 1966 WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
2025 * necessary to avoid spurious warnings from rescuers servicing the
2026 * unbound or a disassociated pool.
2027 */
2028 WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
2029 !(pool->flags & POOL_DISASSOCIATED) &&
2030 raw_smp_processor_id() != pool->cpu); 1967 raw_smp_processor_id() != pool->cpu);
2031 1968
2032 /* 1969 /*
@@ -2052,17 +1989,22 @@ __acquires(&pool->lock)
2052 list_del_init(&work->entry); 1989 list_del_init(&work->entry);
2053 1990
2054 /* 1991 /*
2055 * CPU intensive works don't participate in concurrency 1992 * CPU intensive works don't participate in concurrency management.
2056 * management. They're the scheduler's responsibility. 1993 * They're the scheduler's responsibility. This takes @worker out
1994 * of concurrency management and the next code block will chain
1995 * execution of the pending work items.
2057 */ 1996 */
2058 if (unlikely(cpu_intensive)) 1997 if (unlikely(cpu_intensive))
2059 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); 1998 worker_set_flags(worker, WORKER_CPU_INTENSIVE);
2060 1999
2061 /* 2000 /*
2062 * Unbound pool isn't concurrency managed and work items should be 2001 * Wake up another worker if necessary. The condition is always
2063 * executed ASAP. Wake up another worker if necessary. 2002 * false for normal per-cpu workers since nr_running would always
2003 * be >= 1 at this point. This is used to chain execution of the
2004 * pending work items for WORKER_NOT_RUNNING workers such as the
2005 * UNBOUND and CPU_INTENSIVE ones.
2064 */ 2006 */
2065 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) 2007 if (need_more_worker(pool))
2066 wake_up_worker(pool); 2008 wake_up_worker(pool);
2067 2009
2068 /* 2010 /*
@@ -2218,7 +2160,7 @@ recheck:
2218 } 2160 }
2219 } while (keep_working(pool)); 2161 } while (keep_working(pool));
2220 2162
2221 worker_set_flags(worker, WORKER_PREP, false); 2163 worker_set_flags(worker, WORKER_PREP);
2222sleep: 2164sleep:
2223 /* 2165 /*
2224 * pool->lock is held and there's no work to process and no need to 2166 * pool->lock is held and there's no work to process and no need to
@@ -2311,29 +2253,27 @@ repeat:
2311 move_linked_works(work, scheduled, &n); 2253 move_linked_works(work, scheduled, &n);
2312 2254
2313 process_scheduled_works(rescuer); 2255 process_scheduled_works(rescuer);
2314 spin_unlock_irq(&pool->lock);
2315
2316 worker_detach_from_pool(rescuer, pool);
2317
2318 spin_lock_irq(&pool->lock);
2319 2256
2320 /* 2257 /*
2321 * Put the reference grabbed by send_mayday(). @pool won't 2258 * Put the reference grabbed by send_mayday(). @pool won't
2322 * go away while we're holding its lock. 2259 * go away while we're still attached to it.
2323 */ 2260 */
2324 put_pwq(pwq); 2261 put_pwq(pwq);
2325 2262
2326 /* 2263 /*
2327 * Leave this pool. If keep_working() is %true, notify a 2264 * Leave this pool. If need_more_worker() is %true, notify a
2328 * regular worker; otherwise, we end up with 0 concurrency 2265 * regular worker; otherwise, we end up with 0 concurrency
2329 * and stalling the execution. 2266 * and stalling the execution.
2330 */ 2267 */
2331 if (keep_working(pool)) 2268 if (need_more_worker(pool))
2332 wake_up_worker(pool); 2269 wake_up_worker(pool);
2333 2270
2334 rescuer->pool = NULL; 2271 rescuer->pool = NULL;
2335 spin_unlock(&pool->lock); 2272 spin_unlock_irq(&pool->lock);
2336 spin_lock(&wq_mayday_lock); 2273
2274 worker_detach_from_pool(rescuer, pool);
2275
2276 spin_lock_irq(&wq_mayday_lock);
2337 } 2277 }
2338 2278
2339 spin_unlock_irq(&wq_mayday_lock); 2279 spin_unlock_irq(&wq_mayday_lock);
@@ -3458,7 +3398,7 @@ static void put_unbound_pool(struct worker_pool *pool)
3458 return; 3398 return;
3459 3399
3460 /* sanity checks */ 3400 /* sanity checks */
3461 if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || 3401 if (WARN_ON(!(pool->cpu < 0)) ||
3462 WARN_ON(!list_empty(&pool->worklist))) 3402 WARN_ON(!list_empty(&pool->worklist)))
3463 return; 3403 return;
3464 3404
@@ -3524,7 +3464,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3524 hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { 3464 hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
3525 if (wqattrs_equal(pool->attrs, attrs)) { 3465 if (wqattrs_equal(pool->attrs, attrs)) {
3526 pool->refcnt++; 3466 pool->refcnt++;
3527 goto out_unlock; 3467 return pool;
3528 } 3468 }
3529 } 3469 }
3530 3470
@@ -3557,12 +3497,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3557 goto fail; 3497 goto fail;
3558 3498
3559 /* create and start the initial worker */ 3499 /* create and start the initial worker */
3560 if (create_and_start_worker(pool) < 0) 3500 if (!create_worker(pool))
3561 goto fail; 3501 goto fail;
3562 3502
3563 /* install */ 3503 /* install */
3564 hash_add(unbound_pool_hash, &pool->hash_node, hash); 3504 hash_add(unbound_pool_hash, &pool->hash_node, hash);
3565out_unlock: 3505
3566 return pool; 3506 return pool;
3567fail: 3507fail:
3568 if (pool) 3508 if (pool)
@@ -3591,11 +3531,6 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
3591 if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) 3531 if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
3592 return; 3532 return;
3593 3533
3594 /*
3595 * Unlink @pwq. Synchronization against wq->mutex isn't strictly
3596 * necessary on release but do it anyway. It's easier to verify
3597 * and consistent with the linking path.
3598 */
3599 mutex_lock(&wq->mutex); 3534 mutex_lock(&wq->mutex);
3600 list_del_rcu(&pwq->pwqs_node); 3535 list_del_rcu(&pwq->pwqs_node);
3601 is_last = list_empty(&wq->pwqs); 3536 is_last = list_empty(&wq->pwqs);
@@ -3692,10 +3627,7 @@ static void link_pwq(struct pool_workqueue *pwq)
3692 if (!list_empty(&pwq->pwqs_node)) 3627 if (!list_empty(&pwq->pwqs_node))
3693 return; 3628 return;
3694 3629
3695 /* 3630 /* set the matching work_color */
3696 * Set the matching work_color. This is synchronized with
3697 * wq->mutex to avoid confusing flush_workqueue().
3698 */
3699 pwq->work_color = wq->work_color; 3631 pwq->work_color = wq->work_color;
3700 3632
3701 /* sync max_active to the current setting */ 3633 /* sync max_active to the current setting */
@@ -3832,7 +3764,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
3832 if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) 3764 if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
3833 return -EINVAL; 3765 return -EINVAL;
3834 3766
3835 pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL); 3767 pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
3836 new_attrs = alloc_workqueue_attrs(GFP_KERNEL); 3768 new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3837 tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); 3769 tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3838 if (!pwq_tbl || !new_attrs || !tmp_attrs) 3770 if (!pwq_tbl || !new_attrs || !tmp_attrs)
@@ -4080,7 +4012,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
4080 4012
4081 /* allocate wq and format name */ 4013 /* allocate wq and format name */
4082 if (flags & WQ_UNBOUND) 4014 if (flags & WQ_UNBOUND)
4083 tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); 4015 tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
4084 4016
4085 wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); 4017 wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
4086 if (!wq) 4018 if (!wq)
@@ -4122,7 +4054,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
4122 if (flags & WQ_MEM_RECLAIM) { 4054 if (flags & WQ_MEM_RECLAIM) {
4123 struct worker *rescuer; 4055 struct worker *rescuer;
4124 4056
4125 rescuer = alloc_worker(); 4057 rescuer = alloc_worker(NUMA_NO_NODE);
4126 if (!rescuer) 4058 if (!rescuer)
4127 goto err_destroy; 4059 goto err_destroy;
4128 4060
@@ -4470,8 +4402,6 @@ static void wq_unbind_fn(struct work_struct *work)
4470 struct worker *worker; 4402 struct worker *worker;
4471 4403
4472 for_each_cpu_worker_pool(pool, cpu) { 4404 for_each_cpu_worker_pool(pool, cpu) {
4473 WARN_ON_ONCE(cpu != smp_processor_id());
4474
4475 mutex_lock(&pool->attach_mutex); 4405 mutex_lock(&pool->attach_mutex);
4476 spin_lock_irq(&pool->lock); 4406 spin_lock_irq(&pool->lock);
4477 4407
@@ -4543,6 +4473,7 @@ static void rebind_workers(struct worker_pool *pool)
4543 pool->attrs->cpumask) < 0); 4473 pool->attrs->cpumask) < 0);
4544 4474
4545 spin_lock_irq(&pool->lock); 4475 spin_lock_irq(&pool->lock);
4476 pool->flags &= ~POOL_DISASSOCIATED;
4546 4477
4547 for_each_pool_worker(worker, pool) { 4478 for_each_pool_worker(worker, pool) {
4548 unsigned int worker_flags = worker->flags; 4479 unsigned int worker_flags = worker->flags;
@@ -4632,7 +4563,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
4632 for_each_cpu_worker_pool(pool, cpu) { 4563 for_each_cpu_worker_pool(pool, cpu) {
4633 if (pool->nr_workers) 4564 if (pool->nr_workers)
4634 continue; 4565 continue;
4635 if (create_and_start_worker(pool) < 0) 4566 if (!create_worker(pool))
4636 return NOTIFY_BAD; 4567 return NOTIFY_BAD;
4637 } 4568 }
4638 break; 4569 break;
@@ -4644,15 +4575,10 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
4644 for_each_pool(pool, pi) { 4575 for_each_pool(pool, pi) {
4645 mutex_lock(&pool->attach_mutex); 4576 mutex_lock(&pool->attach_mutex);
4646 4577
4647 if (pool->cpu == cpu) { 4578 if (pool->cpu == cpu)
4648 spin_lock_irq(&pool->lock);
4649 pool->flags &= ~POOL_DISASSOCIATED;
4650 spin_unlock_irq(&pool->lock);
4651
4652 rebind_workers(pool); 4579 rebind_workers(pool);
4653 } else if (pool->cpu < 0) { 4580 else if (pool->cpu < 0)
4654 restore_unbound_workers_cpumask(pool, cpu); 4581 restore_unbound_workers_cpumask(pool, cpu);
4655 }
4656 4582
4657 mutex_unlock(&pool->attach_mutex); 4583 mutex_unlock(&pool->attach_mutex);
4658 } 4584 }
@@ -4856,10 +4782,6 @@ static void __init wq_numa_init(void)
4856 cpumask_var_t *tbl; 4782 cpumask_var_t *tbl;
4857 int node, cpu; 4783 int node, cpu;
4858 4784
4859 /* determine NUMA pwq table len - highest node id + 1 */
4860 for_each_node(node)
4861 wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
4862
4863 if (num_possible_nodes() <= 1) 4785 if (num_possible_nodes() <= 1)
4864 return; 4786 return;
4865 4787
@@ -4876,7 +4798,7 @@ static void __init wq_numa_init(void)
4876 * available. Build one from cpu_to_node() which should have been 4798 * available. Build one from cpu_to_node() which should have been
4877 * fully initialized by now. 4799 * fully initialized by now.
4878 */ 4800 */
4879 tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL); 4801 tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
4880 BUG_ON(!tbl); 4802 BUG_ON(!tbl);
4881 4803
4882 for_each_node(node) 4804 for_each_node(node)
@@ -4936,7 +4858,7 @@ static int __init init_workqueues(void)
4936 4858
4937 for_each_cpu_worker_pool(pool, cpu) { 4859 for_each_cpu_worker_pool(pool, cpu) {
4938 pool->flags &= ~POOL_DISASSOCIATED; 4860 pool->flags &= ~POOL_DISASSOCIATED;
4939 BUG_ON(create_and_start_worker(pool) < 0); 4861 BUG_ON(!create_worker(pool));
4940 } 4862 }
4941 } 4863 }
4942 4864