summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c321
-rw-r--r--kernel/audit.h7
-rw-r--r--kernel/audit_fsnotify.c12
-rw-r--r--kernel/audit_tree.c87
-rw-r--r--kernel/audit_watch.c21
-rw-r--r--kernel/auditfilter.c18
-rw-r--r--kernel/auditsc.c11
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c137
-rw-r--r--kernel/bpf/bpf_lru_list.c2
-rw-r--r--kernel/bpf/cgroup.c5
-rw-r--r--kernel/bpf/core.c21
-rw-r--r--kernel/bpf/hashtab.c185
-rw-r--r--kernel/bpf/inode.c2
-rw-r--r--kernel/bpf/lpm_trie.c14
-rw-r--r--kernel/bpf/map_in_map.c97
-rw-r--r--kernel/bpf/map_in_map.h23
-rw-r--r--kernel/bpf/stackmap.c14
-rw-r--r--kernel/bpf/syscall.c194
-rw-r--r--kernel/bpf/verifier.c364
-rw-r--r--kernel/cgroup/cgroup-internal.h7
-rw-r--r--kernel/cgroup/cgroup-v1.c20
-rw-r--r--kernel/cgroup/cgroup.c40
-rw-r--r--kernel/cgroup/cpuset.c11
-rw-r--r--kernel/cgroup/namespace.c2
-rw-r--r--kernel/compat.c10
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/crash_core.c439
-rw-r--r--kernel/events/callchain.c6
-rw-r--r--kernel/events/core.c139
-rw-r--r--kernel/events/ring_buffer.c34
-rw-r--r--kernel/fork.c52
-rw-r--r--kernel/futex.c518
-rw-r--r--kernel/gcov/base.c6
-rw-r--r--kernel/gcov/gcc_4_7.c4
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/hung_task.c8
-rw-r--r--kernel/irq/chip.c7
-rw-r--r--kernel/irq/manage.c21
-rw-r--r--kernel/kcov.c9
-rw-r--r--kernel/kexec_core.c431
-rw-r--r--kernel/kprobes.c84
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/livepatch/Makefile2
-rw-r--r--kernel/livepatch/core.c450
-rw-r--r--kernel/livepatch/core.h6
-rw-r--r--kernel/livepatch/patch.c272
-rw-r--r--kernel/livepatch/patch.h33
-rw-r--r--kernel/livepatch/transition.c553
-rw-r--r--kernel/livepatch/transition.h14
-rw-r--r--kernel/locking/lockdep.c341
-rw-r--r--kernel/locking/rtmutex-debug.c18
-rw-r--r--kernel/locking/rtmutex-debug.h3
-rw-r--r--kernel/locking/rtmutex.c390
-rw-r--r--kernel/locking/rtmutex.h2
-rw-r--r--kernel/locking/rtmutex_common.h25
-rw-r--r--kernel/locking/rwsem.c6
-rw-r--r--kernel/locking/test-ww_mutex.c29
-rw-r--r--kernel/memremap.c22
-rw-r--r--kernel/module.c51
-rw-r--r--kernel/nsproxy.c3
-rw-r--r--kernel/padata.c15
-rw-r--r--kernel/params.c52
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/pid_namespace.c36
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/power/snapshot.c5
-rw-r--r--kernel/power/suspend.c29
-rw-r--r--kernel/printk/braille.c15
-rw-r--r--kernel/printk/braille.h13
-rw-r--r--kernel/printk/printk.c111
-rw-r--r--kernel/rcu/Makefile5
-rw-r--r--kernel/rcu/rcu.h153
-rw-r--r--kernel/rcu/rcu_segcblist.c505
-rw-r--r--kernel/rcu/rcu_segcblist.h164
-rw-r--r--kernel/rcu/rcutorture.c43
-rw-r--r--kernel/rcu/srcu.c12
-rw-r--r--kernel/rcu/srcutiny.c216
-rw-r--r--kernel/rcu/srcutree.c1155
-rw-r--r--kernel/rcu/tiny.c20
-rw-r--r--kernel/rcu/tiny_plugin.h13
-rw-r--r--kernel/rcu/tree.c772
-rw-r--r--kernel/rcu/tree.h163
-rw-r--r--kernel/rcu/tree_exp.h25
-rw-r--r--kernel/rcu/tree_plugin.h64
-rw-r--r--kernel/rcu/tree_trace.c26
-rw-r--r--kernel/rcu/update.c53
-rw-r--r--kernel/relay.c1
-rw-r--r--kernel/sched/core.c294
-rw-r--r--kernel/sched/cpufreq_schedutil.c85
-rw-r--r--kernel/sched/cputime.c27
-rw-r--r--kernel/sched/fair.c420
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/idle.c6
-rw-r--r--kernel/sched/rt.c81
-rw-r--r--kernel/sched/sched-pelt.h13
-rw-r--r--kernel/sched/sched.h76
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/stacktrace.c12
-rw-r--r--kernel/sys.c33
-rw-r--r--kernel/sysctl.c4
-rw-r--r--kernel/taskstats.c14
-rw-r--r--kernel/time/alarmtimer.c27
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/hrtimer.c17
-rw-r--r--kernel/time/posix-clock.c10
-rw-r--r--kernel/time/posix-cpu-timers.c77
-rw-r--r--kernel/time/posix-stubs.c20
-rw-r--r--kernel/time/posix-timers.c97
-rw-r--r--kernel/time/sched_clock.c5
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/time.c18
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/time/timer.c4
-rw-r--r--kernel/time/timer_list.c6
-rw-r--r--kernel/trace/Kconfig5
-rw-r--r--kernel/trace/blktrace.c39
-rw-r--r--kernel/trace/bpf_trace.c32
-rw-r--r--kernel/trace/ftrace.c1024
-rw-r--r--kernel/trace/ring_buffer.c40
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c284
-rw-r--r--kernel/trace/trace.h82
-rw-r--r--kernel/trace/trace_benchmark.c14
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_events.c151
-rw-r--r--kernel/trace/trace_functions.c227
-rw-r--r--kernel/trace/trace_hwlat.c14
-rw-r--r--kernel/trace/trace_kprobe.c53
-rw-r--r--kernel/trace/trace_output.c9
-rw-r--r--kernel/trace/trace_stack.c35
-rw-r--r--kernel/workqueue.c28
134 files changed, 8915 insertions, 3800 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index b302b4731d16..72aa080f91f0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_MODULES) += module.o
59obj-$(CONFIG_MODULE_SIG) += module_signing.o 59obj-$(CONFIG_MODULE_SIG) += module_signing.o
60obj-$(CONFIG_KALLSYMS) += kallsyms.o 60obj-$(CONFIG_KALLSYMS) += kallsyms.o
61obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 61obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
62obj-$(CONFIG_CRASH_CORE) += crash_core.o
62obj-$(CONFIG_KEXEC_CORE) += kexec_core.o 63obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
63obj-$(CONFIG_KEXEC) += kexec.o 64obj-$(CONFIG_KEXEC) += kexec.o
64obj-$(CONFIG_KEXEC_FILE) += kexec_file.o 65obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
diff --git a/kernel/audit.c b/kernel/audit.c
index a871bf80fde1..4b7d49868ce1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -58,6 +58,8 @@
58#include <linux/rcupdate.h> 58#include <linux/rcupdate.h>
59#include <linux/mutex.h> 59#include <linux/mutex.h>
60#include <linux/gfp.h> 60#include <linux/gfp.h>
61#include <linux/pid.h>
62#include <linux/slab.h>
61 63
62#include <linux/audit.h> 64#include <linux/audit.h>
63 65
@@ -110,18 +112,19 @@ struct audit_net {
110 * @pid: auditd PID 112 * @pid: auditd PID
111 * @portid: netlink portid 113 * @portid: netlink portid
112 * @net: the associated network namespace 114 * @net: the associated network namespace
113 * @lock: spinlock to protect write access 115 * @rcu: RCU head
114 * 116 *
115 * Description: 117 * Description:
116 * This struct is RCU protected; you must either hold the RCU lock for reading 118 * This struct is RCU protected; you must either hold the RCU lock for reading
117 * or the included spinlock for writing. 119 * or the associated spinlock for writing.
118 */ 120 */
119static struct auditd_connection { 121static struct auditd_connection {
120 int pid; 122 struct pid *pid;
121 u32 portid; 123 u32 portid;
122 struct net *net; 124 struct net *net;
123 spinlock_t lock; 125 struct rcu_head rcu;
124} auditd_conn; 126} *auditd_conn = NULL;
127static DEFINE_SPINLOCK(auditd_conn_lock);
125 128
126/* If audit_rate_limit is non-zero, limit the rate of sending audit records 129/* If audit_rate_limit is non-zero, limit the rate of sending audit records
127 * to that number per second. This prevents DoS attacks, but results in 130 * to that number per second. This prevents DoS attacks, but results in
@@ -151,12 +154,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
151/* Hash for inode-based rules */ 154/* Hash for inode-based rules */
152struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; 155struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
153 156
154/* The audit_freelist is a list of pre-allocated audit buffers (if more 157static struct kmem_cache *audit_buffer_cache;
155 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
156 * being placed on the freelist). */
157static DEFINE_SPINLOCK(audit_freelist_lock);
158static int audit_freelist_count;
159static LIST_HEAD(audit_freelist);
160 158
161/* queue msgs to send via kauditd_task */ 159/* queue msgs to send via kauditd_task */
162static struct sk_buff_head audit_queue; 160static struct sk_buff_head audit_queue;
@@ -191,17 +189,12 @@ DEFINE_MUTEX(audit_cmd_mutex);
191 * should be at least that large. */ 189 * should be at least that large. */
192#define AUDIT_BUFSIZ 1024 190#define AUDIT_BUFSIZ 1024
193 191
194/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
195 * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */
196#define AUDIT_MAXFREE (2*NR_CPUS)
197
198/* The audit_buffer is used when formatting an audit record. The caller 192/* The audit_buffer is used when formatting an audit record. The caller
199 * locks briefly to get the record off the freelist or to allocate the 193 * locks briefly to get the record off the freelist or to allocate the
200 * buffer, and locks briefly to send the buffer to the netlink layer or 194 * buffer, and locks briefly to send the buffer to the netlink layer or
201 * to place it on a transmit queue. Multiple audit_buffers can be in 195 * to place it on a transmit queue. Multiple audit_buffers can be in
202 * use simultaneously. */ 196 * use simultaneously. */
203struct audit_buffer { 197struct audit_buffer {
204 struct list_head list;
205 struct sk_buff *skb; /* formatted skb ready to send */ 198 struct sk_buff *skb; /* formatted skb ready to send */
206 struct audit_context *ctx; /* NULL or associated context */ 199 struct audit_context *ctx; /* NULL or associated context */
207 gfp_t gfp_mask; 200 gfp_t gfp_mask;
@@ -220,18 +213,42 @@ struct audit_reply {
220 * Description: 213 * Description:
221 * Return 1 if the task is a registered audit daemon, 0 otherwise. 214 * Return 1 if the task is a registered audit daemon, 0 otherwise.
222 */ 215 */
223int auditd_test_task(const struct task_struct *task) 216int auditd_test_task(struct task_struct *task)
224{ 217{
225 int rc; 218 int rc;
219 struct auditd_connection *ac;
226 220
227 rcu_read_lock(); 221 rcu_read_lock();
228 rc = (auditd_conn.pid && task->tgid == auditd_conn.pid ? 1 : 0); 222 ac = rcu_dereference(auditd_conn);
223 rc = (ac && ac->pid == task_tgid(task) ? 1 : 0);
229 rcu_read_unlock(); 224 rcu_read_unlock();
230 225
231 return rc; 226 return rc;
232} 227}
233 228
234/** 229/**
230 * auditd_pid_vnr - Return the auditd PID relative to the namespace
231 *
232 * Description:
233 * Returns the PID in relation to the namespace, 0 on failure.
234 */
235static pid_t auditd_pid_vnr(void)
236{
237 pid_t pid;
238 const struct auditd_connection *ac;
239
240 rcu_read_lock();
241 ac = rcu_dereference(auditd_conn);
242 if (!ac || !ac->pid)
243 pid = 0;
244 else
245 pid = pid_vnr(ac->pid);
246 rcu_read_unlock();
247
248 return pid;
249}
250
251/**
235 * audit_get_sk - Return the audit socket for the given network namespace 252 * audit_get_sk - Return the audit socket for the given network namespace
236 * @net: the destination network namespace 253 * @net: the destination network namespace
237 * 254 *
@@ -250,14 +267,6 @@ static struct sock *audit_get_sk(const struct net *net)
250 return aunet->sk; 267 return aunet->sk;
251} 268}
252 269
253static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
254{
255 if (ab) {
256 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
257 nlh->nlmsg_pid = portid;
258 }
259}
260
261void audit_panic(const char *message) 270void audit_panic(const char *message)
262{ 271{
263 switch (audit_failure) { 272 switch (audit_failure) {
@@ -427,6 +436,24 @@ static int audit_set_failure(u32 state)
427} 436}
428 437
429/** 438/**
439 * auditd_conn_free - RCU helper to release an auditd connection struct
440 * @rcu: RCU head
441 *
442 * Description:
443 * Drop any references inside the auditd connection tracking struct and free
444 * the memory.
445 */
446 static void auditd_conn_free(struct rcu_head *rcu)
447 {
448 struct auditd_connection *ac;
449
450 ac = container_of(rcu, struct auditd_connection, rcu);
451 put_pid(ac->pid);
452 put_net(ac->net);
453 kfree(ac);
454 }
455
456/**
430 * auditd_set - Set/Reset the auditd connection state 457 * auditd_set - Set/Reset the auditd connection state
431 * @pid: auditd PID 458 * @pid: auditd PID
432 * @portid: auditd netlink portid 459 * @portid: auditd netlink portid
@@ -434,22 +461,33 @@ static int audit_set_failure(u32 state)
434 * 461 *
435 * Description: 462 * Description:
436 * This function will obtain and drop network namespace references as 463 * This function will obtain and drop network namespace references as
437 * necessary. 464 * necessary. Returns zero on success, negative values on failure.
438 */ 465 */
439static void auditd_set(int pid, u32 portid, struct net *net) 466static int auditd_set(struct pid *pid, u32 portid, struct net *net)
440{ 467{
441 unsigned long flags; 468 unsigned long flags;
469 struct auditd_connection *ac_old, *ac_new;
442 470
443 spin_lock_irqsave(&auditd_conn.lock, flags); 471 if (!pid || !net)
444 auditd_conn.pid = pid; 472 return -EINVAL;
445 auditd_conn.portid = portid; 473
446 if (auditd_conn.net) 474 ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL);
447 put_net(auditd_conn.net); 475 if (!ac_new)
448 if (net) 476 return -ENOMEM;
449 auditd_conn.net = get_net(net); 477 ac_new->pid = get_pid(pid);
450 else 478 ac_new->portid = portid;
451 auditd_conn.net = NULL; 479 ac_new->net = get_net(net);
452 spin_unlock_irqrestore(&auditd_conn.lock, flags); 480
481 spin_lock_irqsave(&auditd_conn_lock, flags);
482 ac_old = rcu_dereference_protected(auditd_conn,
483 lockdep_is_held(&auditd_conn_lock));
484 rcu_assign_pointer(auditd_conn, ac_new);
485 spin_unlock_irqrestore(&auditd_conn_lock, flags);
486
487 if (ac_old)
488 call_rcu(&ac_old->rcu, auditd_conn_free);
489
490 return 0;
453} 491}
454 492
455/** 493/**
@@ -544,13 +582,19 @@ static void kauditd_retry_skb(struct sk_buff *skb)
544 */ 582 */
545static void auditd_reset(void) 583static void auditd_reset(void)
546{ 584{
585 unsigned long flags;
547 struct sk_buff *skb; 586 struct sk_buff *skb;
587 struct auditd_connection *ac_old;
548 588
549 /* if it isn't already broken, break the connection */ 589 /* if it isn't already broken, break the connection */
550 rcu_read_lock(); 590 spin_lock_irqsave(&auditd_conn_lock, flags);
551 if (auditd_conn.pid) 591 ac_old = rcu_dereference_protected(auditd_conn,
552 auditd_set(0, 0, NULL); 592 lockdep_is_held(&auditd_conn_lock));
553 rcu_read_unlock(); 593 rcu_assign_pointer(auditd_conn, NULL);
594 spin_unlock_irqrestore(&auditd_conn_lock, flags);
595
596 if (ac_old)
597 call_rcu(&ac_old->rcu, auditd_conn_free);
554 598
555 /* flush all of the main and retry queues to the hold queue */ 599 /* flush all of the main and retry queues to the hold queue */
556 while ((skb = skb_dequeue(&audit_retry_queue))) 600 while ((skb = skb_dequeue(&audit_retry_queue)))
@@ -576,6 +620,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
576 u32 portid; 620 u32 portid;
577 struct net *net; 621 struct net *net;
578 struct sock *sk; 622 struct sock *sk;
623 struct auditd_connection *ac;
579 624
580 /* NOTE: we can't call netlink_unicast while in the RCU section so 625 /* NOTE: we can't call netlink_unicast while in the RCU section so
581 * take a reference to the network namespace and grab local 626 * take a reference to the network namespace and grab local
@@ -585,15 +630,15 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
585 * section netlink_unicast() should safely return an error */ 630 * section netlink_unicast() should safely return an error */
586 631
587 rcu_read_lock(); 632 rcu_read_lock();
588 if (!auditd_conn.pid) { 633 ac = rcu_dereference(auditd_conn);
634 if (!ac) {
589 rcu_read_unlock(); 635 rcu_read_unlock();
590 rc = -ECONNREFUSED; 636 rc = -ECONNREFUSED;
591 goto err; 637 goto err;
592 } 638 }
593 net = auditd_conn.net; 639 net = get_net(ac->net);
594 get_net(net);
595 sk = audit_get_sk(net); 640 sk = audit_get_sk(net);
596 portid = auditd_conn.portid; 641 portid = ac->portid;
597 rcu_read_unlock(); 642 rcu_read_unlock();
598 643
599 rc = netlink_unicast(sk, skb, portid, 0); 644 rc = netlink_unicast(sk, skb, portid, 0);
@@ -728,6 +773,7 @@ static int kauditd_thread(void *dummy)
728 u32 portid = 0; 773 u32 portid = 0;
729 struct net *net = NULL; 774 struct net *net = NULL;
730 struct sock *sk = NULL; 775 struct sock *sk = NULL;
776 struct auditd_connection *ac;
731 777
732#define UNICAST_RETRIES 5 778#define UNICAST_RETRIES 5
733 779
@@ -735,14 +781,14 @@ static int kauditd_thread(void *dummy)
735 while (!kthread_should_stop()) { 781 while (!kthread_should_stop()) {
736 /* NOTE: see the lock comments in auditd_send_unicast_skb() */ 782 /* NOTE: see the lock comments in auditd_send_unicast_skb() */
737 rcu_read_lock(); 783 rcu_read_lock();
738 if (!auditd_conn.pid) { 784 ac = rcu_dereference(auditd_conn);
785 if (!ac) {
739 rcu_read_unlock(); 786 rcu_read_unlock();
740 goto main_queue; 787 goto main_queue;
741 } 788 }
742 net = auditd_conn.net; 789 net = get_net(ac->net);
743 get_net(net);
744 sk = audit_get_sk(net); 790 sk = audit_get_sk(net);
745 portid = auditd_conn.portid; 791 portid = ac->portid;
746 rcu_read_unlock(); 792 rcu_read_unlock();
747 793
748 /* attempt to flush the hold queue */ 794 /* attempt to flush the hold queue */
@@ -816,7 +862,7 @@ int audit_send_list(void *_dest)
816 return 0; 862 return 0;
817} 863}
818 864
819struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done, 865struct sk_buff *audit_make_reply(int seq, int type, int done,
820 int multi, const void *payload, int size) 866 int multi, const void *payload, int size)
821{ 867{
822 struct sk_buff *skb; 868 struct sk_buff *skb;
@@ -829,7 +875,7 @@ struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
829 if (!skb) 875 if (!skb)
830 return NULL; 876 return NULL;
831 877
832 nlh = nlmsg_put(skb, portid, seq, t, size, flags); 878 nlh = nlmsg_put(skb, 0, seq, t, size, flags);
833 if (!nlh) 879 if (!nlh)
834 goto out_kfree_skb; 880 goto out_kfree_skb;
835 data = nlmsg_data(nlh); 881 data = nlmsg_data(nlh);
@@ -873,7 +919,6 @@ static int audit_send_reply_thread(void *arg)
873static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, 919static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
874 int multi, const void *payload, int size) 920 int multi, const void *payload, int size)
875{ 921{
876 u32 portid = NETLINK_CB(request_skb).portid;
877 struct net *net = sock_net(NETLINK_CB(request_skb).sk); 922 struct net *net = sock_net(NETLINK_CB(request_skb).sk);
878 struct sk_buff *skb; 923 struct sk_buff *skb;
879 struct task_struct *tsk; 924 struct task_struct *tsk;
@@ -883,12 +928,12 @@ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int
883 if (!reply) 928 if (!reply)
884 return; 929 return;
885 930
886 skb = audit_make_reply(portid, seq, type, done, multi, payload, size); 931 skb = audit_make_reply(seq, type, done, multi, payload, size);
887 if (!skb) 932 if (!skb)
888 goto out; 933 goto out;
889 934
890 reply->net = get_net(net); 935 reply->net = get_net(net);
891 reply->portid = portid; 936 reply->portid = NETLINK_CB(request_skb).portid;
892 reply->skb = skb; 937 reply->skb = skb;
893 938
894 tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); 939 tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
@@ -1068,11 +1113,13 @@ static int audit_set_feature(struct sk_buff *skb)
1068 return 0; 1113 return 0;
1069} 1114}
1070 1115
1071static int audit_replace(pid_t pid) 1116static int audit_replace(struct pid *pid)
1072{ 1117{
1118 pid_t pvnr;
1073 struct sk_buff *skb; 1119 struct sk_buff *skb;
1074 1120
1075 skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, &pid, sizeof(pid)); 1121 pvnr = pid_vnr(pid);
1122 skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr));
1076 if (!skb) 1123 if (!skb)
1077 return -ENOMEM; 1124 return -ENOMEM;
1078 return auditd_send_unicast_skb(skb); 1125 return auditd_send_unicast_skb(skb);
@@ -1102,9 +1149,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1102 memset(&s, 0, sizeof(s)); 1149 memset(&s, 0, sizeof(s));
1103 s.enabled = audit_enabled; 1150 s.enabled = audit_enabled;
1104 s.failure = audit_failure; 1151 s.failure = audit_failure;
1105 rcu_read_lock(); 1152 /* NOTE: use pid_vnr() so the PID is relative to the current
1106 s.pid = auditd_conn.pid; 1153 * namespace */
1107 rcu_read_unlock(); 1154 s.pid = auditd_pid_vnr();
1108 s.rate_limit = audit_rate_limit; 1155 s.rate_limit = audit_rate_limit;
1109 s.backlog_limit = audit_backlog_limit; 1156 s.backlog_limit = audit_backlog_limit;
1110 s.lost = atomic_read(&audit_lost); 1157 s.lost = atomic_read(&audit_lost);
@@ -1130,51 +1177,61 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1130 return err; 1177 return err;
1131 } 1178 }
1132 if (s.mask & AUDIT_STATUS_PID) { 1179 if (s.mask & AUDIT_STATUS_PID) {
1133 /* NOTE: we are using task_tgid_vnr() below because 1180 /* NOTE: we are using the vnr PID functions below
1134 * the s.pid value is relative to the namespace 1181 * because the s.pid value is relative to the
1135 * of the caller; at present this doesn't matter 1182 * namespace of the caller; at present this
1136 * much since you can really only run auditd 1183 * doesn't matter much since you can really only
1137 * from the initial pid namespace, but something 1184 * run auditd from the initial pid namespace, but
1138 * to keep in mind if this changes */ 1185 * something to keep in mind if this changes */
1139 int new_pid = s.pid; 1186 pid_t new_pid = s.pid;
1140 pid_t auditd_pid; 1187 pid_t auditd_pid;
1141 pid_t requesting_pid = task_tgid_vnr(current); 1188 struct pid *req_pid = task_tgid(current);
1189
1190 /* sanity check - PID values must match */
1191 if (new_pid != pid_vnr(req_pid))
1192 return -EINVAL;
1142 1193
1143 /* test the auditd connection */ 1194 /* test the auditd connection */
1144 audit_replace(requesting_pid); 1195 audit_replace(req_pid);
1145 1196
1146 rcu_read_lock(); 1197 auditd_pid = auditd_pid_vnr();
1147 auditd_pid = auditd_conn.pid;
1148 /* only the current auditd can unregister itself */ 1198 /* only the current auditd can unregister itself */
1149 if ((!new_pid) && (requesting_pid != auditd_pid)) { 1199 if ((!new_pid) && (new_pid != auditd_pid)) {
1150 rcu_read_unlock();
1151 audit_log_config_change("audit_pid", new_pid, 1200 audit_log_config_change("audit_pid", new_pid,
1152 auditd_pid, 0); 1201 auditd_pid, 0);
1153 return -EACCES; 1202 return -EACCES;
1154 } 1203 }
1155 /* replacing a healthy auditd is not allowed */ 1204 /* replacing a healthy auditd is not allowed */
1156 if (auditd_pid && new_pid) { 1205 if (auditd_pid && new_pid) {
1157 rcu_read_unlock();
1158 audit_log_config_change("audit_pid", new_pid, 1206 audit_log_config_change("audit_pid", new_pid,
1159 auditd_pid, 0); 1207 auditd_pid, 0);
1160 return -EEXIST; 1208 return -EEXIST;
1161 } 1209 }
1162 rcu_read_unlock();
1163
1164 if (audit_enabled != AUDIT_OFF)
1165 audit_log_config_change("audit_pid", new_pid,
1166 auditd_pid, 1);
1167 1210
1168 if (new_pid) { 1211 if (new_pid) {
1169 /* register a new auditd connection */ 1212 /* register a new auditd connection */
1170 auditd_set(new_pid, 1213 err = auditd_set(req_pid,
1171 NETLINK_CB(skb).portid, 1214 NETLINK_CB(skb).portid,
1172 sock_net(NETLINK_CB(skb).sk)); 1215 sock_net(NETLINK_CB(skb).sk));
1216 if (audit_enabled != AUDIT_OFF)
1217 audit_log_config_change("audit_pid",
1218 new_pid,
1219 auditd_pid,
1220 err ? 0 : 1);
1221 if (err)
1222 return err;
1223
1173 /* try to process any backlog */ 1224 /* try to process any backlog */
1174 wake_up_interruptible(&kauditd_wait); 1225 wake_up_interruptible(&kauditd_wait);
1175 } else 1226 } else {
1227 if (audit_enabled != AUDIT_OFF)
1228 audit_log_config_change("audit_pid",
1229 new_pid,
1230 auditd_pid, 1);
1231
1176 /* unregister the auditd connection */ 1232 /* unregister the auditd connection */
1177 auditd_reset(); 1233 auditd_reset();
1234 }
1178 } 1235 }
1179 if (s.mask & AUDIT_STATUS_RATE_LIMIT) { 1236 if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
1180 err = audit_set_rate_limit(s.rate_limit); 1237 err = audit_set_rate_limit(s.rate_limit);
@@ -1242,7 +1299,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1242 size--; 1299 size--;
1243 audit_log_n_untrustedstring(ab, data, size); 1300 audit_log_n_untrustedstring(ab, data, size);
1244 } 1301 }
1245 audit_set_portid(ab, NETLINK_CB(skb).portid);
1246 audit_log_end(ab); 1302 audit_log_end(ab);
1247 } 1303 }
1248 break; 1304 break;
@@ -1256,8 +1312,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1256 audit_log_end(ab); 1312 audit_log_end(ab);
1257 return -EPERM; 1313 return -EPERM;
1258 } 1314 }
1259 err = audit_rule_change(msg_type, NETLINK_CB(skb).portid, 1315 err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh));
1260 seq, data, nlmsg_len(nlh));
1261 break; 1316 break;
1262 case AUDIT_LIST_RULES: 1317 case AUDIT_LIST_RULES:
1263 err = audit_list_rules_send(skb, seq); 1318 err = audit_list_rules_send(skb, seq);
@@ -1378,11 +1433,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1378 return err < 0 ? err : 0; 1433 return err < 0 ? err : 0;
1379} 1434}
1380 1435
1381/* 1436/**
1382 * Get message from skb. Each message is processed by audit_receive_msg. 1437 * audit_receive - receive messages from a netlink control socket
1383 * Malformed skbs with wrong length are discarded silently. 1438 * @skb: the message buffer
1439 *
1440 * Parse the provided skb and deal with any messages that may be present,
1441 * malformed skbs are discarded.
1384 */ 1442 */
1385static void audit_receive_skb(struct sk_buff *skb) 1443static void audit_receive(struct sk_buff *skb)
1386{ 1444{
1387 struct nlmsghdr *nlh; 1445 struct nlmsghdr *nlh;
1388 /* 1446 /*
@@ -1395,21 +1453,15 @@ static void audit_receive_skb(struct sk_buff *skb)
1395 nlh = nlmsg_hdr(skb); 1453 nlh = nlmsg_hdr(skb);
1396 len = skb->len; 1454 len = skb->len;
1397 1455
1456 mutex_lock(&audit_cmd_mutex);
1398 while (nlmsg_ok(nlh, len)) { 1457 while (nlmsg_ok(nlh, len)) {
1399 err = audit_receive_msg(skb, nlh); 1458 err = audit_receive_msg(skb, nlh);
1400 /* if err or if this message says it wants a response */ 1459 /* if err or if this message says it wants a response */
1401 if (err || (nlh->nlmsg_flags & NLM_F_ACK)) 1460 if (err || (nlh->nlmsg_flags & NLM_F_ACK))
1402 netlink_ack(skb, nlh, err); 1461 netlink_ack(skb, nlh, err, NULL);
1403 1462
1404 nlh = nlmsg_next(nlh, &len); 1463 nlh = nlmsg_next(nlh, &len);
1405 } 1464 }
1406}
1407
1408/* Receive messages from netlink socket. */
1409static void audit_receive(struct sk_buff *skb)
1410{
1411 mutex_lock(&audit_cmd_mutex);
1412 audit_receive_skb(skb);
1413 mutex_unlock(&audit_cmd_mutex); 1465 mutex_unlock(&audit_cmd_mutex);
1414} 1466}
1415 1467
@@ -1447,10 +1499,11 @@ static void __net_exit audit_net_exit(struct net *net)
1447{ 1499{
1448 struct audit_net *aunet = net_generic(net, audit_net_id); 1500 struct audit_net *aunet = net_generic(net, audit_net_id);
1449 1501
1450 rcu_read_lock(); 1502 /* NOTE: you would think that we would want to check the auditd
1451 if (net == auditd_conn.net) 1503 * connection and potentially reset it here if it lives in this
1452 auditd_reset(); 1504 * namespace, but since the auditd connection tracking struct holds a
1453 rcu_read_unlock(); 1505 * reference to this namespace (see auditd_set()) we are only ever
1506 * going to get here after that connection has been released */
1454 1507
1455 netlink_kernel_release(aunet->sk); 1508 netlink_kernel_release(aunet->sk);
1456} 1509}
@@ -1470,8 +1523,9 @@ static int __init audit_init(void)
1470 if (audit_initialized == AUDIT_DISABLED) 1523 if (audit_initialized == AUDIT_DISABLED)
1471 return 0; 1524 return 0;
1472 1525
1473 memset(&auditd_conn, 0, sizeof(auditd_conn)); 1526 audit_buffer_cache = kmem_cache_create("audit_buffer",
1474 spin_lock_init(&auditd_conn.lock); 1527 sizeof(struct audit_buffer),
1528 0, SLAB_PANIC, NULL);
1475 1529
1476 skb_queue_head_init(&audit_queue); 1530 skb_queue_head_init(&audit_queue);
1477 skb_queue_head_init(&audit_retry_queue); 1531 skb_queue_head_init(&audit_retry_queue);
@@ -1538,60 +1592,33 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set);
1538 1592
1539static void audit_buffer_free(struct audit_buffer *ab) 1593static void audit_buffer_free(struct audit_buffer *ab)
1540{ 1594{
1541 unsigned long flags;
1542
1543 if (!ab) 1595 if (!ab)
1544 return; 1596 return;
1545 1597
1546 kfree_skb(ab->skb); 1598 kfree_skb(ab->skb);
1547 spin_lock_irqsave(&audit_freelist_lock, flags); 1599 kmem_cache_free(audit_buffer_cache, ab);
1548 if (audit_freelist_count > AUDIT_MAXFREE)
1549 kfree(ab);
1550 else {
1551 audit_freelist_count++;
1552 list_add(&ab->list, &audit_freelist);
1553 }
1554 spin_unlock_irqrestore(&audit_freelist_lock, flags);
1555} 1600}
1556 1601
1557static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, 1602static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx,
1558 gfp_t gfp_mask, int type) 1603 gfp_t gfp_mask, int type)
1559{ 1604{
1560 unsigned long flags; 1605 struct audit_buffer *ab;
1561 struct audit_buffer *ab = NULL;
1562 struct nlmsghdr *nlh;
1563
1564 spin_lock_irqsave(&audit_freelist_lock, flags);
1565 if (!list_empty(&audit_freelist)) {
1566 ab = list_entry(audit_freelist.next,
1567 struct audit_buffer, list);
1568 list_del(&ab->list);
1569 --audit_freelist_count;
1570 }
1571 spin_unlock_irqrestore(&audit_freelist_lock, flags);
1572
1573 if (!ab) {
1574 ab = kmalloc(sizeof(*ab), gfp_mask);
1575 if (!ab)
1576 goto err;
1577 }
1578 1606
1579 ab->ctx = ctx; 1607 ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask);
1580 ab->gfp_mask = gfp_mask; 1608 if (!ab)
1609 return NULL;
1581 1610
1582 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); 1611 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
1583 if (!ab->skb) 1612 if (!ab->skb)
1584 goto err; 1613 goto err;
1614 if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
1615 goto err;
1585 1616
1586 nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0); 1617 ab->ctx = ctx;
1587 if (!nlh) 1618 ab->gfp_mask = gfp_mask;
1588 goto out_kfree_skb;
1589 1619
1590 return ab; 1620 return ab;
1591 1621
1592out_kfree_skb:
1593 kfree_skb(ab->skb);
1594 ab->skb = NULL;
1595err: 1622err:
1596 audit_buffer_free(ab); 1623 audit_buffer_free(ab);
1597 return NULL; 1624 return NULL;
@@ -1622,10 +1649,10 @@ unsigned int audit_serial(void)
1622} 1649}
1623 1650
1624static inline void audit_get_stamp(struct audit_context *ctx, 1651static inline void audit_get_stamp(struct audit_context *ctx,
1625 struct timespec *t, unsigned int *serial) 1652 struct timespec64 *t, unsigned int *serial)
1626{ 1653{
1627 if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { 1654 if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
1628 *t = CURRENT_TIME; 1655 ktime_get_real_ts64(t);
1629 *serial = audit_serial(); 1656 *serial = audit_serial();
1630 } 1657 }
1631} 1658}
@@ -1649,7 +1676,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1649 int type) 1676 int type)
1650{ 1677{
1651 struct audit_buffer *ab; 1678 struct audit_buffer *ab;
1652 struct timespec t; 1679 struct timespec64 t;
1653 unsigned int uninitialized_var(serial); 1680 unsigned int uninitialized_var(serial);
1654 1681
1655 if (audit_initialized != AUDIT_INITIALIZED) 1682 if (audit_initialized != AUDIT_INITIALIZED)
@@ -1702,8 +1729,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1702 } 1729 }
1703 1730
1704 audit_get_stamp(ab->ctx, &t, &serial); 1731 audit_get_stamp(ab->ctx, &t, &serial);
1705 audit_log_format(ab, "audit(%lu.%03lu:%u): ", 1732 audit_log_format(ab, "audit(%llu.%03lu:%u): ",
1706 t.tv_sec, t.tv_nsec/1000000, serial); 1733 (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);
1707 1734
1708 return ab; 1735 return ab;
1709} 1736}
diff --git a/kernel/audit.h b/kernel/audit.h
index 0d87f8ab8778..ddfce2ea4891 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -112,7 +112,7 @@ struct audit_context {
112 enum audit_state state, current_state; 112 enum audit_state state, current_state;
113 unsigned int serial; /* serial number for record */ 113 unsigned int serial; /* serial number for record */
114 int major; /* syscall number */ 114 int major; /* syscall number */
115 struct timespec ctime; /* time of syscall entry */ 115 struct timespec64 ctime; /* time of syscall entry */
116 unsigned long argv[4]; /* syscall arguments */ 116 unsigned long argv[4]; /* syscall arguments */
117 long return_code;/* syscall return code */ 117 long return_code;/* syscall return code */
118 u64 prio; 118 u64 prio;
@@ -218,7 +218,7 @@ extern void audit_log_name(struct audit_context *context,
218 struct audit_names *n, const struct path *path, 218 struct audit_names *n, const struct path *path,
219 int record_num, int *call_panic); 219 int record_num, int *call_panic);
220 220
221extern int auditd_test_task(const struct task_struct *task); 221extern int auditd_test_task(struct task_struct *task);
222 222
223#define AUDIT_INODE_BUCKETS 32 223#define AUDIT_INODE_BUCKETS 32
224extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; 224extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -237,8 +237,7 @@ extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
237extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); 237extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
238extern int parent_len(const char *path); 238extern int parent_len(const char *path);
239extern int audit_compare_dname_path(const char *dname, const char *path, int plen); 239extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
240extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, 240extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi,
241 int done, int multi,
242 const void *payload, int size); 241 const void *payload, int size);
243extern void audit_panic(const char *message); 242extern void audit_panic(const char *message);
244 243
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 7ea57e516029..52f368b6561e 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -103,15 +103,15 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
103 goto out; 103 goto out;
104 } 104 }
105 105
106 fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark); 106 fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_group);
107 audit_mark->mark.mask = AUDIT_FS_EVENTS; 107 audit_mark->mark.mask = AUDIT_FS_EVENTS;
108 audit_mark->path = pathname; 108 audit_mark->path = pathname;
109 audit_update_mark(audit_mark, dentry->d_inode); 109 audit_update_mark(audit_mark, dentry->d_inode);
110 audit_mark->rule = krule; 110 audit_mark->rule = krule;
111 111
112 ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true); 112 ret = fsnotify_add_mark(&audit_mark->mark, inode, NULL, true);
113 if (ret < 0) { 113 if (ret < 0) {
114 audit_fsnotify_mark_free(audit_mark); 114 fsnotify_put_mark(&audit_mark->mark);
115 audit_mark = ERR_PTR(ret); 115 audit_mark = ERR_PTR(ret);
116 } 116 }
117out: 117out:
@@ -168,7 +168,8 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
168 struct fsnotify_mark *inode_mark, 168 struct fsnotify_mark *inode_mark,
169 struct fsnotify_mark *vfsmount_mark, 169 struct fsnotify_mark *vfsmount_mark,
170 u32 mask, const void *data, int data_type, 170 u32 mask, const void *data, int data_type,
171 const unsigned char *dname, u32 cookie) 171 const unsigned char *dname, u32 cookie,
172 struct fsnotify_iter_info *iter_info)
172{ 173{
173 struct audit_fsnotify_mark *audit_mark; 174 struct audit_fsnotify_mark *audit_mark;
174 const struct inode *inode = NULL; 175 const struct inode *inode = NULL;
@@ -187,7 +188,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
187 default: 188 default:
188 BUG(); 189 BUG();
189 return 0; 190 return 0;
190 }; 191 }
191 192
192 if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { 193 if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
193 if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) 194 if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
@@ -201,6 +202,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
201 202
202static const struct fsnotify_ops audit_mark_fsnotify_ops = { 203static const struct fsnotify_ops audit_mark_fsnotify_ops = {
203 .handle_event = audit_mark_handle_event, 204 .handle_event = audit_mark_handle_event,
205 .free_mark = audit_fsnotify_free_mark,
204}; 206};
205 207
206static int __init audit_fsnotify_init(void) 208static int __init audit_fsnotify_init(void)
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 7b44195da81b..011d46e5f73f 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -3,13 +3,14 @@
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h> 5#include <linux/kthread.h>
6#include <linux/refcount.h>
6#include <linux/slab.h> 7#include <linux/slab.h>
7 8
8struct audit_tree; 9struct audit_tree;
9struct audit_chunk; 10struct audit_chunk;
10 11
11struct audit_tree { 12struct audit_tree {
12 atomic_t count; 13 refcount_t count;
13 int goner; 14 int goner;
14 struct audit_chunk *root; 15 struct audit_chunk *root;
15 struct list_head chunks; 16 struct list_head chunks;
@@ -77,7 +78,7 @@ static struct audit_tree *alloc_tree(const char *s)
77 78
78 tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL); 79 tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
79 if (tree) { 80 if (tree) {
80 atomic_set(&tree->count, 1); 81 refcount_set(&tree->count, 1);
81 tree->goner = 0; 82 tree->goner = 0;
82 INIT_LIST_HEAD(&tree->chunks); 83 INIT_LIST_HEAD(&tree->chunks);
83 INIT_LIST_HEAD(&tree->rules); 84 INIT_LIST_HEAD(&tree->rules);
@@ -91,12 +92,12 @@ static struct audit_tree *alloc_tree(const char *s)
91 92
92static inline void get_tree(struct audit_tree *tree) 93static inline void get_tree(struct audit_tree *tree)
93{ 94{
94 atomic_inc(&tree->count); 95 refcount_inc(&tree->count);
95} 96}
96 97
97static inline void put_tree(struct audit_tree *tree) 98static inline void put_tree(struct audit_tree *tree)
98{ 99{
99 if (atomic_dec_and_test(&tree->count)) 100 if (refcount_dec_and_test(&tree->count))
100 kfree_rcu(tree, head); 101 kfree_rcu(tree, head);
101} 102}
102 103
@@ -154,7 +155,7 @@ static struct audit_chunk *alloc_chunk(int count)
154 INIT_LIST_HEAD(&chunk->owners[i].list); 155 INIT_LIST_HEAD(&chunk->owners[i].list);
155 chunk->owners[i].index = i; 156 chunk->owners[i].index = i;
156 } 157 }
157 fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); 158 fsnotify_init_mark(&chunk->mark, audit_tree_group);
158 chunk->mark.mask = FS_IN_IGNORED; 159 chunk->mark.mask = FS_IN_IGNORED;
159 return chunk; 160 return chunk;
160} 161}
@@ -163,33 +164,54 @@ enum {HASH_SIZE = 128};
163static struct list_head chunk_hash_heads[HASH_SIZE]; 164static struct list_head chunk_hash_heads[HASH_SIZE];
164static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); 165static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
165 166
166static inline struct list_head *chunk_hash(const struct inode *inode) 167/* Function to return search key in our hash from inode. */
168static unsigned long inode_to_key(const struct inode *inode)
167{ 169{
168 unsigned long n = (unsigned long)inode / L1_CACHE_BYTES; 170 return (unsigned long)inode;
171}
172
173/*
174 * Function to return search key in our hash from chunk. Key 0 is special and
175 * should never be present in the hash.
176 */
177static unsigned long chunk_to_key(struct audit_chunk *chunk)
178{
179 /*
180 * We have a reference to the mark so it should be attached to a
181 * connector.
182 */
183 if (WARN_ON_ONCE(!chunk->mark.connector))
184 return 0;
185 return (unsigned long)chunk->mark.connector->inode;
186}
187
188static inline struct list_head *chunk_hash(unsigned long key)
189{
190 unsigned long n = key / L1_CACHE_BYTES;
169 return chunk_hash_heads + n % HASH_SIZE; 191 return chunk_hash_heads + n % HASH_SIZE;
170} 192}
171 193
172/* hash_lock & entry->lock is held by caller */ 194/* hash_lock & entry->lock is held by caller */
173static void insert_hash(struct audit_chunk *chunk) 195static void insert_hash(struct audit_chunk *chunk)
174{ 196{
175 struct fsnotify_mark *entry = &chunk->mark; 197 unsigned long key = chunk_to_key(chunk);
176 struct list_head *list; 198 struct list_head *list;
177 199
178 if (!entry->inode) 200 if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED))
179 return; 201 return;
180 list = chunk_hash(entry->inode); 202 list = chunk_hash(key);
181 list_add_rcu(&chunk->hash, list); 203 list_add_rcu(&chunk->hash, list);
182} 204}
183 205
184/* called under rcu_read_lock */ 206/* called under rcu_read_lock */
185struct audit_chunk *audit_tree_lookup(const struct inode *inode) 207struct audit_chunk *audit_tree_lookup(const struct inode *inode)
186{ 208{
187 struct list_head *list = chunk_hash(inode); 209 unsigned long key = inode_to_key(inode);
210 struct list_head *list = chunk_hash(key);
188 struct audit_chunk *p; 211 struct audit_chunk *p;
189 212
190 list_for_each_entry_rcu(p, list, hash) { 213 list_for_each_entry_rcu(p, list, hash) {
191 /* mark.inode may have gone NULL, but who cares? */ 214 if (chunk_to_key(p) == key) {
192 if (p->mark.inode == inode) {
193 atomic_long_inc(&p->refs); 215 atomic_long_inc(&p->refs);
194 return p; 216 return p;
195 } 217 }
@@ -233,11 +255,15 @@ static void untag_chunk(struct node *p)
233 255
234 mutex_lock(&entry->group->mark_mutex); 256 mutex_lock(&entry->group->mark_mutex);
235 spin_lock(&entry->lock); 257 spin_lock(&entry->lock);
236 if (chunk->dead || !entry->inode) { 258 /*
259 * mark_mutex protects mark from getting detached and thus also from
260 * mark->connector->inode getting NULL.
261 */
262 if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
237 spin_unlock(&entry->lock); 263 spin_unlock(&entry->lock);
238 mutex_unlock(&entry->group->mark_mutex); 264 mutex_unlock(&entry->group->mark_mutex);
239 if (new) 265 if (new)
240 free_chunk(new); 266 fsnotify_put_mark(&new->mark);
241 goto out; 267 goto out;
242 } 268 }
243 269
@@ -261,7 +287,7 @@ static void untag_chunk(struct node *p)
261 if (!new) 287 if (!new)
262 goto Fallback; 288 goto Fallback;
263 289
264 if (fsnotify_add_mark_locked(&new->mark, entry->group, entry->inode, 290 if (fsnotify_add_mark_locked(&new->mark, entry->connector->inode,
265 NULL, 1)) { 291 NULL, 1)) {
266 fsnotify_put_mark(&new->mark); 292 fsnotify_put_mark(&new->mark);
267 goto Fallback; 293 goto Fallback;
@@ -327,7 +353,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
327 return -ENOMEM; 353 return -ENOMEM;
328 354
329 entry = &chunk->mark; 355 entry = &chunk->mark;
330 if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { 356 if (fsnotify_add_mark(entry, inode, NULL, 0)) {
331 fsnotify_put_mark(entry); 357 fsnotify_put_mark(entry);
332 return -ENOSPC; 358 return -ENOSPC;
333 } 359 }
@@ -366,7 +392,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
366 struct node *p; 392 struct node *p;
367 int n; 393 int n;
368 394
369 old_entry = fsnotify_find_inode_mark(audit_tree_group, inode); 395 old_entry = fsnotify_find_mark(&inode->i_fsnotify_marks,
396 audit_tree_group);
370 if (!old_entry) 397 if (!old_entry)
371 return create_chunk(inode, tree); 398 return create_chunk(inode, tree);
372 399
@@ -393,17 +420,21 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
393 420
394 mutex_lock(&old_entry->group->mark_mutex); 421 mutex_lock(&old_entry->group->mark_mutex);
395 spin_lock(&old_entry->lock); 422 spin_lock(&old_entry->lock);
396 if (!old_entry->inode) { 423 /*
424 * mark_mutex protects mark from getting detached and thus also from
425 * mark->connector->inode getting NULL.
426 */
427 if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
397 /* old_entry is being shot, lets just lie */ 428 /* old_entry is being shot, lets just lie */
398 spin_unlock(&old_entry->lock); 429 spin_unlock(&old_entry->lock);
399 mutex_unlock(&old_entry->group->mark_mutex); 430 mutex_unlock(&old_entry->group->mark_mutex);
400 fsnotify_put_mark(old_entry); 431 fsnotify_put_mark(old_entry);
401 free_chunk(chunk); 432 fsnotify_put_mark(&chunk->mark);
402 return -ENOENT; 433 return -ENOENT;
403 } 434 }
404 435
405 if (fsnotify_add_mark_locked(chunk_entry, old_entry->group, 436 if (fsnotify_add_mark_locked(chunk_entry,
406 old_entry->inode, NULL, 1)) { 437 old_entry->connector->inode, NULL, 1)) {
407 spin_unlock(&old_entry->lock); 438 spin_unlock(&old_entry->lock);
408 mutex_unlock(&old_entry->group->mark_mutex); 439 mutex_unlock(&old_entry->group->mark_mutex);
409 fsnotify_put_mark(chunk_entry); 440 fsnotify_put_mark(chunk_entry);
@@ -588,7 +619,8 @@ int audit_remove_tree_rule(struct audit_krule *rule)
588 619
589static int compare_root(struct vfsmount *mnt, void *arg) 620static int compare_root(struct vfsmount *mnt, void *arg)
590{ 621{
591 return d_backing_inode(mnt->mnt_root) == arg; 622 return inode_to_key(d_backing_inode(mnt->mnt_root)) ==
623 (unsigned long)arg;
592} 624}
593 625
594void audit_trim_trees(void) 626void audit_trim_trees(void)
@@ -623,9 +655,10 @@ void audit_trim_trees(void)
623 list_for_each_entry(node, &tree->chunks, list) { 655 list_for_each_entry(node, &tree->chunks, list) {
624 struct audit_chunk *chunk = find_chunk(node); 656 struct audit_chunk *chunk = find_chunk(node);
625 /* this could be NULL if the watch is dying else where... */ 657 /* this could be NULL if the watch is dying else where... */
626 struct inode *inode = chunk->mark.inode;
627 node->index |= 1U<<31; 658 node->index |= 1U<<31;
628 if (iterate_mounts(compare_root, inode, root_mnt)) 659 if (iterate_mounts(compare_root,
660 (void *)chunk_to_key(chunk),
661 root_mnt))
629 node->index &= ~(1U<<31); 662 node->index &= ~(1U<<31);
630 } 663 }
631 spin_unlock(&hash_lock); 664 spin_unlock(&hash_lock);
@@ -958,7 +991,8 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
958 struct fsnotify_mark *inode_mark, 991 struct fsnotify_mark *inode_mark,
959 struct fsnotify_mark *vfsmount_mark, 992 struct fsnotify_mark *vfsmount_mark,
960 u32 mask, const void *data, int data_type, 993 u32 mask, const void *data, int data_type,
961 const unsigned char *file_name, u32 cookie) 994 const unsigned char *file_name, u32 cookie,
995 struct fsnotify_iter_info *iter_info)
962{ 996{
963 return 0; 997 return 0;
964} 998}
@@ -979,6 +1013,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
979static const struct fsnotify_ops audit_tree_ops = { 1013static const struct fsnotify_ops audit_tree_ops = {
980 .handle_event = audit_tree_handle_event, 1014 .handle_event = audit_tree_handle_event,
981 .freeing_mark = audit_tree_freeing_mark, 1015 .freeing_mark = audit_tree_freeing_mark,
1016 .free_mark = audit_tree_destroy_watch,
982}; 1017};
983 1018
984static int __init audit_tree_init(void) 1019static int __init audit_tree_init(void)
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index f79e4658433d..62d686d96581 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -28,6 +28,7 @@
28#include <linux/fsnotify_backend.h> 28#include <linux/fsnotify_backend.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/netlink.h> 30#include <linux/netlink.h>
31#include <linux/refcount.h>
31#include <linux/sched.h> 32#include <linux/sched.h>
32#include <linux/slab.h> 33#include <linux/slab.h>
33#include <linux/security.h> 34#include <linux/security.h>
@@ -46,7 +47,7 @@
46 */ 47 */
47 48
48struct audit_watch { 49struct audit_watch {
49 atomic_t count; /* reference count */ 50 refcount_t count; /* reference count */
50 dev_t dev; /* associated superblock device */ 51 dev_t dev; /* associated superblock device */
51 char *path; /* insertion path */ 52 char *path; /* insertion path */
52 unsigned long ino; /* associated inode number */ 53 unsigned long ino; /* associated inode number */
@@ -102,7 +103,7 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode)
102 struct audit_parent *parent = NULL; 103 struct audit_parent *parent = NULL;
103 struct fsnotify_mark *entry; 104 struct fsnotify_mark *entry;
104 105
105 entry = fsnotify_find_inode_mark(audit_watch_group, inode); 106 entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_watch_group);
106 if (entry) 107 if (entry)
107 parent = container_of(entry, struct audit_parent, mark); 108 parent = container_of(entry, struct audit_parent, mark);
108 109
@@ -111,12 +112,12 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode)
111 112
112void audit_get_watch(struct audit_watch *watch) 113void audit_get_watch(struct audit_watch *watch)
113{ 114{
114 atomic_inc(&watch->count); 115 refcount_inc(&watch->count);
115} 116}
116 117
117void audit_put_watch(struct audit_watch *watch) 118void audit_put_watch(struct audit_watch *watch)
118{ 119{
119 if (atomic_dec_and_test(&watch->count)) { 120 if (refcount_dec_and_test(&watch->count)) {
120 WARN_ON(watch->parent); 121 WARN_ON(watch->parent);
121 WARN_ON(!list_empty(&watch->rules)); 122 WARN_ON(!list_empty(&watch->rules));
122 kfree(watch->path); 123 kfree(watch->path);
@@ -157,9 +158,9 @@ static struct audit_parent *audit_init_parent(struct path *path)
157 158
158 INIT_LIST_HEAD(&parent->watches); 159 INIT_LIST_HEAD(&parent->watches);
159 160
160 fsnotify_init_mark(&parent->mark, audit_watch_free_mark); 161 fsnotify_init_mark(&parent->mark, audit_watch_group);
161 parent->mark.mask = AUDIT_FS_WATCH; 162 parent->mark.mask = AUDIT_FS_WATCH;
162 ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0); 163 ret = fsnotify_add_mark(&parent->mark, inode, NULL, 0);
163 if (ret < 0) { 164 if (ret < 0) {
164 audit_free_parent(parent); 165 audit_free_parent(parent);
165 return ERR_PTR(ret); 166 return ERR_PTR(ret);
@@ -178,7 +179,7 @@ static struct audit_watch *audit_init_watch(char *path)
178 return ERR_PTR(-ENOMEM); 179 return ERR_PTR(-ENOMEM);
179 180
180 INIT_LIST_HEAD(&watch->rules); 181 INIT_LIST_HEAD(&watch->rules);
181 atomic_set(&watch->count, 1); 182 refcount_set(&watch->count, 1);
182 watch->path = path; 183 watch->path = path;
183 watch->dev = AUDIT_DEV_UNSET; 184 watch->dev = AUDIT_DEV_UNSET;
184 watch->ino = AUDIT_INO_UNSET; 185 watch->ino = AUDIT_INO_UNSET;
@@ -472,7 +473,8 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
472 struct fsnotify_mark *inode_mark, 473 struct fsnotify_mark *inode_mark,
473 struct fsnotify_mark *vfsmount_mark, 474 struct fsnotify_mark *vfsmount_mark,
474 u32 mask, const void *data, int data_type, 475 u32 mask, const void *data, int data_type,
475 const unsigned char *dname, u32 cookie) 476 const unsigned char *dname, u32 cookie,
477 struct fsnotify_iter_info *iter_info)
476{ 478{
477 const struct inode *inode; 479 const struct inode *inode;
478 struct audit_parent *parent; 480 struct audit_parent *parent;
@@ -492,7 +494,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
492 BUG(); 494 BUG();
493 inode = NULL; 495 inode = NULL;
494 break; 496 break;
495 }; 497 }
496 498
497 if (mask & (FS_CREATE|FS_MOVED_TO) && inode) 499 if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
498 audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); 500 audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
@@ -506,6 +508,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
506 508
507static const struct fsnotify_ops audit_watch_fsnotify_ops = { 509static const struct fsnotify_ops audit_watch_fsnotify_ops = {
508 .handle_event = audit_watch_handle_event, 510 .handle_event = audit_watch_handle_event,
511 .free_mark = audit_watch_free_mark,
509}; 512};
510 513
511static int __init audit_watch_init(void) 514static int __init audit_watch_init(void)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 880519d6cf2a..0b0aa5854dac 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -338,7 +338,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
338 entry->rule.listnr != AUDIT_FILTER_USER) 338 entry->rule.listnr != AUDIT_FILTER_USER)
339 return -EINVAL; 339 return -EINVAL;
340 break; 340 break;
341 }; 341 }
342 342
343 switch(f->type) { 343 switch(f->type) {
344 default: 344 default:
@@ -412,7 +412,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
412 if (entry->rule.listnr != AUDIT_FILTER_EXIT) 412 if (entry->rule.listnr != AUDIT_FILTER_EXIT)
413 return -EINVAL; 413 return -EINVAL;
414 break; 414 break;
415 }; 415 }
416 return 0; 416 return 0;
417} 417}
418 418
@@ -1033,7 +1033,7 @@ out:
1033} 1033}
1034 1034
1035/* List rules using struct audit_rule_data. */ 1035/* List rules using struct audit_rule_data. */
1036static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q) 1036static void audit_list_rules(int seq, struct sk_buff_head *q)
1037{ 1037{
1038 struct sk_buff *skb; 1038 struct sk_buff *skb;
1039 struct audit_krule *r; 1039 struct audit_krule *r;
@@ -1048,15 +1048,15 @@ static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
1048 data = audit_krule_to_data(r); 1048 data = audit_krule_to_data(r);
1049 if (unlikely(!data)) 1049 if (unlikely(!data))
1050 break; 1050 break;
1051 skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1051 skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
1052 0, 1, data, 1052 data,
1053 sizeof(*data) + data->buflen); 1053 sizeof(*data) + data->buflen);
1054 if (skb) 1054 if (skb)
1055 skb_queue_tail(q, skb); 1055 skb_queue_tail(q, skb);
1056 kfree(data); 1056 kfree(data);
1057 } 1057 }
1058 } 1058 }
1059 skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); 1059 skb = audit_make_reply(seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
1060 if (skb) 1060 if (skb)
1061 skb_queue_tail(q, skb); 1061 skb_queue_tail(q, skb);
1062} 1062}
@@ -1085,13 +1085,11 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
1085/** 1085/**
1086 * audit_rule_change - apply all rules to the specified message type 1086 * audit_rule_change - apply all rules to the specified message type
1087 * @type: audit message type 1087 * @type: audit message type
1088 * @portid: target port id for netlink audit messages
1089 * @seq: netlink audit message sequence (serial) number 1088 * @seq: netlink audit message sequence (serial) number
1090 * @data: payload data 1089 * @data: payload data
1091 * @datasz: size of payload data 1090 * @datasz: size of payload data
1092 */ 1091 */
1093int audit_rule_change(int type, __u32 portid, int seq, void *data, 1092int audit_rule_change(int type, int seq, void *data, size_t datasz)
1094 size_t datasz)
1095{ 1093{
1096 int err = 0; 1094 int err = 0;
1097 struct audit_entry *entry; 1095 struct audit_entry *entry;
@@ -1150,7 +1148,7 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq)
1150 skb_queue_head_init(&dest->q); 1148 skb_queue_head_init(&dest->q);
1151 1149
1152 mutex_lock(&audit_filter_mutex); 1150 mutex_lock(&audit_filter_mutex);
1153 audit_list_rules(portid, seq, &dest->q); 1151 audit_list_rules(seq, &dest->q);
1154 mutex_unlock(&audit_filter_mutex); 1152 mutex_unlock(&audit_filter_mutex);
1155 1153
1156 tsk = kthread_run(audit_send_list, dest, "audit_send_list"); 1154 tsk = kthread_run(audit_send_list, dest, "audit_send_list");
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1c2333155893..bb724baa7ac9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -73,6 +73,7 @@
73#include <linux/ctype.h> 73#include <linux/ctype.h>
74#include <linux/string.h> 74#include <linux/string.h>
75#include <linux/uaccess.h> 75#include <linux/uaccess.h>
76#include <linux/fsnotify_backend.h>
76#include <uapi/linux/limits.h> 77#include <uapi/linux/limits.h>
77 78
78#include "audit.h" 79#include "audit.h"
@@ -1532,7 +1533,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
1532 return; 1533 return;
1533 1534
1534 context->serial = 0; 1535 context->serial = 0;
1535 context->ctime = CURRENT_TIME; 1536 ktime_get_real_ts64(&context->ctime);
1536 context->in_syscall = 1; 1537 context->in_syscall = 1;
1537 context->current_state = state; 1538 context->current_state = state;
1538 context->ppid = 0; 1539 context->ppid = 0;
@@ -1596,7 +1597,7 @@ static inline void handle_one(const struct inode *inode)
1596 struct audit_tree_refs *p; 1597 struct audit_tree_refs *p;
1597 struct audit_chunk *chunk; 1598 struct audit_chunk *chunk;
1598 int count; 1599 int count;
1599 if (likely(hlist_empty(&inode->i_fsnotify_marks))) 1600 if (likely(!inode->i_fsnotify_marks))
1600 return; 1601 return;
1601 context = current->audit_context; 1602 context = current->audit_context;
1602 p = context->trees; 1603 p = context->trees;
@@ -1639,7 +1640,7 @@ retry:
1639 seq = read_seqbegin(&rename_lock); 1640 seq = read_seqbegin(&rename_lock);
1640 for(;;) { 1641 for(;;) {
1641 struct inode *inode = d_backing_inode(d); 1642 struct inode *inode = d_backing_inode(d);
1642 if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) { 1643 if (inode && unlikely(inode->i_fsnotify_marks)) {
1643 struct audit_chunk *chunk; 1644 struct audit_chunk *chunk;
1644 chunk = audit_tree_lookup(inode); 1645 chunk = audit_tree_lookup(inode);
1645 if (chunk) { 1646 if (chunk) {
@@ -1941,13 +1942,13 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
1941/** 1942/**
1942 * auditsc_get_stamp - get local copies of audit_context values 1943 * auditsc_get_stamp - get local copies of audit_context values
1943 * @ctx: audit_context for the task 1944 * @ctx: audit_context for the task
1944 * @t: timespec to store time recorded in the audit_context 1945 * @t: timespec64 to store time recorded in the audit_context
1945 * @serial: serial value that is recorded in the audit_context 1946 * @serial: serial value that is recorded in the audit_context
1946 * 1947 *
1947 * Also sets the context as auditable. 1948 * Also sets the context as auditable.
1948 */ 1949 */
1949int auditsc_get_stamp(struct audit_context *ctx, 1950int auditsc_get_stamp(struct audit_context *ctx,
1950 struct timespec *t, unsigned int *serial) 1951 struct timespec64 *t, unsigned int *serial)
1951{ 1952{
1952 if (!ctx->in_syscall) 1953 if (!ctx->in_syscall)
1953 return 0; 1954 return 0;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e1ce4f4fd7fd..e1e5e658f2db 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
1obj-y := core.o 1obj-y := core.o
2 2
3obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o 3obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
4obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o 4obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
5ifeq ($(CONFIG_PERF_EVENTS),y) 5ifeq ($(CONFIG_PERF_EVENTS),y)
6obj-$(CONFIG_BPF_SYSCALL) += stackmap.o 6obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
7endif 7endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 6b6f41f0b211..5e00b2333c26 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1,4 +1,5 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 * Copyright (c) 2016,2017 Facebook
2 * 3 *
3 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public 5 * modify it under the terms of version 2 of the GNU General Public
@@ -16,6 +17,8 @@
16#include <linux/filter.h> 17#include <linux/filter.h>
17#include <linux/perf_event.h> 18#include <linux/perf_event.h>
18 19
20#include "map_in_map.h"
21
19static void bpf_array_free_percpu(struct bpf_array *array) 22static void bpf_array_free_percpu(struct bpf_array *array)
20{ 23{
21 int i; 24 int i;
@@ -113,6 +116,30 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
113 return array->value + array->elem_size * index; 116 return array->value + array->elem_size * index;
114} 117}
115 118
119/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
120static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
121{
122 struct bpf_insn *insn = insn_buf;
123 u32 elem_size = round_up(map->value_size, 8);
124 const int ret = BPF_REG_0;
125 const int map_ptr = BPF_REG_1;
126 const int index = BPF_REG_2;
127
128 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
129 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
130 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
131
132 if (is_power_of_2(elem_size)) {
133 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
134 } else {
135 *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
136 }
137 *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
138 *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
139 *insn++ = BPF_MOV64_IMM(ret, 0);
140 return insn - insn_buf;
141}
142
116/* Called from eBPF program */ 143/* Called from eBPF program */
117static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) 144static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
118{ 145{
@@ -155,7 +182,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
155static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 182static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
156{ 183{
157 struct bpf_array *array = container_of(map, struct bpf_array, map); 184 struct bpf_array *array = container_of(map, struct bpf_array, map);
158 u32 index = *(u32 *)key; 185 u32 index = key ? *(u32 *)key : U32_MAX;
159 u32 *next = (u32 *)next_key; 186 u32 *next = (u32 *)next_key;
160 187
161 if (index >= array->map.max_entries) { 188 if (index >= array->map.max_entries) {
@@ -260,21 +287,17 @@ static void array_map_free(struct bpf_map *map)
260 bpf_map_area_free(array); 287 bpf_map_area_free(array);
261} 288}
262 289
263static const struct bpf_map_ops array_ops = { 290const struct bpf_map_ops array_map_ops = {
264 .map_alloc = array_map_alloc, 291 .map_alloc = array_map_alloc,
265 .map_free = array_map_free, 292 .map_free = array_map_free,
266 .map_get_next_key = array_map_get_next_key, 293 .map_get_next_key = array_map_get_next_key,
267 .map_lookup_elem = array_map_lookup_elem, 294 .map_lookup_elem = array_map_lookup_elem,
268 .map_update_elem = array_map_update_elem, 295 .map_update_elem = array_map_update_elem,
269 .map_delete_elem = array_map_delete_elem, 296 .map_delete_elem = array_map_delete_elem,
297 .map_gen_lookup = array_map_gen_lookup,
270}; 298};
271 299
272static struct bpf_map_type_list array_type __ro_after_init = { 300const struct bpf_map_ops percpu_array_map_ops = {
273 .ops = &array_ops,
274 .type = BPF_MAP_TYPE_ARRAY,
275};
276
277static const struct bpf_map_ops percpu_array_ops = {
278 .map_alloc = array_map_alloc, 301 .map_alloc = array_map_alloc,
279 .map_free = array_map_free, 302 .map_free = array_map_free,
280 .map_get_next_key = array_map_get_next_key, 303 .map_get_next_key = array_map_get_next_key,
@@ -283,19 +306,6 @@ static const struct bpf_map_ops percpu_array_ops = {
283 .map_delete_elem = array_map_delete_elem, 306 .map_delete_elem = array_map_delete_elem,
284}; 307};
285 308
286static struct bpf_map_type_list percpu_array_type __ro_after_init = {
287 .ops = &percpu_array_ops,
288 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
289};
290
291static int __init register_array_map(void)
292{
293 bpf_register_map_type(&array_type);
294 bpf_register_map_type(&percpu_array_type);
295 return 0;
296}
297late_initcall(register_array_map);
298
299static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) 309static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
300{ 310{
301 /* only file descriptors can be stored in this type of map */ 311 /* only file descriptors can be stored in this type of map */
@@ -399,7 +409,7 @@ void bpf_fd_array_map_clear(struct bpf_map *map)
399 fd_array_map_delete_elem(map, &i); 409 fd_array_map_delete_elem(map, &i);
400} 410}
401 411
402static const struct bpf_map_ops prog_array_ops = { 412const struct bpf_map_ops prog_array_map_ops = {
403 .map_alloc = fd_array_map_alloc, 413 .map_alloc = fd_array_map_alloc,
404 .map_free = fd_array_map_free, 414 .map_free = fd_array_map_free,
405 .map_get_next_key = array_map_get_next_key, 415 .map_get_next_key = array_map_get_next_key,
@@ -409,18 +419,6 @@ static const struct bpf_map_ops prog_array_ops = {
409 .map_fd_put_ptr = prog_fd_array_put_ptr, 419 .map_fd_put_ptr = prog_fd_array_put_ptr,
410}; 420};
411 421
412static struct bpf_map_type_list prog_array_type __ro_after_init = {
413 .ops = &prog_array_ops,
414 .type = BPF_MAP_TYPE_PROG_ARRAY,
415};
416
417static int __init register_prog_array_map(void)
418{
419 bpf_register_map_type(&prog_array_type);
420 return 0;
421}
422late_initcall(register_prog_array_map);
423
424static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, 422static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
425 struct file *map_file) 423 struct file *map_file)
426{ 424{
@@ -511,7 +509,7 @@ static void perf_event_fd_array_release(struct bpf_map *map,
511 rcu_read_unlock(); 509 rcu_read_unlock();
512} 510}
513 511
514static const struct bpf_map_ops perf_event_array_ops = { 512const struct bpf_map_ops perf_event_array_map_ops = {
515 .map_alloc = fd_array_map_alloc, 513 .map_alloc = fd_array_map_alloc,
516 .map_free = fd_array_map_free, 514 .map_free = fd_array_map_free,
517 .map_get_next_key = array_map_get_next_key, 515 .map_get_next_key = array_map_get_next_key,
@@ -522,18 +520,6 @@ static const struct bpf_map_ops perf_event_array_ops = {
522 .map_release = perf_event_fd_array_release, 520 .map_release = perf_event_fd_array_release,
523}; 521};
524 522
525static struct bpf_map_type_list perf_event_array_type __ro_after_init = {
526 .ops = &perf_event_array_ops,
527 .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
528};
529
530static int __init register_perf_event_array_map(void)
531{
532 bpf_register_map_type(&perf_event_array_type);
533 return 0;
534}
535late_initcall(register_perf_event_array_map);
536
537#ifdef CONFIG_CGROUPS 523#ifdef CONFIG_CGROUPS
538static void *cgroup_fd_array_get_ptr(struct bpf_map *map, 524static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
539 struct file *map_file /* not used */, 525 struct file *map_file /* not used */,
@@ -554,7 +540,7 @@ static void cgroup_fd_array_free(struct bpf_map *map)
554 fd_array_map_free(map); 540 fd_array_map_free(map);
555} 541}
556 542
557static const struct bpf_map_ops cgroup_array_ops = { 543const struct bpf_map_ops cgroup_array_map_ops = {
558 .map_alloc = fd_array_map_alloc, 544 .map_alloc = fd_array_map_alloc,
559 .map_free = cgroup_fd_array_free, 545 .map_free = cgroup_fd_array_free,
560 .map_get_next_key = array_map_get_next_key, 546 .map_get_next_key = array_map_get_next_key,
@@ -563,16 +549,53 @@ static const struct bpf_map_ops cgroup_array_ops = {
563 .map_fd_get_ptr = cgroup_fd_array_get_ptr, 549 .map_fd_get_ptr = cgroup_fd_array_get_ptr,
564 .map_fd_put_ptr = cgroup_fd_array_put_ptr, 550 .map_fd_put_ptr = cgroup_fd_array_put_ptr,
565}; 551};
552#endif
566 553
567static struct bpf_map_type_list cgroup_array_type __ro_after_init = { 554static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
568 .ops = &cgroup_array_ops, 555{
569 .type = BPF_MAP_TYPE_CGROUP_ARRAY, 556 struct bpf_map *map, *inner_map_meta;
570}; 557
558 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
559 if (IS_ERR(inner_map_meta))
560 return inner_map_meta;
571 561
572static int __init register_cgroup_array_map(void) 562 map = fd_array_map_alloc(attr);
563 if (IS_ERR(map)) {
564 bpf_map_meta_free(inner_map_meta);
565 return map;
566 }
567
568 map->inner_map_meta = inner_map_meta;
569
570 return map;
571}
572
573static void array_of_map_free(struct bpf_map *map)
573{ 574{
574 bpf_register_map_type(&cgroup_array_type); 575 /* map->inner_map_meta is only accessed by syscall which
575 return 0; 576 * is protected by fdget/fdput.
577 */
578 bpf_map_meta_free(map->inner_map_meta);
579 bpf_fd_array_map_clear(map);
580 fd_array_map_free(map);
576} 581}
577late_initcall(register_cgroup_array_map); 582
578#endif 583static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
584{
585 struct bpf_map **inner_map = array_map_lookup_elem(map, key);
586
587 if (!inner_map)
588 return NULL;
589
590 return READ_ONCE(*inner_map);
591}
592
593const struct bpf_map_ops array_of_maps_map_ops = {
594 .map_alloc = array_of_map_alloc,
595 .map_free = array_of_map_free,
596 .map_get_next_key = array_map_get_next_key,
597 .map_lookup_elem = array_of_map_lookup_elem,
598 .map_delete_elem = fd_array_map_delete_elem,
599 .map_fd_get_ptr = bpf_map_fd_get_ptr,
600 .map_fd_put_ptr = bpf_map_fd_put_ptr,
601};
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index f62d1d56f41d..e6ef4401a138 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -13,7 +13,7 @@
13#define LOCAL_FREE_TARGET (128) 13#define LOCAL_FREE_TARGET (128)
14#define LOCAL_NR_SCANS LOCAL_FREE_TARGET 14#define LOCAL_NR_SCANS LOCAL_FREE_TARGET
15 15
16#define PERCPU_FREE_TARGET (16) 16#define PERCPU_FREE_TARGET (4)
17#define PERCPU_NR_SCANS PERCPU_FREE_TARGET 17#define PERCPU_NR_SCANS PERCPU_FREE_TARGET
18 18
19/* Helpers to get the local list index */ 19/* Helpers to get the local list index */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index da0f53690295..ea6033cba947 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -154,7 +154,7 @@ int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
154 154
155/** 155/**
156 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering 156 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
157 * @sk: The socken sending or receiving traffic 157 * @sk: The socket sending or receiving traffic
158 * @skb: The skb that is being sent or received 158 * @skb: The skb that is being sent or received
159 * @type: The type of program to be exectuted 159 * @type: The type of program to be exectuted
160 * 160 *
@@ -189,10 +189,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
189 prog = rcu_dereference(cgrp->bpf.effective[type]); 189 prog = rcu_dereference(cgrp->bpf.effective[type]);
190 if (prog) { 190 if (prog) {
191 unsigned int offset = skb->data - skb_network_header(skb); 191 unsigned int offset = skb->data - skb_network_header(skb);
192 struct sock *save_sk = skb->sk;
192 193
194 skb->sk = sk;
193 __skb_push(skb, offset); 195 __skb_push(skb, offset);
194 ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; 196 ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
195 __skb_pull(skb, offset); 197 __skb_pull(skb, offset);
198 skb->sk = save_sk;
196 } 199 }
197 200
198 rcu_read_unlock(); 201 rcu_read_unlock();
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b4f1cb0c5ac7..dedf367f59bb 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -76,8 +76,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
76 76
77struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) 77struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
78{ 78{
79 gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | 79 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
80 gfp_extra_flags;
81 struct bpf_prog_aux *aux; 80 struct bpf_prog_aux *aux;
82 struct bpf_prog *fp; 81 struct bpf_prog *fp;
83 82
@@ -107,8 +106,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_alloc);
107struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, 106struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
108 gfp_t gfp_extra_flags) 107 gfp_t gfp_extra_flags)
109{ 108{
110 gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | 109 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
111 gfp_extra_flags;
112 struct bpf_prog *fp; 110 struct bpf_prog *fp;
113 u32 pages, delta; 111 u32 pages, delta;
114 int ret; 112 int ret;
@@ -394,27 +392,23 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
394 392
395void bpf_prog_kallsyms_add(struct bpf_prog *fp) 393void bpf_prog_kallsyms_add(struct bpf_prog *fp)
396{ 394{
397 unsigned long flags;
398
399 if (!bpf_prog_kallsyms_candidate(fp) || 395 if (!bpf_prog_kallsyms_candidate(fp) ||
400 !capable(CAP_SYS_ADMIN)) 396 !capable(CAP_SYS_ADMIN))
401 return; 397 return;
402 398
403 spin_lock_irqsave(&bpf_lock, flags); 399 spin_lock_bh(&bpf_lock);
404 bpf_prog_ksym_node_add(fp->aux); 400 bpf_prog_ksym_node_add(fp->aux);
405 spin_unlock_irqrestore(&bpf_lock, flags); 401 spin_unlock_bh(&bpf_lock);
406} 402}
407 403
408void bpf_prog_kallsyms_del(struct bpf_prog *fp) 404void bpf_prog_kallsyms_del(struct bpf_prog *fp)
409{ 405{
410 unsigned long flags;
411
412 if (!bpf_prog_kallsyms_candidate(fp)) 406 if (!bpf_prog_kallsyms_candidate(fp))
413 return; 407 return;
414 408
415 spin_lock_irqsave(&bpf_lock, flags); 409 spin_lock_bh(&bpf_lock);
416 bpf_prog_ksym_node_del(fp->aux); 410 bpf_prog_ksym_node_del(fp->aux);
417 spin_unlock_irqrestore(&bpf_lock, flags); 411 spin_unlock_bh(&bpf_lock);
418} 412}
419 413
420static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) 414static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
@@ -659,8 +653,7 @@ out:
659static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, 653static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
660 gfp_t gfp_extra_flags) 654 gfp_t gfp_extra_flags)
661{ 655{
662 gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | 656 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
663 gfp_extra_flags;
664 struct bpf_prog *fp; 657 struct bpf_prog *fp;
665 658
666 fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); 659 fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 361a69dfe543..004334ea13ba 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -16,6 +16,7 @@
16#include <linux/rculist_nulls.h> 16#include <linux/rculist_nulls.h>
17#include "percpu_freelist.h" 17#include "percpu_freelist.h"
18#include "bpf_lru_list.h" 18#include "bpf_lru_list.h"
19#include "map_in_map.h"
19 20
20struct bucket { 21struct bucket {
21 struct hlist_nulls_head head; 22 struct hlist_nulls_head head;
@@ -86,6 +87,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size
86 return *(void __percpu **)(l->key + key_size); 87 return *(void __percpu **)(l->key + key_size);
87} 88}
88 89
90static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
91{
92 return *(void **)(l->key + roundup(map->key_size, 8));
93}
94
89static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) 95static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
90{ 96{
91 return (struct htab_elem *) (htab->elems + i * htab->elem_size); 97 return (struct htab_elem *) (htab->elems + i * htab->elem_size);
@@ -426,7 +432,11 @@ again:
426 return NULL; 432 return NULL;
427} 433}
428 434
429/* Called from syscall or from eBPF program */ 435/* Called from syscall or from eBPF program directly, so
436 * arguments have to match bpf_map_lookup_elem() exactly.
437 * The return value is adjusted by BPF instructions
438 * in htab_map_gen_lookup().
439 */
430static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) 440static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
431{ 441{
432 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 442 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
@@ -458,6 +468,30 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
458 return NULL; 468 return NULL;
459} 469}
460 470
471/* inline bpf_map_lookup_elem() call.
472 * Instead of:
473 * bpf_prog
474 * bpf_map_lookup_elem
475 * map->ops->map_lookup_elem
476 * htab_map_lookup_elem
477 * __htab_map_lookup_elem
478 * do:
479 * bpf_prog
480 * __htab_map_lookup_elem
481 */
482static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
483{
484 struct bpf_insn *insn = insn_buf;
485 const int ret = BPF_REG_0;
486
487 *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
488 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
489 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
490 offsetof(struct htab_elem, key) +
491 round_up(map->key_size, 8));
492 return insn - insn_buf;
493}
494
461static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) 495static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
462{ 496{
463 struct htab_elem *l = __htab_map_lookup_elem(map, key); 497 struct htab_elem *l = __htab_map_lookup_elem(map, key);
@@ -506,12 +540,15 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
506 struct hlist_nulls_head *head; 540 struct hlist_nulls_head *head;
507 struct htab_elem *l, *next_l; 541 struct htab_elem *l, *next_l;
508 u32 hash, key_size; 542 u32 hash, key_size;
509 int i; 543 int i = 0;
510 544
511 WARN_ON_ONCE(!rcu_read_lock_held()); 545 WARN_ON_ONCE(!rcu_read_lock_held());
512 546
513 key_size = map->key_size; 547 key_size = map->key_size;
514 548
549 if (!key)
550 goto find_first_elem;
551
515 hash = htab_map_hash(key, key_size); 552 hash = htab_map_hash(key, key_size);
516 553
517 head = select_bucket(htab, hash); 554 head = select_bucket(htab, hash);
@@ -519,10 +556,8 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
519 /* lookup the key */ 556 /* lookup the key */
520 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 557 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
521 558
522 if (!l) { 559 if (!l)
523 i = 0;
524 goto find_first_elem; 560 goto find_first_elem;
525 }
526 561
527 /* key was found, get next key in the same bucket */ 562 /* key was found, get next key in the same bucket */
528 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), 563 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
@@ -582,6 +617,14 @@ static void htab_elem_free_rcu(struct rcu_head *head)
582 617
583static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) 618static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
584{ 619{
620 struct bpf_map *map = &htab->map;
621
622 if (map->ops->map_fd_put_ptr) {
623 void *ptr = fd_htab_map_get_ptr(map, l);
624
625 map->ops->map_fd_put_ptr(ptr);
626 }
627
585 if (htab_is_prealloc(htab)) { 628 if (htab_is_prealloc(htab)) {
586 pcpu_freelist_push(&htab->freelist, &l->fnode); 629 pcpu_freelist_push(&htab->freelist, &l->fnode);
587 } else { 630 } else {
@@ -1027,6 +1070,7 @@ static void delete_all_elements(struct bpf_htab *htab)
1027 } 1070 }
1028 } 1071 }
1029} 1072}
1073
1030/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 1074/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1031static void htab_map_free(struct bpf_map *map) 1075static void htab_map_free(struct bpf_map *map)
1032{ 1076{
@@ -1053,21 +1097,17 @@ static void htab_map_free(struct bpf_map *map)
1053 kfree(htab); 1097 kfree(htab);
1054} 1098}
1055 1099
1056static const struct bpf_map_ops htab_ops = { 1100const struct bpf_map_ops htab_map_ops = {
1057 .map_alloc = htab_map_alloc, 1101 .map_alloc = htab_map_alloc,
1058 .map_free = htab_map_free, 1102 .map_free = htab_map_free,
1059 .map_get_next_key = htab_map_get_next_key, 1103 .map_get_next_key = htab_map_get_next_key,
1060 .map_lookup_elem = htab_map_lookup_elem, 1104 .map_lookup_elem = htab_map_lookup_elem,
1061 .map_update_elem = htab_map_update_elem, 1105 .map_update_elem = htab_map_update_elem,
1062 .map_delete_elem = htab_map_delete_elem, 1106 .map_delete_elem = htab_map_delete_elem,
1107 .map_gen_lookup = htab_map_gen_lookup,
1063}; 1108};
1064 1109
1065static struct bpf_map_type_list htab_type __ro_after_init = { 1110const struct bpf_map_ops htab_lru_map_ops = {
1066 .ops = &htab_ops,
1067 .type = BPF_MAP_TYPE_HASH,
1068};
1069
1070static const struct bpf_map_ops htab_lru_ops = {
1071 .map_alloc = htab_map_alloc, 1111 .map_alloc = htab_map_alloc,
1072 .map_free = htab_map_free, 1112 .map_free = htab_map_free,
1073 .map_get_next_key = htab_map_get_next_key, 1113 .map_get_next_key = htab_map_get_next_key,
@@ -1076,11 +1116,6 @@ static const struct bpf_map_ops htab_lru_ops = {
1076 .map_delete_elem = htab_lru_map_delete_elem, 1116 .map_delete_elem = htab_lru_map_delete_elem,
1077}; 1117};
1078 1118
1079static struct bpf_map_type_list htab_lru_type __ro_after_init = {
1080 .ops = &htab_lru_ops,
1081 .type = BPF_MAP_TYPE_LRU_HASH,
1082};
1083
1084/* Called from eBPF program */ 1119/* Called from eBPF program */
1085static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) 1120static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
1086{ 1121{
@@ -1154,7 +1189,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
1154 return ret; 1189 return ret;
1155} 1190}
1156 1191
1157static const struct bpf_map_ops htab_percpu_ops = { 1192const struct bpf_map_ops htab_percpu_map_ops = {
1158 .map_alloc = htab_map_alloc, 1193 .map_alloc = htab_map_alloc,
1159 .map_free = htab_map_free, 1194 .map_free = htab_map_free,
1160 .map_get_next_key = htab_map_get_next_key, 1195 .map_get_next_key = htab_map_get_next_key,
@@ -1163,12 +1198,7 @@ static const struct bpf_map_ops htab_percpu_ops = {
1163 .map_delete_elem = htab_map_delete_elem, 1198 .map_delete_elem = htab_map_delete_elem,
1164}; 1199};
1165 1200
1166static struct bpf_map_type_list htab_percpu_type __ro_after_init = { 1201const struct bpf_map_ops htab_lru_percpu_map_ops = {
1167 .ops = &htab_percpu_ops,
1168 .type = BPF_MAP_TYPE_PERCPU_HASH,
1169};
1170
1171static const struct bpf_map_ops htab_lru_percpu_ops = {
1172 .map_alloc = htab_map_alloc, 1202 .map_alloc = htab_map_alloc,
1173 .map_free = htab_map_free, 1203 .map_free = htab_map_free,
1174 .map_get_next_key = htab_map_get_next_key, 1204 .map_get_next_key = htab_map_get_next_key,
@@ -1177,17 +1207,102 @@ static const struct bpf_map_ops htab_lru_percpu_ops = {
1177 .map_delete_elem = htab_lru_map_delete_elem, 1207 .map_delete_elem = htab_lru_map_delete_elem,
1178}; 1208};
1179 1209
1180static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = { 1210static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
1181 .ops = &htab_lru_percpu_ops, 1211{
1182 .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, 1212 struct bpf_map *map;
1183}; 1213
1214 if (attr->value_size != sizeof(u32))
1215 return ERR_PTR(-EINVAL);
1216
1217 /* pointer is stored internally */
1218 attr->value_size = sizeof(void *);
1219 map = htab_map_alloc(attr);
1220 attr->value_size = sizeof(u32);
1184 1221
1185static int __init register_htab_map(void) 1222 return map;
1223}
1224
1225static void fd_htab_map_free(struct bpf_map *map)
1186{ 1226{
1187 bpf_register_map_type(&htab_type); 1227 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1188 bpf_register_map_type(&htab_percpu_type); 1228 struct hlist_nulls_node *n;
1189 bpf_register_map_type(&htab_lru_type); 1229 struct hlist_nulls_head *head;
1190 bpf_register_map_type(&htab_lru_percpu_type); 1230 struct htab_elem *l;
1191 return 0; 1231 int i;
1232
1233 for (i = 0; i < htab->n_buckets; i++) {
1234 head = select_bucket(htab, i);
1235
1236 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1237 void *ptr = fd_htab_map_get_ptr(map, l);
1238
1239 map->ops->map_fd_put_ptr(ptr);
1240 }
1241 }
1242
1243 htab_map_free(map);
1244}
1245
1246/* only called from syscall */
1247int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
1248 void *key, void *value, u64 map_flags)
1249{
1250 void *ptr;
1251 int ret;
1252 u32 ufd = *(u32 *)value;
1253
1254 ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
1255 if (IS_ERR(ptr))
1256 return PTR_ERR(ptr);
1257
1258 ret = htab_map_update_elem(map, key, &ptr, map_flags);
1259 if (ret)
1260 map->ops->map_fd_put_ptr(ptr);
1261
1262 return ret;
1263}
1264
1265static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
1266{
1267 struct bpf_map *map, *inner_map_meta;
1268
1269 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
1270 if (IS_ERR(inner_map_meta))
1271 return inner_map_meta;
1272
1273 map = fd_htab_map_alloc(attr);
1274 if (IS_ERR(map)) {
1275 bpf_map_meta_free(inner_map_meta);
1276 return map;
1277 }
1278
1279 map->inner_map_meta = inner_map_meta;
1280
1281 return map;
1192} 1282}
1193late_initcall(register_htab_map); 1283
1284static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
1285{
1286 struct bpf_map **inner_map = htab_map_lookup_elem(map, key);
1287
1288 if (!inner_map)
1289 return NULL;
1290
1291 return READ_ONCE(*inner_map);
1292}
1293
1294static void htab_of_map_free(struct bpf_map *map)
1295{
1296 bpf_map_meta_free(map->inner_map_meta);
1297 fd_htab_map_free(map);
1298}
1299
1300const struct bpf_map_ops htab_of_maps_map_ops = {
1301 .map_alloc = htab_of_map_alloc,
1302 .map_free = htab_of_map_free,
1303 .map_get_next_key = htab_map_get_next_key,
1304 .map_lookup_elem = htab_of_map_lookup_elem,
1305 .map_delete_elem = htab_map_delete_elem,
1306 .map_fd_get_ptr = bpf_map_fd_get_ptr,
1307 .map_fd_put_ptr = bpf_map_fd_put_ptr,
1308};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index fddcae801724..9bbd33497d3d 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -429,7 +429,7 @@ static int bpf_parse_options(char *data, struct bpf_mount_opts *opts)
429 429
430static int bpf_fill_super(struct super_block *sb, void *data, int silent) 430static int bpf_fill_super(struct super_block *sb, void *data, int silent)
431{ 431{
432 static struct tree_descr bpf_rfiles[] = { { "" } }; 432 static const struct tree_descr bpf_rfiles[] = { { "" } };
433 struct bpf_mount_opts opts; 433 struct bpf_mount_opts opts;
434 struct inode *inode; 434 struct inode *inode;
435 int ret; 435 int ret;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index b37bd9ab7f57..39cfafd895b8 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -505,7 +505,7 @@ static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
505 return -ENOTSUPP; 505 return -ENOTSUPP;
506} 506}
507 507
508static const struct bpf_map_ops trie_ops = { 508const struct bpf_map_ops trie_map_ops = {
509 .map_alloc = trie_alloc, 509 .map_alloc = trie_alloc,
510 .map_free = trie_free, 510 .map_free = trie_free,
511 .map_get_next_key = trie_get_next_key, 511 .map_get_next_key = trie_get_next_key,
@@ -513,15 +513,3 @@ static const struct bpf_map_ops trie_ops = {
513 .map_update_elem = trie_update_elem, 513 .map_update_elem = trie_update_elem,
514 .map_delete_elem = trie_delete_elem, 514 .map_delete_elem = trie_delete_elem,
515}; 515};
516
517static struct bpf_map_type_list trie_type __ro_after_init = {
518 .ops = &trie_ops,
519 .type = BPF_MAP_TYPE_LPM_TRIE,
520};
521
522static int __init register_trie_map(void)
523{
524 bpf_register_map_type(&trie_type);
525 return 0;
526}
527late_initcall(register_trie_map);
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
new file mode 100644
index 000000000000..59bcdf821ae4
--- /dev/null
+++ b/kernel/bpf/map_in_map.c
@@ -0,0 +1,97 @@
1/* Copyright (c) 2017 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/slab.h>
8#include <linux/bpf.h>
9
10#include "map_in_map.h"
11
12struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
13{
14 struct bpf_map *inner_map, *inner_map_meta;
15 struct fd f;
16
17 f = fdget(inner_map_ufd);
18 inner_map = __bpf_map_get(f);
19 if (IS_ERR(inner_map))
20 return inner_map;
21
22 /* prog_array->owner_prog_type and owner_jited
23 * is a runtime binding. Doing static check alone
24 * in the verifier is not enough.
25 */
26 if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
27 fdput(f);
28 return ERR_PTR(-ENOTSUPP);
29 }
30
31 /* Does not support >1 level map-in-map */
32 if (inner_map->inner_map_meta) {
33 fdput(f);
34 return ERR_PTR(-EINVAL);
35 }
36
37 inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER);
38 if (!inner_map_meta) {
39 fdput(f);
40 return ERR_PTR(-ENOMEM);
41 }
42
43 inner_map_meta->map_type = inner_map->map_type;
44 inner_map_meta->key_size = inner_map->key_size;
45 inner_map_meta->value_size = inner_map->value_size;
46 inner_map_meta->map_flags = inner_map->map_flags;
47 inner_map_meta->ops = inner_map->ops;
48 inner_map_meta->max_entries = inner_map->max_entries;
49
50 fdput(f);
51 return inner_map_meta;
52}
53
54void bpf_map_meta_free(struct bpf_map *map_meta)
55{
56 kfree(map_meta);
57}
58
59bool bpf_map_meta_equal(const struct bpf_map *meta0,
60 const struct bpf_map *meta1)
61{
62 /* No need to compare ops because it is covered by map_type */
63 return meta0->map_type == meta1->map_type &&
64 meta0->key_size == meta1->key_size &&
65 meta0->value_size == meta1->value_size &&
66 meta0->map_flags == meta1->map_flags &&
67 meta0->max_entries == meta1->max_entries;
68}
69
70void *bpf_map_fd_get_ptr(struct bpf_map *map,
71 struct file *map_file /* not used */,
72 int ufd)
73{
74 struct bpf_map *inner_map;
75 struct fd f;
76
77 f = fdget(ufd);
78 inner_map = __bpf_map_get(f);
79 if (IS_ERR(inner_map))
80 return inner_map;
81
82 if (bpf_map_meta_equal(map->inner_map_meta, inner_map))
83 inner_map = bpf_map_inc(inner_map, false);
84 else
85 inner_map = ERR_PTR(-EINVAL);
86
87 fdput(f);
88 return inner_map;
89}
90
91void bpf_map_fd_put_ptr(void *ptr)
92{
93 /* ptr->ops->map_free() has to go through one
94 * rcu grace period by itself.
95 */
96 bpf_map_put(ptr);
97}
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
new file mode 100644
index 000000000000..177fadb689dc
--- /dev/null
+++ b/kernel/bpf/map_in_map.h
@@ -0,0 +1,23 @@
1/* Copyright (c) 2017 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#ifndef __MAP_IN_MAP_H__
8#define __MAP_IN_MAP_H__
9
10#include <linux/types.h>
11
12struct file;
13struct bpf_map;
14
15struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
16void bpf_map_meta_free(struct bpf_map *map_meta);
17bool bpf_map_meta_equal(const struct bpf_map *meta0,
18 const struct bpf_map *meta1);
19void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
20 int ufd);
21void bpf_map_fd_put_ptr(void *ptr);
22
23#endif
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 22aa45cd0324..4dfd6f2ec2f9 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -264,7 +264,7 @@ static void stack_map_free(struct bpf_map *map)
264 put_callchain_buffers(); 264 put_callchain_buffers();
265} 265}
266 266
267static const struct bpf_map_ops stack_map_ops = { 267const struct bpf_map_ops stack_map_ops = {
268 .map_alloc = stack_map_alloc, 268 .map_alloc = stack_map_alloc,
269 .map_free = stack_map_free, 269 .map_free = stack_map_free,
270 .map_get_next_key = stack_map_get_next_key, 270 .map_get_next_key = stack_map_get_next_key,
@@ -272,15 +272,3 @@ static const struct bpf_map_ops stack_map_ops = {
272 .map_update_elem = stack_map_update_elem, 272 .map_update_elem = stack_map_update_elem,
273 .map_delete_elem = stack_map_delete_elem, 273 .map_delete_elem = stack_map_delete_elem,
274}; 274};
275
276static struct bpf_map_type_list stack_map_type __ro_after_init = {
277 .ops = &stack_map_ops,
278 .type = BPF_MAP_TYPE_STACK_TRACE,
279};
280
281static int __init register_stack_map(void)
282{
283 bpf_register_map_type(&stack_map_type);
284 return 0;
285}
286late_initcall(register_stack_map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 821f9e807de5..265a0d854e33 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -27,30 +27,29 @@ DEFINE_PER_CPU(int, bpf_prog_active);
27 27
28int sysctl_unprivileged_bpf_disabled __read_mostly; 28int sysctl_unprivileged_bpf_disabled __read_mostly;
29 29
30static LIST_HEAD(bpf_map_types); 30static const struct bpf_map_ops * const bpf_map_types[] = {
31#define BPF_PROG_TYPE(_id, _ops)
32#define BPF_MAP_TYPE(_id, _ops) \
33 [_id] = &_ops,
34#include <linux/bpf_types.h>
35#undef BPF_PROG_TYPE
36#undef BPF_MAP_TYPE
37};
31 38
32static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) 39static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
33{ 40{
34 struct bpf_map_type_list *tl;
35 struct bpf_map *map; 41 struct bpf_map *map;
36 42
37 list_for_each_entry(tl, &bpf_map_types, list_node) { 43 if (attr->map_type >= ARRAY_SIZE(bpf_map_types) ||
38 if (tl->type == attr->map_type) { 44 !bpf_map_types[attr->map_type])
39 map = tl->ops->map_alloc(attr); 45 return ERR_PTR(-EINVAL);
40 if (IS_ERR(map))
41 return map;
42 map->ops = tl->ops;
43 map->map_type = attr->map_type;
44 return map;
45 }
46 }
47 return ERR_PTR(-EINVAL);
48}
49 46
50/* boot time registration of different map implementations */ 47 map = bpf_map_types[attr->map_type]->map_alloc(attr);
51void bpf_register_map_type(struct bpf_map_type_list *tl) 48 if (IS_ERR(map))
52{ 49 return map;
53 list_add(&tl->list_node, &bpf_map_types); 50 map->ops = bpf_map_types[attr->map_type];
51 map->map_type = attr->map_type;
52 return map;
54} 53}
55 54
56void *bpf_map_area_alloc(size_t size) 55void *bpf_map_area_alloc(size_t size)
@@ -68,8 +67,7 @@ void *bpf_map_area_alloc(size_t size)
68 return area; 67 return area;
69 } 68 }
70 69
71 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags, 70 return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL);
72 PAGE_KERNEL);
73} 71}
74 72
75void bpf_map_area_free(void *area) 73void bpf_map_area_free(void *area)
@@ -215,7 +213,7 @@ int bpf_map_new_fd(struct bpf_map *map)
215 offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ 213 offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
216 sizeof(attr->CMD##_LAST_FIELD)) != NULL 214 sizeof(attr->CMD##_LAST_FIELD)) != NULL
217 215
218#define BPF_MAP_CREATE_LAST_FIELD map_flags 216#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd
219/* called via syscall */ 217/* called via syscall */
220static int map_create(union bpf_attr *attr) 218static int map_create(union bpf_attr *attr)
221{ 219{
@@ -352,6 +350,9 @@ static int map_lookup_elem(union bpf_attr *attr)
352 err = bpf_percpu_array_copy(map, key, value); 350 err = bpf_percpu_array_copy(map, key, value);
353 } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { 351 } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
354 err = bpf_stackmap_copy(map, key, value); 352 err = bpf_stackmap_copy(map, key, value);
353 } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
354 map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
355 err = -ENOTSUPP;
355 } else { 356 } else {
356 rcu_read_lock(); 357 rcu_read_lock();
357 ptr = map->ops->map_lookup_elem(map, key); 358 ptr = map->ops->map_lookup_elem(map, key);
@@ -438,11 +439,17 @@ static int map_update_elem(union bpf_attr *attr)
438 err = bpf_percpu_array_update(map, key, value, attr->flags); 439 err = bpf_percpu_array_update(map, key, value, attr->flags);
439 } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || 440 } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
440 map->map_type == BPF_MAP_TYPE_PROG_ARRAY || 441 map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
441 map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { 442 map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY ||
443 map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
442 rcu_read_lock(); 444 rcu_read_lock();
443 err = bpf_fd_array_map_update_elem(map, f.file, key, value, 445 err = bpf_fd_array_map_update_elem(map, f.file, key, value,
444 attr->flags); 446 attr->flags);
445 rcu_read_unlock(); 447 rcu_read_unlock();
448 } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
449 rcu_read_lock();
450 err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
451 attr->flags);
452 rcu_read_unlock();
446 } else { 453 } else {
447 rcu_read_lock(); 454 rcu_read_lock();
448 err = map->ops->map_update_elem(map, key, value, attr->flags); 455 err = map->ops->map_update_elem(map, key, value, attr->flags);
@@ -528,14 +535,18 @@ static int map_get_next_key(union bpf_attr *attr)
528 if (IS_ERR(map)) 535 if (IS_ERR(map))
529 return PTR_ERR(map); 536 return PTR_ERR(map);
530 537
531 err = -ENOMEM; 538 if (ukey) {
532 key = kmalloc(map->key_size, GFP_USER); 539 err = -ENOMEM;
533 if (!key) 540 key = kmalloc(map->key_size, GFP_USER);
534 goto err_put; 541 if (!key)
542 goto err_put;
535 543
536 err = -EFAULT; 544 err = -EFAULT;
537 if (copy_from_user(key, ukey, map->key_size) != 0) 545 if (copy_from_user(key, ukey, map->key_size) != 0)
538 goto free_key; 546 goto free_key;
547 } else {
548 key = NULL;
549 }
539 550
540 err = -ENOMEM; 551 err = -ENOMEM;
541 next_key = kmalloc(map->key_size, GFP_USER); 552 next_key = kmalloc(map->key_size, GFP_USER);
@@ -564,87 +575,23 @@ err_put:
564 return err; 575 return err;
565} 576}
566 577
567static LIST_HEAD(bpf_prog_types); 578static const struct bpf_verifier_ops * const bpf_prog_types[] = {
579#define BPF_PROG_TYPE(_id, _ops) \
580 [_id] = &_ops,
581#define BPF_MAP_TYPE(_id, _ops)
582#include <linux/bpf_types.h>
583#undef BPF_PROG_TYPE
584#undef BPF_MAP_TYPE
585};
568 586
569static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) 587static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
570{ 588{
571 struct bpf_prog_type_list *tl; 589 if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
572 590 return -EINVAL;
573 list_for_each_entry(tl, &bpf_prog_types, list_node) {
574 if (tl->type == type) {
575 prog->aux->ops = tl->ops;
576 prog->type = type;
577 return 0;
578 }
579 }
580
581 return -EINVAL;
582}
583
584void bpf_register_prog_type(struct bpf_prog_type_list *tl)
585{
586 list_add(&tl->list_node, &bpf_prog_types);
587}
588
589/* fixup insn->imm field of bpf_call instructions:
590 * if (insn->imm == BPF_FUNC_map_lookup_elem)
591 * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
592 * else if (insn->imm == BPF_FUNC_map_update_elem)
593 * insn->imm = bpf_map_update_elem - __bpf_call_base;
594 * else ...
595 *
596 * this function is called after eBPF program passed verification
597 */
598static void fixup_bpf_calls(struct bpf_prog *prog)
599{
600 const struct bpf_func_proto *fn;
601 int i;
602 591
603 for (i = 0; i < prog->len; i++) { 592 prog->aux->ops = bpf_prog_types[type];
604 struct bpf_insn *insn = &prog->insnsi[i]; 593 prog->type = type;
605 594 return 0;
606 if (insn->code == (BPF_JMP | BPF_CALL)) {
607 /* we reach here when program has bpf_call instructions
608 * and it passed bpf_check(), means that
609 * ops->get_func_proto must have been supplied, check it
610 */
611 BUG_ON(!prog->aux->ops->get_func_proto);
612
613 if (insn->imm == BPF_FUNC_get_route_realm)
614 prog->dst_needed = 1;
615 if (insn->imm == BPF_FUNC_get_prandom_u32)
616 bpf_user_rnd_init_once();
617 if (insn->imm == BPF_FUNC_xdp_adjust_head)
618 prog->xdp_adjust_head = 1;
619 if (insn->imm == BPF_FUNC_tail_call) {
620 /* If we tail call into other programs, we
621 * cannot make any assumptions since they
622 * can be replaced dynamically during runtime
623 * in the program array.
624 */
625 prog->cb_access = 1;
626 prog->xdp_adjust_head = 1;
627
628 /* mark bpf_tail_call as different opcode
629 * to avoid conditional branch in
630 * interpeter for every normal call
631 * and to prevent accidental JITing by
632 * JIT compiler that doesn't support
633 * bpf_tail_call yet
634 */
635 insn->imm = 0;
636 insn->code |= BPF_X;
637 continue;
638 }
639
640 fn = prog->aux->ops->get_func_proto(insn->imm);
641 /* all functions that have prototype and verifier allowed
642 * programs to call them, must be real in-kernel functions
643 */
644 BUG_ON(!fn->func);
645 insn->imm = fn->func - __bpf_call_base;
646 }
647 }
648} 595}
649 596
650/* drop refcnt on maps used by eBPF program and free auxilary data */ 597/* drop refcnt on maps used by eBPF program and free auxilary data */
@@ -836,7 +783,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
836EXPORT_SYMBOL_GPL(bpf_prog_get_type); 783EXPORT_SYMBOL_GPL(bpf_prog_get_type);
837 784
838/* last field in 'union bpf_attr' used by this command */ 785/* last field in 'union bpf_attr' used by this command */
839#define BPF_PROG_LOAD_LAST_FIELD kern_version 786#define BPF_PROG_LOAD_LAST_FIELD prog_flags
840 787
841static int bpf_prog_load(union bpf_attr *attr) 788static int bpf_prog_load(union bpf_attr *attr)
842{ 789{
@@ -849,6 +796,9 @@ static int bpf_prog_load(union bpf_attr *attr)
849 if (CHECK_ATTR(BPF_PROG_LOAD)) 796 if (CHECK_ATTR(BPF_PROG_LOAD))
850 return -EINVAL; 797 return -EINVAL;
851 798
799 if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
800 return -EINVAL;
801
852 /* copy eBPF program license from user space */ 802 /* copy eBPF program license from user space */
853 if (strncpy_from_user(license, u64_to_user_ptr(attr->license), 803 if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
854 sizeof(license) - 1) < 0) 804 sizeof(license) - 1) < 0)
@@ -900,9 +850,6 @@ static int bpf_prog_load(union bpf_attr *attr)
900 if (err < 0) 850 if (err < 0)
901 goto free_used_maps; 851 goto free_used_maps;
902 852
903 /* fixup BPF_CALL->imm field */
904 fixup_bpf_calls(prog);
905
906 /* eBPF program is ready to be JITed */ 853 /* eBPF program is ready to be JITed */
907 prog = bpf_prog_select_runtime(prog, &err); 854 prog = bpf_prog_select_runtime(prog, &err);
908 if (err < 0) 855 if (err < 0)
@@ -1028,6 +975,28 @@ static int bpf_prog_detach(const union bpf_attr *attr)
1028} 975}
1029#endif /* CONFIG_CGROUP_BPF */ 976#endif /* CONFIG_CGROUP_BPF */
1030 977
978#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
979
980static int bpf_prog_test_run(const union bpf_attr *attr,
981 union bpf_attr __user *uattr)
982{
983 struct bpf_prog *prog;
984 int ret = -ENOTSUPP;
985
986 if (CHECK_ATTR(BPF_PROG_TEST_RUN))
987 return -EINVAL;
988
989 prog = bpf_prog_get(attr->test.prog_fd);
990 if (IS_ERR(prog))
991 return PTR_ERR(prog);
992
993 if (prog->aux->ops->test_run)
994 ret = prog->aux->ops->test_run(prog, attr, uattr);
995
996 bpf_prog_put(prog);
997 return ret;
998}
999
1031SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) 1000SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
1032{ 1001{
1033 union bpf_attr attr = {}; 1002 union bpf_attr attr = {};
@@ -1094,7 +1063,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
1094 case BPF_OBJ_GET: 1063 case BPF_OBJ_GET:
1095 err = bpf_obj_get(&attr); 1064 err = bpf_obj_get(&attr);
1096 break; 1065 break;
1097
1098#ifdef CONFIG_CGROUP_BPF 1066#ifdef CONFIG_CGROUP_BPF
1099 case BPF_PROG_ATTACH: 1067 case BPF_PROG_ATTACH:
1100 err = bpf_prog_attach(&attr); 1068 err = bpf_prog_attach(&attr);
@@ -1103,7 +1071,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
1103 err = bpf_prog_detach(&attr); 1071 err = bpf_prog_detach(&attr);
1104 break; 1072 break;
1105#endif 1073#endif
1106 1074 case BPF_PROG_TEST_RUN:
1075 err = bpf_prog_test_run(&attr, uattr);
1076 break;
1107 default: 1077 default:
1108 err = -EINVAL; 1078 err = -EINVAL;
1109 break; 1079 break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a834068a400e..1eddb713b815 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -140,9 +140,11 @@ struct bpf_verifier_stack_elem {
140 struct bpf_verifier_stack_elem *next; 140 struct bpf_verifier_stack_elem *next;
141}; 141};
142 142
143#define BPF_COMPLEXITY_LIMIT_INSNS 65536 143#define BPF_COMPLEXITY_LIMIT_INSNS 98304
144#define BPF_COMPLEXITY_LIMIT_STACK 1024 144#define BPF_COMPLEXITY_LIMIT_STACK 1024
145 145
146#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
147
146struct bpf_call_arg_meta { 148struct bpf_call_arg_meta {
147 struct bpf_map *map_ptr; 149 struct bpf_map *map_ptr;
148 bool raw_mode; 150 bool raw_mode;
@@ -239,6 +241,12 @@ static void print_verifier_state(struct bpf_verifier_state *state)
239 if (reg->max_value != BPF_REGISTER_MAX_RANGE) 241 if (reg->max_value != BPF_REGISTER_MAX_RANGE)
240 verbose(",max_value=%llu", 242 verbose(",max_value=%llu",
241 (unsigned long long)reg->max_value); 243 (unsigned long long)reg->max_value);
244 if (reg->min_align)
245 verbose(",min_align=%u", reg->min_align);
246 if (reg->aux_off)
247 verbose(",aux_off=%u", reg->aux_off);
248 if (reg->aux_off_align)
249 verbose(",aux_off_align=%u", reg->aux_off_align);
242 } 250 }
243 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { 251 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
244 if (state->stack_slot_type[i] == STACK_SPILL) 252 if (state->stack_slot_type[i] == STACK_SPILL)
@@ -296,7 +304,8 @@ static const char *const bpf_jmp_string[16] = {
296 [BPF_EXIT >> 4] = "exit", 304 [BPF_EXIT >> 4] = "exit",
297}; 305};
298 306
299static void print_bpf_insn(struct bpf_insn *insn) 307static void print_bpf_insn(const struct bpf_verifier_env *env,
308 const struct bpf_insn *insn)
300{ 309{
301 u8 class = BPF_CLASS(insn->code); 310 u8 class = BPF_CLASS(insn->code);
302 311
@@ -360,9 +369,19 @@ static void print_bpf_insn(struct bpf_insn *insn)
360 insn->code, 369 insn->code,
361 bpf_ldst_string[BPF_SIZE(insn->code) >> 3], 370 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
362 insn->src_reg, insn->imm); 371 insn->src_reg, insn->imm);
363 } else if (BPF_MODE(insn->code) == BPF_IMM) { 372 } else if (BPF_MODE(insn->code) == BPF_IMM &&
364 verbose("(%02x) r%d = 0x%x\n", 373 BPF_SIZE(insn->code) == BPF_DW) {
365 insn->code, insn->dst_reg, insn->imm); 374 /* At this point, we already made sure that the second
375 * part of the ldimm64 insn is accessible.
376 */
377 u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
378 bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
379
380 if (map_ptr && !env->allow_ptr_leaks)
381 imm = 0;
382
383 verbose("(%02x) r%d = 0x%llx\n", insn->code,
384 insn->dst_reg, (unsigned long long)imm);
366 } else { 385 } else {
367 verbose("BUG_ld_%02x\n", insn->code); 386 verbose("BUG_ld_%02x\n", insn->code);
368 return; 387 return;
@@ -453,6 +472,9 @@ static void init_reg_state(struct bpf_reg_state *regs)
453 regs[i].imm = 0; 472 regs[i].imm = 0;
454 regs[i].min_value = BPF_REGISTER_MIN_RANGE; 473 regs[i].min_value = BPF_REGISTER_MIN_RANGE;
455 regs[i].max_value = BPF_REGISTER_MAX_RANGE; 474 regs[i].max_value = BPF_REGISTER_MAX_RANGE;
475 regs[i].min_align = 0;
476 regs[i].aux_off = 0;
477 regs[i].aux_off_align = 0;
456 } 478 }
457 479
458 /* frame pointer */ 480 /* frame pointer */
@@ -479,6 +501,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
479{ 501{
480 regs[regno].min_value = BPF_REGISTER_MIN_RANGE; 502 regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
481 regs[regno].max_value = BPF_REGISTER_MAX_RANGE; 503 regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
504 regs[regno].min_align = 0;
482} 505}
483 506
484static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs, 507static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
@@ -766,17 +789,33 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
766} 789}
767 790
768static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, 791static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
769 int off, int size) 792 int off, int size, bool strict)
770{ 793{
771 if (reg->id && size != 1) { 794 int ip_align;
772 verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n"); 795 int reg_off;
773 return -EACCES; 796
797 /* Byte size accesses are always allowed. */
798 if (!strict || size == 1)
799 return 0;
800
801 reg_off = reg->off;
802 if (reg->id) {
803 if (reg->aux_off_align % size) {
804 verbose("Packet access is only %u byte aligned, %d byte access not allowed\n",
805 reg->aux_off_align, size);
806 return -EACCES;
807 }
808 reg_off += reg->aux_off;
774 } 809 }
775 810
776 /* skb->data is NET_IP_ALIGN-ed */ 811 /* skb->data is NET_IP_ALIGN-ed, but for strict alignment checking
777 if ((NET_IP_ALIGN + reg->off + off) % size != 0) { 812 * we force this to 2 which is universally what architectures use
813 * when they don't set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
814 */
815 ip_align = strict ? 2 : NET_IP_ALIGN;
816 if ((ip_align + reg_off + off) % size != 0) {
778 verbose("misaligned packet access off %d+%d+%d size %d\n", 817 verbose("misaligned packet access off %d+%d+%d size %d\n",
779 NET_IP_ALIGN, reg->off, off, size); 818 ip_align, reg_off, off, size);
780 return -EACCES; 819 return -EACCES;
781 } 820 }
782 821
@@ -784,9 +823,9 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
784} 823}
785 824
786static int check_val_ptr_alignment(const struct bpf_reg_state *reg, 825static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
787 int size) 826 int size, bool strict)
788{ 827{
789 if (size != 1) { 828 if (strict && size != 1) {
790 verbose("Unknown alignment. Only byte-sized access allowed in value access.\n"); 829 verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
791 return -EACCES; 830 return -EACCES;
792 } 831 }
@@ -794,16 +833,20 @@ static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
794 return 0; 833 return 0;
795} 834}
796 835
797static int check_ptr_alignment(const struct bpf_reg_state *reg, 836static int check_ptr_alignment(struct bpf_verifier_env *env,
837 const struct bpf_reg_state *reg,
798 int off, int size) 838 int off, int size)
799{ 839{
840 bool strict = env->strict_alignment;
841
842 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
843 strict = true;
844
800 switch (reg->type) { 845 switch (reg->type) {
801 case PTR_TO_PACKET: 846 case PTR_TO_PACKET:
802 return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : 847 return check_pkt_ptr_alignment(reg, off, size, strict);
803 check_pkt_ptr_alignment(reg, off, size);
804 case PTR_TO_MAP_VALUE_ADJ: 848 case PTR_TO_MAP_VALUE_ADJ:
805 return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : 849 return check_val_ptr_alignment(reg, size, strict);
806 check_val_ptr_alignment(reg, size);
807 default: 850 default:
808 if (off % size != 0) { 851 if (off % size != 0) {
809 verbose("misaligned access off %d size %d\n", 852 verbose("misaligned access off %d size %d\n",
@@ -836,7 +879,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
836 if (size < 0) 879 if (size < 0)
837 return size; 880 return size;
838 881
839 err = check_ptr_alignment(reg, off, size); 882 err = check_ptr_alignment(env, reg, off, size);
840 if (err) 883 if (err)
841 return err; 884 return err;
842 885
@@ -870,6 +913,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
870 value_regno); 913 value_regno);
871 /* note that reg.[id|off|range] == 0 */ 914 /* note that reg.[id|off|range] == 0 */
872 state->regs[value_regno].type = reg_type; 915 state->regs[value_regno].type = reg_type;
916 state->regs[value_regno].aux_off = 0;
917 state->regs[value_regno].aux_off_align = 0;
873 } 918 }
874 919
875 } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { 920 } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
@@ -1215,6 +1260,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
1215 func_id != BPF_FUNC_current_task_under_cgroup) 1260 func_id != BPF_FUNC_current_task_under_cgroup)
1216 goto error; 1261 goto error;
1217 break; 1262 break;
1263 case BPF_MAP_TYPE_ARRAY_OF_MAPS:
1264 case BPF_MAP_TYPE_HASH_OF_MAPS:
1265 if (func_id != BPF_FUNC_map_lookup_elem)
1266 goto error;
1218 default: 1267 default:
1219 break; 1268 break;
1220 } 1269 }
@@ -1291,7 +1340,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
1291 } 1340 }
1292} 1341}
1293 1342
1294static int check_call(struct bpf_verifier_env *env, int func_id) 1343static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
1295{ 1344{
1296 struct bpf_verifier_state *state = &env->cur_state; 1345 struct bpf_verifier_state *state = &env->cur_state;
1297 const struct bpf_func_proto *fn = NULL; 1346 const struct bpf_func_proto *fn = NULL;
@@ -1375,6 +1424,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
1375 } else if (fn->ret_type == RET_VOID) { 1424 } else if (fn->ret_type == RET_VOID) {
1376 regs[BPF_REG_0].type = NOT_INIT; 1425 regs[BPF_REG_0].type = NOT_INIT;
1377 } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { 1426 } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
1427 struct bpf_insn_aux_data *insn_aux;
1428
1378 regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; 1429 regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
1379 regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; 1430 regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
1380 /* remember map_ptr, so that check_map_access() 1431 /* remember map_ptr, so that check_map_access()
@@ -1387,6 +1438,11 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
1387 } 1438 }
1388 regs[BPF_REG_0].map_ptr = meta.map_ptr; 1439 regs[BPF_REG_0].map_ptr = meta.map_ptr;
1389 regs[BPF_REG_0].id = ++env->id_gen; 1440 regs[BPF_REG_0].id = ++env->id_gen;
1441 insn_aux = &env->insn_aux_data[insn_idx];
1442 if (!insn_aux->map_ptr)
1443 insn_aux->map_ptr = meta.map_ptr;
1444 else if (insn_aux->map_ptr != meta.map_ptr)
1445 insn_aux->map_ptr = BPF_MAP_PTR_POISON;
1390 } else { 1446 } else {
1391 verbose("unknown return type %d of func %s#%d\n", 1447 verbose("unknown return type %d of func %s#%d\n",
1392 fn->ret_type, func_id_name(func_id), func_id); 1448 fn->ret_type, func_id_name(func_id), func_id);
@@ -1431,6 +1487,8 @@ add_imm:
1431 */ 1487 */
1432 dst_reg->off += imm; 1488 dst_reg->off += imm;
1433 } else { 1489 } else {
1490 bool had_id;
1491
1434 if (src_reg->type == PTR_TO_PACKET) { 1492 if (src_reg->type == PTR_TO_PACKET) {
1435 /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */ 1493 /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
1436 tmp_reg = *dst_reg; /* save r7 state */ 1494 tmp_reg = *dst_reg; /* save r7 state */
@@ -1464,14 +1522,23 @@ add_imm:
1464 src_reg->imm); 1522 src_reg->imm);
1465 return -EACCES; 1523 return -EACCES;
1466 } 1524 }
1525
1526 had_id = (dst_reg->id != 0);
1527
1467 /* dst_reg stays as pkt_ptr type and since some positive 1528 /* dst_reg stays as pkt_ptr type and since some positive
1468 * integer value was added to the pointer, increment its 'id' 1529 * integer value was added to the pointer, increment its 'id'
1469 */ 1530 */
1470 dst_reg->id = ++env->id_gen; 1531 dst_reg->id = ++env->id_gen;
1471 1532
1472 /* something was added to pkt_ptr, set range and off to zero */ 1533 /* something was added to pkt_ptr, set range to zero */
1534 dst_reg->aux_off += dst_reg->off;
1473 dst_reg->off = 0; 1535 dst_reg->off = 0;
1474 dst_reg->range = 0; 1536 dst_reg->range = 0;
1537 if (had_id)
1538 dst_reg->aux_off_align = min(dst_reg->aux_off_align,
1539 src_reg->min_align);
1540 else
1541 dst_reg->aux_off_align = src_reg->min_align;
1475 } 1542 }
1476 return 0; 1543 return 0;
1477} 1544}
@@ -1645,6 +1712,13 @@ static void check_reg_overflow(struct bpf_reg_state *reg)
1645 reg->min_value = BPF_REGISTER_MIN_RANGE; 1712 reg->min_value = BPF_REGISTER_MIN_RANGE;
1646} 1713}
1647 1714
1715static u32 calc_align(u32 imm)
1716{
1717 if (!imm)
1718 return 1U << 31;
1719 return imm - ((imm - 1) & imm);
1720}
1721
1648static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, 1722static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
1649 struct bpf_insn *insn) 1723 struct bpf_insn *insn)
1650{ 1724{
@@ -1652,8 +1726,10 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
1652 s64 min_val = BPF_REGISTER_MIN_RANGE; 1726 s64 min_val = BPF_REGISTER_MIN_RANGE;
1653 u64 max_val = BPF_REGISTER_MAX_RANGE; 1727 u64 max_val = BPF_REGISTER_MAX_RANGE;
1654 u8 opcode = BPF_OP(insn->code); 1728 u8 opcode = BPF_OP(insn->code);
1729 u32 dst_align, src_align;
1655 1730
1656 dst_reg = &regs[insn->dst_reg]; 1731 dst_reg = &regs[insn->dst_reg];
1732 src_align = 0;
1657 if (BPF_SRC(insn->code) == BPF_X) { 1733 if (BPF_SRC(insn->code) == BPF_X) {
1658 check_reg_overflow(&regs[insn->src_reg]); 1734 check_reg_overflow(&regs[insn->src_reg]);
1659 min_val = regs[insn->src_reg].min_value; 1735 min_val = regs[insn->src_reg].min_value;
@@ -1669,12 +1745,18 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
1669 regs[insn->src_reg].type != UNKNOWN_VALUE) { 1745 regs[insn->src_reg].type != UNKNOWN_VALUE) {
1670 min_val = BPF_REGISTER_MIN_RANGE; 1746 min_val = BPF_REGISTER_MIN_RANGE;
1671 max_val = BPF_REGISTER_MAX_RANGE; 1747 max_val = BPF_REGISTER_MAX_RANGE;
1748 src_align = 0;
1749 } else {
1750 src_align = regs[insn->src_reg].min_align;
1672 } 1751 }
1673 } else if (insn->imm < BPF_REGISTER_MAX_RANGE && 1752 } else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
1674 (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { 1753 (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
1675 min_val = max_val = insn->imm; 1754 min_val = max_val = insn->imm;
1755 src_align = calc_align(insn->imm);
1676 } 1756 }
1677 1757
1758 dst_align = dst_reg->min_align;
1759
1678 /* We don't know anything about what was done to this register, mark it 1760 /* We don't know anything about what was done to this register, mark it
1679 * as unknown. 1761 * as unknown.
1680 */ 1762 */
@@ -1699,18 +1781,21 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
1699 dst_reg->min_value += min_val; 1781 dst_reg->min_value += min_val;
1700 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) 1782 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
1701 dst_reg->max_value += max_val; 1783 dst_reg->max_value += max_val;
1784 dst_reg->min_align = min(src_align, dst_align);
1702 break; 1785 break;
1703 case BPF_SUB: 1786 case BPF_SUB:
1704 if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) 1787 if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
1705 dst_reg->min_value -= min_val; 1788 dst_reg->min_value -= min_val;
1706 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) 1789 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
1707 dst_reg->max_value -= max_val; 1790 dst_reg->max_value -= max_val;
1791 dst_reg->min_align = min(src_align, dst_align);
1708 break; 1792 break;
1709 case BPF_MUL: 1793 case BPF_MUL:
1710 if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) 1794 if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
1711 dst_reg->min_value *= min_val; 1795 dst_reg->min_value *= min_val;
1712 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) 1796 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
1713 dst_reg->max_value *= max_val; 1797 dst_reg->max_value *= max_val;
1798 dst_reg->min_align = max(src_align, dst_align);
1714 break; 1799 break;
1715 case BPF_AND: 1800 case BPF_AND:
1716 /* Disallow AND'ing of negative numbers, ain't nobody got time 1801 /* Disallow AND'ing of negative numbers, ain't nobody got time
@@ -1722,17 +1807,23 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
1722 else 1807 else
1723 dst_reg->min_value = 0; 1808 dst_reg->min_value = 0;
1724 dst_reg->max_value = max_val; 1809 dst_reg->max_value = max_val;
1810 dst_reg->min_align = max(src_align, dst_align);
1725 break; 1811 break;
1726 case BPF_LSH: 1812 case BPF_LSH:
1727 /* Gotta have special overflow logic here, if we're shifting 1813 /* Gotta have special overflow logic here, if we're shifting
1728 * more than MAX_RANGE then just assume we have an invalid 1814 * more than MAX_RANGE then just assume we have an invalid
1729 * range. 1815 * range.
1730 */ 1816 */
1731 if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) 1817 if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) {
1732 dst_reg->min_value = BPF_REGISTER_MIN_RANGE; 1818 dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
1733 else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) 1819 dst_reg->min_align = 1;
1734 dst_reg->min_value <<= min_val; 1820 } else {
1735 1821 if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
1822 dst_reg->min_value <<= min_val;
1823 if (!dst_reg->min_align)
1824 dst_reg->min_align = 1;
1825 dst_reg->min_align <<= min_val;
1826 }
1736 if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) 1827 if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
1737 dst_reg->max_value = BPF_REGISTER_MAX_RANGE; 1828 dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
1738 else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) 1829 else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
@@ -1742,11 +1833,19 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
1742 /* RSH by a negative number is undefined, and the BPF_RSH is an 1833 /* RSH by a negative number is undefined, and the BPF_RSH is an
1743 * unsigned shift, so make the appropriate casts. 1834 * unsigned shift, so make the appropriate casts.
1744 */ 1835 */
1745 if (min_val < 0 || dst_reg->min_value < 0) 1836 if (min_val < 0 || dst_reg->min_value < 0) {
1746 dst_reg->min_value = BPF_REGISTER_MIN_RANGE; 1837 dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
1747 else 1838 } else {
1748 dst_reg->min_value = 1839 dst_reg->min_value =
1749 (u64)(dst_reg->min_value) >> min_val; 1840 (u64)(dst_reg->min_value) >> min_val;
1841 }
1842 if (min_val < 0) {
1843 dst_reg->min_align = 1;
1844 } else {
1845 dst_reg->min_align >>= (u64) min_val;
1846 if (!dst_reg->min_align)
1847 dst_reg->min_align = 1;
1848 }
1750 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) 1849 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
1751 dst_reg->max_value >>= max_val; 1850 dst_reg->max_value >>= max_val;
1752 break; 1851 break;
@@ -1848,6 +1947,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
1848 regs[insn->dst_reg].imm = insn->imm; 1947 regs[insn->dst_reg].imm = insn->imm;
1849 regs[insn->dst_reg].max_value = insn->imm; 1948 regs[insn->dst_reg].max_value = insn->imm;
1850 regs[insn->dst_reg].min_value = insn->imm; 1949 regs[insn->dst_reg].min_value = insn->imm;
1950 regs[insn->dst_reg].min_align = calc_align(insn->imm);
1851 } 1951 }
1852 1952
1853 } else if (opcode > BPF_END) { 1953 } else if (opcode > BPF_END) {
@@ -1911,6 +2011,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
1911 return 0; 2011 return 0;
1912 } else if (opcode == BPF_ADD && 2012 } else if (opcode == BPF_ADD &&
1913 BPF_CLASS(insn->code) == BPF_ALU64 && 2013 BPF_CLASS(insn->code) == BPF_ALU64 &&
2014 dst_reg->type == PTR_TO_STACK &&
2015 ((BPF_SRC(insn->code) == BPF_X &&
2016 regs[insn->src_reg].type == CONST_IMM) ||
2017 BPF_SRC(insn->code) == BPF_K)) {
2018 if (BPF_SRC(insn->code) == BPF_X)
2019 dst_reg->imm += regs[insn->src_reg].imm;
2020 else
2021 dst_reg->imm += insn->imm;
2022 return 0;
2023 } else if (opcode == BPF_ADD &&
2024 BPF_CLASS(insn->code) == BPF_ALU64 &&
1914 (dst_reg->type == PTR_TO_PACKET || 2025 (dst_reg->type == PTR_TO_PACKET ||
1915 (BPF_SRC(insn->code) == BPF_X && 2026 (BPF_SRC(insn->code) == BPF_X &&
1916 regs[insn->src_reg].type == PTR_TO_PACKET))) { 2027 regs[insn->src_reg].type == PTR_TO_PACKET))) {
@@ -2112,14 +2223,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
2112 struct bpf_reg_state *reg = &regs[regno]; 2223 struct bpf_reg_state *reg = &regs[regno];
2113 2224
2114 if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { 2225 if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
2115 reg->type = type; 2226 if (type == UNKNOWN_VALUE) {
2227 __mark_reg_unknown_value(regs, regno);
2228 } else if (reg->map_ptr->inner_map_meta) {
2229 reg->type = CONST_PTR_TO_MAP;
2230 reg->map_ptr = reg->map_ptr->inner_map_meta;
2231 } else {
2232 reg->type = type;
2233 }
2116 /* We don't need id from this point onwards anymore, thus we 2234 /* We don't need id from this point onwards anymore, thus we
2117 * should better reset it, so that state pruning has chances 2235 * should better reset it, so that state pruning has chances
2118 * to take effect. 2236 * to take effect.
2119 */ 2237 */
2120 reg->id = 0; 2238 reg->id = 0;
2121 if (type == UNKNOWN_VALUE)
2122 __mark_reg_unknown_value(regs, regno);
2123 } 2239 }
2124} 2240}
2125 2241
@@ -2524,6 +2640,7 @@ peek_stack:
2524 env->explored_states[t + 1] = STATE_LIST_MARK; 2640 env->explored_states[t + 1] = STATE_LIST_MARK;
2525 } else { 2641 } else {
2526 /* conditional jump with two edges */ 2642 /* conditional jump with two edges */
2643 env->explored_states[t] = STATE_LIST_MARK;
2527 ret = push_insn(t, t + 1, FALLTHROUGH, env); 2644 ret = push_insn(t, t + 1, FALLTHROUGH, env);
2528 if (ret == 1) 2645 if (ret == 1)
2529 goto peek_stack; 2646 goto peek_stack;
@@ -2682,6 +2799,12 @@ static bool states_equal(struct bpf_verifier_env *env,
2682 rcur->type != NOT_INIT)) 2799 rcur->type != NOT_INIT))
2683 continue; 2800 continue;
2684 2801
2802 /* Don't care about the reg->id in this case. */
2803 if (rold->type == PTR_TO_MAP_VALUE_OR_NULL &&
2804 rcur->type == PTR_TO_MAP_VALUE_OR_NULL &&
2805 rold->map_ptr == rcur->map_ptr)
2806 continue;
2807
2685 if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && 2808 if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
2686 compare_ptrs_to_packet(rold, rcur)) 2809 compare_ptrs_to_packet(rold, rcur))
2687 continue; 2810 continue;
@@ -2816,15 +2939,22 @@ static int do_check(struct bpf_verifier_env *env)
2816 goto process_bpf_exit; 2939 goto process_bpf_exit;
2817 } 2940 }
2818 2941
2819 if (log_level && do_print_state) { 2942 if (need_resched())
2820 verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); 2943 cond_resched();
2944
2945 if (log_level > 1 || (log_level && do_print_state)) {
2946 if (log_level > 1)
2947 verbose("%d:", insn_idx);
2948 else
2949 verbose("\nfrom %d to %d:",
2950 prev_insn_idx, insn_idx);
2821 print_verifier_state(&env->cur_state); 2951 print_verifier_state(&env->cur_state);
2822 do_print_state = false; 2952 do_print_state = false;
2823 } 2953 }
2824 2954
2825 if (log_level) { 2955 if (log_level) {
2826 verbose("%d: ", insn_idx); 2956 verbose("%d: ", insn_idx);
2827 print_bpf_insn(insn); 2957 print_bpf_insn(env, insn);
2828 } 2958 }
2829 2959
2830 err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); 2960 err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
@@ -2960,7 +3090,7 @@ static int do_check(struct bpf_verifier_env *env)
2960 return -EINVAL; 3090 return -EINVAL;
2961 } 3091 }
2962 3092
2963 err = check_call(env, insn->imm); 3093 err = check_call(env, insn->imm, insn_idx);
2964 if (err) 3094 if (err)
2965 return err; 3095 return err;
2966 3096
@@ -3044,16 +3174,33 @@ process_bpf_exit:
3044 return 0; 3174 return 0;
3045} 3175}
3046 3176
3177static int check_map_prealloc(struct bpf_map *map)
3178{
3179 return (map->map_type != BPF_MAP_TYPE_HASH &&
3180 map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
3181 map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) ||
3182 !(map->map_flags & BPF_F_NO_PREALLOC);
3183}
3184
3047static int check_map_prog_compatibility(struct bpf_map *map, 3185static int check_map_prog_compatibility(struct bpf_map *map,
3048 struct bpf_prog *prog) 3186 struct bpf_prog *prog)
3049 3187
3050{ 3188{
3051 if (prog->type == BPF_PROG_TYPE_PERF_EVENT && 3189 /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use
3052 (map->map_type == BPF_MAP_TYPE_HASH || 3190 * preallocated hash maps, since doing memory allocation
3053 map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && 3191 * in overflow_handler can crash depending on where nmi got
3054 (map->map_flags & BPF_F_NO_PREALLOC)) { 3192 * triggered.
3055 verbose("perf_event programs can only use preallocated hash map\n"); 3193 */
3056 return -EINVAL; 3194 if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
3195 if (!check_map_prealloc(map)) {
3196 verbose("perf_event programs can only use preallocated hash map\n");
3197 return -EINVAL;
3198 }
3199 if (map->inner_map_meta &&
3200 !check_map_prealloc(map->inner_map_meta)) {
3201 verbose("perf_event programs can only use preallocated inner hash map\n");
3202 return -EINVAL;
3203 }
3057 } 3204 }
3058 return 0; 3205 return 0;
3059} 3206}
@@ -3182,6 +3329,41 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
3182 insn->src_reg = 0; 3329 insn->src_reg = 0;
3183} 3330}
3184 3331
3332/* single env->prog->insni[off] instruction was replaced with the range
3333 * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
3334 * [0, off) and [off, end) to new locations, so the patched range stays zero
3335 */
3336static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
3337 u32 off, u32 cnt)
3338{
3339 struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
3340
3341 if (cnt == 1)
3342 return 0;
3343 new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len);
3344 if (!new_data)
3345 return -ENOMEM;
3346 memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
3347 memcpy(new_data + off + cnt - 1, old_data + off,
3348 sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
3349 env->insn_aux_data = new_data;
3350 vfree(old_data);
3351 return 0;
3352}
3353
3354static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
3355 const struct bpf_insn *patch, u32 len)
3356{
3357 struct bpf_prog *new_prog;
3358
3359 new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
3360 if (!new_prog)
3361 return NULL;
3362 if (adjust_insn_aux_data(env, new_prog->len, off, len))
3363 return NULL;
3364 return new_prog;
3365}
3366
3185/* convert load instructions that access fields of 'struct __sk_buff' 3367/* convert load instructions that access fields of 'struct __sk_buff'
3186 * into sequence of instructions that access fields of 'struct sk_buff' 3368 * into sequence of instructions that access fields of 'struct sk_buff'
3187 */ 3369 */
@@ -3201,10 +3383,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
3201 verbose("bpf verifier is misconfigured\n"); 3383 verbose("bpf verifier is misconfigured\n");
3202 return -EINVAL; 3384 return -EINVAL;
3203 } else if (cnt) { 3385 } else if (cnt) {
3204 new_prog = bpf_patch_insn_single(env->prog, 0, 3386 new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
3205 insn_buf, cnt);
3206 if (!new_prog) 3387 if (!new_prog)
3207 return -ENOMEM; 3388 return -ENOMEM;
3389
3208 env->prog = new_prog; 3390 env->prog = new_prog;
3209 delta += cnt - 1; 3391 delta += cnt - 1;
3210 } 3392 }
@@ -3229,7 +3411,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
3229 else 3411 else
3230 continue; 3412 continue;
3231 3413
3232 if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) 3414 if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
3233 continue; 3415 continue;
3234 3416
3235 cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); 3417 cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
@@ -3238,8 +3420,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
3238 return -EINVAL; 3420 return -EINVAL;
3239 } 3421 }
3240 3422
3241 new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf, 3423 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
3242 cnt);
3243 if (!new_prog) 3424 if (!new_prog)
3244 return -ENOMEM; 3425 return -ENOMEM;
3245 3426
@@ -3253,6 +3434,89 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
3253 return 0; 3434 return 0;
3254} 3435}
3255 3436
3437/* fixup insn->imm field of bpf_call instructions
3438 * and inline eligible helpers as explicit sequence of BPF instructions
3439 *
3440 * this function is called after eBPF program passed verification
3441 */
3442static int fixup_bpf_calls(struct bpf_verifier_env *env)
3443{
3444 struct bpf_prog *prog = env->prog;
3445 struct bpf_insn *insn = prog->insnsi;
3446 const struct bpf_func_proto *fn;
3447 const int insn_cnt = prog->len;
3448 struct bpf_insn insn_buf[16];
3449 struct bpf_prog *new_prog;
3450 struct bpf_map *map_ptr;
3451 int i, cnt, delta = 0;
3452
3453 for (i = 0; i < insn_cnt; i++, insn++) {
3454 if (insn->code != (BPF_JMP | BPF_CALL))
3455 continue;
3456
3457 if (insn->imm == BPF_FUNC_get_route_realm)
3458 prog->dst_needed = 1;
3459 if (insn->imm == BPF_FUNC_get_prandom_u32)
3460 bpf_user_rnd_init_once();
3461 if (insn->imm == BPF_FUNC_tail_call) {
3462 /* If we tail call into other programs, we
3463 * cannot make any assumptions since they can
3464 * be replaced dynamically during runtime in
3465 * the program array.
3466 */
3467 prog->cb_access = 1;
3468
3469 /* mark bpf_tail_call as different opcode to avoid
3470 * conditional branch in the interpeter for every normal
3471 * call and to prevent accidental JITing by JIT compiler
3472 * that doesn't support bpf_tail_call yet
3473 */
3474 insn->imm = 0;
3475 insn->code |= BPF_X;
3476 continue;
3477 }
3478
3479 if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) {
3480 map_ptr = env->insn_aux_data[i + delta].map_ptr;
3481 if (map_ptr == BPF_MAP_PTR_POISON ||
3482 !map_ptr->ops->map_gen_lookup)
3483 goto patch_call_imm;
3484
3485 cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
3486 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
3487 verbose("bpf verifier is misconfigured\n");
3488 return -EINVAL;
3489 }
3490
3491 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
3492 cnt);
3493 if (!new_prog)
3494 return -ENOMEM;
3495
3496 delta += cnt - 1;
3497
3498 /* keep walking new program and skip insns we just inserted */
3499 env->prog = prog = new_prog;
3500 insn = new_prog->insnsi + i + delta;
3501 continue;
3502 }
3503
3504patch_call_imm:
3505 fn = prog->aux->ops->get_func_proto(insn->imm);
3506 /* all functions that have prototype and verifier allowed
3507 * programs to call them, must be real in-kernel functions
3508 */
3509 if (!fn->func) {
3510 verbose("kernel subsystem misconfigured func %s#%d\n",
3511 func_id_name(insn->imm), insn->imm);
3512 return -EFAULT;
3513 }
3514 insn->imm = fn->func - __bpf_call_base;
3515 }
3516
3517 return 0;
3518}
3519
3256static void free_states(struct bpf_verifier_env *env) 3520static void free_states(struct bpf_verifier_env *env)
3257{ 3521{
3258 struct bpf_verifier_state_list *sl, *sln; 3522 struct bpf_verifier_state_list *sl, *sln;
@@ -3320,6 +3584,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
3320 } else { 3584 } else {
3321 log_level = 0; 3585 log_level = 0;
3322 } 3586 }
3587 if (attr->prog_flags & BPF_F_STRICT_ALIGNMENT)
3588 env->strict_alignment = true;
3589 else
3590 env->strict_alignment = false;
3323 3591
3324 ret = replace_map_fd_with_map_ptr(env); 3592 ret = replace_map_fd_with_map_ptr(env);
3325 if (ret < 0) 3593 if (ret < 0)
@@ -3348,6 +3616,9 @@ skip_full_check:
3348 /* program is valid, convert *(u32*)(ctx + off) accesses */ 3616 /* program is valid, convert *(u32*)(ctx + off) accesses */
3349 ret = convert_ctx_accesses(env); 3617 ret = convert_ctx_accesses(env);
3350 3618
3619 if (ret == 0)
3620 ret = fixup_bpf_calls(env);
3621
3351 if (log_level && log_len >= log_size - 1) { 3622 if (log_level && log_len >= log_size - 1) {
3352 BUG_ON(log_len >= log_size); 3623 BUG_ON(log_len >= log_size);
3353 /* verifier log exceeded user supplied buffer */ 3624 /* verifier log exceeded user supplied buffer */
@@ -3422,6 +3693,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
3422 mutex_lock(&bpf_verifier_lock); 3693 mutex_lock(&bpf_verifier_lock);
3423 3694
3424 log_level = 0; 3695 log_level = 0;
3696 env->strict_alignment = false;
3425 3697
3426 env->explored_states = kcalloc(env->prog->len, 3698 env->explored_states = kcalloc(env->prog->len,
3427 sizeof(struct bpf_verifier_state_list *), 3699 sizeof(struct bpf_verifier_state_list *),
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 9203bfb05603..00f4d6bf048f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -5,6 +5,7 @@
5#include <linux/kernfs.h> 5#include <linux/kernfs.h>
6#include <linux/workqueue.h> 6#include <linux/workqueue.h>
7#include <linux/list.h> 7#include <linux/list.h>
8#include <linux/refcount.h>
8 9
9/* 10/*
10 * A cgroup can be associated with multiple css_sets as different tasks may 11 * A cgroup can be associated with multiple css_sets as different tasks may
@@ -134,7 +135,7 @@ static inline void put_css_set(struct css_set *cset)
134 * can see it. Similar to atomic_dec_and_lock(), but for an 135 * can see it. Similar to atomic_dec_and_lock(), but for an
135 * rwlock 136 * rwlock
136 */ 137 */
137 if (atomic_add_unless(&cset->refcount, -1, 1)) 138 if (refcount_dec_not_one(&cset->refcount))
138 return; 139 return;
139 140
140 spin_lock_irqsave(&css_set_lock, flags); 141 spin_lock_irqsave(&css_set_lock, flags);
@@ -147,7 +148,7 @@ static inline void put_css_set(struct css_set *cset)
147 */ 148 */
148static inline void get_css_set(struct css_set *cset) 149static inline void get_css_set(struct css_set *cset)
149{ 150{
150 atomic_inc(&cset->refcount); 151 refcount_inc(&cset->refcount);
151} 152}
152 153
153bool cgroup_ssid_enabled(int ssid); 154bool cgroup_ssid_enabled(int ssid);
@@ -163,7 +164,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
163 164
164void cgroup_free_root(struct cgroup_root *root); 165void cgroup_free_root(struct cgroup_root *root);
165void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); 166void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
166int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); 167int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
167int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); 168int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
168struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, 169struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
169 struct cgroup_root *root, unsigned long magic, 170 struct cgroup_root *root, unsigned long magic,
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 1dc22f6b49f5..85d75152402d 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -346,7 +346,7 @@ static int cgroup_task_count(const struct cgroup *cgrp)
346 346
347 spin_lock_irq(&css_set_lock); 347 spin_lock_irq(&css_set_lock);
348 list_for_each_entry(link, &cgrp->cset_links, cset_link) 348 list_for_each_entry(link, &cgrp->cset_links, cset_link)
349 count += atomic_read(&link->cset->refcount); 349 count += refcount_read(&link->cset->refcount);
350 spin_unlock_irq(&css_set_lock); 350 spin_unlock_irq(&css_set_lock);
351 return count; 351 return count;
352} 352}
@@ -1072,6 +1072,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
1072 struct cgroup_subsys *ss; 1072 struct cgroup_subsys *ss;
1073 struct dentry *dentry; 1073 struct dentry *dentry;
1074 int i, ret; 1074 int i, ret;
1075 bool new_root = false;
1075 1076
1076 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); 1077 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1077 1078
@@ -1181,10 +1182,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
1181 ret = -ENOMEM; 1182 ret = -ENOMEM;
1182 goto out_unlock; 1183 goto out_unlock;
1183 } 1184 }
1185 new_root = true;
1184 1186
1185 init_cgroup_root(root, &opts); 1187 init_cgroup_root(root, &opts);
1186 1188
1187 ret = cgroup_setup_root(root, opts.subsys_mask); 1189 ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
1188 if (ret) 1190 if (ret)
1189 cgroup_free_root(root); 1191 cgroup_free_root(root);
1190 1192
@@ -1201,6 +1203,18 @@ out_free:
1201 CGROUP_SUPER_MAGIC, ns); 1203 CGROUP_SUPER_MAGIC, ns);
1202 1204
1203 /* 1205 /*
1206 * There's a race window after we release cgroup_mutex and before
1207 * allocating a superblock. Make sure a concurrent process won't
1208 * be able to re-use the root during this window by delaying the
1209 * initialization of root refcnt.
1210 */
1211 if (new_root) {
1212 mutex_lock(&cgroup_mutex);
1213 percpu_ref_reinit(&root->cgrp.self.refcnt);
1214 mutex_unlock(&cgroup_mutex);
1215 }
1216
1217 /*
1204 * If @pinned_sb, we're reusing an existing root and holding an 1218 * If @pinned_sb, we're reusing an existing root and holding an
1205 * extra ref on its sb. Mount is complete. Put the extra ref. 1219 * extra ref on its sb. Mount is complete. Put the extra ref.
1206 */ 1220 */
@@ -1286,7 +1300,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
1286 u64 count; 1300 u64 count;
1287 1301
1288 rcu_read_lock(); 1302 rcu_read_lock();
1289 count = atomic_read(&task_css_set(current)->refcount); 1303 count = refcount_read(&task_css_set(current)->refcount);
1290 rcu_read_unlock(); 1304 rcu_read_unlock();
1291 return count; 1305 return count;
1292} 1306}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 687f5e0194ef..c3c9a0e1b3c9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -189,7 +189,7 @@ static u16 have_canfork_callback __read_mostly;
189 189
190/* cgroup namespace for init task */ 190/* cgroup namespace for init task */
191struct cgroup_namespace init_cgroup_ns = { 191struct cgroup_namespace init_cgroup_ns = {
192 .count = { .counter = 2, }, 192 .count = REFCOUNT_INIT(2),
193 .user_ns = &init_user_ns, 193 .user_ns = &init_user_ns,
194 .ns.ops = &cgroupns_operations, 194 .ns.ops = &cgroupns_operations,
195 .ns.inum = PROC_CGROUP_INIT_INO, 195 .ns.inum = PROC_CGROUP_INIT_INO,
@@ -436,7 +436,12 @@ out_unlock:
436 return css; 436 return css;
437} 437}
438 438
439static void cgroup_get(struct cgroup *cgrp) 439static void __maybe_unused cgroup_get(struct cgroup *cgrp)
440{
441 css_get(&cgrp->self);
442}
443
444static void cgroup_get_live(struct cgroup *cgrp)
440{ 445{
441 WARN_ON_ONCE(cgroup_is_dead(cgrp)); 446 WARN_ON_ONCE(cgroup_is_dead(cgrp));
442 css_get(&cgrp->self); 447 css_get(&cgrp->self);
@@ -554,7 +559,7 @@ EXPORT_SYMBOL_GPL(of_css);
554 * haven't been created. 559 * haven't been created.
555 */ 560 */
556struct css_set init_css_set = { 561struct css_set init_css_set = {
557 .refcount = ATOMIC_INIT(1), 562 .refcount = REFCOUNT_INIT(1),
558 .tasks = LIST_HEAD_INIT(init_css_set.tasks), 563 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
559 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), 564 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
560 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), 565 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
@@ -724,7 +729,7 @@ void put_css_set_locked(struct css_set *cset)
724 729
725 lockdep_assert_held(&css_set_lock); 730 lockdep_assert_held(&css_set_lock);
726 731
727 if (!atomic_dec_and_test(&cset->refcount)) 732 if (!refcount_dec_and_test(&cset->refcount))
728 return; 733 return;
729 734
730 /* This css_set is dead. unlink it and release cgroup and css refs */ 735 /* This css_set is dead. unlink it and release cgroup and css refs */
@@ -932,7 +937,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
932 list_add_tail(&link->cgrp_link, &cset->cgrp_links); 937 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
933 938
934 if (cgroup_parent(cgrp)) 939 if (cgroup_parent(cgrp))
935 cgroup_get(cgrp); 940 cgroup_get_live(cgrp);
936} 941}
937 942
938/** 943/**
@@ -977,7 +982,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
977 return NULL; 982 return NULL;
978 } 983 }
979 984
980 atomic_set(&cset->refcount, 1); 985 refcount_set(&cset->refcount, 1);
981 INIT_LIST_HEAD(&cset->tasks); 986 INIT_LIST_HEAD(&cset->tasks);
982 INIT_LIST_HEAD(&cset->mg_tasks); 987 INIT_LIST_HEAD(&cset->mg_tasks);
983 INIT_LIST_HEAD(&cset->task_iters); 988 INIT_LIST_HEAD(&cset->task_iters);
@@ -1640,7 +1645,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
1640 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); 1645 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1641} 1646}
1642 1647
1643int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) 1648int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
1644{ 1649{
1645 LIST_HEAD(tmp_links); 1650 LIST_HEAD(tmp_links);
1646 struct cgroup *root_cgrp = &root->cgrp; 1651 struct cgroup *root_cgrp = &root->cgrp;
@@ -1656,8 +1661,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
1656 root_cgrp->id = ret; 1661 root_cgrp->id = ret;
1657 root_cgrp->ancestor_ids[0] = ret; 1662 root_cgrp->ancestor_ids[0] = ret;
1658 1663
1659 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, 1664 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
1660 GFP_KERNEL); 1665 ref_flags, GFP_KERNEL);
1661 if (ret) 1666 if (ret)
1662 goto out; 1667 goto out;
1663 1668
@@ -1802,7 +1807,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1802 return ERR_PTR(-EINVAL); 1807 return ERR_PTR(-EINVAL);
1803 } 1808 }
1804 cgrp_dfl_visible = true; 1809 cgrp_dfl_visible = true;
1805 cgroup_get(&cgrp_dfl_root.cgrp); 1810 cgroup_get_live(&cgrp_dfl_root.cgrp);
1806 1811
1807 dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, 1812 dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
1808 CGROUP2_SUPER_MAGIC, ns); 1813 CGROUP2_SUPER_MAGIC, ns);
@@ -2576,7 +2581,7 @@ restart:
2576 if (!css || !percpu_ref_is_dying(&css->refcnt)) 2581 if (!css || !percpu_ref_is_dying(&css->refcnt))
2577 continue; 2582 continue;
2578 2583
2579 cgroup_get(dsct); 2584 cgroup_get_live(dsct);
2580 prepare_to_wait(&dsct->offline_waitq, &wait, 2585 prepare_to_wait(&dsct->offline_waitq, &wait,
2581 TASK_UNINTERRUPTIBLE); 2586 TASK_UNINTERRUPTIBLE);
2582 2587
@@ -3947,7 +3952,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
3947{ 3952{
3948 lockdep_assert_held(&cgroup_mutex); 3953 lockdep_assert_held(&cgroup_mutex);
3949 3954
3950 cgroup_get(cgrp); 3955 cgroup_get_live(cgrp);
3951 3956
3952 memset(css, 0, sizeof(*css)); 3957 memset(css, 0, sizeof(*css));
3953 css->cgroup = cgrp; 3958 css->cgroup = cgrp;
@@ -4123,7 +4128,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
4123 /* allocation complete, commit to creation */ 4128 /* allocation complete, commit to creation */
4124 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); 4129 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
4125 atomic_inc(&root->nr_cgrps); 4130 atomic_inc(&root->nr_cgrps);
4126 cgroup_get(parent); 4131 cgroup_get_live(parent);
4127 4132
4128 /* 4133 /*
4129 * @cgrp is now fully operational. If something fails after this 4134 * @cgrp is now fully operational. If something fails after this
@@ -4513,7 +4518,7 @@ int __init cgroup_init(void)
4513 hash_add(css_set_table, &init_css_set.hlist, 4518 hash_add(css_set_table, &init_css_set.hlist,
4514 css_set_hash(init_css_set.subsys)); 4519 css_set_hash(init_css_set.subsys));
4515 4520
4516 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); 4521 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
4517 4522
4518 mutex_unlock(&cgroup_mutex); 4523 mutex_unlock(&cgroup_mutex);
4519 4524
@@ -4947,7 +4952,7 @@ struct cgroup *cgroup_get_from_path(const char *path)
4947 if (kn) { 4952 if (kn) {
4948 if (kernfs_type(kn) == KERNFS_DIR) { 4953 if (kernfs_type(kn) == KERNFS_DIR) {
4949 cgrp = kn->priv; 4954 cgrp = kn->priv;
4950 cgroup_get(cgrp); 4955 cgroup_get_live(cgrp);
4951 } else { 4956 } else {
4952 cgrp = ERR_PTR(-ENOTDIR); 4957 cgrp = ERR_PTR(-ENOTDIR);
4953 } 4958 }
@@ -5027,6 +5032,11 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
5027 5032
5028 /* Socket clone path */ 5033 /* Socket clone path */
5029 if (skcd->val) { 5034 if (skcd->val) {
5035 /*
5036 * We might be cloning a socket which is left in an empty
5037 * cgroup and the cgroup might have already been rmdir'd.
5038 * Don't use cgroup_get_live().
5039 */
5030 cgroup_get(sock_cgroup_ptr(skcd)); 5040 cgroup_get(sock_cgroup_ptr(skcd));
5031 return; 5041 return;
5032 } 5042 }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0f41292be0fb..f6501f4f6040 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2121,10 +2121,8 @@ int __init cpuset_init(void)
2121{ 2121{
2122 int err = 0; 2122 int err = 0;
2123 2123
2124 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) 2124 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
2125 BUG(); 2125 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
2126 if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
2127 BUG();
2128 2126
2129 cpumask_setall(top_cpuset.cpus_allowed); 2127 cpumask_setall(top_cpuset.cpus_allowed);
2130 nodes_setall(top_cpuset.mems_allowed); 2128 nodes_setall(top_cpuset.mems_allowed);
@@ -2139,8 +2137,7 @@ int __init cpuset_init(void)
2139 if (err < 0) 2137 if (err < 0)
2140 return err; 2138 return err;
2141 2139
2142 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) 2140 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
2143 BUG();
2144 2141
2145 return 0; 2142 return 0;
2146} 2143}
@@ -2354,7 +2351,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2354 rebuild_sched_domains(); 2351 rebuild_sched_domains();
2355} 2352}
2356 2353
2357void cpuset_update_active_cpus(bool cpu_online) 2354void cpuset_update_active_cpus(void)
2358{ 2355{
2359 /* 2356 /*
2360 * We're inside cpu hotplug critical region which usually nests 2357 * We're inside cpu hotplug critical region which usually nests
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 96d38dab6fb2..66129eb4371d 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -31,7 +31,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
31 kfree(new_ns); 31 kfree(new_ns);
32 return ERR_PTR(ret); 32 return ERR_PTR(ret);
33 } 33 }
34 atomic_set(&new_ns->count, 1); 34 refcount_set(&new_ns->count, 1);
35 new_ns->ns.ops = &cgroupns_operations; 35 new_ns->ns.ops = &cgroupns_operations;
36 return new_ns; 36 return new_ns;
37} 37}
diff --git a/kernel/compat.c b/kernel/compat.c
index 19aec5d98108..933bcb31ae10 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -108,8 +108,8 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
108COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, 108COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
109 struct timezone __user *, tz) 109 struct timezone __user *, tz)
110{ 110{
111 struct timespec64 new_ts;
111 struct timeval user_tv; 112 struct timeval user_tv;
112 struct timespec new_ts;
113 struct timezone new_tz; 113 struct timezone new_tz;
114 114
115 if (tv) { 115 if (tv) {
@@ -123,7 +123,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
123 return -EFAULT; 123 return -EFAULT;
124 } 124 }
125 125
126 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); 126 return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
127} 127}
128 128
129static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) 129static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
@@ -240,18 +240,20 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
240 struct compat_timespec __user *, rmtp) 240 struct compat_timespec __user *, rmtp)
241{ 241{
242 struct timespec tu, rmt; 242 struct timespec tu, rmt;
243 struct timespec64 tu64;
243 mm_segment_t oldfs; 244 mm_segment_t oldfs;
244 long ret; 245 long ret;
245 246
246 if (compat_get_timespec(&tu, rqtp)) 247 if (compat_get_timespec(&tu, rqtp))
247 return -EFAULT; 248 return -EFAULT;
248 249
249 if (!timespec_valid(&tu)) 250 tu64 = timespec_to_timespec64(tu);
251 if (!timespec64_valid(&tu64))
250 return -EINVAL; 252 return -EINVAL;
251 253
252 oldfs = get_fs(); 254 oldfs = get_fs();
253 set_fs(KERNEL_DS); 255 set_fs(KERNEL_DS);
254 ret = hrtimer_nanosleep(&tu, 256 ret = hrtimer_nanosleep(&tu64,
255 rmtp ? (struct timespec __user *)&rmt : NULL, 257 rmtp ? (struct timespec __user *)&rmt : NULL,
256 HRTIMER_MODE_REL, CLOCK_MONOTONIC); 258 HRTIMER_MODE_REL, CLOCK_MONOTONIC);
257 set_fs(oldfs); 259 set_fs(oldfs);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 37b223e4fc05..9ae6fbe5b5cf 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1125,6 +1125,8 @@ core_initcall(cpu_hotplug_pm_sync_init);
1125 1125
1126#endif /* CONFIG_PM_SLEEP_SMP */ 1126#endif /* CONFIG_PM_SLEEP_SMP */
1127 1127
1128int __boot_cpu_id;
1129
1128#endif /* CONFIG_SMP */ 1130#endif /* CONFIG_SMP */
1129 1131
1130/* Boot processor state steps */ 1132/* Boot processor state steps */
@@ -1815,6 +1817,10 @@ void __init boot_cpu_init(void)
1815 set_cpu_active(cpu, true); 1817 set_cpu_active(cpu, true);
1816 set_cpu_present(cpu, true); 1818 set_cpu_present(cpu, true);
1817 set_cpu_possible(cpu, true); 1819 set_cpu_possible(cpu, true);
1820
1821#ifdef CONFIG_SMP
1822 __boot_cpu_id = cpu;
1823#endif
1818} 1824}
1819 1825
1820/* 1826/*
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
new file mode 100644
index 000000000000..fcbd568f1e95
--- /dev/null
+++ b/kernel/crash_core.c
@@ -0,0 +1,439 @@
1/*
2 * crash.c - kernel crash support code.
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/crash_core.h>
10#include <linux/utsname.h>
11#include <linux/vmalloc.h>
12
13#include <asm/page.h>
14#include <asm/sections.h>
15
16/* vmcoreinfo stuff */
17static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
18u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
19size_t vmcoreinfo_size;
20size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
21
22/*
23 * parsing the "crashkernel" commandline
24 *
25 * this code is intended to be called from architecture specific code
26 */
27
28
29/*
30 * This function parses command lines in the format
31 *
32 * crashkernel=ramsize-range:size[,...][@offset]
33 *
34 * The function returns 0 on success and -EINVAL on failure.
35 */
36static int __init parse_crashkernel_mem(char *cmdline,
37 unsigned long long system_ram,
38 unsigned long long *crash_size,
39 unsigned long long *crash_base)
40{
41 char *cur = cmdline, *tmp;
42
43 /* for each entry of the comma-separated list */
44 do {
45 unsigned long long start, end = ULLONG_MAX, size;
46
47 /* get the start of the range */
48 start = memparse(cur, &tmp);
49 if (cur == tmp) {
50 pr_warn("crashkernel: Memory value expected\n");
51 return -EINVAL;
52 }
53 cur = tmp;
54 if (*cur != '-') {
55 pr_warn("crashkernel: '-' expected\n");
56 return -EINVAL;
57 }
58 cur++;
59
60 /* if no ':' is here, than we read the end */
61 if (*cur != ':') {
62 end = memparse(cur, &tmp);
63 if (cur == tmp) {
64 pr_warn("crashkernel: Memory value expected\n");
65 return -EINVAL;
66 }
67 cur = tmp;
68 if (end <= start) {
69 pr_warn("crashkernel: end <= start\n");
70 return -EINVAL;
71 }
72 }
73
74 if (*cur != ':') {
75 pr_warn("crashkernel: ':' expected\n");
76 return -EINVAL;
77 }
78 cur++;
79
80 size = memparse(cur, &tmp);
81 if (cur == tmp) {
82 pr_warn("Memory value expected\n");
83 return -EINVAL;
84 }
85 cur = tmp;
86 if (size >= system_ram) {
87 pr_warn("crashkernel: invalid size\n");
88 return -EINVAL;
89 }
90
91 /* match ? */
92 if (system_ram >= start && system_ram < end) {
93 *crash_size = size;
94 break;
95 }
96 } while (*cur++ == ',');
97
98 if (*crash_size > 0) {
99 while (*cur && *cur != ' ' && *cur != '@')
100 cur++;
101 if (*cur == '@') {
102 cur++;
103 *crash_base = memparse(cur, &tmp);
104 if (cur == tmp) {
105 pr_warn("Memory value expected after '@'\n");
106 return -EINVAL;
107 }
108 }
109 }
110
111 return 0;
112}
113
114/*
115 * That function parses "simple" (old) crashkernel command lines like
116 *
117 * crashkernel=size[@offset]
118 *
119 * It returns 0 on success and -EINVAL on failure.
120 */
121static int __init parse_crashkernel_simple(char *cmdline,
122 unsigned long long *crash_size,
123 unsigned long long *crash_base)
124{
125 char *cur = cmdline;
126
127 *crash_size = memparse(cmdline, &cur);
128 if (cmdline == cur) {
129 pr_warn("crashkernel: memory value expected\n");
130 return -EINVAL;
131 }
132
133 if (*cur == '@')
134 *crash_base = memparse(cur+1, &cur);
135 else if (*cur != ' ' && *cur != '\0') {
136 pr_warn("crashkernel: unrecognized char: %c\n", *cur);
137 return -EINVAL;
138 }
139
140 return 0;
141}
142
143#define SUFFIX_HIGH 0
144#define SUFFIX_LOW 1
145#define SUFFIX_NULL 2
146static __initdata char *suffix_tbl[] = {
147 [SUFFIX_HIGH] = ",high",
148 [SUFFIX_LOW] = ",low",
149 [SUFFIX_NULL] = NULL,
150};
151
152/*
153 * That function parses "suffix" crashkernel command lines like
154 *
155 * crashkernel=size,[high|low]
156 *
157 * It returns 0 on success and -EINVAL on failure.
158 */
159static int __init parse_crashkernel_suffix(char *cmdline,
160 unsigned long long *crash_size,
161 const char *suffix)
162{
163 char *cur = cmdline;
164
165 *crash_size = memparse(cmdline, &cur);
166 if (cmdline == cur) {
167 pr_warn("crashkernel: memory value expected\n");
168 return -EINVAL;
169 }
170
171 /* check with suffix */
172 if (strncmp(cur, suffix, strlen(suffix))) {
173 pr_warn("crashkernel: unrecognized char: %c\n", *cur);
174 return -EINVAL;
175 }
176 cur += strlen(suffix);
177 if (*cur != ' ' && *cur != '\0') {
178 pr_warn("crashkernel: unrecognized char: %c\n", *cur);
179 return -EINVAL;
180 }
181
182 return 0;
183}
184
185static __init char *get_last_crashkernel(char *cmdline,
186 const char *name,
187 const char *suffix)
188{
189 char *p = cmdline, *ck_cmdline = NULL;
190
191 /* find crashkernel and use the last one if there are more */
192 p = strstr(p, name);
193 while (p) {
194 char *end_p = strchr(p, ' ');
195 char *q;
196
197 if (!end_p)
198 end_p = p + strlen(p);
199
200 if (!suffix) {
201 int i;
202
203 /* skip the one with any known suffix */
204 for (i = 0; suffix_tbl[i]; i++) {
205 q = end_p - strlen(suffix_tbl[i]);
206 if (!strncmp(q, suffix_tbl[i],
207 strlen(suffix_tbl[i])))
208 goto next;
209 }
210 ck_cmdline = p;
211 } else {
212 q = end_p - strlen(suffix);
213 if (!strncmp(q, suffix, strlen(suffix)))
214 ck_cmdline = p;
215 }
216next:
217 p = strstr(p+1, name);
218 }
219
220 if (!ck_cmdline)
221 return NULL;
222
223 return ck_cmdline;
224}
225
226static int __init __parse_crashkernel(char *cmdline,
227 unsigned long long system_ram,
228 unsigned long long *crash_size,
229 unsigned long long *crash_base,
230 const char *name,
231 const char *suffix)
232{
233 char *first_colon, *first_space;
234 char *ck_cmdline;
235
236 BUG_ON(!crash_size || !crash_base);
237 *crash_size = 0;
238 *crash_base = 0;
239
240 ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
241
242 if (!ck_cmdline)
243 return -EINVAL;
244
245 ck_cmdline += strlen(name);
246
247 if (suffix)
248 return parse_crashkernel_suffix(ck_cmdline, crash_size,
249 suffix);
250 /*
251 * if the commandline contains a ':', then that's the extended
252 * syntax -- if not, it must be the classic syntax
253 */
254 first_colon = strchr(ck_cmdline, ':');
255 first_space = strchr(ck_cmdline, ' ');
256 if (first_colon && (!first_space || first_colon < first_space))
257 return parse_crashkernel_mem(ck_cmdline, system_ram,
258 crash_size, crash_base);
259
260 return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
261}
262
263/*
264 * That function is the entry point for command line parsing and should be
265 * called from the arch-specific code.
266 */
267int __init parse_crashkernel(char *cmdline,
268 unsigned long long system_ram,
269 unsigned long long *crash_size,
270 unsigned long long *crash_base)
271{
272 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
273 "crashkernel=", NULL);
274}
275
276int __init parse_crashkernel_high(char *cmdline,
277 unsigned long long system_ram,
278 unsigned long long *crash_size,
279 unsigned long long *crash_base)
280{
281 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
282 "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
283}
284
285int __init parse_crashkernel_low(char *cmdline,
286 unsigned long long system_ram,
287 unsigned long long *crash_size,
288 unsigned long long *crash_base)
289{
290 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
291 "crashkernel=", suffix_tbl[SUFFIX_LOW]);
292}
293
294Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
295 void *data, size_t data_len)
296{
297 struct elf_note *note = (struct elf_note *)buf;
298
299 note->n_namesz = strlen(name) + 1;
300 note->n_descsz = data_len;
301 note->n_type = type;
302 buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
303 memcpy(buf, name, note->n_namesz);
304 buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
305 memcpy(buf, data, data_len);
306 buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
307
308 return buf;
309}
310
311void final_note(Elf_Word *buf)
312{
313 memset(buf, 0, sizeof(struct elf_note));
314}
315
316static void update_vmcoreinfo_note(void)
317{
318 u32 *buf = vmcoreinfo_note;
319
320 if (!vmcoreinfo_size)
321 return;
322 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
323 vmcoreinfo_size);
324 final_note(buf);
325}
326
327void crash_save_vmcoreinfo(void)
328{
329 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
330 update_vmcoreinfo_note();
331}
332
333void vmcoreinfo_append_str(const char *fmt, ...)
334{
335 va_list args;
336 char buf[0x50];
337 size_t r;
338
339 va_start(args, fmt);
340 r = vscnprintf(buf, sizeof(buf), fmt, args);
341 va_end(args);
342
343 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
344
345 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
346
347 vmcoreinfo_size += r;
348}
349
350/*
351 * provide an empty default implementation here -- architecture
352 * code may override this
353 */
354void __weak arch_crash_save_vmcoreinfo(void)
355{}
356
357phys_addr_t __weak paddr_vmcoreinfo_note(void)
358{
359 return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
360}
361
362static int __init crash_save_vmcoreinfo_init(void)
363{
364 VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
365 VMCOREINFO_PAGESIZE(PAGE_SIZE);
366
367 VMCOREINFO_SYMBOL(init_uts_ns);
368 VMCOREINFO_SYMBOL(node_online_map);
369#ifdef CONFIG_MMU
370 VMCOREINFO_SYMBOL(swapper_pg_dir);
371#endif
372 VMCOREINFO_SYMBOL(_stext);
373 VMCOREINFO_SYMBOL(vmap_area_list);
374
375#ifndef CONFIG_NEED_MULTIPLE_NODES
376 VMCOREINFO_SYMBOL(mem_map);
377 VMCOREINFO_SYMBOL(contig_page_data);
378#endif
379#ifdef CONFIG_SPARSEMEM
380 VMCOREINFO_SYMBOL(mem_section);
381 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
382 VMCOREINFO_STRUCT_SIZE(mem_section);
383 VMCOREINFO_OFFSET(mem_section, section_mem_map);
384#endif
385 VMCOREINFO_STRUCT_SIZE(page);
386 VMCOREINFO_STRUCT_SIZE(pglist_data);
387 VMCOREINFO_STRUCT_SIZE(zone);
388 VMCOREINFO_STRUCT_SIZE(free_area);
389 VMCOREINFO_STRUCT_SIZE(list_head);
390 VMCOREINFO_SIZE(nodemask_t);
391 VMCOREINFO_OFFSET(page, flags);
392 VMCOREINFO_OFFSET(page, _refcount);
393 VMCOREINFO_OFFSET(page, mapping);
394 VMCOREINFO_OFFSET(page, lru);
395 VMCOREINFO_OFFSET(page, _mapcount);
396 VMCOREINFO_OFFSET(page, private);
397 VMCOREINFO_OFFSET(page, compound_dtor);
398 VMCOREINFO_OFFSET(page, compound_order);
399 VMCOREINFO_OFFSET(page, compound_head);
400 VMCOREINFO_OFFSET(pglist_data, node_zones);
401 VMCOREINFO_OFFSET(pglist_data, nr_zones);
402#ifdef CONFIG_FLAT_NODE_MEM_MAP
403 VMCOREINFO_OFFSET(pglist_data, node_mem_map);
404#endif
405 VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
406 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
407 VMCOREINFO_OFFSET(pglist_data, node_id);
408 VMCOREINFO_OFFSET(zone, free_area);
409 VMCOREINFO_OFFSET(zone, vm_stat);
410 VMCOREINFO_OFFSET(zone, spanned_pages);
411 VMCOREINFO_OFFSET(free_area, free_list);
412 VMCOREINFO_OFFSET(list_head, next);
413 VMCOREINFO_OFFSET(list_head, prev);
414 VMCOREINFO_OFFSET(vmap_area, va_start);
415 VMCOREINFO_OFFSET(vmap_area, list);
416 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
417 log_buf_vmcoreinfo_setup();
418 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
419 VMCOREINFO_NUMBER(NR_FREE_PAGES);
420 VMCOREINFO_NUMBER(PG_lru);
421 VMCOREINFO_NUMBER(PG_private);
422 VMCOREINFO_NUMBER(PG_swapcache);
423 VMCOREINFO_NUMBER(PG_slab);
424#ifdef CONFIG_MEMORY_FAILURE
425 VMCOREINFO_NUMBER(PG_hwpoison);
426#endif
427 VMCOREINFO_NUMBER(PG_head_mask);
428 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
429#ifdef CONFIG_HUGETLB_PAGE
430 VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
431#endif
432
433 arch_crash_save_vmcoreinfo();
434 update_vmcoreinfo_note();
435
436 return 0;
437}
438
439subsys_initcall(crash_save_vmcoreinfo_init);
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c04917cad1bf..1b2be63c8528 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -229,12 +229,18 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
229 } 229 }
230 230
231 if (regs) { 231 if (regs) {
232 mm_segment_t fs;
233
232 if (crosstask) 234 if (crosstask)
233 goto exit_put; 235 goto exit_put;
234 236
235 if (add_mark) 237 if (add_mark)
236 perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); 238 perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
239
240 fs = get_fs();
241 set_fs(USER_DS);
237 perf_callchain_user(&ctx, regs); 242 perf_callchain_user(&ctx, regs);
243 set_fs(fs);
238 } 244 }
239 } 245 }
240 246
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ff01cba86f43..6e75a5c9412d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -48,6 +48,8 @@
48#include <linux/parser.h> 48#include <linux/parser.h>
49#include <linux/sched/clock.h> 49#include <linux/sched/clock.h>
50#include <linux/sched/mm.h> 50#include <linux/sched/mm.h>
51#include <linux/proc_ns.h>
52#include <linux/mount.h>
51 53
52#include "internal.h" 54#include "internal.h"
53 55
@@ -379,6 +381,7 @@ static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
379 381
380static atomic_t nr_mmap_events __read_mostly; 382static atomic_t nr_mmap_events __read_mostly;
381static atomic_t nr_comm_events __read_mostly; 383static atomic_t nr_comm_events __read_mostly;
384static atomic_t nr_namespaces_events __read_mostly;
382static atomic_t nr_task_events __read_mostly; 385static atomic_t nr_task_events __read_mostly;
383static atomic_t nr_freq_events __read_mostly; 386static atomic_t nr_freq_events __read_mostly;
384static atomic_t nr_switch_events __read_mostly; 387static atomic_t nr_switch_events __read_mostly;
@@ -3991,6 +3994,8 @@ static void unaccount_event(struct perf_event *event)
3991 atomic_dec(&nr_mmap_events); 3994 atomic_dec(&nr_mmap_events);
3992 if (event->attr.comm) 3995 if (event->attr.comm)
3993 atomic_dec(&nr_comm_events); 3996 atomic_dec(&nr_comm_events);
3997 if (event->attr.namespaces)
3998 atomic_dec(&nr_namespaces_events);
3994 if (event->attr.task) 3999 if (event->attr.task)
3995 atomic_dec(&nr_task_events); 4000 atomic_dec(&nr_task_events);
3996 if (event->attr.freq) 4001 if (event->attr.freq)
@@ -6491,6 +6496,7 @@ static void perf_event_task(struct task_struct *task,
6491void perf_event_fork(struct task_struct *task) 6496void perf_event_fork(struct task_struct *task)
6492{ 6497{
6493 perf_event_task(task, NULL, 1); 6498 perf_event_task(task, NULL, 1);
6499 perf_event_namespaces(task);
6494} 6500}
6495 6501
6496/* 6502/*
@@ -6593,6 +6599,132 @@ void perf_event_comm(struct task_struct *task, bool exec)
6593} 6599}
6594 6600
6595/* 6601/*
6602 * namespaces tracking
6603 */
6604
6605struct perf_namespaces_event {
6606 struct task_struct *task;
6607
6608 struct {
6609 struct perf_event_header header;
6610
6611 u32 pid;
6612 u32 tid;
6613 u64 nr_namespaces;
6614 struct perf_ns_link_info link_info[NR_NAMESPACES];
6615 } event_id;
6616};
6617
6618static int perf_event_namespaces_match(struct perf_event *event)
6619{
6620 return event->attr.namespaces;
6621}
6622
6623static void perf_event_namespaces_output(struct perf_event *event,
6624 void *data)
6625{
6626 struct perf_namespaces_event *namespaces_event = data;
6627 struct perf_output_handle handle;
6628 struct perf_sample_data sample;
6629 int ret;
6630
6631 if (!perf_event_namespaces_match(event))
6632 return;
6633
6634 perf_event_header__init_id(&namespaces_event->event_id.header,
6635 &sample, event);
6636 ret = perf_output_begin(&handle, event,
6637 namespaces_event->event_id.header.size);
6638 if (ret)
6639 return;
6640
6641 namespaces_event->event_id.pid = perf_event_pid(event,
6642 namespaces_event->task);
6643 namespaces_event->event_id.tid = perf_event_tid(event,
6644 namespaces_event->task);
6645
6646 perf_output_put(&handle, namespaces_event->event_id);
6647
6648 perf_event__output_id_sample(event, &handle, &sample);
6649
6650 perf_output_end(&handle);
6651}
6652
6653static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
6654 struct task_struct *task,
6655 const struct proc_ns_operations *ns_ops)
6656{
6657 struct path ns_path;
6658 struct inode *ns_inode;
6659 void *error;
6660
6661 error = ns_get_path(&ns_path, task, ns_ops);
6662 if (!error) {
6663 ns_inode = ns_path.dentry->d_inode;
6664 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
6665 ns_link_info->ino = ns_inode->i_ino;
6666 }
6667}
6668
6669void perf_event_namespaces(struct task_struct *task)
6670{
6671 struct perf_namespaces_event namespaces_event;
6672 struct perf_ns_link_info *ns_link_info;
6673
6674 if (!atomic_read(&nr_namespaces_events))
6675 return;
6676
6677 namespaces_event = (struct perf_namespaces_event){
6678 .task = task,
6679 .event_id = {
6680 .header = {
6681 .type = PERF_RECORD_NAMESPACES,
6682 .misc = 0,
6683 .size = sizeof(namespaces_event.event_id),
6684 },
6685 /* .pid */
6686 /* .tid */
6687 .nr_namespaces = NR_NAMESPACES,
6688 /* .link_info[NR_NAMESPACES] */
6689 },
6690 };
6691
6692 ns_link_info = namespaces_event.event_id.link_info;
6693
6694 perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
6695 task, &mntns_operations);
6696
6697#ifdef CONFIG_USER_NS
6698 perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
6699 task, &userns_operations);
6700#endif
6701#ifdef CONFIG_NET_NS
6702 perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
6703 task, &netns_operations);
6704#endif
6705#ifdef CONFIG_UTS_NS
6706 perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
6707 task, &utsns_operations);
6708#endif
6709#ifdef CONFIG_IPC_NS
6710 perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
6711 task, &ipcns_operations);
6712#endif
6713#ifdef CONFIG_PID_NS
6714 perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
6715 task, &pidns_operations);
6716#endif
6717#ifdef CONFIG_CGROUPS
6718 perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
6719 task, &cgroupns_operations);
6720#endif
6721
6722 perf_iterate_sb(perf_event_namespaces_output,
6723 &namespaces_event,
6724 NULL);
6725}
6726
6727/*
6596 * mmap tracking 6728 * mmap tracking
6597 */ 6729 */
6598 6730
@@ -9146,6 +9278,8 @@ static void account_event(struct perf_event *event)
9146 atomic_inc(&nr_mmap_events); 9278 atomic_inc(&nr_mmap_events);
9147 if (event->attr.comm) 9279 if (event->attr.comm)
9148 atomic_inc(&nr_comm_events); 9280 atomic_inc(&nr_comm_events);
9281 if (event->attr.namespaces)
9282 atomic_inc(&nr_namespaces_events);
9149 if (event->attr.task) 9283 if (event->attr.task)
9150 atomic_inc(&nr_task_events); 9284 atomic_inc(&nr_task_events);
9151 if (event->attr.freq) 9285 if (event->attr.freq)
@@ -9691,6 +9825,11 @@ SYSCALL_DEFINE5(perf_event_open,
9691 return -EACCES; 9825 return -EACCES;
9692 } 9826 }
9693 9827
9828 if (attr.namespaces) {
9829 if (!capable(CAP_SYS_ADMIN))
9830 return -EACCES;
9831 }
9832
9694 if (attr.freq) { 9833 if (attr.freq) {
9695 if (attr.sample_freq > sysctl_perf_event_sample_rate) 9834 if (attr.sample_freq > sysctl_perf_event_sample_rate)
9696 return -EINVAL; 9835 return -EINVAL;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 257fa460b846..2831480c63a2 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -297,6 +297,19 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
297 rb->paused = 1; 297 rb->paused = 1;
298} 298}
299 299
300void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
301{
302 /*
303 * OVERWRITE is determined by perf_aux_output_end() and can't
304 * be passed in directly.
305 */
306 if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
307 return;
308
309 handle->aux_flags |= flags;
310}
311EXPORT_SYMBOL_GPL(perf_aux_output_flag);
312
300/* 313/*
301 * This is called before hardware starts writing to the AUX area to 314 * This is called before hardware starts writing to the AUX area to
302 * obtain an output handle and make sure there's room in the buffer. 315 * obtain an output handle and make sure there's room in the buffer.
@@ -360,6 +373,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
360 handle->event = event; 373 handle->event = event;
361 handle->head = aux_head; 374 handle->head = aux_head;
362 handle->size = 0; 375 handle->size = 0;
376 handle->aux_flags = 0;
363 377
364 /* 378 /*
365 * In overwrite mode, AUX data stores do not depend on aux_tail, 379 * In overwrite mode, AUX data stores do not depend on aux_tail,
@@ -408,34 +422,32 @@ err:
408 * of the AUX buffer management code is that after pmu::stop(), the AUX 422 * of the AUX buffer management code is that after pmu::stop(), the AUX
409 * transaction must be stopped and therefore drop the AUX reference count. 423 * transaction must be stopped and therefore drop the AUX reference count.
410 */ 424 */
411void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, 425void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
412 bool truncated)
413{ 426{
427 bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
414 struct ring_buffer *rb = handle->rb; 428 struct ring_buffer *rb = handle->rb;
415 bool wakeup = truncated;
416 unsigned long aux_head; 429 unsigned long aux_head;
417 u64 flags = 0;
418
419 if (truncated)
420 flags |= PERF_AUX_FLAG_TRUNCATED;
421 430
422 /* in overwrite mode, driver provides aux_head via handle */ 431 /* in overwrite mode, driver provides aux_head via handle */
423 if (rb->aux_overwrite) { 432 if (rb->aux_overwrite) {
424 flags |= PERF_AUX_FLAG_OVERWRITE; 433 handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
425 434
426 aux_head = handle->head; 435 aux_head = handle->head;
427 local_set(&rb->aux_head, aux_head); 436 local_set(&rb->aux_head, aux_head);
428 } else { 437 } else {
438 handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
439
429 aux_head = local_read(&rb->aux_head); 440 aux_head = local_read(&rb->aux_head);
430 local_add(size, &rb->aux_head); 441 local_add(size, &rb->aux_head);
431 } 442 }
432 443
433 if (size || flags) { 444 if (size || handle->aux_flags) {
434 /* 445 /*
435 * Only send RECORD_AUX if we have something useful to communicate 446 * Only send RECORD_AUX if we have something useful to communicate
436 */ 447 */
437 448
438 perf_event_aux_event(handle->event, aux_head, size, flags); 449 perf_event_aux_event(handle->event, aux_head, size,
450 handle->aux_flags);
439 } 451 }
440 452
441 aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); 453 aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
@@ -446,7 +458,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
446 } 458 }
447 459
448 if (wakeup) { 460 if (wakeup) {
449 if (truncated) 461 if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
450 handle->event->pending_disable = 1; 462 handle->event->pending_disable = 1;
451 perf_output_wakeup(handle); 463 perf_output_wakeup(handle);
452 } 464 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..aa1076c5e4a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -87,6 +87,7 @@
87#include <linux/compiler.h> 87#include <linux/compiler.h>
88#include <linux/sysctl.h> 88#include <linux/sysctl.h>
89#include <linux/kcov.h> 89#include <linux/kcov.h>
90#include <linux/livepatch.h>
90 91
91#include <asm/pgtable.h> 92#include <asm/pgtable.h>
92#include <asm/pgalloc.h> 93#include <asm/pgalloc.h>
@@ -178,6 +179,24 @@ void __weak arch_release_thread_stack(unsigned long *stack)
178 */ 179 */
179#define NR_CACHED_STACKS 2 180#define NR_CACHED_STACKS 2
180static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); 181static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
182
183static int free_vm_stack_cache(unsigned int cpu)
184{
185 struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
186 int i;
187
188 for (i = 0; i < NR_CACHED_STACKS; i++) {
189 struct vm_struct *vm_stack = cached_vm_stacks[i];
190
191 if (!vm_stack)
192 continue;
193
194 vfree(vm_stack->addr);
195 cached_vm_stacks[i] = NULL;
196 }
197
198 return 0;
199}
181#endif 200#endif
182 201
183static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) 202static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
@@ -202,7 +221,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
202 221
203 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, 222 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
204 VMALLOC_START, VMALLOC_END, 223 VMALLOC_START, VMALLOC_END,
205 THREADINFO_GFP | __GFP_HIGHMEM, 224 THREADINFO_GFP,
206 PAGE_KERNEL, 225 PAGE_KERNEL,
207 0, node, __builtin_return_address(0)); 226 0, node, __builtin_return_address(0));
208 227
@@ -466,6 +485,11 @@ void __init fork_init(void)
466 for (i = 0; i < UCOUNT_COUNTS; i++) { 485 for (i = 0; i < UCOUNT_COUNTS; i++) {
467 init_user_ns.ucount_max[i] = max_threads/2; 486 init_user_ns.ucount_max[i] = max_threads/2;
468 } 487 }
488
489#ifdef CONFIG_VMAP_STACK
490 cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
491 NULL, free_vm_stack_cache);
492#endif
469} 493}
470 494
471int __weak arch_dup_task_struct(struct task_struct *dst, 495int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -536,7 +560,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
536 set_task_stack_end_magic(tsk); 560 set_task_stack_end_magic(tsk);
537 561
538#ifdef CONFIG_CC_STACKPROTECTOR 562#ifdef CONFIG_CC_STACKPROTECTOR
539 tsk->stack_canary = get_random_int(); 563 tsk->stack_canary = get_random_long();
540#endif 564#endif
541 565
542 /* 566 /*
@@ -1313,7 +1337,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
1313 if (atomic_dec_and_test(&sighand->count)) { 1337 if (atomic_dec_and_test(&sighand->count)) {
1314 signalfd_cleanup(sighand); 1338 signalfd_cleanup(sighand);
1315 /* 1339 /*
1316 * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it 1340 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
1317 * without an RCU grace period, see __lock_task_sighand(). 1341 * without an RCU grace period, see __lock_task_sighand().
1318 */ 1342 */
1319 kmem_cache_free(sighand_cachep, sighand); 1343 kmem_cache_free(sighand_cachep, sighand);
@@ -1438,6 +1462,7 @@ static void rt_mutex_init_task(struct task_struct *p)
1438#ifdef CONFIG_RT_MUTEXES 1462#ifdef CONFIG_RT_MUTEXES
1439 p->pi_waiters = RB_ROOT; 1463 p->pi_waiters = RB_ROOT;
1440 p->pi_waiters_leftmost = NULL; 1464 p->pi_waiters_leftmost = NULL;
1465 p->pi_top_task = NULL;
1441 p->pi_blocked_on = NULL; 1466 p->pi_blocked_on = NULL;
1442#endif 1467#endif
1443} 1468}
@@ -1679,9 +1704,12 @@ static __latent_entropy struct task_struct *copy_process(
1679 goto bad_fork_cleanup_perf; 1704 goto bad_fork_cleanup_perf;
1680 /* copy all the process information */ 1705 /* copy all the process information */
1681 shm_init_task(p); 1706 shm_init_task(p);
1682 retval = copy_semundo(clone_flags, p); 1707 retval = security_task_alloc(p, clone_flags);
1683 if (retval) 1708 if (retval)
1684 goto bad_fork_cleanup_audit; 1709 goto bad_fork_cleanup_audit;
1710 retval = copy_semundo(clone_flags, p);
1711 if (retval)
1712 goto bad_fork_cleanup_security;
1685 retval = copy_files(clone_flags, p); 1713 retval = copy_files(clone_flags, p);
1686 if (retval) 1714 if (retval)
1687 goto bad_fork_cleanup_semundo; 1715 goto bad_fork_cleanup_semundo;
@@ -1797,6 +1825,8 @@ static __latent_entropy struct task_struct *copy_process(
1797 p->parent_exec_id = current->self_exec_id; 1825 p->parent_exec_id = current->self_exec_id;
1798 } 1826 }
1799 1827
1828 klp_copy_process(p);
1829
1800 spin_lock(&current->sighand->siglock); 1830 spin_lock(&current->sighand->siglock);
1801 1831
1802 /* 1832 /*
@@ -1815,11 +1845,13 @@ static __latent_entropy struct task_struct *copy_process(
1815 */ 1845 */
1816 recalc_sigpending(); 1846 recalc_sigpending();
1817 if (signal_pending(current)) { 1847 if (signal_pending(current)) {
1818 spin_unlock(&current->sighand->siglock);
1819 write_unlock_irq(&tasklist_lock);
1820 retval = -ERESTARTNOINTR; 1848 retval = -ERESTARTNOINTR;
1821 goto bad_fork_cancel_cgroup; 1849 goto bad_fork_cancel_cgroup;
1822 } 1850 }
1851 if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
1852 retval = -ENOMEM;
1853 goto bad_fork_cancel_cgroup;
1854 }
1823 1855
1824 if (likely(p->pid)) { 1856 if (likely(p->pid)) {
1825 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); 1857 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -1877,6 +1909,8 @@ static __latent_entropy struct task_struct *copy_process(
1877 return p; 1909 return p;
1878 1910
1879bad_fork_cancel_cgroup: 1911bad_fork_cancel_cgroup:
1912 spin_unlock(&current->sighand->siglock);
1913 write_unlock_irq(&tasklist_lock);
1880 cgroup_cancel_fork(p); 1914 cgroup_cancel_fork(p);
1881bad_fork_free_pid: 1915bad_fork_free_pid:
1882 cgroup_threadgroup_change_end(current); 1916 cgroup_threadgroup_change_end(current);
@@ -1903,6 +1937,8 @@ bad_fork_cleanup_files:
1903 exit_files(p); /* blocking */ 1937 exit_files(p); /* blocking */
1904bad_fork_cleanup_semundo: 1938bad_fork_cleanup_semundo:
1905 exit_sem(p); 1939 exit_sem(p);
1940bad_fork_cleanup_security:
1941 security_task_free(p);
1906bad_fork_cleanup_audit: 1942bad_fork_cleanup_audit:
1907 audit_free(p); 1943 audit_free(p);
1908bad_fork_cleanup_perf: 1944bad_fork_cleanup_perf:
@@ -2144,7 +2180,7 @@ void __init proc_caches_init(void)
2144{ 2180{
2145 sighand_cachep = kmem_cache_create("sighand_cache", 2181 sighand_cachep = kmem_cache_create("sighand_cache",
2146 sizeof(struct sighand_struct), 0, 2182 sizeof(struct sighand_struct), 0,
2147 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| 2183 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
2148 SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); 2184 SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
2149 signal_cachep = kmem_cache_create("signal_cache", 2185 signal_cachep = kmem_cache_create("signal_cache",
2150 sizeof(struct signal_struct), 0, 2186 sizeof(struct signal_struct), 0,
@@ -2352,6 +2388,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
2352 } 2388 }
2353 } 2389 }
2354 2390
2391 perf_event_namespaces(current);
2392
2355bad_unshare_cleanup_cred: 2393bad_unshare_cleanup_cred:
2356 if (new_cred) 2394 if (new_cred)
2357 put_cred(new_cred); 2395 put_cred(new_cred);
diff --git a/kernel/futex.c b/kernel/futex.c
index 45858ec73941..357348a6cf6b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -802,7 +802,7 @@ static int refill_pi_state_cache(void)
802 return 0; 802 return 0;
803} 803}
804 804
805static struct futex_pi_state * alloc_pi_state(void) 805static struct futex_pi_state *alloc_pi_state(void)
806{ 806{
807 struct futex_pi_state *pi_state = current->pi_state_cache; 807 struct futex_pi_state *pi_state = current->pi_state_cache;
808 808
@@ -812,6 +812,11 @@ static struct futex_pi_state * alloc_pi_state(void)
812 return pi_state; 812 return pi_state;
813} 813}
814 814
815static void get_pi_state(struct futex_pi_state *pi_state)
816{
817 WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
818}
819
815/* 820/*
816 * Drops a reference to the pi_state object and frees or caches it 821 * Drops a reference to the pi_state object and frees or caches it
817 * when the last reference is gone. 822 * when the last reference is gone.
@@ -856,7 +861,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
856 * Look up the task based on what TID userspace gave us. 861 * Look up the task based on what TID userspace gave us.
857 * We dont trust it. 862 * We dont trust it.
858 */ 863 */
859static struct task_struct * futex_find_get_task(pid_t pid) 864static struct task_struct *futex_find_get_task(pid_t pid)
860{ 865{
861 struct task_struct *p; 866 struct task_struct *p;
862 867
@@ -916,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr)
916 pi_state->owner = NULL; 921 pi_state->owner = NULL;
917 raw_spin_unlock_irq(&curr->pi_lock); 922 raw_spin_unlock_irq(&curr->pi_lock);
918 923
919 rt_mutex_unlock(&pi_state->pi_mutex); 924 get_pi_state(pi_state);
920
921 spin_unlock(&hb->lock); 925 spin_unlock(&hb->lock);
922 926
927 rt_mutex_futex_unlock(&pi_state->pi_mutex);
928 put_pi_state(pi_state);
929
923 raw_spin_lock_irq(&curr->pi_lock); 930 raw_spin_lock_irq(&curr->pi_lock);
924 } 931 }
925 raw_spin_unlock_irq(&curr->pi_lock); 932 raw_spin_unlock_irq(&curr->pi_lock);
@@ -973,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr)
973 * 980 *
974 * [10] There is no transient state which leaves owner and user space 981 * [10] There is no transient state which leaves owner and user space
975 * TID out of sync. 982 * TID out of sync.
983 *
984 *
985 * Serialization and lifetime rules:
986 *
987 * hb->lock:
988 *
989 * hb -> futex_q, relation
990 * futex_q -> pi_state, relation
991 *
992 * (cannot be raw because hb can contain arbitrary amount
993 * of futex_q's)
994 *
995 * pi_mutex->wait_lock:
996 *
997 * {uval, pi_state}
998 *
999 * (and pi_mutex 'obviously')
1000 *
1001 * p->pi_lock:
1002 *
1003 * p->pi_state_list -> pi_state->list, relation
1004 *
1005 * pi_state->refcount:
1006 *
1007 * pi_state lifetime
1008 *
1009 *
1010 * Lock order:
1011 *
1012 * hb->lock
1013 * pi_mutex->wait_lock
1014 * p->pi_lock
1015 *
976 */ 1016 */
977 1017
978/* 1018/*
@@ -980,10 +1020,13 @@ void exit_pi_state_list(struct task_struct *curr)
980 * the pi_state against the user space value. If correct, attach to 1020 * the pi_state against the user space value. If correct, attach to
981 * it. 1021 * it.
982 */ 1022 */
983static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, 1023static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
1024 struct futex_pi_state *pi_state,
984 struct futex_pi_state **ps) 1025 struct futex_pi_state **ps)
985{ 1026{
986 pid_t pid = uval & FUTEX_TID_MASK; 1027 pid_t pid = uval & FUTEX_TID_MASK;
1028 u32 uval2;
1029 int ret;
987 1030
988 /* 1031 /*
989 * Userspace might have messed up non-PI and PI futexes [3] 1032 * Userspace might have messed up non-PI and PI futexes [3]
@@ -991,9 +1034,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
991 if (unlikely(!pi_state)) 1034 if (unlikely(!pi_state))
992 return -EINVAL; 1035 return -EINVAL;
993 1036
1037 /*
1038 * We get here with hb->lock held, and having found a
1039 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
1040 * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
1041 * which in turn means that futex_lock_pi() still has a reference on
1042 * our pi_state.
1043 *
1044 * The waiter holding a reference on @pi_state also protects against
1045 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
1046 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
1047 * free pi_state before we can take a reference ourselves.
1048 */
994 WARN_ON(!atomic_read(&pi_state->refcount)); 1049 WARN_ON(!atomic_read(&pi_state->refcount));
995 1050
996 /* 1051 /*
1052 * Now that we have a pi_state, we can acquire wait_lock
1053 * and do the state validation.
1054 */
1055 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1056
1057 /*
1058 * Since {uval, pi_state} is serialized by wait_lock, and our current
1059 * uval was read without holding it, it can have changed. Verify it
1060 * still is what we expect it to be, otherwise retry the entire
1061 * operation.
1062 */
1063 if (get_futex_value_locked(&uval2, uaddr))
1064 goto out_efault;
1065
1066 if (uval != uval2)
1067 goto out_eagain;
1068
1069 /*
997 * Handle the owner died case: 1070 * Handle the owner died case:
998 */ 1071 */
999 if (uval & FUTEX_OWNER_DIED) { 1072 if (uval & FUTEX_OWNER_DIED) {
@@ -1008,11 +1081,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
1008 * is not 0. Inconsistent state. [5] 1081 * is not 0. Inconsistent state. [5]
1009 */ 1082 */
1010 if (pid) 1083 if (pid)
1011 return -EINVAL; 1084 goto out_einval;
1012 /* 1085 /*
1013 * Take a ref on the state and return success. [4] 1086 * Take a ref on the state and return success. [4]
1014 */ 1087 */
1015 goto out_state; 1088 goto out_attach;
1016 } 1089 }
1017 1090
1018 /* 1091 /*
@@ -1024,14 +1097,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
1024 * Take a ref on the state and return success. [6] 1097 * Take a ref on the state and return success. [6]
1025 */ 1098 */
1026 if (!pid) 1099 if (!pid)
1027 goto out_state; 1100 goto out_attach;
1028 } else { 1101 } else {
1029 /* 1102 /*
1030 * If the owner died bit is not set, then the pi_state 1103 * If the owner died bit is not set, then the pi_state
1031 * must have an owner. [7] 1104 * must have an owner. [7]
1032 */ 1105 */
1033 if (!pi_state->owner) 1106 if (!pi_state->owner)
1034 return -EINVAL; 1107 goto out_einval;
1035 } 1108 }
1036 1109
1037 /* 1110 /*
@@ -1040,11 +1113,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
1040 * user space TID. [9/10] 1113 * user space TID. [9/10]
1041 */ 1114 */
1042 if (pid != task_pid_vnr(pi_state->owner)) 1115 if (pid != task_pid_vnr(pi_state->owner))
1043 return -EINVAL; 1116 goto out_einval;
1044out_state: 1117
1045 atomic_inc(&pi_state->refcount); 1118out_attach:
1119 get_pi_state(pi_state);
1120 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1046 *ps = pi_state; 1121 *ps = pi_state;
1047 return 0; 1122 return 0;
1123
1124out_einval:
1125 ret = -EINVAL;
1126 goto out_error;
1127
1128out_eagain:
1129 ret = -EAGAIN;
1130 goto out_error;
1131
1132out_efault:
1133 ret = -EFAULT;
1134 goto out_error;
1135
1136out_error:
1137 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1138 return ret;
1048} 1139}
1049 1140
1050/* 1141/*
@@ -1095,6 +1186,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
1095 1186
1096 /* 1187 /*
1097 * No existing pi state. First waiter. [2] 1188 * No existing pi state. First waiter. [2]
1189 *
1190 * This creates pi_state, we have hb->lock held, this means nothing can
1191 * observe this state, wait_lock is irrelevant.
1098 */ 1192 */
1099 pi_state = alloc_pi_state(); 1193 pi_state = alloc_pi_state();
1100 1194
@@ -1119,17 +1213,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
1119 return 0; 1213 return 0;
1120} 1214}
1121 1215
1122static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 1216static int lookup_pi_state(u32 __user *uaddr, u32 uval,
1217 struct futex_hash_bucket *hb,
1123 union futex_key *key, struct futex_pi_state **ps) 1218 union futex_key *key, struct futex_pi_state **ps)
1124{ 1219{
1125 struct futex_q *match = futex_top_waiter(hb, key); 1220 struct futex_q *top_waiter = futex_top_waiter(hb, key);
1126 1221
1127 /* 1222 /*
1128 * If there is a waiter on that futex, validate it and 1223 * If there is a waiter on that futex, validate it and
1129 * attach to the pi_state when the validation succeeds. 1224 * attach to the pi_state when the validation succeeds.
1130 */ 1225 */
1131 if (match) 1226 if (top_waiter)
1132 return attach_to_pi_state(uval, match->pi_state, ps); 1227 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1133 1228
1134 /* 1229 /*
1135 * We are the first waiter - try to look up the owner based on 1230 * We are the first waiter - try to look up the owner based on
@@ -1148,7 +1243,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
1148 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) 1243 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
1149 return -EFAULT; 1244 return -EFAULT;
1150 1245
1151 /*If user space value changed, let the caller retry */ 1246 /* If user space value changed, let the caller retry */
1152 return curval != uval ? -EAGAIN : 0; 1247 return curval != uval ? -EAGAIN : 0;
1153} 1248}
1154 1249
@@ -1176,7 +1271,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
1176 struct task_struct *task, int set_waiters) 1271 struct task_struct *task, int set_waiters)
1177{ 1272{
1178 u32 uval, newval, vpid = task_pid_vnr(task); 1273 u32 uval, newval, vpid = task_pid_vnr(task);
1179 struct futex_q *match; 1274 struct futex_q *top_waiter;
1180 int ret; 1275 int ret;
1181 1276
1182 /* 1277 /*
@@ -1202,9 +1297,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
1202 * Lookup existing state first. If it exists, try to attach to 1297 * Lookup existing state first. If it exists, try to attach to
1203 * its pi_state. 1298 * its pi_state.
1204 */ 1299 */
1205 match = futex_top_waiter(hb, key); 1300 top_waiter = futex_top_waiter(hb, key);
1206 if (match) 1301 if (top_waiter)
1207 return attach_to_pi_state(uval, match->pi_state, ps); 1302 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1208 1303
1209 /* 1304 /*
1210 * No waiter and user TID is 0. We are here because the 1305 * No waiter and user TID is 0. We are here because the
@@ -1285,50 +1380,44 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
1285 wake_q_add(wake_q, p); 1380 wake_q_add(wake_q, p);
1286 __unqueue_futex(q); 1381 __unqueue_futex(q);
1287 /* 1382 /*
1288 * The waiting task can free the futex_q as soon as 1383 * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
1289 * q->lock_ptr = NULL is written, without taking any locks. A 1384 * is written, without taking any locks. This is possible in the event
1290 * memory barrier is required here to prevent the following 1385 * of a spurious wakeup, for example. A memory barrier is required here
1291 * store to lock_ptr from getting ahead of the plist_del. 1386 * to prevent the following store to lock_ptr from getting ahead of the
1387 * plist_del in __unqueue_futex().
1292 */ 1388 */
1293 smp_wmb(); 1389 smp_store_release(&q->lock_ptr, NULL);
1294 q->lock_ptr = NULL;
1295} 1390}
1296 1391
1297static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, 1392/*
1298 struct futex_hash_bucket *hb) 1393 * Caller must hold a reference on @pi_state.
1394 */
1395static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
1299{ 1396{
1300 struct task_struct *new_owner;
1301 struct futex_pi_state *pi_state = this->pi_state;
1302 u32 uninitialized_var(curval), newval; 1397 u32 uninitialized_var(curval), newval;
1398 struct task_struct *new_owner;
1399 bool postunlock = false;
1303 DEFINE_WAKE_Q(wake_q); 1400 DEFINE_WAKE_Q(wake_q);
1304 bool deboost;
1305 int ret = 0; 1401 int ret = 0;
1306 1402
1307 if (!pi_state)
1308 return -EINVAL;
1309
1310 /*
1311 * If current does not own the pi_state then the futex is
1312 * inconsistent and user space fiddled with the futex value.
1313 */
1314 if (pi_state->owner != current)
1315 return -EINVAL;
1316
1317 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1318 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 1403 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
1404 if (WARN_ON_ONCE(!new_owner)) {
1405 /*
1406 * As per the comment in futex_unlock_pi() this should not happen.
1407 *
1408 * When this happens, give up our locks and try again, giving
1409 * the futex_lock_pi() instance time to complete, either by
1410 * waiting on the rtmutex or removing itself from the futex
1411 * queue.
1412 */
1413 ret = -EAGAIN;
1414 goto out_unlock;
1415 }
1319 1416
1320 /* 1417 /*
1321 * It is possible that the next waiter (the one that brought 1418 * We pass it to the next owner. The WAITERS bit is always kept
1322 * this owner to the kernel) timed out and is no longer 1419 * enabled while there is PI state around. We cleanup the owner
1323 * waiting on the lock. 1420 * died bit, because we are the owner.
1324 */
1325 if (!new_owner)
1326 new_owner = this->task;
1327
1328 /*
1329 * We pass it to the next owner. The WAITERS bit is always
1330 * kept enabled while there is PI state around. We cleanup the
1331 * owner died bit, because we are the owner.
1332 */ 1421 */
1333 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 1422 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1334 1423
@@ -1337,6 +1426,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
1337 1426
1338 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { 1427 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
1339 ret = -EFAULT; 1428 ret = -EFAULT;
1429
1340 } else if (curval != uval) { 1430 } else if (curval != uval) {
1341 /* 1431 /*
1342 * If a unconditional UNLOCK_PI operation (user space did not 1432 * If a unconditional UNLOCK_PI operation (user space did not
@@ -1349,10 +1439,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
1349 else 1439 else
1350 ret = -EINVAL; 1440 ret = -EINVAL;
1351 } 1441 }
1352 if (ret) { 1442
1353 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1443 if (ret)
1354 return ret; 1444 goto out_unlock;
1355 } 1445
1446 /*
1447 * This is a point of no return; once we modify the uval there is no
1448 * going back and subsequent operations must not fail.
1449 */
1356 1450
1357 raw_spin_lock(&pi_state->owner->pi_lock); 1451 raw_spin_lock(&pi_state->owner->pi_lock);
1358 WARN_ON(list_empty(&pi_state->list)); 1452 WARN_ON(list_empty(&pi_state->list));
@@ -1365,22 +1459,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
1365 pi_state->owner = new_owner; 1459 pi_state->owner = new_owner;
1366 raw_spin_unlock(&new_owner->pi_lock); 1460 raw_spin_unlock(&new_owner->pi_lock);
1367 1461
1368 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1462 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
1369 1463
1370 deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); 1464out_unlock:
1465 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1371 1466
1372 /* 1467 if (postunlock)
1373 * First unlock HB so the waiter does not spin on it once he got woken 1468 rt_mutex_postunlock(&wake_q);
1374 * up. Second wake up the waiter before the priority is adjusted. If we
1375 * deboost first (and lose our higher priority), then the task might get
1376 * scheduled away before the wake up can take place.
1377 */
1378 spin_unlock(&hb->lock);
1379 wake_up_q(&wake_q);
1380 if (deboost)
1381 rt_mutex_adjust_prio(current);
1382 1469
1383 return 0; 1470 return ret;
1384} 1471}
1385 1472
1386/* 1473/*
@@ -1826,7 +1913,7 @@ retry_private:
1826 * If that call succeeds then we have pi_state and an 1913 * If that call succeeds then we have pi_state and an
1827 * initial refcount on it. 1914 * initial refcount on it.
1828 */ 1915 */
1829 ret = lookup_pi_state(ret, hb2, &key2, &pi_state); 1916 ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
1830 } 1917 }
1831 1918
1832 switch (ret) { 1919 switch (ret) {
@@ -1909,7 +1996,7 @@ retry_private:
1909 * refcount on the pi_state and store the pointer in 1996 * refcount on the pi_state and store the pointer in
1910 * the futex_q object of the waiter. 1997 * the futex_q object of the waiter.
1911 */ 1998 */
1912 atomic_inc(&pi_state->refcount); 1999 get_pi_state(pi_state);
1913 this->pi_state = pi_state; 2000 this->pi_state = pi_state;
1914 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, 2001 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1915 this->rt_waiter, 2002 this->rt_waiter,
@@ -2009,20 +2096,7 @@ queue_unlock(struct futex_hash_bucket *hb)
2009 hb_waiters_dec(hb); 2096 hb_waiters_dec(hb);
2010} 2097}
2011 2098
2012/** 2099static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2013 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
2014 * @q: The futex_q to enqueue
2015 * @hb: The destination hash bucket
2016 *
2017 * The hb->lock must be held by the caller, and is released here. A call to
2018 * queue_me() is typically paired with exactly one call to unqueue_me(). The
2019 * exceptions involve the PI related operations, which may use unqueue_me_pi()
2020 * or nothing if the unqueue is done as part of the wake process and the unqueue
2021 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
2022 * an example).
2023 */
2024static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2025 __releases(&hb->lock)
2026{ 2100{
2027 int prio; 2101 int prio;
2028 2102
@@ -2039,6 +2113,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2039 plist_node_init(&q->list, prio); 2113 plist_node_init(&q->list, prio);
2040 plist_add(&q->list, &hb->chain); 2114 plist_add(&q->list, &hb->chain);
2041 q->task = current; 2115 q->task = current;
2116}
2117
2118/**
2119 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
2120 * @q: The futex_q to enqueue
2121 * @hb: The destination hash bucket
2122 *
2123 * The hb->lock must be held by the caller, and is released here. A call to
2124 * queue_me() is typically paired with exactly one call to unqueue_me(). The
2125 * exceptions involve the PI related operations, which may use unqueue_me_pi()
2126 * or nothing if the unqueue is done as part of the wake process and the unqueue
2127 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
2128 * an example).
2129 */
2130static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2131 __releases(&hb->lock)
2132{
2133 __queue_me(q, hb);
2042 spin_unlock(&hb->lock); 2134 spin_unlock(&hb->lock);
2043} 2135}
2044 2136
@@ -2125,10 +2217,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2125{ 2217{
2126 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 2218 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
2127 struct futex_pi_state *pi_state = q->pi_state; 2219 struct futex_pi_state *pi_state = q->pi_state;
2128 struct task_struct *oldowner = pi_state->owner;
2129 u32 uval, uninitialized_var(curval), newval; 2220 u32 uval, uninitialized_var(curval), newval;
2221 struct task_struct *oldowner;
2130 int ret; 2222 int ret;
2131 2223
2224 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2225
2226 oldowner = pi_state->owner;
2132 /* Owner died? */ 2227 /* Owner died? */
2133 if (!pi_state->owner) 2228 if (!pi_state->owner)
2134 newtid |= FUTEX_OWNER_DIED; 2229 newtid |= FUTEX_OWNER_DIED;
@@ -2136,7 +2231,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2136 /* 2231 /*
2137 * We are here either because we stole the rtmutex from the 2232 * We are here either because we stole the rtmutex from the
2138 * previous highest priority waiter or we are the highest priority 2233 * previous highest priority waiter or we are the highest priority
2139 * waiter but failed to get the rtmutex the first time. 2234 * waiter but have failed to get the rtmutex the first time.
2235 *
2140 * We have to replace the newowner TID in the user space variable. 2236 * We have to replace the newowner TID in the user space variable.
2141 * This must be atomic as we have to preserve the owner died bit here. 2237 * This must be atomic as we have to preserve the owner died bit here.
2142 * 2238 *
@@ -2144,17 +2240,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2144 * because we can fault here. Imagine swapped out pages or a fork 2240 * because we can fault here. Imagine swapped out pages or a fork
2145 * that marked all the anonymous memory readonly for cow. 2241 * that marked all the anonymous memory readonly for cow.
2146 * 2242 *
2147 * Modifying pi_state _before_ the user space value would 2243 * Modifying pi_state _before_ the user space value would leave the
2148 * leave the pi_state in an inconsistent state when we fault 2244 * pi_state in an inconsistent state when we fault here, because we
2149 * here, because we need to drop the hash bucket lock to 2245 * need to drop the locks to handle the fault. This might be observed
2150 * handle the fault. This might be observed in the PID check 2246 * in the PID check in lookup_pi_state.
2151 * in lookup_pi_state.
2152 */ 2247 */
2153retry: 2248retry:
2154 if (get_futex_value_locked(&uval, uaddr)) 2249 if (get_futex_value_locked(&uval, uaddr))
2155 goto handle_fault; 2250 goto handle_fault;
2156 2251
2157 while (1) { 2252 for (;;) {
2158 newval = (uval & FUTEX_OWNER_DIED) | newtid; 2253 newval = (uval & FUTEX_OWNER_DIED) | newtid;
2159 2254
2160 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) 2255 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
@@ -2169,47 +2264,60 @@ retry:
2169 * itself. 2264 * itself.
2170 */ 2265 */
2171 if (pi_state->owner != NULL) { 2266 if (pi_state->owner != NULL) {
2172 raw_spin_lock_irq(&pi_state->owner->pi_lock); 2267 raw_spin_lock(&pi_state->owner->pi_lock);
2173 WARN_ON(list_empty(&pi_state->list)); 2268 WARN_ON(list_empty(&pi_state->list));
2174 list_del_init(&pi_state->list); 2269 list_del_init(&pi_state->list);
2175 raw_spin_unlock_irq(&pi_state->owner->pi_lock); 2270 raw_spin_unlock(&pi_state->owner->pi_lock);
2176 } 2271 }
2177 2272
2178 pi_state->owner = newowner; 2273 pi_state->owner = newowner;
2179 2274
2180 raw_spin_lock_irq(&newowner->pi_lock); 2275 raw_spin_lock(&newowner->pi_lock);
2181 WARN_ON(!list_empty(&pi_state->list)); 2276 WARN_ON(!list_empty(&pi_state->list));
2182 list_add(&pi_state->list, &newowner->pi_state_list); 2277 list_add(&pi_state->list, &newowner->pi_state_list);
2183 raw_spin_unlock_irq(&newowner->pi_lock); 2278 raw_spin_unlock(&newowner->pi_lock);
2279 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2280
2184 return 0; 2281 return 0;
2185 2282
2186 /* 2283 /*
2187 * To handle the page fault we need to drop the hash bucket 2284 * To handle the page fault we need to drop the locks here. That gives
2188 * lock here. That gives the other task (either the highest priority 2285 * the other task (either the highest priority waiter itself or the
2189 * waiter itself or the task which stole the rtmutex) the 2286 * task which stole the rtmutex) the chance to try the fixup of the
2190 * chance to try the fixup of the pi_state. So once we are 2287 * pi_state. So once we are back from handling the fault we need to
2191 * back from handling the fault we need to check the pi_state 2288 * check the pi_state after reacquiring the locks and before trying to
2192 * after reacquiring the hash bucket lock and before trying to 2289 * do another fixup. When the fixup has been done already we simply
2193 * do another fixup. When the fixup has been done already we 2290 * return.
2194 * simply return. 2291 *
2292 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
2293 * drop hb->lock since the caller owns the hb -> futex_q relation.
2294 * Dropping the pi_mutex->wait_lock requires the state revalidate.
2195 */ 2295 */
2196handle_fault: 2296handle_fault:
2297 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2197 spin_unlock(q->lock_ptr); 2298 spin_unlock(q->lock_ptr);
2198 2299
2199 ret = fault_in_user_writeable(uaddr); 2300 ret = fault_in_user_writeable(uaddr);
2200 2301
2201 spin_lock(q->lock_ptr); 2302 spin_lock(q->lock_ptr);
2303 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2202 2304
2203 /* 2305 /*
2204 * Check if someone else fixed it for us: 2306 * Check if someone else fixed it for us:
2205 */ 2307 */
2206 if (pi_state->owner != oldowner) 2308 if (pi_state->owner != oldowner) {
2207 return 0; 2309 ret = 0;
2310 goto out_unlock;
2311 }
2208 2312
2209 if (ret) 2313 if (ret)
2210 return ret; 2314 goto out_unlock;
2211 2315
2212 goto retry; 2316 goto retry;
2317
2318out_unlock:
2319 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2320 return ret;
2213} 2321}
2214 2322
2215static long futex_wait_restart(struct restart_block *restart); 2323static long futex_wait_restart(struct restart_block *restart);
@@ -2231,13 +2339,16 @@ static long futex_wait_restart(struct restart_block *restart);
2231 */ 2339 */
2232static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) 2340static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2233{ 2341{
2234 struct task_struct *owner;
2235 int ret = 0; 2342 int ret = 0;
2236 2343
2237 if (locked) { 2344 if (locked) {
2238 /* 2345 /*
2239 * Got the lock. We might not be the anticipated owner if we 2346 * Got the lock. We might not be the anticipated owner if we
2240 * did a lock-steal - fix up the PI-state in that case: 2347 * did a lock-steal - fix up the PI-state in that case:
2348 *
2349 * We can safely read pi_state->owner without holding wait_lock
2350 * because we now own the rt_mutex, only the owner will attempt
2351 * to change it.
2241 */ 2352 */
2242 if (q->pi_state->owner != current) 2353 if (q->pi_state->owner != current)
2243 ret = fixup_pi_state_owner(uaddr, q, current); 2354 ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2245,43 +2356,15 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2245 } 2356 }
2246 2357
2247 /* 2358 /*
2248 * Catch the rare case, where the lock was released when we were on the
2249 * way back before we locked the hash bucket.
2250 */
2251 if (q->pi_state->owner == current) {
2252 /*
2253 * Try to get the rt_mutex now. This might fail as some other
2254 * task acquired the rt_mutex after we removed ourself from the
2255 * rt_mutex waiters list.
2256 */
2257 if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
2258 locked = 1;
2259 goto out;
2260 }
2261
2262 /*
2263 * pi_state is incorrect, some other task did a lock steal and
2264 * we returned due to timeout or signal without taking the
2265 * rt_mutex. Too late.
2266 */
2267 raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
2268 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
2269 if (!owner)
2270 owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
2271 raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
2272 ret = fixup_pi_state_owner(uaddr, q, owner);
2273 goto out;
2274 }
2275
2276 /*
2277 * Paranoia check. If we did not take the lock, then we should not be 2359 * Paranoia check. If we did not take the lock, then we should not be
2278 * the owner of the rt_mutex. 2360 * the owner of the rt_mutex.
2279 */ 2361 */
2280 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) 2362 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
2281 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " 2363 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
2282 "pi-state %p\n", ret, 2364 "pi-state %p\n", ret,
2283 q->pi_state->pi_mutex.owner, 2365 q->pi_state->pi_mutex.owner,
2284 q->pi_state->owner); 2366 q->pi_state->owner);
2367 }
2285 2368
2286out: 2369out:
2287 return ret ? ret : locked; 2370 return ret ? ret : locked;
@@ -2505,6 +2588,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
2505 ktime_t *time, int trylock) 2588 ktime_t *time, int trylock)
2506{ 2589{
2507 struct hrtimer_sleeper timeout, *to = NULL; 2590 struct hrtimer_sleeper timeout, *to = NULL;
2591 struct futex_pi_state *pi_state = NULL;
2592 struct rt_mutex_waiter rt_waiter;
2508 struct futex_hash_bucket *hb; 2593 struct futex_hash_bucket *hb;
2509 struct futex_q q = futex_q_init; 2594 struct futex_q q = futex_q_init;
2510 int res, ret; 2595 int res, ret;
@@ -2557,25 +2642,68 @@ retry_private:
2557 } 2642 }
2558 } 2643 }
2559 2644
2645 WARN_ON(!q.pi_state);
2646
2560 /* 2647 /*
2561 * Only actually queue now that the atomic ops are done: 2648 * Only actually queue now that the atomic ops are done:
2562 */ 2649 */
2563 queue_me(&q, hb); 2650 __queue_me(&q, hb);
2564 2651
2565 WARN_ON(!q.pi_state); 2652 if (trylock) {
2566 /* 2653 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
2567 * Block on the PI mutex:
2568 */
2569 if (!trylock) {
2570 ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
2571 } else {
2572 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
2573 /* Fixup the trylock return value: */ 2654 /* Fixup the trylock return value: */
2574 ret = ret ? 0 : -EWOULDBLOCK; 2655 ret = ret ? 0 : -EWOULDBLOCK;
2656 goto no_block;
2575 } 2657 }
2576 2658
2659 rt_mutex_init_waiter(&rt_waiter);
2660
2661 /*
2662 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
2663 * hold it while doing rt_mutex_start_proxy(), because then it will
2664 * include hb->lock in the blocking chain, even through we'll not in
2665 * fact hold it while blocking. This will lead it to report -EDEADLK
2666 * and BUG when futex_unlock_pi() interleaves with this.
2667 *
2668 * Therefore acquire wait_lock while holding hb->lock, but drop the
2669 * latter before calling rt_mutex_start_proxy_lock(). This still fully
2670 * serializes against futex_unlock_pi() as that does the exact same
2671 * lock handoff sequence.
2672 */
2673 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
2674 spin_unlock(q.lock_ptr);
2675 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
2676 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
2677
2678 if (ret) {
2679 if (ret == 1)
2680 ret = 0;
2681
2682 spin_lock(q.lock_ptr);
2683 goto no_block;
2684 }
2685
2686
2687 if (unlikely(to))
2688 hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
2689
2690 ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
2691
2577 spin_lock(q.lock_ptr); 2692 spin_lock(q.lock_ptr);
2578 /* 2693 /*
2694 * If we failed to acquire the lock (signal/timeout), we must
2695 * first acquire the hb->lock before removing the lock from the
2696 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
2697 * wait lists consistent.
2698 *
2699 * In particular; it is important that futex_unlock_pi() can not
2700 * observe this inconsistency.
2701 */
2702 if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
2703 ret = 0;
2704
2705no_block:
2706 /*
2579 * Fixup the pi_state owner and possibly acquire the lock if we 2707 * Fixup the pi_state owner and possibly acquire the lock if we
2580 * haven't already. 2708 * haven't already.
2581 */ 2709 */
@@ -2591,12 +2719,19 @@ retry_private:
2591 * If fixup_owner() faulted and was unable to handle the fault, unlock 2719 * If fixup_owner() faulted and was unable to handle the fault, unlock
2592 * it and return the fault to userspace. 2720 * it and return the fault to userspace.
2593 */ 2721 */
2594 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) 2722 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
2595 rt_mutex_unlock(&q.pi_state->pi_mutex); 2723 pi_state = q.pi_state;
2724 get_pi_state(pi_state);
2725 }
2596 2726
2597 /* Unqueue and drop the lock */ 2727 /* Unqueue and drop the lock */
2598 unqueue_me_pi(&q); 2728 unqueue_me_pi(&q);
2599 2729
2730 if (pi_state) {
2731 rt_mutex_futex_unlock(&pi_state->pi_mutex);
2732 put_pi_state(pi_state);
2733 }
2734
2600 goto out_put_key; 2735 goto out_put_key;
2601 2736
2602out_unlock_put_key: 2737out_unlock_put_key:
@@ -2605,8 +2740,10 @@ out_unlock_put_key:
2605out_put_key: 2740out_put_key:
2606 put_futex_key(&q.key); 2741 put_futex_key(&q.key);
2607out: 2742out:
2608 if (to) 2743 if (to) {
2744 hrtimer_cancel(&to->timer);
2609 destroy_hrtimer_on_stack(&to->timer); 2745 destroy_hrtimer_on_stack(&to->timer);
2746 }
2610 return ret != -EINTR ? ret : -ERESTARTNOINTR; 2747 return ret != -EINTR ? ret : -ERESTARTNOINTR;
2611 2748
2612uaddr_faulted: 2749uaddr_faulted:
@@ -2633,7 +2770,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2633 u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); 2770 u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
2634 union futex_key key = FUTEX_KEY_INIT; 2771 union futex_key key = FUTEX_KEY_INIT;
2635 struct futex_hash_bucket *hb; 2772 struct futex_hash_bucket *hb;
2636 struct futex_q *match; 2773 struct futex_q *top_waiter;
2637 int ret; 2774 int ret;
2638 2775
2639retry: 2776retry:
@@ -2657,12 +2794,37 @@ retry:
2657 * all and we at least want to know if user space fiddled 2794 * all and we at least want to know if user space fiddled
2658 * with the futex value instead of blindly unlocking. 2795 * with the futex value instead of blindly unlocking.
2659 */ 2796 */
2660 match = futex_top_waiter(hb, &key); 2797 top_waiter = futex_top_waiter(hb, &key);
2661 if (match) { 2798 if (top_waiter) {
2662 ret = wake_futex_pi(uaddr, uval, match, hb); 2799 struct futex_pi_state *pi_state = top_waiter->pi_state;
2800
2801 ret = -EINVAL;
2802 if (!pi_state)
2803 goto out_unlock;
2804
2663 /* 2805 /*
2664 * In case of success wake_futex_pi dropped the hash 2806 * If current does not own the pi_state then the futex is
2665 * bucket lock. 2807 * inconsistent and user space fiddled with the futex value.
2808 */
2809 if (pi_state->owner != current)
2810 goto out_unlock;
2811
2812 get_pi_state(pi_state);
2813 /*
2814 * By taking wait_lock while still holding hb->lock, we ensure
2815 * there is no point where we hold neither; and therefore
2816 * wake_futex_pi() must observe a state consistent with what we
2817 * observed.
2818 */
2819 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2820 spin_unlock(&hb->lock);
2821
2822 ret = wake_futex_pi(uaddr, uval, pi_state);
2823
2824 put_pi_state(pi_state);
2825
2826 /*
2827 * Success, we're done! No tricky corner cases.
2666 */ 2828 */
2667 if (!ret) 2829 if (!ret)
2668 goto out_putkey; 2830 goto out_putkey;
@@ -2677,7 +2839,6 @@ retry:
2677 * setting the FUTEX_WAITERS bit. Try again. 2839 * setting the FUTEX_WAITERS bit. Try again.
2678 */ 2840 */
2679 if (ret == -EAGAIN) { 2841 if (ret == -EAGAIN) {
2680 spin_unlock(&hb->lock);
2681 put_futex_key(&key); 2842 put_futex_key(&key);
2682 goto retry; 2843 goto retry;
2683 } 2844 }
@@ -2685,7 +2846,7 @@ retry:
2685 * wake_futex_pi has detected invalid state. Tell user 2846 * wake_futex_pi has detected invalid state. Tell user
2686 * space. 2847 * space.
2687 */ 2848 */
2688 goto out_unlock; 2849 goto out_putkey;
2689 } 2850 }
2690 2851
2691 /* 2852 /*
@@ -2695,8 +2856,10 @@ retry:
2695 * preserve the WAITERS bit not the OWNER_DIED one. We are the 2856 * preserve the WAITERS bit not the OWNER_DIED one. We are the
2696 * owner. 2857 * owner.
2697 */ 2858 */
2698 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) 2859 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
2860 spin_unlock(&hb->lock);
2699 goto pi_faulted; 2861 goto pi_faulted;
2862 }
2700 2863
2701 /* 2864 /*
2702 * If uval has changed, let user space handle it. 2865 * If uval has changed, let user space handle it.
@@ -2710,7 +2873,6 @@ out_putkey:
2710 return ret; 2873 return ret;
2711 2874
2712pi_faulted: 2875pi_faulted:
2713 spin_unlock(&hb->lock);
2714 put_futex_key(&key); 2876 put_futex_key(&key);
2715 2877
2716 ret = fault_in_user_writeable(uaddr); 2878 ret = fault_in_user_writeable(uaddr);
@@ -2814,6 +2976,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2814 u32 __user *uaddr2) 2976 u32 __user *uaddr2)
2815{ 2977{
2816 struct hrtimer_sleeper timeout, *to = NULL; 2978 struct hrtimer_sleeper timeout, *to = NULL;
2979 struct futex_pi_state *pi_state = NULL;
2817 struct rt_mutex_waiter rt_waiter; 2980 struct rt_mutex_waiter rt_waiter;
2818 struct futex_hash_bucket *hb; 2981 struct futex_hash_bucket *hb;
2819 union futex_key key2 = FUTEX_KEY_INIT; 2982 union futex_key key2 = FUTEX_KEY_INIT;
@@ -2840,10 +3003,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2840 * The waiter is allocated on our stack, manipulated by the requeue 3003 * The waiter is allocated on our stack, manipulated by the requeue
2841 * code while we sleep on uaddr. 3004 * code while we sleep on uaddr.
2842 */ 3005 */
2843 debug_rt_mutex_init_waiter(&rt_waiter); 3006 rt_mutex_init_waiter(&rt_waiter);
2844 RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
2845 RB_CLEAR_NODE(&rt_waiter.tree_entry);
2846 rt_waiter.task = NULL;
2847 3007
2848 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); 3008 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
2849 if (unlikely(ret != 0)) 3009 if (unlikely(ret != 0))
@@ -2898,8 +3058,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2898 if (q.pi_state && (q.pi_state->owner != current)) { 3058 if (q.pi_state && (q.pi_state->owner != current)) {
2899 spin_lock(q.lock_ptr); 3059 spin_lock(q.lock_ptr);
2900 ret = fixup_pi_state_owner(uaddr2, &q, current); 3060 ret = fixup_pi_state_owner(uaddr2, &q, current);
2901 if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) 3061 if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
2902 rt_mutex_unlock(&q.pi_state->pi_mutex); 3062 pi_state = q.pi_state;
3063 get_pi_state(pi_state);
3064 }
2903 /* 3065 /*
2904 * Drop the reference to the pi state which 3066 * Drop the reference to the pi state which
2905 * the requeue_pi() code acquired for us. 3067 * the requeue_pi() code acquired for us.
@@ -2917,10 +3079,13 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2917 */ 3079 */
2918 WARN_ON(!q.pi_state); 3080 WARN_ON(!q.pi_state);
2919 pi_mutex = &q.pi_state->pi_mutex; 3081 pi_mutex = &q.pi_state->pi_mutex;
2920 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter); 3082 ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
2921 debug_rt_mutex_free_waiter(&rt_waiter);
2922 3083
2923 spin_lock(q.lock_ptr); 3084 spin_lock(q.lock_ptr);
3085 if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
3086 ret = 0;
3087
3088 debug_rt_mutex_free_waiter(&rt_waiter);
2924 /* 3089 /*
2925 * Fixup the pi_state owner and possibly acquire the lock if we 3090 * Fixup the pi_state owner and possibly acquire the lock if we
2926 * haven't already. 3091 * haven't already.
@@ -2938,13 +3103,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2938 * the fault, unlock the rt_mutex and return the fault to 3103 * the fault, unlock the rt_mutex and return the fault to
2939 * userspace. 3104 * userspace.
2940 */ 3105 */
2941 if (ret && rt_mutex_owner(pi_mutex) == current) 3106 if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
2942 rt_mutex_unlock(pi_mutex); 3107 pi_state = q.pi_state;
3108 get_pi_state(pi_state);
3109 }
2943 3110
2944 /* Unqueue and drop the lock. */ 3111 /* Unqueue and drop the lock. */
2945 unqueue_me_pi(&q); 3112 unqueue_me_pi(&q);
2946 } 3113 }
2947 3114
3115 if (pi_state) {
3116 rt_mutex_futex_unlock(&pi_state->pi_mutex);
3117 put_pi_state(pi_state);
3118 }
3119
2948 if (ret == -EINTR) { 3120 if (ret == -EINTR) {
2949 /* 3121 /*
2950 * We've already been requeued, but cannot restart by calling 3122 * We've already been requeued, but cannot restart by calling
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 2f9df37940a0..c51a49c9be70 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -98,6 +98,12 @@ void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
98} 98}
99EXPORT_SYMBOL(__gcov_merge_icall_topn); 99EXPORT_SYMBOL(__gcov_merge_icall_topn);
100 100
101void __gcov_exit(void)
102{
103 /* Unused. */
104}
105EXPORT_SYMBOL(__gcov_exit);
106
101/** 107/**
102 * gcov_enable_events - enable event reporting through gcov_event() 108 * gcov_enable_events - enable event reporting through gcov_event()
103 * 109 *
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 6a5c239c7669..46a18e72bce6 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,9 @@
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include "gcov.h" 19#include "gcov.h"
20 20
21#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) 21#if (__GNUC__ >= 7)
22#define GCOV_COUNTERS 9
23#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
22#define GCOV_COUNTERS 10 24#define GCOV_COUNTERS 10
23#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 25#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
24#define GCOV_COUNTERS 9 26#define GCOV_COUNTERS 9
diff --git a/kernel/groups.c b/kernel/groups.c
index 8dd7a61b7115..d09727692a2a 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -18,7 +18,7 @@ struct group_info *groups_alloc(int gidsetsize)
18 len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; 18 len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize;
19 gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); 19 gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY);
20 if (!gi) 20 if (!gi)
21 gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL); 21 gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL);
22 if (!gi) 22 if (!gi)
23 return NULL; 23 return NULL;
24 24
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index f0f8e2a8496f..751593ed7c0b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -43,6 +43,7 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_
43int __read_mostly sysctl_hung_task_warnings = 10; 43int __read_mostly sysctl_hung_task_warnings = 10;
44 44
45static int __read_mostly did_panic; 45static int __read_mostly did_panic;
46static bool hung_task_show_lock;
46 47
47static struct task_struct *watchdog_task; 48static struct task_struct *watchdog_task;
48 49
@@ -120,12 +121,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
120 pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" 121 pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
121 " disables this message.\n"); 122 " disables this message.\n");
122 sched_show_task(t); 123 sched_show_task(t);
123 debug_show_all_locks(); 124 hung_task_show_lock = true;
124 } 125 }
125 126
126 touch_nmi_watchdog(); 127 touch_nmi_watchdog();
127 128
128 if (sysctl_hung_task_panic) { 129 if (sysctl_hung_task_panic) {
130 if (hung_task_show_lock)
131 debug_show_all_locks();
129 trigger_all_cpu_backtrace(); 132 trigger_all_cpu_backtrace();
130 panic("hung_task: blocked tasks"); 133 panic("hung_task: blocked tasks");
131 } 134 }
@@ -172,6 +175,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
172 if (test_taint(TAINT_DIE) || did_panic) 175 if (test_taint(TAINT_DIE) || did_panic)
173 return; 176 return;
174 177
178 hung_task_show_lock = false;
175 rcu_read_lock(); 179 rcu_read_lock();
176 for_each_process_thread(g, t) { 180 for_each_process_thread(g, t) {
177 if (!max_count--) 181 if (!max_count--)
@@ -187,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
187 } 191 }
188 unlock: 192 unlock:
189 rcu_read_unlock(); 193 rcu_read_unlock();
194 if (hung_task_show_lock)
195 debug_show_all_locks();
190} 196}
191 197
192static long hung_timeout_jiffies(unsigned long last_checked, 198static long hung_timeout_jiffies(unsigned long last_checked,
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index be3c34e4f2ac..c94da688ee9b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -348,7 +348,10 @@ void handle_nested_irq(unsigned int irq)
348 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); 348 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
349 raw_spin_unlock_irq(&desc->lock); 349 raw_spin_unlock_irq(&desc->lock);
350 350
351 action_ret = action->thread_fn(action->irq, action->dev_id); 351 action_ret = IRQ_NONE;
352 for_each_action_of_desc(desc, action)
353 action_ret |= action->thread_fn(action->irq, action->dev_id);
354
352 if (!noirqdebug) 355 if (!noirqdebug)
353 note_interrupt(desc, action_ret); 356 note_interrupt(desc, action_ret);
354 357
@@ -877,8 +880,8 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
877 if (!desc) 880 if (!desc)
878 return; 881 return;
879 882
880 __irq_do_set_handler(desc, handle, 1, NULL);
881 desc->irq_common_data.handler_data = data; 883 desc->irq_common_data.handler_data = data;
884 __irq_do_set_handler(desc, handle, 1, NULL);
882 885
883 irq_put_desc_busunlock(desc, flags); 886 irq_put_desc_busunlock(desc, flags);
884} 887}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a4afe5cc5af1..070be980c37a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -852,7 +852,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
852 * This code is triggered unconditionally. Check the affinity 852 * This code is triggered unconditionally. Check the affinity
853 * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. 853 * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
854 */ 854 */
855 if (desc->irq_common_data.affinity) 855 if (cpumask_available(desc->irq_common_data.affinity))
856 cpumask_copy(mask, desc->irq_common_data.affinity); 856 cpumask_copy(mask, desc->irq_common_data.affinity);
857 else 857 else
858 valid = false; 858 valid = false;
@@ -1212,8 +1212,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1212 * set the trigger type must match. Also all must 1212 * set the trigger type must match. Also all must
1213 * agree on ONESHOT. 1213 * agree on ONESHOT.
1214 */ 1214 */
1215 unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data);
1216
1215 if (!((old->flags & new->flags) & IRQF_SHARED) || 1217 if (!((old->flags & new->flags) & IRQF_SHARED) ||
1216 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || 1218 (oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
1217 ((old->flags ^ new->flags) & IRQF_ONESHOT)) 1219 ((old->flags ^ new->flags) & IRQF_ONESHOT))
1218 goto mismatch; 1220 goto mismatch;
1219 1221
@@ -1557,7 +1559,7 @@ void remove_irq(unsigned int irq, struct irqaction *act)
1557 struct irq_desc *desc = irq_to_desc(irq); 1559 struct irq_desc *desc = irq_to_desc(irq);
1558 1560
1559 if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc))) 1561 if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
1560 __free_irq(irq, act->dev_id); 1562 __free_irq(irq, act->dev_id);
1561} 1563}
1562EXPORT_SYMBOL_GPL(remove_irq); 1564EXPORT_SYMBOL_GPL(remove_irq);
1563 1565
@@ -1574,20 +1576,27 @@ EXPORT_SYMBOL_GPL(remove_irq);
1574 * have completed. 1576 * have completed.
1575 * 1577 *
1576 * This function must not be called from interrupt context. 1578 * This function must not be called from interrupt context.
1579 *
1580 * Returns the devname argument passed to request_irq.
1577 */ 1581 */
1578void free_irq(unsigned int irq, void *dev_id) 1582const void *free_irq(unsigned int irq, void *dev_id)
1579{ 1583{
1580 struct irq_desc *desc = irq_to_desc(irq); 1584 struct irq_desc *desc = irq_to_desc(irq);
1585 struct irqaction *action;
1586 const char *devname;
1581 1587
1582 if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) 1588 if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
1583 return; 1589 return NULL;
1584 1590
1585#ifdef CONFIG_SMP 1591#ifdef CONFIG_SMP
1586 if (WARN_ON(desc->affinity_notify)) 1592 if (WARN_ON(desc->affinity_notify))
1587 desc->affinity_notify = NULL; 1593 desc->affinity_notify = NULL;
1588#endif 1594#endif
1589 1595
1590 kfree(__free_irq(irq, dev_id)); 1596 action = __free_irq(irq, dev_id);
1597 devname = action->name;
1598 kfree(action);
1599 return devname;
1591} 1600}
1592EXPORT_SYMBOL(free_irq); 1601EXPORT_SYMBOL(free_irq);
1593 1602
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 85e5546cd791..cd771993f96f 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -60,15 +60,8 @@ void notrace __sanitizer_cov_trace_pc(void)
60 /* 60 /*
61 * We are interested in code coverage as a function of a syscall inputs, 61 * We are interested in code coverage as a function of a syscall inputs,
62 * so we ignore code executed in interrupts. 62 * so we ignore code executed in interrupts.
63 * The checks for whether we are in an interrupt are open-coded, because
64 * 1. We can't use in_interrupt() here, since it also returns true
65 * when we are inside local_bh_disable() section.
66 * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
67 * since that leads to slower generated code (three separate tests,
68 * one for each of the flags).
69 */ 63 */
70 if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET 64 if (!t || !in_task())
71 | NMI_MASK)))
72 return; 65 return;
73 mode = READ_ONCE(t->kcov_mode); 66 mode = READ_ONCE(t->kcov_mode);
74 if (mode == KCOV_MODE_TRACE) { 67 if (mode == KCOV_MODE_TRACE) {
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index bfe62d5b3872..ae1a3ba24df5 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -51,12 +51,6 @@ DEFINE_MUTEX(kexec_mutex);
51/* Per cpu memory for storing cpu states in case of system crash. */ 51/* Per cpu memory for storing cpu states in case of system crash. */
52note_buf_t __percpu *crash_notes; 52note_buf_t __percpu *crash_notes;
53 53
54/* vmcoreinfo stuff */
55static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
56u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
57size_t vmcoreinfo_size;
58size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
59
60/* Flag to indicate we are going to kexec a new kernel */ 54/* Flag to indicate we are going to kexec a new kernel */
61bool kexec_in_progress = false; 55bool kexec_in_progress = false;
62 56
@@ -996,34 +990,6 @@ unlock:
996 return ret; 990 return ret;
997} 991}
998 992
999static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1000 size_t data_len)
1001{
1002 struct elf_note note;
1003
1004 note.n_namesz = strlen(name) + 1;
1005 note.n_descsz = data_len;
1006 note.n_type = type;
1007 memcpy(buf, &note, sizeof(note));
1008 buf += (sizeof(note) + 3)/4;
1009 memcpy(buf, name, note.n_namesz);
1010 buf += (note.n_namesz + 3)/4;
1011 memcpy(buf, data, note.n_descsz);
1012 buf += (note.n_descsz + 3)/4;
1013
1014 return buf;
1015}
1016
1017static void final_note(u32 *buf)
1018{
1019 struct elf_note note;
1020
1021 note.n_namesz = 0;
1022 note.n_descsz = 0;
1023 note.n_type = 0;
1024 memcpy(buf, &note, sizeof(note));
1025}
1026
1027void crash_save_cpu(struct pt_regs *regs, int cpu) 993void crash_save_cpu(struct pt_regs *regs, int cpu)
1028{ 994{
1029 struct elf_prstatus prstatus; 995 struct elf_prstatus prstatus;
@@ -1085,403 +1051,6 @@ subsys_initcall(crash_notes_memory_init);
1085 1051
1086 1052
1087/* 1053/*
1088 * parsing the "crashkernel" commandline
1089 *
1090 * this code is intended to be called from architecture specific code
1091 */
1092
1093
1094/*
1095 * This function parses command lines in the format
1096 *
1097 * crashkernel=ramsize-range:size[,...][@offset]
1098 *
1099 * The function returns 0 on success and -EINVAL on failure.
1100 */
1101static int __init parse_crashkernel_mem(char *cmdline,
1102 unsigned long long system_ram,
1103 unsigned long long *crash_size,
1104 unsigned long long *crash_base)
1105{
1106 char *cur = cmdline, *tmp;
1107
1108 /* for each entry of the comma-separated list */
1109 do {
1110 unsigned long long start, end = ULLONG_MAX, size;
1111
1112 /* get the start of the range */
1113 start = memparse(cur, &tmp);
1114 if (cur == tmp) {
1115 pr_warn("crashkernel: Memory value expected\n");
1116 return -EINVAL;
1117 }
1118 cur = tmp;
1119 if (*cur != '-') {
1120 pr_warn("crashkernel: '-' expected\n");
1121 return -EINVAL;
1122 }
1123 cur++;
1124
1125 /* if no ':' is here, than we read the end */
1126 if (*cur != ':') {
1127 end = memparse(cur, &tmp);
1128 if (cur == tmp) {
1129 pr_warn("crashkernel: Memory value expected\n");
1130 return -EINVAL;
1131 }
1132 cur = tmp;
1133 if (end <= start) {
1134 pr_warn("crashkernel: end <= start\n");
1135 return -EINVAL;
1136 }
1137 }
1138
1139 if (*cur != ':') {
1140 pr_warn("crashkernel: ':' expected\n");
1141 return -EINVAL;
1142 }
1143 cur++;
1144
1145 size = memparse(cur, &tmp);
1146 if (cur == tmp) {
1147 pr_warn("Memory value expected\n");
1148 return -EINVAL;
1149 }
1150 cur = tmp;
1151 if (size >= system_ram) {
1152 pr_warn("crashkernel: invalid size\n");
1153 return -EINVAL;
1154 }
1155
1156 /* match ? */
1157 if (system_ram >= start && system_ram < end) {
1158 *crash_size = size;
1159 break;
1160 }
1161 } while (*cur++ == ',');
1162
1163 if (*crash_size > 0) {
1164 while (*cur && *cur != ' ' && *cur != '@')
1165 cur++;
1166 if (*cur == '@') {
1167 cur++;
1168 *crash_base = memparse(cur, &tmp);
1169 if (cur == tmp) {
1170 pr_warn("Memory value expected after '@'\n");
1171 return -EINVAL;
1172 }
1173 }
1174 }
1175
1176 return 0;
1177}
1178
1179/*
1180 * That function parses "simple" (old) crashkernel command lines like
1181 *
1182 * crashkernel=size[@offset]
1183 *
1184 * It returns 0 on success and -EINVAL on failure.
1185 */
1186static int __init parse_crashkernel_simple(char *cmdline,
1187 unsigned long long *crash_size,
1188 unsigned long long *crash_base)
1189{
1190 char *cur = cmdline;
1191
1192 *crash_size = memparse(cmdline, &cur);
1193 if (cmdline == cur) {
1194 pr_warn("crashkernel: memory value expected\n");
1195 return -EINVAL;
1196 }
1197
1198 if (*cur == '@')
1199 *crash_base = memparse(cur+1, &cur);
1200 else if (*cur != ' ' && *cur != '\0') {
1201 pr_warn("crashkernel: unrecognized char: %c\n", *cur);
1202 return -EINVAL;
1203 }
1204
1205 return 0;
1206}
1207
1208#define SUFFIX_HIGH 0
1209#define SUFFIX_LOW 1
1210#define SUFFIX_NULL 2
1211static __initdata char *suffix_tbl[] = {
1212 [SUFFIX_HIGH] = ",high",
1213 [SUFFIX_LOW] = ",low",
1214 [SUFFIX_NULL] = NULL,
1215};
1216
1217/*
1218 * That function parses "suffix" crashkernel command lines like
1219 *
1220 * crashkernel=size,[high|low]
1221 *
1222 * It returns 0 on success and -EINVAL on failure.
1223 */
1224static int __init parse_crashkernel_suffix(char *cmdline,
1225 unsigned long long *crash_size,
1226 const char *suffix)
1227{
1228 char *cur = cmdline;
1229
1230 *crash_size = memparse(cmdline, &cur);
1231 if (cmdline == cur) {
1232 pr_warn("crashkernel: memory value expected\n");
1233 return -EINVAL;
1234 }
1235
1236 /* check with suffix */
1237 if (strncmp(cur, suffix, strlen(suffix))) {
1238 pr_warn("crashkernel: unrecognized char: %c\n", *cur);
1239 return -EINVAL;
1240 }
1241 cur += strlen(suffix);
1242 if (*cur != ' ' && *cur != '\0') {
1243 pr_warn("crashkernel: unrecognized char: %c\n", *cur);
1244 return -EINVAL;
1245 }
1246
1247 return 0;
1248}
1249
1250static __init char *get_last_crashkernel(char *cmdline,
1251 const char *name,
1252 const char *suffix)
1253{
1254 char *p = cmdline, *ck_cmdline = NULL;
1255
1256 /* find crashkernel and use the last one if there are more */
1257 p = strstr(p, name);
1258 while (p) {
1259 char *end_p = strchr(p, ' ');
1260 char *q;
1261
1262 if (!end_p)
1263 end_p = p + strlen(p);
1264
1265 if (!suffix) {
1266 int i;
1267
1268 /* skip the one with any known suffix */
1269 for (i = 0; suffix_tbl[i]; i++) {
1270 q = end_p - strlen(suffix_tbl[i]);
1271 if (!strncmp(q, suffix_tbl[i],
1272 strlen(suffix_tbl[i])))
1273 goto next;
1274 }
1275 ck_cmdline = p;
1276 } else {
1277 q = end_p - strlen(suffix);
1278 if (!strncmp(q, suffix, strlen(suffix)))
1279 ck_cmdline = p;
1280 }
1281next:
1282 p = strstr(p+1, name);
1283 }
1284
1285 if (!ck_cmdline)
1286 return NULL;
1287
1288 return ck_cmdline;
1289}
1290
1291static int __init __parse_crashkernel(char *cmdline,
1292 unsigned long long system_ram,
1293 unsigned long long *crash_size,
1294 unsigned long long *crash_base,
1295 const char *name,
1296 const char *suffix)
1297{
1298 char *first_colon, *first_space;
1299 char *ck_cmdline;
1300
1301 BUG_ON(!crash_size || !crash_base);
1302 *crash_size = 0;
1303 *crash_base = 0;
1304
1305 ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
1306
1307 if (!ck_cmdline)
1308 return -EINVAL;
1309
1310 ck_cmdline += strlen(name);
1311
1312 if (suffix)
1313 return parse_crashkernel_suffix(ck_cmdline, crash_size,
1314 suffix);
1315 /*
1316 * if the commandline contains a ':', then that's the extended
1317 * syntax -- if not, it must be the classic syntax
1318 */
1319 first_colon = strchr(ck_cmdline, ':');
1320 first_space = strchr(ck_cmdline, ' ');
1321 if (first_colon && (!first_space || first_colon < first_space))
1322 return parse_crashkernel_mem(ck_cmdline, system_ram,
1323 crash_size, crash_base);
1324
1325 return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
1326}
1327
1328/*
1329 * That function is the entry point for command line parsing and should be
1330 * called from the arch-specific code.
1331 */
1332int __init parse_crashkernel(char *cmdline,
1333 unsigned long long system_ram,
1334 unsigned long long *crash_size,
1335 unsigned long long *crash_base)
1336{
1337 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1338 "crashkernel=", NULL);
1339}
1340
1341int __init parse_crashkernel_high(char *cmdline,
1342 unsigned long long system_ram,
1343 unsigned long long *crash_size,
1344 unsigned long long *crash_base)
1345{
1346 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1347 "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
1348}
1349
1350int __init parse_crashkernel_low(char *cmdline,
1351 unsigned long long system_ram,
1352 unsigned long long *crash_size,
1353 unsigned long long *crash_base)
1354{
1355 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1356 "crashkernel=", suffix_tbl[SUFFIX_LOW]);
1357}
1358
1359static void update_vmcoreinfo_note(void)
1360{
1361 u32 *buf = vmcoreinfo_note;
1362
1363 if (!vmcoreinfo_size)
1364 return;
1365 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1366 vmcoreinfo_size);
1367 final_note(buf);
1368}
1369
1370void crash_save_vmcoreinfo(void)
1371{
1372 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
1373 update_vmcoreinfo_note();
1374}
1375
1376void vmcoreinfo_append_str(const char *fmt, ...)
1377{
1378 va_list args;
1379 char buf[0x50];
1380 size_t r;
1381
1382 va_start(args, fmt);
1383 r = vscnprintf(buf, sizeof(buf), fmt, args);
1384 va_end(args);
1385
1386 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
1387
1388 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1389
1390 vmcoreinfo_size += r;
1391}
1392
1393/*
1394 * provide an empty default implementation here -- architecture
1395 * code may override this
1396 */
1397void __weak arch_crash_save_vmcoreinfo(void)
1398{}
1399
1400phys_addr_t __weak paddr_vmcoreinfo_note(void)
1401{
1402 return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
1403}
1404
1405static int __init crash_save_vmcoreinfo_init(void)
1406{
1407 VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1408 VMCOREINFO_PAGESIZE(PAGE_SIZE);
1409
1410 VMCOREINFO_SYMBOL(init_uts_ns);
1411 VMCOREINFO_SYMBOL(node_online_map);
1412#ifdef CONFIG_MMU
1413 VMCOREINFO_SYMBOL(swapper_pg_dir);
1414#endif
1415 VMCOREINFO_SYMBOL(_stext);
1416 VMCOREINFO_SYMBOL(vmap_area_list);
1417
1418#ifndef CONFIG_NEED_MULTIPLE_NODES
1419 VMCOREINFO_SYMBOL(mem_map);
1420 VMCOREINFO_SYMBOL(contig_page_data);
1421#endif
1422#ifdef CONFIG_SPARSEMEM
1423 VMCOREINFO_SYMBOL(mem_section);
1424 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1425 VMCOREINFO_STRUCT_SIZE(mem_section);
1426 VMCOREINFO_OFFSET(mem_section, section_mem_map);
1427#endif
1428 VMCOREINFO_STRUCT_SIZE(page);
1429 VMCOREINFO_STRUCT_SIZE(pglist_data);
1430 VMCOREINFO_STRUCT_SIZE(zone);
1431 VMCOREINFO_STRUCT_SIZE(free_area);
1432 VMCOREINFO_STRUCT_SIZE(list_head);
1433 VMCOREINFO_SIZE(nodemask_t);
1434 VMCOREINFO_OFFSET(page, flags);
1435 VMCOREINFO_OFFSET(page, _refcount);
1436 VMCOREINFO_OFFSET(page, mapping);
1437 VMCOREINFO_OFFSET(page, lru);
1438 VMCOREINFO_OFFSET(page, _mapcount);
1439 VMCOREINFO_OFFSET(page, private);
1440 VMCOREINFO_OFFSET(page, compound_dtor);
1441 VMCOREINFO_OFFSET(page, compound_order);
1442 VMCOREINFO_OFFSET(page, compound_head);
1443 VMCOREINFO_OFFSET(pglist_data, node_zones);
1444 VMCOREINFO_OFFSET(pglist_data, nr_zones);
1445#ifdef CONFIG_FLAT_NODE_MEM_MAP
1446 VMCOREINFO_OFFSET(pglist_data, node_mem_map);
1447#endif
1448 VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
1449 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
1450 VMCOREINFO_OFFSET(pglist_data, node_id);
1451 VMCOREINFO_OFFSET(zone, free_area);
1452 VMCOREINFO_OFFSET(zone, vm_stat);
1453 VMCOREINFO_OFFSET(zone, spanned_pages);
1454 VMCOREINFO_OFFSET(free_area, free_list);
1455 VMCOREINFO_OFFSET(list_head, next);
1456 VMCOREINFO_OFFSET(list_head, prev);
1457 VMCOREINFO_OFFSET(vmap_area, va_start);
1458 VMCOREINFO_OFFSET(vmap_area, list);
1459 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1460 log_buf_kexec_setup();
1461 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1462 VMCOREINFO_NUMBER(NR_FREE_PAGES);
1463 VMCOREINFO_NUMBER(PG_lru);
1464 VMCOREINFO_NUMBER(PG_private);
1465 VMCOREINFO_NUMBER(PG_swapcache);
1466 VMCOREINFO_NUMBER(PG_slab);
1467#ifdef CONFIG_MEMORY_FAILURE
1468 VMCOREINFO_NUMBER(PG_hwpoison);
1469#endif
1470 VMCOREINFO_NUMBER(PG_head_mask);
1471 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1472#ifdef CONFIG_HUGETLB_PAGE
1473 VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
1474#endif
1475
1476 arch_crash_save_vmcoreinfo();
1477 update_vmcoreinfo_note();
1478
1479 return 0;
1480}
1481
1482subsys_initcall(crash_save_vmcoreinfo_init);
1483
1484/*
1485 * Move into place and start executing a preloaded standalone 1054 * Move into place and start executing a preloaded standalone
1486 * executable. If nothing was preloaded return an error. 1055 * executable. If nothing was preloaded return an error.
1487 */ 1056 */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 699c5bc51a92..2d2d3a568e4e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -58,15 +58,6 @@
58#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) 58#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
59 59
60 60
61/*
62 * Some oddball architectures like 64bit powerpc have function descriptors
63 * so this must be overridable.
64 */
65#ifndef kprobe_lookup_name
66#define kprobe_lookup_name(name, addr) \
67 addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
68#endif
69
70static int kprobes_initialized; 61static int kprobes_initialized;
71static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 62static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
72static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 63static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
@@ -81,6 +72,12 @@ static struct {
81 raw_spinlock_t lock ____cacheline_aligned_in_smp; 72 raw_spinlock_t lock ____cacheline_aligned_in_smp;
82} kretprobe_table_locks[KPROBE_TABLE_SIZE]; 73} kretprobe_table_locks[KPROBE_TABLE_SIZE];
83 74
75kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
76 unsigned int __unused)
77{
78 return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
79}
80
84static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) 81static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
85{ 82{
86 return &(kretprobe_table_locks[hash].lock); 83 return &(kretprobe_table_locks[hash].lock);
@@ -598,7 +595,7 @@ static void kprobe_optimizer(struct work_struct *work)
598} 595}
599 596
600/* Wait for completing optimization and unoptimization */ 597/* Wait for completing optimization and unoptimization */
601static void wait_for_kprobe_optimizer(void) 598void wait_for_kprobe_optimizer(void)
602{ 599{
603 mutex_lock(&kprobe_mutex); 600 mutex_lock(&kprobe_mutex);
604 601
@@ -746,13 +743,20 @@ static void kill_optimized_kprobe(struct kprobe *p)
746 arch_remove_optimized_kprobe(op); 743 arch_remove_optimized_kprobe(op);
747} 744}
748 745
746static inline
747void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
748{
749 if (!kprobe_ftrace(p))
750 arch_prepare_optimized_kprobe(op, p);
751}
752
749/* Try to prepare optimized instructions */ 753/* Try to prepare optimized instructions */
750static void prepare_optimized_kprobe(struct kprobe *p) 754static void prepare_optimized_kprobe(struct kprobe *p)
751{ 755{
752 struct optimized_kprobe *op; 756 struct optimized_kprobe *op;
753 757
754 op = container_of(p, struct optimized_kprobe, kp); 758 op = container_of(p, struct optimized_kprobe, kp);
755 arch_prepare_optimized_kprobe(op, p); 759 __prepare_optimized_kprobe(op, p);
756} 760}
757 761
758/* Allocate new optimized_kprobe and try to prepare optimized instructions */ 762/* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -766,7 +770,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
766 770
767 INIT_LIST_HEAD(&op->list); 771 INIT_LIST_HEAD(&op->list);
768 op->kp.addr = p->addr; 772 op->kp.addr = p->addr;
769 arch_prepare_optimized_kprobe(op, p); 773 __prepare_optimized_kprobe(op, p);
770 774
771 return &op->kp; 775 return &op->kp;
772} 776}
@@ -1391,21 +1395,19 @@ bool within_kprobe_blacklist(unsigned long addr)
1391 * This returns encoded errors if it fails to look up symbol or invalid 1395 * This returns encoded errors if it fails to look up symbol or invalid
1392 * combination of parameters. 1396 * combination of parameters.
1393 */ 1397 */
1394static kprobe_opcode_t *kprobe_addr(struct kprobe *p) 1398static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
1399 const char *symbol_name, unsigned int offset)
1395{ 1400{
1396 kprobe_opcode_t *addr = p->addr; 1401 if ((symbol_name && addr) || (!symbol_name && !addr))
1397
1398 if ((p->symbol_name && p->addr) ||
1399 (!p->symbol_name && !p->addr))
1400 goto invalid; 1402 goto invalid;
1401 1403
1402 if (p->symbol_name) { 1404 if (symbol_name) {
1403 kprobe_lookup_name(p->symbol_name, addr); 1405 addr = kprobe_lookup_name(symbol_name, offset);
1404 if (!addr) 1406 if (!addr)
1405 return ERR_PTR(-ENOENT); 1407 return ERR_PTR(-ENOENT);
1406 } 1408 }
1407 1409
1408 addr = (kprobe_opcode_t *)(((char *)addr) + p->offset); 1410 addr = (kprobe_opcode_t *)(((char *)addr) + offset);
1409 if (addr) 1411 if (addr)
1410 return addr; 1412 return addr;
1411 1413
@@ -1413,6 +1415,11 @@ invalid:
1413 return ERR_PTR(-EINVAL); 1415 return ERR_PTR(-EINVAL);
1414} 1416}
1415 1417
1418static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
1419{
1420 return _kprobe_addr(p->addr, p->symbol_name, p->offset);
1421}
1422
1416/* Check passed kprobe is valid and return kprobe in kprobe_table. */ 1423/* Check passed kprobe is valid and return kprobe in kprobe_table. */
1417static struct kprobe *__get_valid_kprobe(struct kprobe *p) 1424static struct kprobe *__get_valid_kprobe(struct kprobe *p)
1418{ 1425{
@@ -1740,11 +1747,12 @@ void unregister_kprobes(struct kprobe **kps, int num)
1740} 1747}
1741EXPORT_SYMBOL_GPL(unregister_kprobes); 1748EXPORT_SYMBOL_GPL(unregister_kprobes);
1742 1749
1743int __weak __kprobes kprobe_exceptions_notify(struct notifier_block *self, 1750int __weak kprobe_exceptions_notify(struct notifier_block *self,
1744 unsigned long val, void *data) 1751 unsigned long val, void *data)
1745{ 1752{
1746 return NOTIFY_DONE; 1753 return NOTIFY_DONE;
1747} 1754}
1755NOKPROBE_SYMBOL(kprobe_exceptions_notify);
1748 1756
1749static struct notifier_block kprobe_exceptions_nb = { 1757static struct notifier_block kprobe_exceptions_nb = {
1750 .notifier_call = kprobe_exceptions_notify, 1758 .notifier_call = kprobe_exceptions_notify,
@@ -1875,6 +1883,25 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
1875} 1883}
1876NOKPROBE_SYMBOL(pre_handler_kretprobe); 1884NOKPROBE_SYMBOL(pre_handler_kretprobe);
1877 1885
1886bool __weak arch_function_offset_within_entry(unsigned long offset)
1887{
1888 return !offset;
1889}
1890
1891bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
1892{
1893 kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
1894
1895 if (IS_ERR(kp_addr))
1896 return false;
1897
1898 if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
1899 !arch_function_offset_within_entry(offset))
1900 return false;
1901
1902 return true;
1903}
1904
1878int register_kretprobe(struct kretprobe *rp) 1905int register_kretprobe(struct kretprobe *rp)
1879{ 1906{
1880 int ret = 0; 1907 int ret = 0;
@@ -1882,6 +1909,9 @@ int register_kretprobe(struct kretprobe *rp)
1882 int i; 1909 int i;
1883 void *addr; 1910 void *addr;
1884 1911
1912 if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
1913 return -EINVAL;
1914
1885 if (kretprobe_blacklist_size) { 1915 if (kretprobe_blacklist_size) {
1886 addr = kprobe_addr(&rp->kp); 1916 addr = kprobe_addr(&rp->kp);
1887 if (IS_ERR(addr)) 1917 if (IS_ERR(addr))
@@ -2153,6 +2183,12 @@ static int kprobes_module_callback(struct notifier_block *nb,
2153 * The vaddr this probe is installed will soon 2183 * The vaddr this probe is installed will soon
2154 * be vfreed buy not synced to disk. Hence, 2184 * be vfreed buy not synced to disk. Hence,
2155 * disarming the breakpoint isn't needed. 2185 * disarming the breakpoint isn't needed.
2186 *
2187 * Note, this will also move any optimized probes
2188 * that are pending to be removed from their
2189 * corresponding lists to the freeing_list and
2190 * will not be touched by the delayed
2191 * kprobe_optimizer work handler.
2156 */ 2192 */
2157 kill_kprobe(p); 2193 kill_kprobe(p);
2158 } 2194 }
@@ -2192,8 +2228,8 @@ static int __init init_kprobes(void)
2192 if (kretprobe_blacklist_size) { 2228 if (kretprobe_blacklist_size) {
2193 /* lookup the function address from its name */ 2229 /* lookup the function address from its name */
2194 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { 2230 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
2195 kprobe_lookup_name(kretprobe_blacklist[i].name, 2231 kretprobe_blacklist[i].addr =
2196 kretprobe_blacklist[i].addr); 2232 kprobe_lookup_name(kretprobe_blacklist[i].name, 0);
2197 if (!kretprobe_blacklist[i].addr) 2233 if (!kretprobe_blacklist[i].addr)
2198 printk("kretprobe: lookup failed: %s\n", 2234 printk("kretprobe: lookup failed: %s\n",
2199 kretprobe_blacklist[i].name); 2235 kretprobe_blacklist[i].name);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0999679d6f26..23cd70651238 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -125,6 +125,10 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
125} 125}
126KERNEL_ATTR_RW(kexec_crash_size); 126KERNEL_ATTR_RW(kexec_crash_size);
127 127
128#endif /* CONFIG_KEXEC_CORE */
129
130#ifdef CONFIG_CRASH_CORE
131
128static ssize_t vmcoreinfo_show(struct kobject *kobj, 132static ssize_t vmcoreinfo_show(struct kobject *kobj,
129 struct kobj_attribute *attr, char *buf) 133 struct kobj_attribute *attr, char *buf)
130{ 134{
@@ -134,7 +138,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
134} 138}
135KERNEL_ATTR_RO(vmcoreinfo); 139KERNEL_ATTR_RO(vmcoreinfo);
136 140
137#endif /* CONFIG_KEXEC_CORE */ 141#endif /* CONFIG_CRASH_CORE */
138 142
139/* whether file capabilities are enabled */ 143/* whether file capabilities are enabled */
140static ssize_t fscaps_show(struct kobject *kobj, 144static ssize_t fscaps_show(struct kobject *kobj,
@@ -219,6 +223,8 @@ static struct attribute * kernel_attrs[] = {
219 &kexec_loaded_attr.attr, 223 &kexec_loaded_attr.attr,
220 &kexec_crash_loaded_attr.attr, 224 &kexec_crash_loaded_attr.attr,
221 &kexec_crash_size_attr.attr, 225 &kexec_crash_size_attr.attr,
226#endif
227#ifdef CONFIG_CRASH_CORE
222 &vmcoreinfo_attr.attr, 228 &vmcoreinfo_attr.attr,
223#endif 229#endif
224#ifndef CONFIG_TINY_RCU 230#ifndef CONFIG_TINY_RCU
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index e8780c0901d9..2b8bdb1925da 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,3 @@
1obj-$(CONFIG_LIVEPATCH) += livepatch.o 1obj-$(CONFIG_LIVEPATCH) += livepatch.o
2 2
3livepatch-objs := core.o 3livepatch-objs := core.o patch.o transition.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index af4643873e71..b9628e43c78f 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -24,61 +24,31 @@
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/ftrace.h>
28#include <linux/list.h> 27#include <linux/list.h>
29#include <linux/kallsyms.h> 28#include <linux/kallsyms.h>
30#include <linux/livepatch.h> 29#include <linux/livepatch.h>
31#include <linux/elf.h> 30#include <linux/elf.h>
32#include <linux/moduleloader.h> 31#include <linux/moduleloader.h>
32#include <linux/completion.h>
33#include <asm/cacheflush.h> 33#include <asm/cacheflush.h>
34 34#include "core.h"
35/** 35#include "patch.h"
36 * struct klp_ops - structure for tracking registered ftrace ops structs 36#include "transition.h"
37 *
38 * A single ftrace_ops is shared between all enabled replacement functions
39 * (klp_func structs) which have the same old_addr. This allows the switch
40 * between function versions to happen instantaneously by updating the klp_ops
41 * struct's func_stack list. The winner is the klp_func at the top of the
42 * func_stack (front of the list).
43 *
44 * @node: node for the global klp_ops list
45 * @func_stack: list head for the stack of klp_func's (active func is on top)
46 * @fops: registered ftrace ops struct
47 */
48struct klp_ops {
49 struct list_head node;
50 struct list_head func_stack;
51 struct ftrace_ops fops;
52};
53 37
54/* 38/*
55 * The klp_mutex protects the global lists and state transitions of any 39 * klp_mutex is a coarse lock which serializes access to klp data. All
56 * structure reachable from them. References to any structure must be obtained 40 * accesses to klp-related variables and structures must have mutex protection,
57 * under mutex protection (except in klp_ftrace_handler(), which uses RCU to 41 * except within the following functions which carefully avoid the need for it:
58 * ensure it gets consistent data). 42 *
43 * - klp_ftrace_handler()
44 * - klp_update_patch_state()
59 */ 45 */
60static DEFINE_MUTEX(klp_mutex); 46DEFINE_MUTEX(klp_mutex);
61 47
62static LIST_HEAD(klp_patches); 48static LIST_HEAD(klp_patches);
63static LIST_HEAD(klp_ops);
64 49
65static struct kobject *klp_root_kobj; 50static struct kobject *klp_root_kobj;
66 51
67static struct klp_ops *klp_find_ops(unsigned long old_addr)
68{
69 struct klp_ops *ops;
70 struct klp_func *func;
71
72 list_for_each_entry(ops, &klp_ops, node) {
73 func = list_first_entry(&ops->func_stack, struct klp_func,
74 stack_node);
75 if (func->old_addr == old_addr)
76 return ops;
77 }
78
79 return NULL;
80}
81
82static bool klp_is_module(struct klp_object *obj) 52static bool klp_is_module(struct klp_object *obj)
83{ 53{
84 return obj->name; 54 return obj->name;
@@ -117,7 +87,6 @@ static void klp_find_object_module(struct klp_object *obj)
117 mutex_unlock(&module_mutex); 87 mutex_unlock(&module_mutex);
118} 88}
119 89
120/* klp_mutex must be held by caller */
121static bool klp_is_patch_registered(struct klp_patch *patch) 90static bool klp_is_patch_registered(struct klp_patch *patch)
122{ 91{
123 struct klp_patch *mypatch; 92 struct klp_patch *mypatch;
@@ -182,7 +151,10 @@ static int klp_find_object_symbol(const char *objname, const char *name,
182 }; 151 };
183 152
184 mutex_lock(&module_mutex); 153 mutex_lock(&module_mutex);
185 kallsyms_on_each_symbol(klp_find_callback, &args); 154 if (objname)
155 module_kallsyms_on_each_symbol(klp_find_callback, &args);
156 else
157 kallsyms_on_each_symbol(klp_find_callback, &args);
186 mutex_unlock(&module_mutex); 158 mutex_unlock(&module_mutex);
187 159
188 /* 160 /*
@@ -233,7 +205,7 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod)
233 for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { 205 for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) {
234 sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info); 206 sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info);
235 if (sym->st_shndx != SHN_LIVEPATCH) { 207 if (sym->st_shndx != SHN_LIVEPATCH) {
236 pr_err("symbol %s is not marked as a livepatch symbol", 208 pr_err("symbol %s is not marked as a livepatch symbol\n",
237 strtab + sym->st_name); 209 strtab + sym->st_name);
238 return -EINVAL; 210 return -EINVAL;
239 } 211 }
@@ -243,7 +215,7 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod)
243 ".klp.sym.%55[^.].%127[^,],%lu", 215 ".klp.sym.%55[^.].%127[^,],%lu",
244 objname, symname, &sympos); 216 objname, symname, &sympos);
245 if (cnt != 3) { 217 if (cnt != 3) {
246 pr_err("symbol %s has an incorrectly formatted name", 218 pr_err("symbol %s has an incorrectly formatted name\n",
247 strtab + sym->st_name); 219 strtab + sym->st_name);
248 return -EINVAL; 220 return -EINVAL;
249 } 221 }
@@ -288,7 +260,7 @@ static int klp_write_object_relocations(struct module *pmod,
288 */ 260 */
289 cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname); 261 cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname);
290 if (cnt != 1) { 262 if (cnt != 1) {
291 pr_err("section %s has an incorrectly formatted name", 263 pr_err("section %s has an incorrectly formatted name\n",
292 secname); 264 secname);
293 ret = -EINVAL; 265 ret = -EINVAL;
294 break; 266 break;
@@ -311,191 +283,30 @@ static int klp_write_object_relocations(struct module *pmod,
311 return ret; 283 return ret;
312} 284}
313 285
314static void notrace klp_ftrace_handler(unsigned long ip,
315 unsigned long parent_ip,
316 struct ftrace_ops *fops,
317 struct pt_regs *regs)
318{
319 struct klp_ops *ops;
320 struct klp_func *func;
321
322 ops = container_of(fops, struct klp_ops, fops);
323
324 rcu_read_lock();
325 func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
326 stack_node);
327 if (WARN_ON_ONCE(!func))
328 goto unlock;
329
330 klp_arch_set_pc(regs, (unsigned long)func->new_func);
331unlock:
332 rcu_read_unlock();
333}
334
335/*
336 * Convert a function address into the appropriate ftrace location.
337 *
338 * Usually this is just the address of the function, but on some architectures
339 * it's more complicated so allow them to provide a custom behaviour.
340 */
341#ifndef klp_get_ftrace_location
342static unsigned long klp_get_ftrace_location(unsigned long faddr)
343{
344 return faddr;
345}
346#endif
347
348static void klp_disable_func(struct klp_func *func)
349{
350 struct klp_ops *ops;
351
352 if (WARN_ON(func->state != KLP_ENABLED))
353 return;
354 if (WARN_ON(!func->old_addr))
355 return;
356
357 ops = klp_find_ops(func->old_addr);
358 if (WARN_ON(!ops))
359 return;
360
361 if (list_is_singular(&ops->func_stack)) {
362 unsigned long ftrace_loc;
363
364 ftrace_loc = klp_get_ftrace_location(func->old_addr);
365 if (WARN_ON(!ftrace_loc))
366 return;
367
368 WARN_ON(unregister_ftrace_function(&ops->fops));
369 WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0));
370
371 list_del_rcu(&func->stack_node);
372 list_del(&ops->node);
373 kfree(ops);
374 } else {
375 list_del_rcu(&func->stack_node);
376 }
377
378 func->state = KLP_DISABLED;
379}
380
381static int klp_enable_func(struct klp_func *func)
382{
383 struct klp_ops *ops;
384 int ret;
385
386 if (WARN_ON(!func->old_addr))
387 return -EINVAL;
388
389 if (WARN_ON(func->state != KLP_DISABLED))
390 return -EINVAL;
391
392 ops = klp_find_ops(func->old_addr);
393 if (!ops) {
394 unsigned long ftrace_loc;
395
396 ftrace_loc = klp_get_ftrace_location(func->old_addr);
397 if (!ftrace_loc) {
398 pr_err("failed to find location for function '%s'\n",
399 func->old_name);
400 return -EINVAL;
401 }
402
403 ops = kzalloc(sizeof(*ops), GFP_KERNEL);
404 if (!ops)
405 return -ENOMEM;
406
407 ops->fops.func = klp_ftrace_handler;
408 ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
409 FTRACE_OPS_FL_DYNAMIC |
410 FTRACE_OPS_FL_IPMODIFY;
411
412 list_add(&ops->node, &klp_ops);
413
414 INIT_LIST_HEAD(&ops->func_stack);
415 list_add_rcu(&func->stack_node, &ops->func_stack);
416
417 ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0);
418 if (ret) {
419 pr_err("failed to set ftrace filter for function '%s' (%d)\n",
420 func->old_name, ret);
421 goto err;
422 }
423
424 ret = register_ftrace_function(&ops->fops);
425 if (ret) {
426 pr_err("failed to register ftrace handler for function '%s' (%d)\n",
427 func->old_name, ret);
428 ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0);
429 goto err;
430 }
431
432
433 } else {
434 list_add_rcu(&func->stack_node, &ops->func_stack);
435 }
436
437 func->state = KLP_ENABLED;
438
439 return 0;
440
441err:
442 list_del_rcu(&func->stack_node);
443 list_del(&ops->node);
444 kfree(ops);
445 return ret;
446}
447
448static void klp_disable_object(struct klp_object *obj)
449{
450 struct klp_func *func;
451
452 klp_for_each_func(obj, func)
453 if (func->state == KLP_ENABLED)
454 klp_disable_func(func);
455
456 obj->state = KLP_DISABLED;
457}
458
459static int klp_enable_object(struct klp_object *obj)
460{
461 struct klp_func *func;
462 int ret;
463
464 if (WARN_ON(obj->state != KLP_DISABLED))
465 return -EINVAL;
466
467 if (WARN_ON(!klp_is_object_loaded(obj)))
468 return -EINVAL;
469
470 klp_for_each_func(obj, func) {
471 ret = klp_enable_func(func);
472 if (ret) {
473 klp_disable_object(obj);
474 return ret;
475 }
476 }
477 obj->state = KLP_ENABLED;
478
479 return 0;
480}
481
482static int __klp_disable_patch(struct klp_patch *patch) 286static int __klp_disable_patch(struct klp_patch *patch)
483{ 287{
484 struct klp_object *obj; 288 if (klp_transition_patch)
289 return -EBUSY;
485 290
486 /* enforce stacking: only the last enabled patch can be disabled */ 291 /* enforce stacking: only the last enabled patch can be disabled */
487 if (!list_is_last(&patch->list, &klp_patches) && 292 if (!list_is_last(&patch->list, &klp_patches) &&
488 list_next_entry(patch, list)->state == KLP_ENABLED) 293 list_next_entry(patch, list)->enabled)
489 return -EBUSY; 294 return -EBUSY;
490 295
491 pr_notice("disabling patch '%s'\n", patch->mod->name); 296 klp_init_transition(patch, KLP_UNPATCHED);
492 297
493 klp_for_each_object(patch, obj) { 298 /*
494 if (obj->state == KLP_ENABLED) 299 * Enforce the order of the func->transition writes in
495 klp_disable_object(obj); 300 * klp_init_transition() and the TIF_PATCH_PENDING writes in
496 } 301 * klp_start_transition(). In the rare case where klp_ftrace_handler()
302 * is called shortly after klp_update_patch_state() switches the task,
303 * this ensures the handler sees that func->transition is set.
304 */
305 smp_wmb();
497 306
498 patch->state = KLP_DISABLED; 307 klp_start_transition();
308 klp_try_complete_transition();
309 patch->enabled = false;
499 310
500 return 0; 311 return 0;
501} 312}
@@ -519,7 +330,7 @@ int klp_disable_patch(struct klp_patch *patch)
519 goto err; 330 goto err;
520 } 331 }
521 332
522 if (patch->state == KLP_DISABLED) { 333 if (!patch->enabled) {
523 ret = -EINVAL; 334 ret = -EINVAL;
524 goto err; 335 goto err;
525 } 336 }
@@ -537,32 +348,61 @@ static int __klp_enable_patch(struct klp_patch *patch)
537 struct klp_object *obj; 348 struct klp_object *obj;
538 int ret; 349 int ret;
539 350
540 if (WARN_ON(patch->state != KLP_DISABLED)) 351 if (klp_transition_patch)
352 return -EBUSY;
353
354 if (WARN_ON(patch->enabled))
541 return -EINVAL; 355 return -EINVAL;
542 356
543 /* enforce stacking: only the first disabled patch can be enabled */ 357 /* enforce stacking: only the first disabled patch can be enabled */
544 if (patch->list.prev != &klp_patches && 358 if (patch->list.prev != &klp_patches &&
545 list_prev_entry(patch, list)->state == KLP_DISABLED) 359 !list_prev_entry(patch, list)->enabled)
546 return -EBUSY; 360 return -EBUSY;
547 361
362 /*
363 * A reference is taken on the patch module to prevent it from being
364 * unloaded.
365 *
366 * Note: For immediate (no consistency model) patches we don't allow
367 * patch modules to unload since there is no safe/sane method to
368 * determine if a thread is still running in the patched code contained
369 * in the patch module once the ftrace registration is successful.
370 */
371 if (!try_module_get(patch->mod))
372 return -ENODEV;
373
548 pr_notice("enabling patch '%s'\n", patch->mod->name); 374 pr_notice("enabling patch '%s'\n", patch->mod->name);
549 375
376 klp_init_transition(patch, KLP_PATCHED);
377
378 /*
379 * Enforce the order of the func->transition writes in
380 * klp_init_transition() and the ops->func_stack writes in
381 * klp_patch_object(), so that klp_ftrace_handler() will see the
382 * func->transition updates before the handler is registered and the
383 * new funcs become visible to the handler.
384 */
385 smp_wmb();
386
550 klp_for_each_object(patch, obj) { 387 klp_for_each_object(patch, obj) {
551 if (!klp_is_object_loaded(obj)) 388 if (!klp_is_object_loaded(obj))
552 continue; 389 continue;
553 390
554 ret = klp_enable_object(obj); 391 ret = klp_patch_object(obj);
555 if (ret) 392 if (ret) {
556 goto unregister; 393 pr_warn("failed to enable patch '%s'\n",
394 patch->mod->name);
395
396 klp_cancel_transition();
397 return ret;
398 }
557 } 399 }
558 400
559 patch->state = KLP_ENABLED; 401 klp_start_transition();
402 klp_try_complete_transition();
403 patch->enabled = true;
560 404
561 return 0; 405 return 0;
562
563unregister:
564 WARN_ON(__klp_disable_patch(patch));
565 return ret;
566} 406}
567 407
568/** 408/**
@@ -599,6 +439,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
599 * /sys/kernel/livepatch 439 * /sys/kernel/livepatch
600 * /sys/kernel/livepatch/<patch> 440 * /sys/kernel/livepatch/<patch>
601 * /sys/kernel/livepatch/<patch>/enabled 441 * /sys/kernel/livepatch/<patch>/enabled
442 * /sys/kernel/livepatch/<patch>/transition
602 * /sys/kernel/livepatch/<patch>/<object> 443 * /sys/kernel/livepatch/<patch>/<object>
603 * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> 444 * /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
604 */ 445 */
@@ -608,26 +449,34 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
608{ 449{
609 struct klp_patch *patch; 450 struct klp_patch *patch;
610 int ret; 451 int ret;
611 unsigned long val; 452 bool enabled;
612 453
613 ret = kstrtoul(buf, 10, &val); 454 ret = kstrtobool(buf, &enabled);
614 if (ret) 455 if (ret)
615 return -EINVAL; 456 return ret;
616
617 if (val != KLP_DISABLED && val != KLP_ENABLED)
618 return -EINVAL;
619 457
620 patch = container_of(kobj, struct klp_patch, kobj); 458 patch = container_of(kobj, struct klp_patch, kobj);
621 459
622 mutex_lock(&klp_mutex); 460 mutex_lock(&klp_mutex);
623 461
624 if (val == patch->state) { 462 if (!klp_is_patch_registered(patch)) {
463 /*
464 * Module with the patch could either disappear meanwhile or is
465 * not properly initialized yet.
466 */
467 ret = -EINVAL;
468 goto err;
469 }
470
471 if (patch->enabled == enabled) {
625 /* already in requested state */ 472 /* already in requested state */
626 ret = -EINVAL; 473 ret = -EINVAL;
627 goto err; 474 goto err;
628 } 475 }
629 476
630 if (val == KLP_ENABLED) { 477 if (patch == klp_transition_patch) {
478 klp_reverse_transition();
479 } else if (enabled) {
631 ret = __klp_enable_patch(patch); 480 ret = __klp_enable_patch(patch);
632 if (ret) 481 if (ret)
633 goto err; 482 goto err;
@@ -652,21 +501,33 @@ static ssize_t enabled_show(struct kobject *kobj,
652 struct klp_patch *patch; 501 struct klp_patch *patch;
653 502
654 patch = container_of(kobj, struct klp_patch, kobj); 503 patch = container_of(kobj, struct klp_patch, kobj);
655 return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state); 504 return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->enabled);
505}
506
507static ssize_t transition_show(struct kobject *kobj,
508 struct kobj_attribute *attr, char *buf)
509{
510 struct klp_patch *patch;
511
512 patch = container_of(kobj, struct klp_patch, kobj);
513 return snprintf(buf, PAGE_SIZE-1, "%d\n",
514 patch == klp_transition_patch);
656} 515}
657 516
658static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); 517static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
518static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition);
659static struct attribute *klp_patch_attrs[] = { 519static struct attribute *klp_patch_attrs[] = {
660 &enabled_kobj_attr.attr, 520 &enabled_kobj_attr.attr,
521 &transition_kobj_attr.attr,
661 NULL 522 NULL
662}; 523};
663 524
664static void klp_kobj_release_patch(struct kobject *kobj) 525static void klp_kobj_release_patch(struct kobject *kobj)
665{ 526{
666 /* 527 struct klp_patch *patch;
667 * Once we have a consistency model we'll need to module_put() the 528
668 * patch module here. See klp_register_patch() for more details. 529 patch = container_of(kobj, struct klp_patch, kobj);
669 */ 530 complete(&patch->finish);
670} 531}
671 532
672static struct kobj_type klp_ktype_patch = { 533static struct kobj_type klp_ktype_patch = {
@@ -737,7 +598,6 @@ static void klp_free_patch(struct klp_patch *patch)
737 klp_free_objects_limited(patch, NULL); 598 klp_free_objects_limited(patch, NULL);
738 if (!list_empty(&patch->list)) 599 if (!list_empty(&patch->list))
739 list_del(&patch->list); 600 list_del(&patch->list);
740 kobject_put(&patch->kobj);
741} 601}
742 602
743static int klp_init_func(struct klp_object *obj, struct klp_func *func) 603static int klp_init_func(struct klp_object *obj, struct klp_func *func)
@@ -746,7 +606,8 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
746 return -EINVAL; 606 return -EINVAL;
747 607
748 INIT_LIST_HEAD(&func->stack_node); 608 INIT_LIST_HEAD(&func->stack_node);
749 func->state = KLP_DISABLED; 609 func->patched = false;
610 func->transition = false;
750 611
751 /* The format for the sysfs directory is <function,sympos> where sympos 612 /* The format for the sysfs directory is <function,sympos> where sympos
752 * is the nth occurrence of this symbol in kallsyms for the patched 613 * is the nth occurrence of this symbol in kallsyms for the patched
@@ -787,6 +648,22 @@ static int klp_init_object_loaded(struct klp_patch *patch,
787 &func->old_addr); 648 &func->old_addr);
788 if (ret) 649 if (ret)
789 return ret; 650 return ret;
651
652 ret = kallsyms_lookup_size_offset(func->old_addr,
653 &func->old_size, NULL);
654 if (!ret) {
655 pr_err("kallsyms size lookup failed for '%s'\n",
656 func->old_name);
657 return -ENOENT;
658 }
659
660 ret = kallsyms_lookup_size_offset((unsigned long)func->new_func,
661 &func->new_size, NULL);
662 if (!ret) {
663 pr_err("kallsyms size lookup failed for '%s' replacement\n",
664 func->old_name);
665 return -ENOENT;
666 }
790 } 667 }
791 668
792 return 0; 669 return 0;
@@ -801,7 +678,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
801 if (!obj->funcs) 678 if (!obj->funcs)
802 return -EINVAL; 679 return -EINVAL;
803 680
804 obj->state = KLP_DISABLED; 681 obj->patched = false;
805 obj->mod = NULL; 682 obj->mod = NULL;
806 683
807 klp_find_object_module(obj); 684 klp_find_object_module(obj);
@@ -842,12 +719,15 @@ static int klp_init_patch(struct klp_patch *patch)
842 719
843 mutex_lock(&klp_mutex); 720 mutex_lock(&klp_mutex);
844 721
845 patch->state = KLP_DISABLED; 722 patch->enabled = false;
723 init_completion(&patch->finish);
846 724
847 ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, 725 ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
848 klp_root_kobj, "%s", patch->mod->name); 726 klp_root_kobj, "%s", patch->mod->name);
849 if (ret) 727 if (ret) {
850 goto unlock; 728 mutex_unlock(&klp_mutex);
729 return ret;
730 }
851 731
852 klp_for_each_object(patch, obj) { 732 klp_for_each_object(patch, obj) {
853 ret = klp_init_object(patch, obj); 733 ret = klp_init_object(patch, obj);
@@ -863,9 +743,12 @@ static int klp_init_patch(struct klp_patch *patch)
863 743
864free: 744free:
865 klp_free_objects_limited(patch, obj); 745 klp_free_objects_limited(patch, obj);
866 kobject_put(&patch->kobj); 746
867unlock:
868 mutex_unlock(&klp_mutex); 747 mutex_unlock(&klp_mutex);
748
749 kobject_put(&patch->kobj);
750 wait_for_completion(&patch->finish);
751
869 return ret; 752 return ret;
870} 753}
871 754
@@ -879,23 +762,29 @@ unlock:
879 */ 762 */
880int klp_unregister_patch(struct klp_patch *patch) 763int klp_unregister_patch(struct klp_patch *patch)
881{ 764{
882 int ret = 0; 765 int ret;
883 766
884 mutex_lock(&klp_mutex); 767 mutex_lock(&klp_mutex);
885 768
886 if (!klp_is_patch_registered(patch)) { 769 if (!klp_is_patch_registered(patch)) {
887 ret = -EINVAL; 770 ret = -EINVAL;
888 goto out; 771 goto err;
889 } 772 }
890 773
891 if (patch->state == KLP_ENABLED) { 774 if (patch->enabled) {
892 ret = -EBUSY; 775 ret = -EBUSY;
893 goto out; 776 goto err;
894 } 777 }
895 778
896 klp_free_patch(patch); 779 klp_free_patch(patch);
897 780
898out: 781 mutex_unlock(&klp_mutex);
782
783 kobject_put(&patch->kobj);
784 wait_for_completion(&patch->finish);
785
786 return 0;
787err:
899 mutex_unlock(&klp_mutex); 788 mutex_unlock(&klp_mutex);
900 return ret; 789 return ret;
901} 790}
@@ -908,17 +797,18 @@ EXPORT_SYMBOL_GPL(klp_unregister_patch);
908 * Initializes the data structure associated with the patch and 797 * Initializes the data structure associated with the patch and
909 * creates the sysfs interface. 798 * creates the sysfs interface.
910 * 799 *
800 * There is no need to take the reference on the patch module here. It is done
801 * later when the patch is enabled.
802 *
911 * Return: 0 on success, otherwise error 803 * Return: 0 on success, otherwise error
912 */ 804 */
913int klp_register_patch(struct klp_patch *patch) 805int klp_register_patch(struct klp_patch *patch)
914{ 806{
915 int ret;
916
917 if (!patch || !patch->mod) 807 if (!patch || !patch->mod)
918 return -EINVAL; 808 return -EINVAL;
919 809
920 if (!is_livepatch_module(patch->mod)) { 810 if (!is_livepatch_module(patch->mod)) {
921 pr_err("module %s is not marked as a livepatch module", 811 pr_err("module %s is not marked as a livepatch module\n",
922 patch->mod->name); 812 patch->mod->name);
923 return -EINVAL; 813 return -EINVAL;
924 } 814 }
@@ -927,20 +817,16 @@ int klp_register_patch(struct klp_patch *patch)
927 return -ENODEV; 817 return -ENODEV;
928 818
929 /* 819 /*
930 * A reference is taken on the patch module to prevent it from being 820 * Architectures without reliable stack traces have to set
931 * unloaded. Right now, we don't allow patch modules to unload since 821 * patch->immediate because there's currently no way to patch kthreads
932 * there is currently no method to determine if a thread is still 822 * with the consistency model.
933 * running in the patched code contained in the patch module once
934 * the ftrace registration is successful.
935 */ 823 */
936 if (!try_module_get(patch->mod)) 824 if (!klp_have_reliable_stack() && !patch->immediate) {
937 return -ENODEV; 825 pr_err("This architecture doesn't have support for the livepatch consistency model.\n");
938 826 return -ENOSYS;
939 ret = klp_init_patch(patch); 827 }
940 if (ret)
941 module_put(patch->mod);
942 828
943 return ret; 829 return klp_init_patch(patch);
944} 830}
945EXPORT_SYMBOL_GPL(klp_register_patch); 831EXPORT_SYMBOL_GPL(klp_register_patch);
946 832
@@ -975,13 +861,17 @@ int klp_module_coming(struct module *mod)
975 goto err; 861 goto err;
976 } 862 }
977 863
978 if (patch->state == KLP_DISABLED) 864 /*
865 * Only patch the module if the patch is enabled or is
866 * in transition.
867 */
868 if (!patch->enabled && patch != klp_transition_patch)
979 break; 869 break;
980 870
981 pr_notice("applying patch '%s' to loading module '%s'\n", 871 pr_notice("applying patch '%s' to loading module '%s'\n",
982 patch->mod->name, obj->mod->name); 872 patch->mod->name, obj->mod->name);
983 873
984 ret = klp_enable_object(obj); 874 ret = klp_patch_object(obj);
985 if (ret) { 875 if (ret) {
986 pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", 876 pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
987 patch->mod->name, obj->mod->name, ret); 877 patch->mod->name, obj->mod->name, ret);
@@ -1032,10 +922,14 @@ void klp_module_going(struct module *mod)
1032 if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) 922 if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
1033 continue; 923 continue;
1034 924
1035 if (patch->state != KLP_DISABLED) { 925 /*
926 * Only unpatch the module if the patch is enabled or
927 * is in transition.
928 */
929 if (patch->enabled || patch == klp_transition_patch) {
1036 pr_notice("reverting patch '%s' on unloading module '%s'\n", 930 pr_notice("reverting patch '%s' on unloading module '%s'\n",
1037 patch->mod->name, obj->mod->name); 931 patch->mod->name, obj->mod->name);
1038 klp_disable_object(obj); 932 klp_unpatch_object(obj);
1039 } 933 }
1040 934
1041 klp_free_object_loaded(obj); 935 klp_free_object_loaded(obj);
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
new file mode 100644
index 000000000000..c74f24c47837
--- /dev/null
+++ b/kernel/livepatch/core.h
@@ -0,0 +1,6 @@
1#ifndef _LIVEPATCH_CORE_H
2#define _LIVEPATCH_CORE_H
3
4extern struct mutex klp_mutex;
5
6#endif /* _LIVEPATCH_CORE_H */
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
new file mode 100644
index 000000000000..f8269036bf0b
--- /dev/null
+++ b/kernel/livepatch/patch.c
@@ -0,0 +1,272 @@
1/*
2 * patch.c - livepatch patching functions
3 *
4 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
5 * Copyright (C) 2014 SUSE
6 * Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version 2
11 * of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21
22#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
23
24#include <linux/livepatch.h>
25#include <linux/list.h>
26#include <linux/ftrace.h>
27#include <linux/rculist.h>
28#include <linux/slab.h>
29#include <linux/bug.h>
30#include <linux/printk.h>
31#include "patch.h"
32#include "transition.h"
33
34static LIST_HEAD(klp_ops);
35
36struct klp_ops *klp_find_ops(unsigned long old_addr)
37{
38 struct klp_ops *ops;
39 struct klp_func *func;
40
41 list_for_each_entry(ops, &klp_ops, node) {
42 func = list_first_entry(&ops->func_stack, struct klp_func,
43 stack_node);
44 if (func->old_addr == old_addr)
45 return ops;
46 }
47
48 return NULL;
49}
50
51static void notrace klp_ftrace_handler(unsigned long ip,
52 unsigned long parent_ip,
53 struct ftrace_ops *fops,
54 struct pt_regs *regs)
55{
56 struct klp_ops *ops;
57 struct klp_func *func;
58 int patch_state;
59
60 ops = container_of(fops, struct klp_ops, fops);
61
62 rcu_read_lock();
63
64 func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
65 stack_node);
66
67 /*
68 * func should never be NULL because preemption should be disabled here
69 * and unregister_ftrace_function() does the equivalent of a
70 * synchronize_sched() before the func_stack removal.
71 */
72 if (WARN_ON_ONCE(!func))
73 goto unlock;
74
75 /*
76 * In the enable path, enforce the order of the ops->func_stack and
77 * func->transition reads. The corresponding write barrier is in
78 * __klp_enable_patch().
79 *
80 * (Note that this barrier technically isn't needed in the disable
81 * path. In the rare case where klp_update_patch_state() runs before
82 * this handler, its TIF_PATCH_PENDING read and this func->transition
83 * read need to be ordered. But klp_update_patch_state() already
84 * enforces that.)
85 */
86 smp_rmb();
87
88 if (unlikely(func->transition)) {
89
90 /*
91 * Enforce the order of the func->transition and
92 * current->patch_state reads. Otherwise we could read an
93 * out-of-date task state and pick the wrong function. The
94 * corresponding write barrier is in klp_init_transition().
95 */
96 smp_rmb();
97
98 patch_state = current->patch_state;
99
100 WARN_ON_ONCE(patch_state == KLP_UNDEFINED);
101
102 if (patch_state == KLP_UNPATCHED) {
103 /*
104 * Use the previously patched version of the function.
105 * If no previous patches exist, continue with the
106 * original function.
107 */
108 func = list_entry_rcu(func->stack_node.next,
109 struct klp_func, stack_node);
110
111 if (&func->stack_node == &ops->func_stack)
112 goto unlock;
113 }
114 }
115
116 klp_arch_set_pc(regs, (unsigned long)func->new_func);
117unlock:
118 rcu_read_unlock();
119}
120
121/*
122 * Convert a function address into the appropriate ftrace location.
123 *
124 * Usually this is just the address of the function, but on some architectures
125 * it's more complicated so allow them to provide a custom behaviour.
126 */
127#ifndef klp_get_ftrace_location
128static unsigned long klp_get_ftrace_location(unsigned long faddr)
129{
130 return faddr;
131}
132#endif
133
134static void klp_unpatch_func(struct klp_func *func)
135{
136 struct klp_ops *ops;
137
138 if (WARN_ON(!func->patched))
139 return;
140 if (WARN_ON(!func->old_addr))
141 return;
142
143 ops = klp_find_ops(func->old_addr);
144 if (WARN_ON(!ops))
145 return;
146
147 if (list_is_singular(&ops->func_stack)) {
148 unsigned long ftrace_loc;
149
150 ftrace_loc = klp_get_ftrace_location(func->old_addr);
151 if (WARN_ON(!ftrace_loc))
152 return;
153
154 WARN_ON(unregister_ftrace_function(&ops->fops));
155 WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0));
156
157 list_del_rcu(&func->stack_node);
158 list_del(&ops->node);
159 kfree(ops);
160 } else {
161 list_del_rcu(&func->stack_node);
162 }
163
164 func->patched = false;
165}
166
167static int klp_patch_func(struct klp_func *func)
168{
169 struct klp_ops *ops;
170 int ret;
171
172 if (WARN_ON(!func->old_addr))
173 return -EINVAL;
174
175 if (WARN_ON(func->patched))
176 return -EINVAL;
177
178 ops = klp_find_ops(func->old_addr);
179 if (!ops) {
180 unsigned long ftrace_loc;
181
182 ftrace_loc = klp_get_ftrace_location(func->old_addr);
183 if (!ftrace_loc) {
184 pr_err("failed to find location for function '%s'\n",
185 func->old_name);
186 return -EINVAL;
187 }
188
189 ops = kzalloc(sizeof(*ops), GFP_KERNEL);
190 if (!ops)
191 return -ENOMEM;
192
193 ops->fops.func = klp_ftrace_handler;
194 ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
195 FTRACE_OPS_FL_DYNAMIC |
196 FTRACE_OPS_FL_IPMODIFY;
197
198 list_add(&ops->node, &klp_ops);
199
200 INIT_LIST_HEAD(&ops->func_stack);
201 list_add_rcu(&func->stack_node, &ops->func_stack);
202
203 ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0);
204 if (ret) {
205 pr_err("failed to set ftrace filter for function '%s' (%d)\n",
206 func->old_name, ret);
207 goto err;
208 }
209
210 ret = register_ftrace_function(&ops->fops);
211 if (ret) {
212 pr_err("failed to register ftrace handler for function '%s' (%d)\n",
213 func->old_name, ret);
214 ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0);
215 goto err;
216 }
217
218
219 } else {
220 list_add_rcu(&func->stack_node, &ops->func_stack);
221 }
222
223 func->patched = true;
224
225 return 0;
226
227err:
228 list_del_rcu(&func->stack_node);
229 list_del(&ops->node);
230 kfree(ops);
231 return ret;
232}
233
234void klp_unpatch_object(struct klp_object *obj)
235{
236 struct klp_func *func;
237
238 klp_for_each_func(obj, func)
239 if (func->patched)
240 klp_unpatch_func(func);
241
242 obj->patched = false;
243}
244
245int klp_patch_object(struct klp_object *obj)
246{
247 struct klp_func *func;
248 int ret;
249
250 if (WARN_ON(obj->patched))
251 return -EINVAL;
252
253 klp_for_each_func(obj, func) {
254 ret = klp_patch_func(func);
255 if (ret) {
256 klp_unpatch_object(obj);
257 return ret;
258 }
259 }
260 obj->patched = true;
261
262 return 0;
263}
264
265void klp_unpatch_objects(struct klp_patch *patch)
266{
267 struct klp_object *obj;
268
269 klp_for_each_object(patch, obj)
270 if (obj->patched)
271 klp_unpatch_object(obj);
272}
diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h
new file mode 100644
index 000000000000..0db227170c36
--- /dev/null
+++ b/kernel/livepatch/patch.h
@@ -0,0 +1,33 @@
1#ifndef _LIVEPATCH_PATCH_H
2#define _LIVEPATCH_PATCH_H
3
4#include <linux/livepatch.h>
5#include <linux/list.h>
6#include <linux/ftrace.h>
7
8/**
9 * struct klp_ops - structure for tracking registered ftrace ops structs
10 *
11 * A single ftrace_ops is shared between all enabled replacement functions
12 * (klp_func structs) which have the same old_addr. This allows the switch
13 * between function versions to happen instantaneously by updating the klp_ops
14 * struct's func_stack list. The winner is the klp_func at the top of the
15 * func_stack (front of the list).
16 *
17 * @node: node for the global klp_ops list
18 * @func_stack: list head for the stack of klp_func's (active func is on top)
19 * @fops: registered ftrace ops struct
20 */
21struct klp_ops {
22 struct list_head node;
23 struct list_head func_stack;
24 struct ftrace_ops fops;
25};
26
27struct klp_ops *klp_find_ops(unsigned long old_addr);
28
29int klp_patch_object(struct klp_object *obj);
30void klp_unpatch_object(struct klp_object *obj);
31void klp_unpatch_objects(struct klp_patch *patch);
32
33#endif /* _LIVEPATCH_PATCH_H */
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
new file mode 100644
index 000000000000..adc0cc64aa4b
--- /dev/null
+++ b/kernel/livepatch/transition.c
@@ -0,0 +1,553 @@
1/*
2 * transition.c - Kernel Live Patching transition functions
3 *
4 * Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
21
22#include <linux/cpu.h>
23#include <linux/stacktrace.h>
24#include "core.h"
25#include "patch.h"
26#include "transition.h"
27#include "../sched/sched.h"
28
29#define MAX_STACK_ENTRIES 100
30#define STACK_ERR_BUF_SIZE 128
31
32struct klp_patch *klp_transition_patch;
33
34static int klp_target_state = KLP_UNDEFINED;
35
36/*
37 * This work can be performed periodically to finish patching or unpatching any
38 * "straggler" tasks which failed to transition in the first attempt.
39 */
40static void klp_transition_work_fn(struct work_struct *work)
41{
42 mutex_lock(&klp_mutex);
43
44 if (klp_transition_patch)
45 klp_try_complete_transition();
46
47 mutex_unlock(&klp_mutex);
48}
49static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn);
50
51/*
52 * The transition to the target patch state is complete. Clean up the data
53 * structures.
54 */
55static void klp_complete_transition(void)
56{
57 struct klp_object *obj;
58 struct klp_func *func;
59 struct task_struct *g, *task;
60 unsigned int cpu;
61 bool immediate_func = false;
62
63 if (klp_target_state == KLP_UNPATCHED) {
64 /*
65 * All tasks have transitioned to KLP_UNPATCHED so we can now
66 * remove the new functions from the func_stack.
67 */
68 klp_unpatch_objects(klp_transition_patch);
69
70 /*
71 * Make sure klp_ftrace_handler() can no longer see functions
72 * from this patch on the ops->func_stack. Otherwise, after
73 * func->transition gets cleared, the handler may choose a
74 * removed function.
75 */
76 synchronize_rcu();
77 }
78
79 if (klp_transition_patch->immediate)
80 goto done;
81
82 klp_for_each_object(klp_transition_patch, obj) {
83 klp_for_each_func(obj, func) {
84 func->transition = false;
85 if (func->immediate)
86 immediate_func = true;
87 }
88 }
89
90 if (klp_target_state == KLP_UNPATCHED && !immediate_func)
91 module_put(klp_transition_patch->mod);
92
93 /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
94 if (klp_target_state == KLP_PATCHED)
95 synchronize_rcu();
96
97 read_lock(&tasklist_lock);
98 for_each_process_thread(g, task) {
99 WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING));
100 task->patch_state = KLP_UNDEFINED;
101 }
102 read_unlock(&tasklist_lock);
103
104 for_each_possible_cpu(cpu) {
105 task = idle_task(cpu);
106 WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING));
107 task->patch_state = KLP_UNDEFINED;
108 }
109
110done:
111 klp_target_state = KLP_UNDEFINED;
112 klp_transition_patch = NULL;
113}
114
115/*
116 * This is called in the error path, to cancel a transition before it has
117 * started, i.e. klp_init_transition() has been called but
118 * klp_start_transition() hasn't. If the transition *has* been started,
119 * klp_reverse_transition() should be used instead.
120 */
121void klp_cancel_transition(void)
122{
123 if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED))
124 return;
125
126 klp_target_state = KLP_UNPATCHED;
127 klp_complete_transition();
128}
129
130/*
131 * Switch the patched state of the task to the set of functions in the target
132 * patch state.
133 *
134 * NOTE: If task is not 'current', the caller must ensure the task is inactive.
135 * Otherwise klp_ftrace_handler() might read the wrong 'patch_state' value.
136 */
137void klp_update_patch_state(struct task_struct *task)
138{
139 rcu_read_lock();
140
141 /*
142 * This test_and_clear_tsk_thread_flag() call also serves as a read
143 * barrier (smp_rmb) for two cases:
144 *
145 * 1) Enforce the order of the TIF_PATCH_PENDING read and the
146 * klp_target_state read. The corresponding write barrier is in
147 * klp_init_transition().
148 *
149 * 2) Enforce the order of the TIF_PATCH_PENDING read and a future read
150 * of func->transition, if klp_ftrace_handler() is called later on
151 * the same CPU. See __klp_disable_patch().
152 */
153 if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING))
154 task->patch_state = READ_ONCE(klp_target_state);
155
156 rcu_read_unlock();
157}
158
159/*
160 * Determine whether the given stack trace includes any references to a
161 * to-be-patched or to-be-unpatched function.
162 */
163static int klp_check_stack_func(struct klp_func *func,
164 struct stack_trace *trace)
165{
166 unsigned long func_addr, func_size, address;
167 struct klp_ops *ops;
168 int i;
169
170 if (func->immediate)
171 return 0;
172
173 for (i = 0; i < trace->nr_entries; i++) {
174 address = trace->entries[i];
175
176 if (klp_target_state == KLP_UNPATCHED) {
177 /*
178 * Check for the to-be-unpatched function
179 * (the func itself).
180 */
181 func_addr = (unsigned long)func->new_func;
182 func_size = func->new_size;
183 } else {
184 /*
185 * Check for the to-be-patched function
186 * (the previous func).
187 */
188 ops = klp_find_ops(func->old_addr);
189
190 if (list_is_singular(&ops->func_stack)) {
191 /* original function */
192 func_addr = func->old_addr;
193 func_size = func->old_size;
194 } else {
195 /* previously patched function */
196 struct klp_func *prev;
197
198 prev = list_next_entry(func, stack_node);
199 func_addr = (unsigned long)prev->new_func;
200 func_size = prev->new_size;
201 }
202 }
203
204 if (address >= func_addr && address < func_addr + func_size)
205 return -EAGAIN;
206 }
207
208 return 0;
209}
210
211/*
212 * Determine whether it's safe to transition the task to the target patch state
213 * by looking for any to-be-patched or to-be-unpatched functions on its stack.
214 */
215static int klp_check_stack(struct task_struct *task, char *err_buf)
216{
217 static unsigned long entries[MAX_STACK_ENTRIES];
218 struct stack_trace trace;
219 struct klp_object *obj;
220 struct klp_func *func;
221 int ret;
222
223 trace.skip = 0;
224 trace.nr_entries = 0;
225 trace.max_entries = MAX_STACK_ENTRIES;
226 trace.entries = entries;
227 ret = save_stack_trace_tsk_reliable(task, &trace);
228 WARN_ON_ONCE(ret == -ENOSYS);
229 if (ret) {
230 snprintf(err_buf, STACK_ERR_BUF_SIZE,
231 "%s: %s:%d has an unreliable stack\n",
232 __func__, task->comm, task->pid);
233 return ret;
234 }
235
236 klp_for_each_object(klp_transition_patch, obj) {
237 if (!obj->patched)
238 continue;
239 klp_for_each_func(obj, func) {
240 ret = klp_check_stack_func(func, &trace);
241 if (ret) {
242 snprintf(err_buf, STACK_ERR_BUF_SIZE,
243 "%s: %s:%d is sleeping on function %s\n",
244 __func__, task->comm, task->pid,
245 func->old_name);
246 return ret;
247 }
248 }
249 }
250
251 return 0;
252}
253
254/*
255 * Try to safely switch a task to the target patch state. If it's currently
256 * running, or it's sleeping on a to-be-patched or to-be-unpatched function, or
257 * if the stack is unreliable, return false.
258 */
259static bool klp_try_switch_task(struct task_struct *task)
260{
261 struct rq *rq;
262 struct rq_flags flags;
263 int ret;
264 bool success = false;
265 char err_buf[STACK_ERR_BUF_SIZE];
266
267 err_buf[0] = '\0';
268
269 /* check if this task has already switched over */
270 if (task->patch_state == klp_target_state)
271 return true;
272
273 /*
274 * For arches which don't have reliable stack traces, we have to rely
275 * on other methods (e.g., switching tasks at kernel exit).
276 */
277 if (!klp_have_reliable_stack())
278 return false;
279
280 /*
281 * Now try to check the stack for any to-be-patched or to-be-unpatched
282 * functions. If all goes well, switch the task to the target patch
283 * state.
284 */
285 rq = task_rq_lock(task, &flags);
286
287 if (task_running(rq, task) && task != current) {
288 snprintf(err_buf, STACK_ERR_BUF_SIZE,
289 "%s: %s:%d is running\n", __func__, task->comm,
290 task->pid);
291 goto done;
292 }
293
294 ret = klp_check_stack(task, err_buf);
295 if (ret)
296 goto done;
297
298 success = true;
299
300 clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
301 task->patch_state = klp_target_state;
302
303done:
304 task_rq_unlock(rq, task, &flags);
305
306 /*
307 * Due to console deadlock issues, pr_debug() can't be used while
308 * holding the task rq lock. Instead we have to use a temporary buffer
309 * and print the debug message after releasing the lock.
310 */
311 if (err_buf[0] != '\0')
312 pr_debug("%s", err_buf);
313
314 return success;
315
316}
317
318/*
319 * Try to switch all remaining tasks to the target patch state by walking the
320 * stacks of sleeping tasks and looking for any to-be-patched or
321 * to-be-unpatched functions. If such functions are found, the task can't be
322 * switched yet.
323 *
324 * If any tasks are still stuck in the initial patch state, schedule a retry.
325 */
326void klp_try_complete_transition(void)
327{
328 unsigned int cpu;
329 struct task_struct *g, *task;
330 bool complete = true;
331
332 WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
333
334 /*
335 * If the patch can be applied or reverted immediately, skip the
336 * per-task transitions.
337 */
338 if (klp_transition_patch->immediate)
339 goto success;
340
341 /*
342 * Try to switch the tasks to the target patch state by walking their
343 * stacks and looking for any to-be-patched or to-be-unpatched
344 * functions. If such functions are found on a stack, or if the stack
345 * is deemed unreliable, the task can't be switched yet.
346 *
347 * Usually this will transition most (or all) of the tasks on a system
348 * unless the patch includes changes to a very common function.
349 */
350 read_lock(&tasklist_lock);
351 for_each_process_thread(g, task)
352 if (!klp_try_switch_task(task))
353 complete = false;
354 read_unlock(&tasklist_lock);
355
356 /*
357 * Ditto for the idle "swapper" tasks.
358 */
359 get_online_cpus();
360 for_each_possible_cpu(cpu) {
361 task = idle_task(cpu);
362 if (cpu_online(cpu)) {
363 if (!klp_try_switch_task(task))
364 complete = false;
365 } else if (task->patch_state != klp_target_state) {
366 /* offline idle tasks can be switched immediately */
367 clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
368 task->patch_state = klp_target_state;
369 }
370 }
371 put_online_cpus();
372
373 if (!complete) {
374 /*
375 * Some tasks weren't able to be switched over. Try again
376 * later and/or wait for other methods like kernel exit
377 * switching.
378 */
379 schedule_delayed_work(&klp_transition_work,
380 round_jiffies_relative(HZ));
381 return;
382 }
383
384success:
385 pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
386 klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
387
388 /* we're done, now cleanup the data structures */
389 klp_complete_transition();
390}
391
392/*
393 * Start the transition to the specified target patch state so tasks can begin
394 * switching to it.
395 */
396void klp_start_transition(void)
397{
398 struct task_struct *g, *task;
399 unsigned int cpu;
400
401 WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
402
403 pr_notice("'%s': %s...\n", klp_transition_patch->mod->name,
404 klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
405
406 /*
407 * If the patch can be applied or reverted immediately, skip the
408 * per-task transitions.
409 */
410 if (klp_transition_patch->immediate)
411 return;
412
413 /*
414 * Mark all normal tasks as needing a patch state update. They'll
415 * switch either in klp_try_complete_transition() or as they exit the
416 * kernel.
417 */
418 read_lock(&tasklist_lock);
419 for_each_process_thread(g, task)
420 if (task->patch_state != klp_target_state)
421 set_tsk_thread_flag(task, TIF_PATCH_PENDING);
422 read_unlock(&tasklist_lock);
423
424 /*
425 * Mark all idle tasks as needing a patch state update. They'll switch
426 * either in klp_try_complete_transition() or at the idle loop switch
427 * point.
428 */
429 for_each_possible_cpu(cpu) {
430 task = idle_task(cpu);
431 if (task->patch_state != klp_target_state)
432 set_tsk_thread_flag(task, TIF_PATCH_PENDING);
433 }
434}
435
436/*
437 * Initialize the global target patch state and all tasks to the initial patch
438 * state, and initialize all function transition states to true in preparation
439 * for patching or unpatching.
440 */
441void klp_init_transition(struct klp_patch *patch, int state)
442{
443 struct task_struct *g, *task;
444 unsigned int cpu;
445 struct klp_object *obj;
446 struct klp_func *func;
447 int initial_state = !state;
448
449 WARN_ON_ONCE(klp_target_state != KLP_UNDEFINED);
450
451 klp_transition_patch = patch;
452
453 /*
454 * Set the global target patch state which tasks will switch to. This
455 * has no effect until the TIF_PATCH_PENDING flags get set later.
456 */
457 klp_target_state = state;
458
459 /*
460 * If the patch can be applied or reverted immediately, skip the
461 * per-task transitions.
462 */
463 if (patch->immediate)
464 return;
465
466 /*
467 * Initialize all tasks to the initial patch state to prepare them for
468 * switching to the target state.
469 */
470 read_lock(&tasklist_lock);
471 for_each_process_thread(g, task) {
472 WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED);
473 task->patch_state = initial_state;
474 }
475 read_unlock(&tasklist_lock);
476
477 /*
478 * Ditto for the idle "swapper" tasks.
479 */
480 for_each_possible_cpu(cpu) {
481 task = idle_task(cpu);
482 WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED);
483 task->patch_state = initial_state;
484 }
485
486 /*
487 * Enforce the order of the task->patch_state initializations and the
488 * func->transition updates to ensure that klp_ftrace_handler() doesn't
489 * see a func in transition with a task->patch_state of KLP_UNDEFINED.
490 *
491 * Also enforce the order of the klp_target_state write and future
492 * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't
493 * set a task->patch_state to KLP_UNDEFINED.
494 */
495 smp_wmb();
496
497 /*
498 * Set the func transition states so klp_ftrace_handler() will know to
499 * switch to the transition logic.
500 *
501 * When patching, the funcs aren't yet in the func_stack and will be
502 * made visible to the ftrace handler shortly by the calls to
503 * klp_patch_object().
504 *
505 * When unpatching, the funcs are already in the func_stack and so are
506 * already visible to the ftrace handler.
507 */
508 klp_for_each_object(patch, obj)
509 klp_for_each_func(obj, func)
510 func->transition = true;
511}
512
513/*
514 * This function can be called in the middle of an existing transition to
515 * reverse the direction of the target patch state. This can be done to
516 * effectively cancel an existing enable or disable operation if there are any
517 * tasks which are stuck in the initial patch state.
518 */
519void klp_reverse_transition(void)
520{
521 unsigned int cpu;
522 struct task_struct *g, *task;
523
524 klp_transition_patch->enabled = !klp_transition_patch->enabled;
525
526 klp_target_state = !klp_target_state;
527
528 /*
529 * Clear all TIF_PATCH_PENDING flags to prevent races caused by
530 * klp_update_patch_state() running in parallel with
531 * klp_start_transition().
532 */
533 read_lock(&tasklist_lock);
534 for_each_process_thread(g, task)
535 clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
536 read_unlock(&tasklist_lock);
537
538 for_each_possible_cpu(cpu)
539 clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
540
541 /* Let any remaining calls to klp_update_patch_state() complete */
542 synchronize_rcu();
543
544 klp_start_transition();
545}
546
547/* Called from copy_process() during fork */
548void klp_copy_process(struct task_struct *child)
549{
550 child->patch_state = current->patch_state;
551
552 /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */
553}
diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h
new file mode 100644
index 000000000000..ce09b326546c
--- /dev/null
+++ b/kernel/livepatch/transition.h
@@ -0,0 +1,14 @@
1#ifndef _LIVEPATCH_TRANSITION_H
2#define _LIVEPATCH_TRANSITION_H
3
4#include <linux/livepatch.h>
5
6extern struct klp_patch *klp_transition_patch;
7
8void klp_init_transition(struct klp_patch *patch, int state);
9void klp_cancel_transition(void);
10void klp_start_transition(void);
11void klp_try_complete_transition(void);
12void klp_reverse_transition(void);
13
14#endif /* _LIVEPATCH_TRANSITION_H */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a95e5d1f4a9c..c0e31bfee25c 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -30,6 +30,7 @@
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/sched/clock.h> 31#include <linux/sched/clock.h>
32#include <linux/sched/task.h> 32#include <linux/sched/task.h>
33#include <linux/sched/mm.h>
33#include <linux/delay.h> 34#include <linux/delay.h>
34#include <linux/module.h> 35#include <linux/module.h>
35#include <linux/proc_fs.h> 36#include <linux/proc_fs.h>
@@ -660,6 +661,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
660 struct lockdep_subclass_key *key; 661 struct lockdep_subclass_key *key;
661 struct hlist_head *hash_head; 662 struct hlist_head *hash_head;
662 struct lock_class *class; 663 struct lock_class *class;
664 bool is_static = false;
663 665
664 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { 666 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
665 debug_locks_off(); 667 debug_locks_off();
@@ -673,10 +675,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
673 675
674 /* 676 /*
675 * Static locks do not have their class-keys yet - for them the key 677 * Static locks do not have their class-keys yet - for them the key
676 * is the lock object itself: 678 * is the lock object itself. If the lock is in the per cpu area,
679 * the canonical address of the lock (per cpu offset removed) is
680 * used.
677 */ 681 */
678 if (unlikely(!lock->key)) 682 if (unlikely(!lock->key)) {
679 lock->key = (void *)lock; 683 unsigned long can_addr, addr = (unsigned long)lock;
684
685 if (__is_kernel_percpu_address(addr, &can_addr))
686 lock->key = (void *)can_addr;
687 else if (__is_module_percpu_address(addr, &can_addr))
688 lock->key = (void *)can_addr;
689 else if (static_obj(lock))
690 lock->key = (void *)lock;
691 else
692 return ERR_PTR(-EINVAL);
693 is_static = true;
694 }
680 695
681 /* 696 /*
682 * NOTE: the class-key must be unique. For dynamic locks, a static 697 * NOTE: the class-key must be unique. For dynamic locks, a static
@@ -708,7 +723,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
708 } 723 }
709 } 724 }
710 725
711 return NULL; 726 return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
712} 727}
713 728
714/* 729/*
@@ -726,19 +741,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
726 DEBUG_LOCKS_WARN_ON(!irqs_disabled()); 741 DEBUG_LOCKS_WARN_ON(!irqs_disabled());
727 742
728 class = look_up_lock_class(lock, subclass); 743 class = look_up_lock_class(lock, subclass);
729 if (likely(class)) 744 if (likely(!IS_ERR_OR_NULL(class)))
730 goto out_set_class_cache; 745 goto out_set_class_cache;
731 746
732 /* 747 /*
733 * Debug-check: all keys must be persistent! 748 * Debug-check: all keys must be persistent!
734 */ 749 */
735 if (!static_obj(lock->key)) { 750 if (IS_ERR(class)) {
736 debug_locks_off(); 751 debug_locks_off();
737 printk("INFO: trying to register non-static key.\n"); 752 printk("INFO: trying to register non-static key.\n");
738 printk("the code is fine but needs lockdep annotation.\n"); 753 printk("the code is fine but needs lockdep annotation.\n");
739 printk("turning off the locking correctness validator.\n"); 754 printk("turning off the locking correctness validator.\n");
740 dump_stack(); 755 dump_stack();
741
742 return NULL; 756 return NULL;
743 } 757 }
744 758
@@ -1144,10 +1158,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1144 return 0; 1158 return 0;
1145 1159
1146 printk("\n"); 1160 printk("\n");
1147 printk("======================================================\n"); 1161 pr_warn("======================================================\n");
1148 printk("[ INFO: possible circular locking dependency detected ]\n"); 1162 pr_warn("WARNING: possible circular locking dependency detected\n");
1149 print_kernel_ident(); 1163 print_kernel_ident();
1150 printk("-------------------------------------------------------\n"); 1164 pr_warn("------------------------------------------------------\n");
1151 printk("%s/%d is trying to acquire lock:\n", 1165 printk("%s/%d is trying to acquire lock:\n",
1152 curr->comm, task_pid_nr(curr)); 1166 curr->comm, task_pid_nr(curr));
1153 print_lock(check_src); 1167 print_lock(check_src);
@@ -1482,11 +1496,11 @@ print_bad_irq_dependency(struct task_struct *curr,
1482 return 0; 1496 return 0;
1483 1497
1484 printk("\n"); 1498 printk("\n");
1485 printk("======================================================\n"); 1499 pr_warn("=====================================================\n");
1486 printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", 1500 pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
1487 irqclass, irqclass); 1501 irqclass, irqclass);
1488 print_kernel_ident(); 1502 print_kernel_ident();
1489 printk("------------------------------------------------------\n"); 1503 pr_warn("-----------------------------------------------------\n");
1490 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 1504 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
1491 curr->comm, task_pid_nr(curr), 1505 curr->comm, task_pid_nr(curr),
1492 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, 1506 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1711,10 +1725,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1711 return 0; 1725 return 0;
1712 1726
1713 printk("\n"); 1727 printk("\n");
1714 printk("=============================================\n"); 1728 pr_warn("============================================\n");
1715 printk("[ INFO: possible recursive locking detected ]\n"); 1729 pr_warn("WARNING: possible recursive locking detected\n");
1716 print_kernel_ident(); 1730 print_kernel_ident();
1717 printk("---------------------------------------------\n"); 1731 pr_warn("--------------------------------------------\n");
1718 printk("%s/%d is trying to acquire lock:\n", 1732 printk("%s/%d is trying to acquire lock:\n",
1719 curr->comm, task_pid_nr(curr)); 1733 curr->comm, task_pid_nr(curr));
1720 print_lock(next); 1734 print_lock(next);
@@ -2061,10 +2075,10 @@ static void print_collision(struct task_struct *curr,
2061 struct lock_chain *chain) 2075 struct lock_chain *chain)
2062{ 2076{
2063 printk("\n"); 2077 printk("\n");
2064 printk("======================\n"); 2078 pr_warn("============================\n");
2065 printk("[chain_key collision ]\n"); 2079 pr_warn("WARNING: chain_key collision\n");
2066 print_kernel_ident(); 2080 print_kernel_ident();
2067 printk("----------------------\n"); 2081 pr_warn("----------------------------\n");
2068 printk("%s/%d: ", current->comm, task_pid_nr(current)); 2082 printk("%s/%d: ", current->comm, task_pid_nr(current));
2069 printk("Hash chain already cached but the contents don't match!\n"); 2083 printk("Hash chain already cached but the contents don't match!\n");
2070 2084
@@ -2360,10 +2374,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2360 return 0; 2374 return 0;
2361 2375
2362 printk("\n"); 2376 printk("\n");
2363 printk("=================================\n"); 2377 pr_warn("================================\n");
2364 printk("[ INFO: inconsistent lock state ]\n"); 2378 pr_warn("WARNING: inconsistent lock state\n");
2365 print_kernel_ident(); 2379 print_kernel_ident();
2366 printk("---------------------------------\n"); 2380 pr_warn("--------------------------------\n");
2367 2381
2368 printk("inconsistent {%s} -> {%s} usage.\n", 2382 printk("inconsistent {%s} -> {%s} usage.\n",
2369 usage_str[prev_bit], usage_str[new_bit]); 2383 usage_str[prev_bit], usage_str[new_bit]);
@@ -2425,10 +2439,10 @@ print_irq_inversion_bug(struct task_struct *curr,
2425 return 0; 2439 return 0;
2426 2440
2427 printk("\n"); 2441 printk("\n");
2428 printk("=========================================================\n"); 2442 pr_warn("========================================================\n");
2429 printk("[ INFO: possible irq lock inversion dependency detected ]\n"); 2443 pr_warn("WARNING: possible irq lock inversion dependency detected\n");
2430 print_kernel_ident(); 2444 print_kernel_ident();
2431 printk("---------------------------------------------------------\n"); 2445 pr_warn("--------------------------------------------------------\n");
2432 printk("%s/%d just changed the state of lock:\n", 2446 printk("%s/%d just changed the state of lock:\n",
2433 curr->comm, task_pid_nr(curr)); 2447 curr->comm, task_pid_nr(curr));
2434 print_lock(this); 2448 print_lock(this);
@@ -2863,6 +2877,8 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
2863 if (unlikely(!debug_locks)) 2877 if (unlikely(!debug_locks))
2864 return; 2878 return;
2865 2879
2880 gfp_mask = current_gfp_context(gfp_mask);
2881
2866 /* no reclaim without waiting on it */ 2882 /* no reclaim without waiting on it */
2867 if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) 2883 if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
2868 return; 2884 return;
@@ -2872,7 +2888,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
2872 return; 2888 return;
2873 2889
2874 /* We're only interested __GFP_FS allocations for now */ 2890 /* We're only interested __GFP_FS allocations for now */
2875 if (!(gfp_mask & __GFP_FS)) 2891 if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
2876 return; 2892 return;
2877 2893
2878 /* 2894 /*
@@ -2881,6 +2897,10 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
2881 if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) 2897 if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
2882 return; 2898 return;
2883 2899
2900 /* Disable lockdep if explicitly requested */
2901 if (gfp_mask & __GFP_NOLOCKDEP)
2902 return;
2903
2884 mark_held_locks(curr, RECLAIM_FS); 2904 mark_held_locks(curr, RECLAIM_FS);
2885} 2905}
2886 2906
@@ -3170,10 +3190,10 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
3170 return 0; 3190 return 0;
3171 3191
3172 printk("\n"); 3192 printk("\n");
3173 printk("==================================\n"); 3193 pr_warn("==================================\n");
3174 printk("[ BUG: Nested lock was not taken ]\n"); 3194 pr_warn("WARNING: Nested lock was not taken\n");
3175 print_kernel_ident(); 3195 print_kernel_ident();
3176 printk("----------------------------------\n"); 3196 pr_warn("----------------------------------\n");
3177 3197
3178 printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); 3198 printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
3179 print_lock(hlock); 3199 print_lock(hlock);
@@ -3383,10 +3403,10 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3383 return 0; 3403 return 0;
3384 3404
3385 printk("\n"); 3405 printk("\n");
3386 printk("=====================================\n"); 3406 pr_warn("=====================================\n");
3387 printk("[ BUG: bad unlock balance detected! ]\n"); 3407 pr_warn("WARNING: bad unlock balance detected!\n");
3388 print_kernel_ident(); 3408 print_kernel_ident();
3389 printk("-------------------------------------\n"); 3409 pr_warn("-------------------------------------\n");
3390 printk("%s/%d is trying to release lock (", 3410 printk("%s/%d is trying to release lock (",
3391 curr->comm, task_pid_nr(curr)); 3411 curr->comm, task_pid_nr(curr));
3392 print_lockdep_cache(lock); 3412 print_lockdep_cache(lock);
@@ -3419,7 +3439,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
3419 * Clearly if the lock hasn't been acquired _ever_, we're not 3439 * Clearly if the lock hasn't been acquired _ever_, we're not
3420 * holding it either, so report failure. 3440 * holding it either, so report failure.
3421 */ 3441 */
3422 if (!class) 3442 if (IS_ERR_OR_NULL(class))
3423 return 0; 3443 return 0;
3424 3444
3425 /* 3445 /*
@@ -3437,13 +3457,67 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
3437 return 0; 3457 return 0;
3438} 3458}
3439 3459
3460/* @depth must not be zero */
3461static struct held_lock *find_held_lock(struct task_struct *curr,
3462 struct lockdep_map *lock,
3463 unsigned int depth, int *idx)
3464{
3465 struct held_lock *ret, *hlock, *prev_hlock;
3466 int i;
3467
3468 i = depth - 1;
3469 hlock = curr->held_locks + i;
3470 ret = hlock;
3471 if (match_held_lock(hlock, lock))
3472 goto out;
3473
3474 ret = NULL;
3475 for (i--, prev_hlock = hlock--;
3476 i >= 0;
3477 i--, prev_hlock = hlock--) {
3478 /*
3479 * We must not cross into another context:
3480 */
3481 if (prev_hlock->irq_context != hlock->irq_context) {
3482 ret = NULL;
3483 break;
3484 }
3485 if (match_held_lock(hlock, lock)) {
3486 ret = hlock;
3487 break;
3488 }
3489 }
3490
3491out:
3492 *idx = i;
3493 return ret;
3494}
3495
3496static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
3497 int idx)
3498{
3499 struct held_lock *hlock;
3500
3501 for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) {
3502 if (!__lock_acquire(hlock->instance,
3503 hlock_class(hlock)->subclass,
3504 hlock->trylock,
3505 hlock->read, hlock->check,
3506 hlock->hardirqs_off,
3507 hlock->nest_lock, hlock->acquire_ip,
3508 hlock->references, hlock->pin_count))
3509 return 1;
3510 }
3511 return 0;
3512}
3513
3440static int 3514static int
3441__lock_set_class(struct lockdep_map *lock, const char *name, 3515__lock_set_class(struct lockdep_map *lock, const char *name,
3442 struct lock_class_key *key, unsigned int subclass, 3516 struct lock_class_key *key, unsigned int subclass,
3443 unsigned long ip) 3517 unsigned long ip)
3444{ 3518{
3445 struct task_struct *curr = current; 3519 struct task_struct *curr = current;
3446 struct held_lock *hlock, *prev_hlock; 3520 struct held_lock *hlock;
3447 struct lock_class *class; 3521 struct lock_class *class;
3448 unsigned int depth; 3522 unsigned int depth;
3449 int i; 3523 int i;
@@ -3456,21 +3530,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
3456 if (DEBUG_LOCKS_WARN_ON(!depth)) 3530 if (DEBUG_LOCKS_WARN_ON(!depth))
3457 return 0; 3531 return 0;
3458 3532
3459 prev_hlock = NULL; 3533 hlock = find_held_lock(curr, lock, depth, &i);
3460 for (i = depth-1; i >= 0; i--) { 3534 if (!hlock)
3461 hlock = curr->held_locks + i; 3535 return print_unlock_imbalance_bug(curr, lock, ip);
3462 /*
3463 * We must not cross into another context:
3464 */
3465 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3466 break;
3467 if (match_held_lock(hlock, lock))
3468 goto found_it;
3469 prev_hlock = hlock;
3470 }
3471 return print_unlock_imbalance_bug(curr, lock, ip);
3472 3536
3473found_it:
3474 lockdep_init_map(lock, name, key, 0); 3537 lockdep_init_map(lock, name, key, 0);
3475 class = register_lock_class(lock, subclass, 0); 3538 class = register_lock_class(lock, subclass, 0);
3476 hlock->class_idx = class - lock_classes + 1; 3539 hlock->class_idx = class - lock_classes + 1;
@@ -3478,15 +3541,46 @@ found_it:
3478 curr->lockdep_depth = i; 3541 curr->lockdep_depth = i;
3479 curr->curr_chain_key = hlock->prev_chain_key; 3542 curr->curr_chain_key = hlock->prev_chain_key;
3480 3543
3481 for (; i < depth; i++) { 3544 if (reacquire_held_locks(curr, depth, i))
3482 hlock = curr->held_locks + i; 3545 return 0;
3483 if (!__lock_acquire(hlock->instance, 3546
3484 hlock_class(hlock)->subclass, hlock->trylock, 3547 /*
3485 hlock->read, hlock->check, hlock->hardirqs_off, 3548 * I took it apart and put it back together again, except now I have
3486 hlock->nest_lock, hlock->acquire_ip, 3549 * these 'spare' parts.. where shall I put them.
3487 hlock->references, hlock->pin_count)) 3550 */
3488 return 0; 3551 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
3489 } 3552 return 0;
3553 return 1;
3554}
3555
3556static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
3557{
3558 struct task_struct *curr = current;
3559 struct held_lock *hlock;
3560 unsigned int depth;
3561 int i;
3562
3563 depth = curr->lockdep_depth;
3564 /*
3565 * This function is about (re)setting the class of a held lock,
3566 * yet we're not actually holding any locks. Naughty user!
3567 */
3568 if (DEBUG_LOCKS_WARN_ON(!depth))
3569 return 0;
3570
3571 hlock = find_held_lock(curr, lock, depth, &i);
3572 if (!hlock)
3573 return print_unlock_imbalance_bug(curr, lock, ip);
3574
3575 curr->lockdep_depth = i;
3576 curr->curr_chain_key = hlock->prev_chain_key;
3577
3578 WARN(hlock->read, "downgrading a read lock");
3579 hlock->read = 1;
3580 hlock->acquire_ip = ip;
3581
3582 if (reacquire_held_locks(curr, depth, i))
3583 return 0;
3490 3584
3491 /* 3585 /*
3492 * I took it apart and put it back together again, except now I have 3586 * I took it apart and put it back together again, except now I have
@@ -3508,7 +3602,7 @@ static int
3508__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) 3602__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
3509{ 3603{
3510 struct task_struct *curr = current; 3604 struct task_struct *curr = current;
3511 struct held_lock *hlock, *prev_hlock; 3605 struct held_lock *hlock;
3512 unsigned int depth; 3606 unsigned int depth;
3513 int i; 3607 int i;
3514 3608
@@ -3527,21 +3621,10 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
3527 * Check whether the lock exists in the current stack 3621 * Check whether the lock exists in the current stack
3528 * of held locks: 3622 * of held locks:
3529 */ 3623 */
3530 prev_hlock = NULL; 3624 hlock = find_held_lock(curr, lock, depth, &i);
3531 for (i = depth-1; i >= 0; i--) { 3625 if (!hlock)
3532 hlock = curr->held_locks + i; 3626 return print_unlock_imbalance_bug(curr, lock, ip);
3533 /*
3534 * We must not cross into another context:
3535 */
3536 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3537 break;
3538 if (match_held_lock(hlock, lock))
3539 goto found_it;
3540 prev_hlock = hlock;
3541 }
3542 return print_unlock_imbalance_bug(curr, lock, ip);
3543 3627
3544found_it:
3545 if (hlock->instance == lock) 3628 if (hlock->instance == lock)
3546 lock_release_holdtime(hlock); 3629 lock_release_holdtime(hlock);
3547 3630
@@ -3568,15 +3651,8 @@ found_it:
3568 curr->lockdep_depth = i; 3651 curr->lockdep_depth = i;
3569 curr->curr_chain_key = hlock->prev_chain_key; 3652 curr->curr_chain_key = hlock->prev_chain_key;
3570 3653
3571 for (i++; i < depth; i++) { 3654 if (reacquire_held_locks(curr, depth, i + 1))
3572 hlock = curr->held_locks + i; 3655 return 0;
3573 if (!__lock_acquire(hlock->instance,
3574 hlock_class(hlock)->subclass, hlock->trylock,
3575 hlock->read, hlock->check, hlock->hardirqs_off,
3576 hlock->nest_lock, hlock->acquire_ip,
3577 hlock->references, hlock->pin_count))
3578 return 0;
3579 }
3580 3656
3581 /* 3657 /*
3582 * We had N bottles of beer on the wall, we drank one, but now 3658 * We had N bottles of beer on the wall, we drank one, but now
@@ -3741,6 +3817,23 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
3741} 3817}
3742EXPORT_SYMBOL_GPL(lock_set_class); 3818EXPORT_SYMBOL_GPL(lock_set_class);
3743 3819
3820void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
3821{
3822 unsigned long flags;
3823
3824 if (unlikely(current->lockdep_recursion))
3825 return;
3826
3827 raw_local_irq_save(flags);
3828 current->lockdep_recursion = 1;
3829 check_flags(flags);
3830 if (__lock_downgrade(lock, ip))
3831 check_chain_key(current);
3832 current->lockdep_recursion = 0;
3833 raw_local_irq_restore(flags);
3834}
3835EXPORT_SYMBOL_GPL(lock_downgrade);
3836
3744/* 3837/*
3745 * We are not always called with irqs disabled - do that here, 3838 * We are not always called with irqs disabled - do that here,
3746 * and also avoid lockdep recursion: 3839 * and also avoid lockdep recursion:
@@ -3861,13 +3954,15 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock);
3861 3954
3862void lockdep_set_current_reclaim_state(gfp_t gfp_mask) 3955void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
3863{ 3956{
3864 current->lockdep_reclaim_gfp = gfp_mask; 3957 current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
3865} 3958}
3959EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state);
3866 3960
3867void lockdep_clear_current_reclaim_state(void) 3961void lockdep_clear_current_reclaim_state(void)
3868{ 3962{
3869 current->lockdep_reclaim_gfp = 0; 3963 current->lockdep_reclaim_gfp = 0;
3870} 3964}
3965EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state);
3871 3966
3872#ifdef CONFIG_LOCK_STAT 3967#ifdef CONFIG_LOCK_STAT
3873static int 3968static int
@@ -3880,10 +3975,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
3880 return 0; 3975 return 0;
3881 3976
3882 printk("\n"); 3977 printk("\n");
3883 printk("=================================\n"); 3978 pr_warn("=================================\n");
3884 printk("[ BUG: bad contention detected! ]\n"); 3979 pr_warn("WARNING: bad contention detected!\n");
3885 print_kernel_ident(); 3980 print_kernel_ident();
3886 printk("---------------------------------\n"); 3981 pr_warn("---------------------------------\n");
3887 printk("%s/%d is trying to contend lock (", 3982 printk("%s/%d is trying to contend lock (",
3888 curr->comm, task_pid_nr(curr)); 3983 curr->comm, task_pid_nr(curr));
3889 print_lockdep_cache(lock); 3984 print_lockdep_cache(lock);
@@ -3903,7 +3998,7 @@ static void
3903__lock_contended(struct lockdep_map *lock, unsigned long ip) 3998__lock_contended(struct lockdep_map *lock, unsigned long ip)
3904{ 3999{
3905 struct task_struct *curr = current; 4000 struct task_struct *curr = current;
3906 struct held_lock *hlock, *prev_hlock; 4001 struct held_lock *hlock;
3907 struct lock_class_stats *stats; 4002 struct lock_class_stats *stats;
3908 unsigned int depth; 4003 unsigned int depth;
3909 int i, contention_point, contending_point; 4004 int i, contention_point, contending_point;
@@ -3916,22 +4011,12 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3916 if (DEBUG_LOCKS_WARN_ON(!depth)) 4011 if (DEBUG_LOCKS_WARN_ON(!depth))
3917 return; 4012 return;
3918 4013
3919 prev_hlock = NULL; 4014 hlock = find_held_lock(curr, lock, depth, &i);
3920 for (i = depth-1; i >= 0; i--) { 4015 if (!hlock) {
3921 hlock = curr->held_locks + i; 4016 print_lock_contention_bug(curr, lock, ip);
3922 /* 4017 return;
3923 * We must not cross into another context:
3924 */
3925 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3926 break;
3927 if (match_held_lock(hlock, lock))
3928 goto found_it;
3929 prev_hlock = hlock;
3930 } 4018 }
3931 print_lock_contention_bug(curr, lock, ip);
3932 return;
3933 4019
3934found_it:
3935 if (hlock->instance != lock) 4020 if (hlock->instance != lock)
3936 return; 4021 return;
3937 4022
@@ -3955,7 +4040,7 @@ static void
3955__lock_acquired(struct lockdep_map *lock, unsigned long ip) 4040__lock_acquired(struct lockdep_map *lock, unsigned long ip)
3956{ 4041{
3957 struct task_struct *curr = current; 4042 struct task_struct *curr = current;
3958 struct held_lock *hlock, *prev_hlock; 4043 struct held_lock *hlock;
3959 struct lock_class_stats *stats; 4044 struct lock_class_stats *stats;
3960 unsigned int depth; 4045 unsigned int depth;
3961 u64 now, waittime = 0; 4046 u64 now, waittime = 0;
@@ -3969,22 +4054,12 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3969 if (DEBUG_LOCKS_WARN_ON(!depth)) 4054 if (DEBUG_LOCKS_WARN_ON(!depth))
3970 return; 4055 return;
3971 4056
3972 prev_hlock = NULL; 4057 hlock = find_held_lock(curr, lock, depth, &i);
3973 for (i = depth-1; i >= 0; i--) { 4058 if (!hlock) {
3974 hlock = curr->held_locks + i; 4059 print_lock_contention_bug(curr, lock, _RET_IP_);
3975 /* 4060 return;
3976 * We must not cross into another context:
3977 */
3978 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3979 break;
3980 if (match_held_lock(hlock, lock))
3981 goto found_it;
3982 prev_hlock = hlock;
3983 } 4061 }
3984 print_lock_contention_bug(curr, lock, _RET_IP_);
3985 return;
3986 4062
3987found_it:
3988 if (hlock->instance != lock) 4063 if (hlock->instance != lock)
3989 return; 4064 return;
3990 4065
@@ -4172,7 +4247,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
4172 * If the class exists we look it up and zap it: 4247 * If the class exists we look it up and zap it:
4173 */ 4248 */
4174 class = look_up_lock_class(lock, j); 4249 class = look_up_lock_class(lock, j);
4175 if (class) 4250 if (!IS_ERR_OR_NULL(class))
4176 zap_class(class); 4251 zap_class(class);
4177 } 4252 }
4178 /* 4253 /*
@@ -4244,10 +4319,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
4244 return; 4319 return;
4245 4320
4246 printk("\n"); 4321 printk("\n");
4247 printk("=========================\n"); 4322 pr_warn("=========================\n");
4248 printk("[ BUG: held lock freed! ]\n"); 4323 pr_warn("WARNING: held lock freed!\n");
4249 print_kernel_ident(); 4324 print_kernel_ident();
4250 printk("-------------------------\n"); 4325 pr_warn("-------------------------\n");
4251 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 4326 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
4252 curr->comm, task_pid_nr(curr), mem_from, mem_to-1); 4327 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
4253 print_lock(hlock); 4328 print_lock(hlock);
@@ -4302,11 +4377,11 @@ static void print_held_locks_bug(void)
4302 return; 4377 return;
4303 4378
4304 printk("\n"); 4379 printk("\n");
4305 printk("=====================================\n"); 4380 pr_warn("====================================\n");
4306 printk("[ BUG: %s/%d still has locks held! ]\n", 4381 pr_warn("WARNING: %s/%d still has locks held!\n",
4307 current->comm, task_pid_nr(current)); 4382 current->comm, task_pid_nr(current));
4308 print_kernel_ident(); 4383 print_kernel_ident();
4309 printk("-------------------------------------\n"); 4384 pr_warn("------------------------------------\n");
4310 lockdep_print_held_locks(current); 4385 lockdep_print_held_locks(current);
4311 printk("\nstack backtrace:\n"); 4386 printk("\nstack backtrace:\n");
4312 dump_stack(); 4387 dump_stack();
@@ -4371,7 +4446,7 @@ retry:
4371 } while_each_thread(g, p); 4446 } while_each_thread(g, p);
4372 4447
4373 printk("\n"); 4448 printk("\n");
4374 printk("=============================================\n\n"); 4449 pr_warn("=============================================\n\n");
4375 4450
4376 if (unlock) 4451 if (unlock)
4377 read_unlock(&tasklist_lock); 4452 read_unlock(&tasklist_lock);
@@ -4401,10 +4476,10 @@ asmlinkage __visible void lockdep_sys_exit(void)
4401 if (!debug_locks_off()) 4476 if (!debug_locks_off())
4402 return; 4477 return;
4403 printk("\n"); 4478 printk("\n");
4404 printk("================================================\n"); 4479 pr_warn("================================================\n");
4405 printk("[ BUG: lock held when returning to user space! ]\n"); 4480 pr_warn("WARNING: lock held when returning to user space!\n");
4406 print_kernel_ident(); 4481 print_kernel_ident();
4407 printk("------------------------------------------------\n"); 4482 pr_warn("------------------------------------------------\n");
4408 printk("%s/%d is leaving the kernel with locks still held!\n", 4483 printk("%s/%d is leaving the kernel with locks still held!\n",
4409 curr->comm, curr->pid); 4484 curr->comm, curr->pid);
4410 lockdep_print_held_locks(curr); 4485 lockdep_print_held_locks(curr);
@@ -4421,13 +4496,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4421#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ 4496#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
4422 /* Note: the following can be executed concurrently, so be careful. */ 4497 /* Note: the following can be executed concurrently, so be careful. */
4423 printk("\n"); 4498 printk("\n");
4424 pr_err("===============================\n"); 4499 pr_warn("=============================\n");
4425 pr_err("[ ERR: suspicious RCU usage. ]\n"); 4500 pr_warn("WARNING: suspicious RCU usage\n");
4426 print_kernel_ident(); 4501 print_kernel_ident();
4427 pr_err("-------------------------------\n"); 4502 pr_warn("-----------------------------\n");
4428 pr_err("%s:%d %s!\n", file, line, s); 4503 printk("%s:%d %s!\n", file, line, s);
4429 pr_err("\nother info that might help us debug this:\n\n"); 4504 printk("\nother info that might help us debug this:\n\n");
4430 pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n", 4505 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
4431 !rcu_lockdep_current_cpu_online() 4506 !rcu_lockdep_current_cpu_online()
4432 ? "RCU used illegally from offline CPU!\n" 4507 ? "RCU used illegally from offline CPU!\n"
4433 : !rcu_is_watching() 4508 : !rcu_is_watching()
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 97ee9df32e0f..58e366ad36f4 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -102,10 +102,11 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
102 return; 102 return;
103 } 103 }
104 104
105 printk("\n============================================\n"); 105 pr_warn("\n");
106 printk( "[ BUG: circular locking deadlock detected! ]\n"); 106 pr_warn("============================================\n");
107 printk("%s\n", print_tainted()); 107 pr_warn("WARNING: circular locking deadlock detected!\n");
108 printk( "--------------------------------------------\n"); 108 pr_warn("%s\n", print_tainted());
109 pr_warn("--------------------------------------------\n");
109 printk("%s/%d is deadlocking current task %s/%d\n\n", 110 printk("%s/%d is deadlocking current task %s/%d\n\n",
110 task->comm, task_pid_nr(task), 111 task->comm, task_pid_nr(task),
111 current->comm, task_pid_nr(current)); 112 current->comm, task_pid_nr(current));
@@ -174,12 +175,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
174 lock->name = name; 175 lock->name = name;
175} 176}
176 177
177void
178rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
179{
180}
181
182void rt_mutex_deadlock_account_unlock(struct task_struct *task)
183{
184}
185
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index d0519c3432b6..b585af9a1b50 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -9,9 +9,6 @@
9 * This file contains macros used solely by rtmutex.c. Debug version. 9 * This file contains macros used solely by rtmutex.c. Debug version.
10 */ 10 */
11 11
12extern void
13rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
14extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
15extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); 12extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
16extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); 13extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
17extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); 14extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6edc32ecd9c5..b95509416909 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -224,6 +224,12 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
224} 224}
225#endif 225#endif
226 226
227/*
228 * Only use with rt_mutex_waiter_{less,equal}()
229 */
230#define task_to_waiter(p) \
231 &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
232
227static inline int 233static inline int
228rt_mutex_waiter_less(struct rt_mutex_waiter *left, 234rt_mutex_waiter_less(struct rt_mutex_waiter *left,
229 struct rt_mutex_waiter *right) 235 struct rt_mutex_waiter *right)
@@ -238,12 +244,30 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
238 * then right waiter has a dl_prio() too. 244 * then right waiter has a dl_prio() too.
239 */ 245 */
240 if (dl_prio(left->prio)) 246 if (dl_prio(left->prio))
241 return dl_time_before(left->task->dl.deadline, 247 return dl_time_before(left->deadline, right->deadline);
242 right->task->dl.deadline);
243 248
244 return 0; 249 return 0;
245} 250}
246 251
252static inline int
253rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
254 struct rt_mutex_waiter *right)
255{
256 if (left->prio != right->prio)
257 return 0;
258
259 /*
260 * If both waiters have dl_prio(), we check the deadlines of the
261 * associated tasks.
262 * If left waiter has a dl_prio(), and we didn't return 0 above,
263 * then right waiter has a dl_prio() too.
264 */
265 if (dl_prio(left->prio))
266 return left->deadline == right->deadline;
267
268 return 1;
269}
270
247static void 271static void
248rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) 272rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
249{ 273{
@@ -322,72 +346,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
322 RB_CLEAR_NODE(&waiter->pi_tree_entry); 346 RB_CLEAR_NODE(&waiter->pi_tree_entry);
323} 347}
324 348
325/* 349static void rt_mutex_adjust_prio(struct task_struct *p)
326 * Calculate task priority from the waiter tree priority
327 *
328 * Return task->normal_prio when the waiter tree is empty or when
329 * the waiter is not allowed to do priority boosting
330 */
331int rt_mutex_getprio(struct task_struct *task)
332{
333 if (likely(!task_has_pi_waiters(task)))
334 return task->normal_prio;
335
336 return min(task_top_pi_waiter(task)->prio,
337 task->normal_prio);
338}
339
340struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
341{ 350{
342 if (likely(!task_has_pi_waiters(task))) 351 struct task_struct *pi_task = NULL;
343 return NULL;
344
345 return task_top_pi_waiter(task)->task;
346}
347 352
348/* 353 lockdep_assert_held(&p->pi_lock);
349 * Called by sched_setscheduler() to get the priority which will be
350 * effective after the change.
351 */
352int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
353{
354 if (!task_has_pi_waiters(task))
355 return newprio;
356 354
357 if (task_top_pi_waiter(task)->task->prio <= newprio) 355 if (task_has_pi_waiters(p))
358 return task_top_pi_waiter(task)->task->prio; 356 pi_task = task_top_pi_waiter(p)->task;
359 return newprio;
360}
361 357
362/* 358 rt_mutex_setprio(p, pi_task);
363 * Adjust the priority of a task, after its pi_waiters got modified.
364 *
365 * This can be both boosting and unboosting. task->pi_lock must be held.
366 */
367static void __rt_mutex_adjust_prio(struct task_struct *task)
368{
369 int prio = rt_mutex_getprio(task);
370
371 if (task->prio != prio || dl_prio(prio))
372 rt_mutex_setprio(task, prio);
373}
374
375/*
376 * Adjust task priority (undo boosting). Called from the exit path of
377 * rt_mutex_slowunlock() and rt_mutex_slowlock().
378 *
379 * (Note: We do this outside of the protection of lock->wait_lock to
380 * allow the lock to be taken while or before we readjust the priority
381 * of task. We do not use the spin_xx_mutex() variants here as we are
382 * outside of the debug path.)
383 */
384void rt_mutex_adjust_prio(struct task_struct *task)
385{
386 unsigned long flags;
387
388 raw_spin_lock_irqsave(&task->pi_lock, flags);
389 __rt_mutex_adjust_prio(task);
390 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
391} 359}
392 360
393/* 361/*
@@ -610,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
610 * enabled we continue, but stop the requeueing in the chain 578 * enabled we continue, but stop the requeueing in the chain
611 * walk. 579 * walk.
612 */ 580 */
613 if (waiter->prio == task->prio) { 581 if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
614 if (!detect_deadlock) 582 if (!detect_deadlock)
615 goto out_unlock_pi; 583 goto out_unlock_pi;
616 else 584 else
@@ -706,7 +674,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
706 674
707 /* [7] Requeue the waiter in the lock waiter tree. */ 675 /* [7] Requeue the waiter in the lock waiter tree. */
708 rt_mutex_dequeue(lock, waiter); 676 rt_mutex_dequeue(lock, waiter);
677
678 /*
679 * Update the waiter prio fields now that we're dequeued.
680 *
681 * These values can have changed through either:
682 *
683 * sys_sched_set_scheduler() / sys_sched_setattr()
684 *
685 * or
686 *
687 * DL CBS enforcement advancing the effective deadline.
688 *
689 * Even though pi_waiters also uses these fields, and that tree is only
690 * updated in [11], we can do this here, since we hold [L], which
691 * serializes all pi_waiters access and rb_erase() does not care about
692 * the values of the node being removed.
693 */
709 waiter->prio = task->prio; 694 waiter->prio = task->prio;
695 waiter->deadline = task->dl.deadline;
696
710 rt_mutex_enqueue(lock, waiter); 697 rt_mutex_enqueue(lock, waiter);
711 698
712 /* [8] Release the task */ 699 /* [8] Release the task */
@@ -747,7 +734,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
747 */ 734 */
748 rt_mutex_dequeue_pi(task, prerequeue_top_waiter); 735 rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
749 rt_mutex_enqueue_pi(task, waiter); 736 rt_mutex_enqueue_pi(task, waiter);
750 __rt_mutex_adjust_prio(task); 737 rt_mutex_adjust_prio(task);
751 738
752 } else if (prerequeue_top_waiter == waiter) { 739 } else if (prerequeue_top_waiter == waiter) {
753 /* 740 /*
@@ -763,7 +750,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
763 rt_mutex_dequeue_pi(task, waiter); 750 rt_mutex_dequeue_pi(task, waiter);
764 waiter = rt_mutex_top_waiter(lock); 751 waiter = rt_mutex_top_waiter(lock);
765 rt_mutex_enqueue_pi(task, waiter); 752 rt_mutex_enqueue_pi(task, waiter);
766 __rt_mutex_adjust_prio(task); 753 rt_mutex_adjust_prio(task);
767 } else { 754 } else {
768 /* 755 /*
769 * Nothing changed. No need to do any priority 756 * Nothing changed. No need to do any priority
@@ -833,6 +820,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
833static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, 820static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
834 struct rt_mutex_waiter *waiter) 821 struct rt_mutex_waiter *waiter)
835{ 822{
823 lockdep_assert_held(&lock->wait_lock);
824
836 /* 825 /*
837 * Before testing whether we can acquire @lock, we set the 826 * Before testing whether we can acquire @lock, we set the
838 * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all 827 * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -892,7 +881,8 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
892 * the top waiter priority (kernel view), 881 * the top waiter priority (kernel view),
893 * @task lost. 882 * @task lost.
894 */ 883 */
895 if (task->prio >= rt_mutex_top_waiter(lock)->prio) 884 if (!rt_mutex_waiter_less(task_to_waiter(task),
885 rt_mutex_top_waiter(lock)))
896 return 0; 886 return 0;
897 887
898 /* 888 /*
@@ -938,8 +928,6 @@ takeit:
938 */ 928 */
939 rt_mutex_set_owner(lock, task); 929 rt_mutex_set_owner(lock, task);
940 930
941 rt_mutex_deadlock_account_lock(lock, task);
942
943 return 1; 931 return 1;
944} 932}
945 933
@@ -960,6 +948,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
960 struct rt_mutex *next_lock; 948 struct rt_mutex *next_lock;
961 int chain_walk = 0, res; 949 int chain_walk = 0, res;
962 950
951 lockdep_assert_held(&lock->wait_lock);
952
963 /* 953 /*
964 * Early deadlock detection. We really don't want the task to 954 * Early deadlock detection. We really don't want the task to
965 * enqueue on itself just to untangle the mess later. It's not 955 * enqueue on itself just to untangle the mess later. It's not
@@ -973,10 +963,11 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
973 return -EDEADLK; 963 return -EDEADLK;
974 964
975 raw_spin_lock(&task->pi_lock); 965 raw_spin_lock(&task->pi_lock);
976 __rt_mutex_adjust_prio(task); 966 rt_mutex_adjust_prio(task);
977 waiter->task = task; 967 waiter->task = task;
978 waiter->lock = lock; 968 waiter->lock = lock;
979 waiter->prio = task->prio; 969 waiter->prio = task->prio;
970 waiter->deadline = task->dl.deadline;
980 971
981 /* Get the top priority waiter on the lock */ 972 /* Get the top priority waiter on the lock */
982 if (rt_mutex_has_waiters(lock)) 973 if (rt_mutex_has_waiters(lock))
@@ -995,7 +986,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
995 rt_mutex_dequeue_pi(owner, top_waiter); 986 rt_mutex_dequeue_pi(owner, top_waiter);
996 rt_mutex_enqueue_pi(owner, waiter); 987 rt_mutex_enqueue_pi(owner, waiter);
997 988
998 __rt_mutex_adjust_prio(owner); 989 rt_mutex_adjust_prio(owner);
999 if (owner->pi_blocked_on) 990 if (owner->pi_blocked_on)
1000 chain_walk = 1; 991 chain_walk = 1;
1001 } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { 992 } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1047,12 +1038,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
1047 waiter = rt_mutex_top_waiter(lock); 1038 waiter = rt_mutex_top_waiter(lock);
1048 1039
1049 /* 1040 /*
1050 * Remove it from current->pi_waiters. We do not adjust a 1041 * Remove it from current->pi_waiters and deboost.
1051 * possible priority boost right now. We execute wakeup in the 1042 *
1052 * boosted mode and go back to normal after releasing 1043 * We must in fact deboost here in order to ensure we call
1053 * lock->wait_lock. 1044 * rt_mutex_setprio() to update p->pi_top_task before the
1045 * task unblocks.
1054 */ 1046 */
1055 rt_mutex_dequeue_pi(current, waiter); 1047 rt_mutex_dequeue_pi(current, waiter);
1048 rt_mutex_adjust_prio(current);
1056 1049
1057 /* 1050 /*
1058 * As we are waking up the top waiter, and the waiter stays 1051 * As we are waking up the top waiter, and the waiter stays
@@ -1064,9 +1057,19 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
1064 */ 1057 */
1065 lock->owner = (void *) RT_MUTEX_HAS_WAITERS; 1058 lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
1066 1059
1067 raw_spin_unlock(&current->pi_lock); 1060 /*
1068 1061 * We deboosted before waking the top waiter task such that we don't
1062 * run two tasks with the 'same' priority (and ensure the
1063 * p->pi_top_task pointer points to a blocked task). This however can
1064 * lead to priority inversion if we would get preempted after the
1065 * deboost but before waking our donor task, hence the preempt_disable()
1066 * before unlock.
1067 *
1068 * Pairs with preempt_enable() in rt_mutex_postunlock();
1069 */
1070 preempt_disable();
1069 wake_q_add(wake_q, waiter->task); 1071 wake_q_add(wake_q, waiter->task);
1072 raw_spin_unlock(&current->pi_lock);
1070} 1073}
1071 1074
1072/* 1075/*
@@ -1082,6 +1085,8 @@ static void remove_waiter(struct rt_mutex *lock,
1082 struct task_struct *owner = rt_mutex_owner(lock); 1085 struct task_struct *owner = rt_mutex_owner(lock);
1083 struct rt_mutex *next_lock; 1086 struct rt_mutex *next_lock;
1084 1087
1088 lockdep_assert_held(&lock->wait_lock);
1089
1085 raw_spin_lock(&current->pi_lock); 1090 raw_spin_lock(&current->pi_lock);
1086 rt_mutex_dequeue(lock, waiter); 1091 rt_mutex_dequeue(lock, waiter);
1087 current->pi_blocked_on = NULL; 1092 current->pi_blocked_on = NULL;
@@ -1101,7 +1106,7 @@ static void remove_waiter(struct rt_mutex *lock,
1101 if (rt_mutex_has_waiters(lock)) 1106 if (rt_mutex_has_waiters(lock))
1102 rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); 1107 rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
1103 1108
1104 __rt_mutex_adjust_prio(owner); 1109 rt_mutex_adjust_prio(owner);
1105 1110
1106 /* Store the lock on which owner is blocked or NULL */ 1111 /* Store the lock on which owner is blocked or NULL */
1107 next_lock = task_blocked_on_lock(owner); 1112 next_lock = task_blocked_on_lock(owner);
@@ -1140,8 +1145,7 @@ void rt_mutex_adjust_pi(struct task_struct *task)
1140 raw_spin_lock_irqsave(&task->pi_lock, flags); 1145 raw_spin_lock_irqsave(&task->pi_lock, flags);
1141 1146
1142 waiter = task->pi_blocked_on; 1147 waiter = task->pi_blocked_on;
1143 if (!waiter || (waiter->prio == task->prio && 1148 if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
1144 !dl_prio(task->prio))) {
1145 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 1149 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
1146 return; 1150 return;
1147 } 1151 }
@@ -1155,6 +1159,14 @@ void rt_mutex_adjust_pi(struct task_struct *task)
1155 next_lock, NULL, task); 1159 next_lock, NULL, task);
1156} 1160}
1157 1161
1162void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
1163{
1164 debug_rt_mutex_init_waiter(waiter);
1165 RB_CLEAR_NODE(&waiter->pi_tree_entry);
1166 RB_CLEAR_NODE(&waiter->tree_entry);
1167 waiter->task = NULL;
1168}
1169
1158/** 1170/**
1159 * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop 1171 * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
1160 * @lock: the rt_mutex to take 1172 * @lock: the rt_mutex to take
@@ -1237,9 +1249,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
1237 unsigned long flags; 1249 unsigned long flags;
1238 int ret = 0; 1250 int ret = 0;
1239 1251
1240 debug_rt_mutex_init_waiter(&waiter); 1252 rt_mutex_init_waiter(&waiter);
1241 RB_CLEAR_NODE(&waiter.pi_tree_entry);
1242 RB_CLEAR_NODE(&waiter.tree_entry);
1243 1253
1244 /* 1254 /*
1245 * Technically we could use raw_spin_[un]lock_irq() here, but this can 1255 * Technically we could use raw_spin_[un]lock_irq() here, but this can
@@ -1330,7 +1340,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
1330 1340
1331/* 1341/*
1332 * Slow path to release a rt-mutex. 1342 * Slow path to release a rt-mutex.
1333 * Return whether the current task needs to undo a potential priority boosting. 1343 *
1344 * Return whether the current task needs to call rt_mutex_postunlock().
1334 */ 1345 */
1335static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, 1346static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
1336 struct wake_q_head *wake_q) 1347 struct wake_q_head *wake_q)
@@ -1342,8 +1353,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
1342 1353
1343 debug_rt_mutex_unlock(lock); 1354 debug_rt_mutex_unlock(lock);
1344 1355
1345 rt_mutex_deadlock_account_unlock(current);
1346
1347 /* 1356 /*
1348 * We must be careful here if the fast path is enabled. If we 1357 * We must be careful here if the fast path is enabled. If we
1349 * have no waiters queued we cannot set owner to NULL here 1358 * have no waiters queued we cannot set owner to NULL here
@@ -1390,11 +1399,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
1390 * Queue the next waiter for wakeup once we release the wait_lock. 1399 * Queue the next waiter for wakeup once we release the wait_lock.
1391 */ 1400 */
1392 mark_wakeup_next_waiter(wake_q, lock); 1401 mark_wakeup_next_waiter(wake_q, lock);
1393
1394 raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 1402 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1395 1403
1396 /* check PI boosting */ 1404 return true; /* call rt_mutex_postunlock() */
1397 return true;
1398} 1405}
1399 1406
1400/* 1407/*
@@ -1409,11 +1416,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
1409 struct hrtimer_sleeper *timeout, 1416 struct hrtimer_sleeper *timeout,
1410 enum rtmutex_chainwalk chwalk)) 1417 enum rtmutex_chainwalk chwalk))
1411{ 1418{
1412 if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { 1419 if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
1413 rt_mutex_deadlock_account_lock(lock, current);
1414 return 0; 1420 return 0;
1415 } else 1421
1416 return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); 1422 return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
1417} 1423}
1418 1424
1419static inline int 1425static inline int
@@ -1425,24 +1431,33 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
1425 enum rtmutex_chainwalk chwalk)) 1431 enum rtmutex_chainwalk chwalk))
1426{ 1432{
1427 if (chwalk == RT_MUTEX_MIN_CHAINWALK && 1433 if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
1428 likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { 1434 likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
1429 rt_mutex_deadlock_account_lock(lock, current);
1430 return 0; 1435 return 0;
1431 } else 1436
1432 return slowfn(lock, state, timeout, chwalk); 1437 return slowfn(lock, state, timeout, chwalk);
1433} 1438}
1434 1439
1435static inline int 1440static inline int
1436rt_mutex_fasttrylock(struct rt_mutex *lock, 1441rt_mutex_fasttrylock(struct rt_mutex *lock,
1437 int (*slowfn)(struct rt_mutex *lock)) 1442 int (*slowfn)(struct rt_mutex *lock))
1438{ 1443{
1439 if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { 1444 if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
1440 rt_mutex_deadlock_account_lock(lock, current);
1441 return 1; 1445 return 1;
1442 } 1446
1443 return slowfn(lock); 1447 return slowfn(lock);
1444} 1448}
1445 1449
1450/*
1451 * Performs the wakeup of the the top-waiter and re-enables preemption.
1452 */
1453void rt_mutex_postunlock(struct wake_q_head *wake_q)
1454{
1455 wake_up_q(wake_q);
1456
1457 /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
1458 preempt_enable();
1459}
1460
1446static inline void 1461static inline void
1447rt_mutex_fastunlock(struct rt_mutex *lock, 1462rt_mutex_fastunlock(struct rt_mutex *lock,
1448 bool (*slowfn)(struct rt_mutex *lock, 1463 bool (*slowfn)(struct rt_mutex *lock,
@@ -1450,18 +1465,11 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
1450{ 1465{
1451 DEFINE_WAKE_Q(wake_q); 1466 DEFINE_WAKE_Q(wake_q);
1452 1467
1453 if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { 1468 if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
1454 rt_mutex_deadlock_account_unlock(current); 1469 return;
1455
1456 } else {
1457 bool deboost = slowfn(lock, &wake_q);
1458
1459 wake_up_q(&wake_q);
1460 1470
1461 /* Undo pi boosting if necessary: */ 1471 if (slowfn(lock, &wake_q))
1462 if (deboost) 1472 rt_mutex_postunlock(&wake_q);
1463 rt_mutex_adjust_prio(current);
1464 }
1465} 1473}
1466 1474
1467/** 1475/**
@@ -1495,16 +1503,11 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
1495EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); 1503EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
1496 1504
1497/* 1505/*
1498 * Futex variant with full deadlock detection. 1506 * Futex variant, must not use fastpath.
1499 */ 1507 */
1500int rt_mutex_timed_futex_lock(struct rt_mutex *lock, 1508int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
1501 struct hrtimer_sleeper *timeout)
1502{ 1509{
1503 might_sleep(); 1510 return rt_mutex_slowtrylock(lock);
1504
1505 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
1506 RT_MUTEX_FULL_CHAINWALK,
1507 rt_mutex_slowlock);
1508} 1511}
1509 1512
1510/** 1513/**
@@ -1563,20 +1566,43 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
1563EXPORT_SYMBOL_GPL(rt_mutex_unlock); 1566EXPORT_SYMBOL_GPL(rt_mutex_unlock);
1564 1567
1565/** 1568/**
1566 * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock 1569 * Futex variant, that since futex variants do not use the fast-path, can be
1567 * @lock: the rt_mutex to be unlocked 1570 * simple and will not need to retry.
1568 *
1569 * Returns: true/false indicating whether priority adjustment is
1570 * required or not.
1571 */ 1571 */
1572bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, 1572bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
1573 struct wake_q_head *wqh) 1573 struct wake_q_head *wake_q)
1574{ 1574{
1575 if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { 1575 lockdep_assert_held(&lock->wait_lock);
1576 rt_mutex_deadlock_account_unlock(current); 1576
1577 return false; 1577 debug_rt_mutex_unlock(lock);
1578
1579 if (!rt_mutex_has_waiters(lock)) {
1580 lock->owner = NULL;
1581 return false; /* done */
1578 } 1582 }
1579 return rt_mutex_slowunlock(lock, wqh); 1583
1584 /*
1585 * We've already deboosted, mark_wakeup_next_waiter() will
1586 * retain preempt_disabled when we drop the wait_lock, to
1587 * avoid inversion prior to the wakeup. preempt_disable()
1588 * therein pairs with rt_mutex_postunlock().
1589 */
1590 mark_wakeup_next_waiter(wake_q, lock);
1591
1592 return true; /* call postunlock() */
1593}
1594
1595void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
1596{
1597 DEFINE_WAKE_Q(wake_q);
1598 bool postunlock;
1599
1600 raw_spin_lock_irq(&lock->wait_lock);
1601 postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
1602 raw_spin_unlock_irq(&lock->wait_lock);
1603
1604 if (postunlock)
1605 rt_mutex_postunlock(&wake_q);
1580} 1606}
1581 1607
1582/** 1608/**
@@ -1637,7 +1663,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
1637 __rt_mutex_init(lock, NULL); 1663 __rt_mutex_init(lock, NULL);
1638 debug_rt_mutex_proxy_lock(lock, proxy_owner); 1664 debug_rt_mutex_proxy_lock(lock, proxy_owner);
1639 rt_mutex_set_owner(lock, proxy_owner); 1665 rt_mutex_set_owner(lock, proxy_owner);
1640 rt_mutex_deadlock_account_lock(lock, proxy_owner);
1641} 1666}
1642 1667
1643/** 1668/**
@@ -1657,34 +1682,16 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
1657{ 1682{
1658 debug_rt_mutex_proxy_unlock(lock); 1683 debug_rt_mutex_proxy_unlock(lock);
1659 rt_mutex_set_owner(lock, NULL); 1684 rt_mutex_set_owner(lock, NULL);
1660 rt_mutex_deadlock_account_unlock(proxy_owner);
1661} 1685}
1662 1686
1663/** 1687int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1664 * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
1665 * @lock: the rt_mutex to take
1666 * @waiter: the pre-initialized rt_mutex_waiter
1667 * @task: the task to prepare
1668 *
1669 * Returns:
1670 * 0 - task blocked on lock
1671 * 1 - acquired the lock for task, caller should wake it up
1672 * <0 - error
1673 *
1674 * Special API call for FUTEX_REQUEUE_PI support.
1675 */
1676int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1677 struct rt_mutex_waiter *waiter, 1688 struct rt_mutex_waiter *waiter,
1678 struct task_struct *task) 1689 struct task_struct *task)
1679{ 1690{
1680 int ret; 1691 int ret;
1681 1692
1682 raw_spin_lock_irq(&lock->wait_lock); 1693 if (try_to_take_rt_mutex(lock, task, NULL))
1683
1684 if (try_to_take_rt_mutex(lock, task, NULL)) {
1685 raw_spin_unlock_irq(&lock->wait_lock);
1686 return 1; 1694 return 1;
1687 }
1688 1695
1689 /* We enforce deadlock detection for futexes */ 1696 /* We enforce deadlock detection for futexes */
1690 ret = task_blocks_on_rt_mutex(lock, waiter, task, 1697 ret = task_blocks_on_rt_mutex(lock, waiter, task,
@@ -1703,14 +1710,38 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1703 if (unlikely(ret)) 1710 if (unlikely(ret))
1704 remove_waiter(lock, waiter); 1711 remove_waiter(lock, waiter);
1705 1712
1706 raw_spin_unlock_irq(&lock->wait_lock);
1707
1708 debug_rt_mutex_print_deadlock(waiter); 1713 debug_rt_mutex_print_deadlock(waiter);
1709 1714
1710 return ret; 1715 return ret;
1711} 1716}
1712 1717
1713/** 1718/**
1719 * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
1720 * @lock: the rt_mutex to take
1721 * @waiter: the pre-initialized rt_mutex_waiter
1722 * @task: the task to prepare
1723 *
1724 * Returns:
1725 * 0 - task blocked on lock
1726 * 1 - acquired the lock for task, caller should wake it up
1727 * <0 - error
1728 *
1729 * Special API call for FUTEX_REQUEUE_PI support.
1730 */
1731int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1732 struct rt_mutex_waiter *waiter,
1733 struct task_struct *task)
1734{
1735 int ret;
1736
1737 raw_spin_lock_irq(&lock->wait_lock);
1738 ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
1739 raw_spin_unlock_irq(&lock->wait_lock);
1740
1741 return ret;
1742}
1743
1744/**
1714 * rt_mutex_next_owner - return the next owner of the lock 1745 * rt_mutex_next_owner - return the next owner of the lock
1715 * 1746 *
1716 * @lock: the rt lock query 1747 * @lock: the rt lock query
@@ -1731,21 +1762,23 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
1731} 1762}
1732 1763
1733/** 1764/**
1734 * rt_mutex_finish_proxy_lock() - Complete lock acquisition 1765 * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
1735 * @lock: the rt_mutex we were woken on 1766 * @lock: the rt_mutex we were woken on
1736 * @to: the timeout, null if none. hrtimer should already have 1767 * @to: the timeout, null if none. hrtimer should already have
1737 * been started. 1768 * been started.
1738 * @waiter: the pre-initialized rt_mutex_waiter 1769 * @waiter: the pre-initialized rt_mutex_waiter
1739 * 1770 *
1740 * Complete the lock acquisition started our behalf by another thread. 1771 * Wait for the the lock acquisition started on our behalf by
1772 * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
1773 * rt_mutex_cleanup_proxy_lock().
1741 * 1774 *
1742 * Returns: 1775 * Returns:
1743 * 0 - success 1776 * 0 - success
1744 * <0 - error, one of -EINTR, -ETIMEDOUT 1777 * <0 - error, one of -EINTR, -ETIMEDOUT
1745 * 1778 *
1746 * Special API call for PI-futex requeue support 1779 * Special API call for PI-futex support
1747 */ 1780 */
1748int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, 1781int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
1749 struct hrtimer_sleeper *to, 1782 struct hrtimer_sleeper *to,
1750 struct rt_mutex_waiter *waiter) 1783 struct rt_mutex_waiter *waiter)
1751{ 1784{
@@ -1758,8 +1791,45 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1758 /* sleep on the mutex */ 1791 /* sleep on the mutex */
1759 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); 1792 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
1760 1793
1761 if (unlikely(ret)) 1794 raw_spin_unlock_irq(&lock->wait_lock);
1795
1796 return ret;
1797}
1798
1799/**
1800 * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
1801 * @lock: the rt_mutex we were woken on
1802 * @waiter: the pre-initialized rt_mutex_waiter
1803 *
1804 * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
1805 *
1806 * Unless we acquired the lock; we're still enqueued on the wait-list and can
1807 * in fact still be granted ownership until we're removed. Therefore we can
1808 * find we are in fact the owner and must disregard the
1809 * rt_mutex_wait_proxy_lock() failure.
1810 *
1811 * Returns:
1812 * true - did the cleanup, we done.
1813 * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
1814 * caller should disregards its return value.
1815 *
1816 * Special API call for PI-futex support
1817 */
1818bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
1819 struct rt_mutex_waiter *waiter)
1820{
1821 bool cleanup = false;
1822
1823 raw_spin_lock_irq(&lock->wait_lock);
1824 /*
1825 * Unless we're the owner; we're still enqueued on the wait_list.
1826 * So check if we became owner, if not, take us off the wait_list.
1827 */
1828 if (rt_mutex_owner(lock) != current) {
1762 remove_waiter(lock, waiter); 1829 remove_waiter(lock, waiter);
1830 fixup_rt_mutex_waiters(lock);
1831 cleanup = true;
1832 }
1763 1833
1764 /* 1834 /*
1765 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might 1835 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
@@ -1769,5 +1839,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1769 1839
1770 raw_spin_unlock_irq(&lock->wait_lock); 1840 raw_spin_unlock_irq(&lock->wait_lock);
1771 1841
1772 return ret; 1842 return cleanup;
1773} 1843}
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index c4060584c407..6607802efa8b 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -11,8 +11,6 @@
11 */ 11 */
12 12
13#define rt_mutex_deadlock_check(l) (0) 13#define rt_mutex_deadlock_check(l) (0)
14#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
15#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
16#define debug_rt_mutex_init_waiter(w) do { } while (0) 14#define debug_rt_mutex_init_waiter(w) do { } while (0)
17#define debug_rt_mutex_free_waiter(w) do { } while (0) 15#define debug_rt_mutex_free_waiter(w) do { } while (0)
18#define debug_rt_mutex_lock(l) do { } while (0) 16#define debug_rt_mutex_lock(l) do { } while (0)
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 856dfff5c33a..72ad45a9a794 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -34,6 +34,7 @@ struct rt_mutex_waiter {
34 struct rt_mutex *deadlock_lock; 34 struct rt_mutex *deadlock_lock;
35#endif 35#endif
36 int prio; 36 int prio;
37 u64 deadline;
37}; 38};
38 39
39/* 40/*
@@ -103,16 +104,26 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
103 struct task_struct *proxy_owner); 104 struct task_struct *proxy_owner);
104extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, 105extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
105 struct task_struct *proxy_owner); 106 struct task_struct *proxy_owner);
107extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
108extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
109 struct rt_mutex_waiter *waiter,
110 struct task_struct *task);
106extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, 111extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
107 struct rt_mutex_waiter *waiter, 112 struct rt_mutex_waiter *waiter,
108 struct task_struct *task); 113 struct task_struct *task);
109extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, 114extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
110 struct hrtimer_sleeper *to, 115 struct hrtimer_sleeper *to,
111 struct rt_mutex_waiter *waiter); 116 struct rt_mutex_waiter *waiter);
112extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); 117extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
113extern bool rt_mutex_futex_unlock(struct rt_mutex *lock, 118 struct rt_mutex_waiter *waiter);
114 struct wake_q_head *wqh); 119
115extern void rt_mutex_adjust_prio(struct task_struct *task); 120extern int rt_mutex_futex_trylock(struct rt_mutex *l);
121
122extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
123extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
124 struct wake_q_head *wqh);
125
126extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
116 127
117#ifdef CONFIG_DEBUG_RT_MUTEXES 128#ifdef CONFIG_DEBUG_RT_MUTEXES
118# include "rtmutex-debug.h" 129# include "rtmutex-debug.h"
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 90a74ccd85a4..4d48b1c4870d 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -124,10 +124,8 @@ EXPORT_SYMBOL(up_write);
124 */ 124 */
125void downgrade_write(struct rw_semaphore *sem) 125void downgrade_write(struct rw_semaphore *sem)
126{ 126{
127 /* 127 lock_downgrade(&sem->dep_map, _RET_IP_);
128 * lockdep: a downgraded write will live on as a write 128
129 * dependency.
130 */
131 rwsem_set_reader_owned(sem); 129 rwsem_set_reader_owned(sem);
132 __downgrade_write(sem); 130 __downgrade_write(sem);
133} 131}
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 6b7abb334ca6..39f56c870051 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -353,8 +353,8 @@ static int test_cycle(unsigned int ncpus)
353struct stress { 353struct stress {
354 struct work_struct work; 354 struct work_struct work;
355 struct ww_mutex *locks; 355 struct ww_mutex *locks;
356 unsigned long timeout;
356 int nlocks; 357 int nlocks;
357 int nloops;
358}; 358};
359 359
360static int *get_random_order(int count) 360static int *get_random_order(int count)
@@ -398,12 +398,11 @@ static void stress_inorder_work(struct work_struct *work)
398 if (!order) 398 if (!order)
399 return; 399 return;
400 400
401 ww_acquire_init(&ctx, &ww_class);
402
403 do { 401 do {
404 int contended = -1; 402 int contended = -1;
405 int n, err; 403 int n, err;
406 404
405 ww_acquire_init(&ctx, &ww_class);
407retry: 406retry:
408 err = 0; 407 err = 0;
409 for (n = 0; n < nlocks; n++) { 408 for (n = 0; n < nlocks; n++) {
@@ -433,9 +432,9 @@ retry:
433 __func__, err); 432 __func__, err);
434 break; 433 break;
435 } 434 }
436 } while (--stress->nloops);
437 435
438 ww_acquire_fini(&ctx); 436 ww_acquire_fini(&ctx);
437 } while (!time_after(jiffies, stress->timeout));
439 438
440 kfree(order); 439 kfree(order);
441 kfree(stress); 440 kfree(stress);
@@ -470,9 +469,9 @@ static void stress_reorder_work(struct work_struct *work)
470 kfree(order); 469 kfree(order);
471 order = NULL; 470 order = NULL;
472 471
473 ww_acquire_init(&ctx, &ww_class);
474
475 do { 472 do {
473 ww_acquire_init(&ctx, &ww_class);
474
476 list_for_each_entry(ll, &locks, link) { 475 list_for_each_entry(ll, &locks, link) {
477 err = ww_mutex_lock(ll->lock, &ctx); 476 err = ww_mutex_lock(ll->lock, &ctx);
478 if (!err) 477 if (!err)
@@ -495,9 +494,9 @@ static void stress_reorder_work(struct work_struct *work)
495 dummy_load(stress); 494 dummy_load(stress);
496 list_for_each_entry(ll, &locks, link) 495 list_for_each_entry(ll, &locks, link)
497 ww_mutex_unlock(ll->lock); 496 ww_mutex_unlock(ll->lock);
498 } while (--stress->nloops);
499 497
500 ww_acquire_fini(&ctx); 498 ww_acquire_fini(&ctx);
499 } while (!time_after(jiffies, stress->timeout));
501 500
502out: 501out:
503 list_for_each_entry_safe(ll, ln, &locks, link) 502 list_for_each_entry_safe(ll, ln, &locks, link)
@@ -523,7 +522,7 @@ static void stress_one_work(struct work_struct *work)
523 __func__, err); 522 __func__, err);
524 break; 523 break;
525 } 524 }
526 } while (--stress->nloops); 525 } while (!time_after(jiffies, stress->timeout));
527 526
528 kfree(stress); 527 kfree(stress);
529} 528}
@@ -533,7 +532,7 @@ static void stress_one_work(struct work_struct *work)
533#define STRESS_ONE BIT(2) 532#define STRESS_ONE BIT(2)
534#define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE) 533#define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE)
535 534
536static int stress(int nlocks, int nthreads, int nloops, unsigned int flags) 535static int stress(int nlocks, int nthreads, unsigned int flags)
537{ 536{
538 struct ww_mutex *locks; 537 struct ww_mutex *locks;
539 int n; 538 int n;
@@ -575,7 +574,7 @@ static int stress(int nlocks, int nthreads, int nloops, unsigned int flags)
575 INIT_WORK(&stress->work, fn); 574 INIT_WORK(&stress->work, fn);
576 stress->locks = locks; 575 stress->locks = locks;
577 stress->nlocks = nlocks; 576 stress->nlocks = nlocks;
578 stress->nloops = nloops; 577 stress->timeout = jiffies + 2*HZ;
579 578
580 queue_work(wq, &stress->work); 579 queue_work(wq, &stress->work);
581 nthreads--; 580 nthreads--;
@@ -619,15 +618,15 @@ static int __init test_ww_mutex_init(void)
619 if (ret) 618 if (ret)
620 return ret; 619 return ret;
621 620
622 ret = stress(16, 2*ncpus, 1<<10, STRESS_INORDER); 621 ret = stress(16, 2*ncpus, STRESS_INORDER);
623 if (ret) 622 if (ret)
624 return ret; 623 return ret;
625 624
626 ret = stress(16, 2*ncpus, 1<<10, STRESS_REORDER); 625 ret = stress(16, 2*ncpus, STRESS_REORDER);
627 if (ret) 626 if (ret)
628 return ret; 627 return ret;
629 628
630 ret = stress(4095, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL); 629 ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
631 if (ret) 630 if (ret)
632 return ret; 631 return ret;
633 632
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 07e85e5229da..23a6483c3666 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -182,18 +182,6 @@ struct page_map {
182 struct vmem_altmap altmap; 182 struct vmem_altmap altmap;
183}; 183};
184 184
185void get_zone_device_page(struct page *page)
186{
187 percpu_ref_get(page->pgmap->ref);
188}
189EXPORT_SYMBOL(get_zone_device_page);
190
191void put_zone_device_page(struct page *page)
192{
193 put_dev_pagemap(page->pgmap);
194}
195EXPORT_SYMBOL(put_zone_device_page);
196
197static void pgmap_radix_release(struct resource *res) 185static void pgmap_radix_release(struct resource *res)
198{ 186{
199 resource_size_t key, align_start, align_size, align_end; 187 resource_size_t key, align_start, align_size, align_end;
@@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
237 struct resource *res = &page_map->res; 225 struct resource *res = &page_map->res;
238 resource_size_t align_start, align_size; 226 resource_size_t align_start, align_size;
239 struct dev_pagemap *pgmap = &page_map->pgmap; 227 struct dev_pagemap *pgmap = &page_map->pgmap;
228 unsigned long pfn;
229
230 for_each_device_pfn(pfn, page_map)
231 put_page(pfn_to_page(pfn));
240 232
241 if (percpu_ref_tryget_live(pgmap->ref)) { 233 if (percpu_ref_tryget_live(pgmap->ref)) {
242 dev_WARN(dev, "%s: page mapping is still live!\n", __func__); 234 dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
@@ -277,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
277 * 269 *
278 * Notes: 270 * Notes:
279 * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time 271 * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
280 * (or devm release event). 272 * (or devm release event). The expected order of events is that @ref has
273 * been through percpu_ref_kill() before devm_memremap_pages_release(). The
274 * wait for the completion of all references being dropped and
275 * percpu_ref_exit() must occur after devm_memremap_pages_release().
281 * 276 *
282 * 2/ @res is expected to be a host memory range that could feasibly be 277 * 2/ @res is expected to be a host memory range that could feasibly be
283 * treated as a "System RAM" range, i.e. not a device mmio range, but 278 * treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -379,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
379 */ 374 */
380 list_del(&page->lru); 375 list_del(&page->lru);
381 page->pgmap = pgmap; 376 page->pgmap = pgmap;
377 percpu_ref_get(ref);
382 } 378 }
383 devres_add(dev, page_map); 379 devres_add(dev, page_map);
384 return __va(res->start); 380 return __va(res->start);
diff --git a/kernel/module.c b/kernel/module.c
index 7eba6dea4f41..4a3665f8f837 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -49,6 +49,9 @@
49#include <linux/rculist.h> 49#include <linux/rculist.h>
50#include <linux/uaccess.h> 50#include <linux/uaccess.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#ifdef CONFIG_STRICT_MODULE_RWX
53#include <asm/set_memory.h>
54#endif
52#include <asm/mmu_context.h> 55#include <asm/mmu_context.h>
53#include <linux/license.h> 56#include <linux/license.h>
54#include <asm/sections.h> 57#include <asm/sections.h>
@@ -665,16 +668,7 @@ static void percpu_modcopy(struct module *mod,
665 memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); 668 memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
666} 669}
667 670
668/** 671bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
669 * is_module_percpu_address - test whether address is from module static percpu
670 * @addr: address to test
671 *
672 * Test whether @addr belongs to module static percpu area.
673 *
674 * RETURNS:
675 * %true if @addr is from module static percpu area
676 */
677bool is_module_percpu_address(unsigned long addr)
678{ 672{
679 struct module *mod; 673 struct module *mod;
680 unsigned int cpu; 674 unsigned int cpu;
@@ -688,9 +682,15 @@ bool is_module_percpu_address(unsigned long addr)
688 continue; 682 continue;
689 for_each_possible_cpu(cpu) { 683 for_each_possible_cpu(cpu) {
690 void *start = per_cpu_ptr(mod->percpu, cpu); 684 void *start = per_cpu_ptr(mod->percpu, cpu);
691 685 void *va = (void *)addr;
692 if ((void *)addr >= start && 686
693 (void *)addr < start + mod->percpu_size) { 687 if (va >= start && va < start + mod->percpu_size) {
688 if (can_addr) {
689 *can_addr = (unsigned long) (va - start);
690 *can_addr += (unsigned long)
691 per_cpu_ptr(mod->percpu,
692 get_boot_cpu_id());
693 }
694 preempt_enable(); 694 preempt_enable();
695 return true; 695 return true;
696 } 696 }
@@ -701,6 +701,20 @@ bool is_module_percpu_address(unsigned long addr)
701 return false; 701 return false;
702} 702}
703 703
704/**
705 * is_module_percpu_address - test whether address is from module static percpu
706 * @addr: address to test
707 *
708 * Test whether @addr belongs to module static percpu area.
709 *
710 * RETURNS:
711 * %true if @addr is from module static percpu area
712 */
713bool is_module_percpu_address(unsigned long addr)
714{
715 return __is_module_percpu_address(addr, NULL);
716}
717
704#else /* ... !CONFIG_SMP */ 718#else /* ... !CONFIG_SMP */
705 719
706static inline void __percpu *mod_percpu(struct module *mod) 720static inline void __percpu *mod_percpu(struct module *mod)
@@ -732,6 +746,11 @@ bool is_module_percpu_address(unsigned long addr)
732 return false; 746 return false;
733} 747}
734 748
749bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
750{
751 return false;
752}
753
735#endif /* CONFIG_SMP */ 754#endif /* CONFIG_SMP */
736 755
737#define MODINFO_ATTR(field) \ 756#define MODINFO_ATTR(field) \
@@ -947,6 +966,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
947 return -EFAULT; 966 return -EFAULT;
948 name[MODULE_NAME_LEN-1] = '\0'; 967 name[MODULE_NAME_LEN-1] = '\0';
949 968
969 audit_log_kern_module(name);
970
950 if (mutex_lock_interruptible(&module_mutex) != 0) 971 if (mutex_lock_interruptible(&module_mutex) != 0)
951 return -EINTR; 972 return -EINTR;
952 973
@@ -2846,7 +2867,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
2846 2867
2847 /* Suck in entire file: we'll want most of it. */ 2868 /* Suck in entire file: we'll want most of it. */
2848 info->hdr = __vmalloc(info->len, 2869 info->hdr = __vmalloc(info->len,
2849 GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL); 2870 GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL);
2850 if (!info->hdr) 2871 if (!info->hdr)
2851 return -ENOMEM; 2872 return -ENOMEM;
2852 2873
@@ -4017,7 +4038,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
4017 4038
4018 /* Don't lock: we're in enough trouble already. */ 4039 /* Don't lock: we're in enough trouble already. */
4019 preempt_disable(); 4040 preempt_disable();
4020 if ((colon = strchr(name, ':')) != NULL) { 4041 if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
4021 if ((mod = find_module_all(name, colon - name, false)) != NULL) 4042 if ((mod = find_module_all(name, colon - name, false)) != NULL)
4022 ret = mod_find_symname(mod, colon+1); 4043 ret = mod_find_symname(mod, colon+1);
4023 } else { 4044 } else {
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 782102e59eed..f6c5d330059a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/syscalls.h> 27#include <linux/syscalls.h>
28#include <linux/cgroup.h> 28#include <linux/cgroup.h>
29#include <linux/perf_event.h>
29 30
30static struct kmem_cache *nsproxy_cachep; 31static struct kmem_cache *nsproxy_cachep;
31 32
@@ -262,6 +263,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
262 goto out; 263 goto out;
263 } 264 }
264 switch_task_namespaces(tsk, new_nsproxy); 265 switch_task_namespaces(tsk, new_nsproxy);
266
267 perf_event_namespaces(tsk);
265out: 268out:
266 fput(file); 269 fput(file);
267 return err; 270 return err;
diff --git a/kernel/padata.c b/kernel/padata.c
index 3202aa17492c..ac8f1e524836 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -154,8 +154,6 @@ EXPORT_SYMBOL(padata_do_parallel);
154 * A pointer to the control struct of the next object that needs 154 * A pointer to the control struct of the next object that needs
155 * serialization, if present in one of the percpu reorder queues. 155 * serialization, if present in one of the percpu reorder queues.
156 * 156 *
157 * NULL, if all percpu reorder queues are empty.
158 *
159 * -EINPROGRESS, if the next object that needs serialization will 157 * -EINPROGRESS, if the next object that needs serialization will
160 * be parallel processed by another cpu and is not yet present in 158 * be parallel processed by another cpu and is not yet present in
161 * the cpu's reorder queue. 159 * the cpu's reorder queue.
@@ -182,8 +180,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
182 cpu = padata_index_to_cpu(pd, next_index); 180 cpu = padata_index_to_cpu(pd, next_index);
183 next_queue = per_cpu_ptr(pd->pqueue, cpu); 181 next_queue = per_cpu_ptr(pd->pqueue, cpu);
184 182
185 padata = NULL;
186
187 reorder = &next_queue->reorder; 183 reorder = &next_queue->reorder;
188 184
189 spin_lock(&reorder->lock); 185 spin_lock(&reorder->lock);
@@ -235,12 +231,11 @@ static void padata_reorder(struct parallel_data *pd)
235 padata = padata_get_next(pd); 231 padata = padata_get_next(pd);
236 232
237 /* 233 /*
238 * All reorder queues are empty, or the next object that needs 234 * If the next object that needs serialization is parallel
239 * serialization is parallel processed by another cpu and is 235 * processed by another cpu and is still on it's way to the
240 * still on it's way to the cpu's reorder queue, nothing to 236 * cpu's reorder queue, nothing to do for now.
241 * do for now.
242 */ 237 */
243 if (!padata || PTR_ERR(padata) == -EINPROGRESS) 238 if (PTR_ERR(padata) == -EINPROGRESS)
244 break; 239 break;
245 240
246 /* 241 /*
@@ -354,7 +349,7 @@ static int padata_setup_cpumasks(struct parallel_data *pd,
354 349
355 cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); 350 cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
356 if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { 351 if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
357 free_cpumask_var(pd->cpumask.cbcpu); 352 free_cpumask_var(pd->cpumask.pcpu);
358 return -ENOMEM; 353 return -ENOMEM;
359 } 354 }
360 355
diff --git a/kernel/params.c b/kernel/params.c
index a6d6149c0fe6..60b2d8101355 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -160,58 +160,6 @@ static int parse_one(char *param,
160 return -ENOENT; 160 return -ENOENT;
161} 161}
162 162
163/* You can use " around spaces, but can't escape ". */
164/* Hyphens and underscores equivalent in parameter names. */
165static char *next_arg(char *args, char **param, char **val)
166{
167 unsigned int i, equals = 0;
168 int in_quote = 0, quoted = 0;
169 char *next;
170
171 if (*args == '"') {
172 args++;
173 in_quote = 1;
174 quoted = 1;
175 }
176
177 for (i = 0; args[i]; i++) {
178 if (isspace(args[i]) && !in_quote)
179 break;
180 if (equals == 0) {
181 if (args[i] == '=')
182 equals = i;
183 }
184 if (args[i] == '"')
185 in_quote = !in_quote;
186 }
187
188 *param = args;
189 if (!equals)
190 *val = NULL;
191 else {
192 args[equals] = '\0';
193 *val = args + equals + 1;
194
195 /* Don't include quotes in value. */
196 if (**val == '"') {
197 (*val)++;
198 if (args[i-1] == '"')
199 args[i-1] = '\0';
200 }
201 }
202 if (quoted && args[i-1] == '"')
203 args[i-1] = '\0';
204
205 if (args[i]) {
206 args[i] = '\0';
207 next = args + i + 1;
208 } else
209 next = args + i;
210
211 /* Chew up trailing spaces. */
212 return skip_spaces(next);
213}
214
215/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 163/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
216char *parse_args(const char *doing, 164char *parse_args(const char *doing,
217 char *args, 165 char *args,
diff --git a/kernel/pid.c b/kernel/pid.c
index 0143ac0ddceb..fd1cde1e4576 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -321,8 +321,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
321 } 321 }
322 322
323 if (unlikely(is_child_reaper(pid))) { 323 if (unlikely(is_child_reaper(pid))) {
324 if (pid_ns_prepare_proc(ns)) 324 if (pid_ns_prepare_proc(ns)) {
325 disable_pid_allocation(ns);
325 goto out_free; 326 goto out_free;
327 }
326 } 328 }
327 329
328 get_pid_ns(ns); 330 get_pid_ns(ns);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index de461aa0bf9a..74a5a7255b4d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -277,7 +277,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
277 * if reparented. 277 * if reparented.
278 */ 278 */
279 for (;;) { 279 for (;;) {
280 set_current_state(TASK_UNINTERRUPTIBLE); 280 set_current_state(TASK_INTERRUPTIBLE);
281 if (pid_ns->nr_hashed == init_pids) 281 if (pid_ns->nr_hashed == init_pids)
282 break; 282 break;
283 schedule(); 283 schedule();
@@ -374,6 +374,29 @@ static struct ns_common *pidns_get(struct task_struct *task)
374 return ns ? &ns->ns : NULL; 374 return ns ? &ns->ns : NULL;
375} 375}
376 376
377static struct ns_common *pidns_for_children_get(struct task_struct *task)
378{
379 struct pid_namespace *ns = NULL;
380
381 task_lock(task);
382 if (task->nsproxy) {
383 ns = task->nsproxy->pid_ns_for_children;
384 get_pid_ns(ns);
385 }
386 task_unlock(task);
387
388 if (ns) {
389 read_lock(&tasklist_lock);
390 if (!ns->child_reaper) {
391 put_pid_ns(ns);
392 ns = NULL;
393 }
394 read_unlock(&tasklist_lock);
395 }
396
397 return ns ? &ns->ns : NULL;
398}
399
377static void pidns_put(struct ns_common *ns) 400static void pidns_put(struct ns_common *ns)
378{ 401{
379 put_pid_ns(to_pid_ns(ns)); 402 put_pid_ns(to_pid_ns(ns));
@@ -443,6 +466,17 @@ const struct proc_ns_operations pidns_operations = {
443 .get_parent = pidns_get_parent, 466 .get_parent = pidns_get_parent,
444}; 467};
445 468
469const struct proc_ns_operations pidns_for_children_operations = {
470 .name = "pid_for_children",
471 .real_ns_name = "pid",
472 .type = CLONE_NEWPID,
473 .get = pidns_for_children_get,
474 .put = pidns_put,
475 .install = pidns_install,
476 .owner = pidns_owner,
477 .get_parent = pidns_get_parent,
478};
479
446static __init int pid_namespaces_init(void) 480static __init int pid_namespaces_init(void)
447{ 481{
448 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 482 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index c7209f060eeb..78672d324a6e 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -132,7 +132,7 @@ int freeze_processes(void)
132 if (!pm_freezing) 132 if (!pm_freezing)
133 atomic_inc(&system_freezing_cnt); 133 atomic_inc(&system_freezing_cnt);
134 134
135 pm_wakeup_clear(); 135 pm_wakeup_clear(true);
136 pr_info("Freezing user space processes ... "); 136 pr_info("Freezing user space processes ... ");
137 pm_freezing = true; 137 pm_freezing = true;
138 error = try_to_freeze_tasks(true); 138 error = try_to_freeze_tasks(true);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d79a38de425a..fa46606f3356 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -36,6 +36,9 @@
36#include <asm/pgtable.h> 36#include <asm/pgtable.h>
37#include <asm/tlbflush.h> 37#include <asm/tlbflush.h>
38#include <asm/io.h> 38#include <asm/io.h>
39#ifdef CONFIG_STRICT_KERNEL_RWX
40#include <asm/set_memory.h>
41#endif
39 42
40#include "power.h" 43#include "power.h"
41 44
@@ -1422,7 +1425,7 @@ static unsigned int nr_meta_pages;
1422 * Numbers of normal and highmem page frames allocated for hibernation image 1425 * Numbers of normal and highmem page frames allocated for hibernation image
1423 * before suspending devices. 1426 * before suspending devices.
1424 */ 1427 */
1425unsigned int alloc_normal, alloc_highmem; 1428static unsigned int alloc_normal, alloc_highmem;
1426/* 1429/*
1427 * Memory bitmap used for marking saveable pages (during hibernation) or 1430 * Memory bitmap used for marking saveable pages (during hibernation) or
1428 * hibernation image pages (during restore) 1431 * hibernation image pages (during restore)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 15e6baef5c73..c0248c74d6d4 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -72,6 +72,8 @@ static void freeze_begin(void)
72 72
73static void freeze_enter(void) 73static void freeze_enter(void)
74{ 74{
75 trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true);
76
75 spin_lock_irq(&suspend_freeze_lock); 77 spin_lock_irq(&suspend_freeze_lock);
76 if (pm_wakeup_pending()) 78 if (pm_wakeup_pending())
77 goto out; 79 goto out;
@@ -98,6 +100,27 @@ static void freeze_enter(void)
98 out: 100 out:
99 suspend_freeze_state = FREEZE_STATE_NONE; 101 suspend_freeze_state = FREEZE_STATE_NONE;
100 spin_unlock_irq(&suspend_freeze_lock); 102 spin_unlock_irq(&suspend_freeze_lock);
103
104 trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false);
105}
106
107static void s2idle_loop(void)
108{
109 do {
110 freeze_enter();
111
112 if (freeze_ops && freeze_ops->wake)
113 freeze_ops->wake();
114
115 dpm_resume_noirq(PMSG_RESUME);
116 if (freeze_ops && freeze_ops->sync)
117 freeze_ops->sync();
118
119 if (pm_wakeup_pending())
120 break;
121
122 pm_wakeup_clear(false);
123 } while (!dpm_suspend_noirq(PMSG_SUSPEND));
101} 124}
102 125
103void freeze_wake(void) 126void freeze_wake(void)
@@ -371,10 +394,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
371 * all the devices are suspended. 394 * all the devices are suspended.
372 */ 395 */
373 if (state == PM_SUSPEND_FREEZE) { 396 if (state == PM_SUSPEND_FREEZE) {
374 trace_suspend_resume(TPS("machine_suspend"), state, true); 397 s2idle_loop();
375 freeze_enter(); 398 goto Platform_early_resume;
376 trace_suspend_resume(TPS("machine_suspend"), state, false);
377 goto Platform_wake;
378 } 399 }
379 400
380 error = disable_nonboot_cpus(); 401 error = disable_nonboot_cpus();
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index d5760c42f042..61d41ca41844 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -2,12 +2,13 @@
2 2
3#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/console.h> 4#include <linux/console.h>
5#include <linux/errno.h>
5#include <linux/string.h> 6#include <linux/string.h>
6 7
7#include "console_cmdline.h" 8#include "console_cmdline.h"
8#include "braille.h" 9#include "braille.h"
9 10
10char *_braille_console_setup(char **str, char **brl_options) 11int _braille_console_setup(char **str, char **brl_options)
11{ 12{
12 if (!strncmp(*str, "brl,", 4)) { 13 if (!strncmp(*str, "brl,", 4)) {
13 *brl_options = ""; 14 *brl_options = "";
@@ -15,14 +16,14 @@ char *_braille_console_setup(char **str, char **brl_options)
15 } else if (!strncmp(*str, "brl=", 4)) { 16 } else if (!strncmp(*str, "brl=", 4)) {
16 *brl_options = *str + 4; 17 *brl_options = *str + 4;
17 *str = strchr(*brl_options, ','); 18 *str = strchr(*brl_options, ',');
18 if (!*str) 19 if (!*str) {
19 pr_err("need port name after brl=\n"); 20 pr_err("need port name after brl=\n");
20 else 21 return -EINVAL;
21 *((*str)++) = 0; 22 }
22 } else 23 *((*str)++) = 0;
23 return NULL; 24 }
24 25
25 return *str; 26 return 0;
26} 27}
27 28
28int 29int
diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h
index 769d771145c8..749a6756843a 100644
--- a/kernel/printk/braille.h
+++ b/kernel/printk/braille.h
@@ -9,7 +9,14 @@ braille_set_options(struct console_cmdline *c, char *brl_options)
9 c->brl_options = brl_options; 9 c->brl_options = brl_options;
10} 10}
11 11
12char * 12/*
13 * Setup console according to braille options.
14 * Return -EINVAL on syntax error, 0 on success (or no braille option was
15 * actually given).
16 * Modifies str to point to the serial options
17 * Sets brl_options to the parsed braille options.
18 */
19int
13_braille_console_setup(char **str, char **brl_options); 20_braille_console_setup(char **str, char **brl_options);
14 21
15int 22int
@@ -25,10 +32,10 @@ braille_set_options(struct console_cmdline *c, char *brl_options)
25{ 32{
26} 33}
27 34
28static inline char * 35static inline int
29_braille_console_setup(char **str, char **brl_options) 36_braille_console_setup(char **str, char **brl_options)
30{ 37{
31 return NULL; 38 return 0;
32} 39}
33 40
34static inline int 41static inline int
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 2984fb0f0257..a1aecf44ab07 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -32,7 +32,7 @@
32#include <linux/bootmem.h> 32#include <linux/bootmem.h>
33#include <linux/memblock.h> 33#include <linux/memblock.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h> 35#include <linux/crash_core.h>
36#include <linux/kdb.h> 36#include <linux/kdb.h>
37#include <linux/ratelimit.h> 37#include <linux/ratelimit.h>
38#include <linux/kmsg_dump.h> 38#include <linux/kmsg_dump.h>
@@ -269,8 +269,8 @@ static struct console *exclusive_console;
269#define MAX_CMDLINECONSOLES 8 269#define MAX_CMDLINECONSOLES 8
270 270
271static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; 271static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
272static int console_cmdline_cnt;
272 273
273static int selected_console = -1;
274static int preferred_console = -1; 274static int preferred_console = -1;
275int console_set_on_cmdline; 275int console_set_on_cmdline;
276EXPORT_SYMBOL(console_set_on_cmdline); 276EXPORT_SYMBOL(console_set_on_cmdline);
@@ -1002,7 +1002,7 @@ const struct file_operations kmsg_fops = {
1002 .release = devkmsg_release, 1002 .release = devkmsg_release,
1003}; 1003};
1004 1004
1005#ifdef CONFIG_KEXEC_CORE 1005#ifdef CONFIG_CRASH_CORE
1006/* 1006/*
1007 * This appends the listed symbols to /proc/vmcore 1007 * This appends the listed symbols to /proc/vmcore
1008 * 1008 *
@@ -1011,7 +1011,7 @@ const struct file_operations kmsg_fops = {
1011 * symbols are specifically used so that utilities can access and extract the 1011 * symbols are specifically used so that utilities can access and extract the
1012 * dmesg log from a vmcore file after a crash. 1012 * dmesg log from a vmcore file after a crash.
1013 */ 1013 */
1014void log_buf_kexec_setup(void) 1014void log_buf_vmcoreinfo_setup(void)
1015{ 1015{
1016 VMCOREINFO_SYMBOL(log_buf); 1016 VMCOREINFO_SYMBOL(log_buf);
1017 VMCOREINFO_SYMBOL(log_buf_len); 1017 VMCOREINFO_SYMBOL(log_buf_len);
@@ -1906,24 +1906,38 @@ static int __add_preferred_console(char *name, int idx, char *options,
1906 * See if this tty is not yet registered, and 1906 * See if this tty is not yet registered, and
1907 * if we have a slot free. 1907 * if we have a slot free.
1908 */ 1908 */
1909 for (i = 0, c = console_cmdline; 1909 for (i = 0, c = console_cmdline; i < console_cmdline_cnt; i++, c++) {
1910 i < MAX_CMDLINECONSOLES && c->name[0];
1911 i++, c++) {
1912 if (strcmp(c->name, name) == 0 && c->index == idx) { 1910 if (strcmp(c->name, name) == 0 && c->index == idx) {
1913 if (!brl_options) 1911 if (brl_options)
1914 selected_console = i; 1912 return 0;
1913
1914 /*
1915 * Maintain an invariant that will help to find if
1916 * the matching console is preferred, see
1917 * register_console():
1918 *
1919 * The last non-braille console is always
1920 * the preferred one.
1921 */
1922 if (i != console_cmdline_cnt - 1)
1923 swap(console_cmdline[i],
1924 console_cmdline[console_cmdline_cnt - 1]);
1925
1926 preferred_console = console_cmdline_cnt - 1;
1927
1915 return 0; 1928 return 0;
1916 } 1929 }
1917 } 1930 }
1918 if (i == MAX_CMDLINECONSOLES) 1931 if (i == MAX_CMDLINECONSOLES)
1919 return -E2BIG; 1932 return -E2BIG;
1920 if (!brl_options) 1933 if (!brl_options)
1921 selected_console = i; 1934 preferred_console = i;
1922 strlcpy(c->name, name, sizeof(c->name)); 1935 strlcpy(c->name, name, sizeof(c->name));
1923 c->options = options; 1936 c->options = options;
1924 braille_set_options(c, brl_options); 1937 braille_set_options(c, brl_options);
1925 1938
1926 c->index = idx; 1939 c->index = idx;
1940 console_cmdline_cnt++;
1927 return 0; 1941 return 0;
1928} 1942}
1929/* 1943/*
@@ -2031,15 +2045,16 @@ void resume_console(void)
2031 * @cpu: unused 2045 * @cpu: unused
2032 * 2046 *
2033 * If printk() is called from a CPU that is not online yet, the messages 2047 * If printk() is called from a CPU that is not online yet, the messages
2034 * will be spooled but will not show up on the console. This function is 2048 * will be printed on the console only if there are CON_ANYTIME consoles.
2035 * called when a new CPU comes online (or fails to come up), and ensures 2049 * This function is called when a new CPU comes online (or fails to come
2036 * that any such output gets printed. 2050 * up) or goes offline.
2037 */ 2051 */
2038static int console_cpu_notify(unsigned int cpu) 2052static int console_cpu_notify(unsigned int cpu)
2039{ 2053{
2040 if (!cpuhp_tasks_frozen) { 2054 if (!cpuhp_tasks_frozen) {
2041 console_lock(); 2055 /* If trylock fails, someone else is doing the printing */
2042 console_unlock(); 2056 if (console_trylock())
2057 console_unlock();
2043 } 2058 }
2044 return 0; 2059 return 0;
2045} 2060}
@@ -2161,7 +2176,7 @@ void console_unlock(void)
2161 } 2176 }
2162 2177
2163 /* 2178 /*
2164 * Console drivers are called under logbuf_lock, so 2179 * Console drivers are called with interrupts disabled, so
2165 * @console_may_schedule should be cleared before; however, we may 2180 * @console_may_schedule should be cleared before; however, we may
2166 * end up dumping a lot of lines, for example, if called from 2181 * end up dumping a lot of lines, for example, if called from
2167 * console registration path, and should invoke cond_resched() 2182 * console registration path, and should invoke cond_resched()
@@ -2169,11 +2184,15 @@ void console_unlock(void)
2169 * scheduling stall on a slow console leading to RCU stall and 2184 * scheduling stall on a slow console leading to RCU stall and
2170 * softlockup warnings which exacerbate the issue with more 2185 * softlockup warnings which exacerbate the issue with more
2171 * messages practically incapacitating the system. 2186 * messages practically incapacitating the system.
2187 *
2188 * console_trylock() is not able to detect the preemptive
2189 * context reliably. Therefore the value must be stored before
2190 * and cleared after the the "again" goto label.
2172 */ 2191 */
2173 do_cond_resched = console_may_schedule; 2192 do_cond_resched = console_may_schedule;
2193again:
2174 console_may_schedule = 0; 2194 console_may_schedule = 0;
2175 2195
2176again:
2177 /* 2196 /*
2178 * We released the console_sem lock, so we need to recheck if 2197 * We released the console_sem lock, so we need to recheck if
2179 * cpu is online and (if not) is there at least one CON_ANYTIME 2198 * cpu is online and (if not) is there at least one CON_ANYTIME
@@ -2409,6 +2428,7 @@ void register_console(struct console *newcon)
2409 unsigned long flags; 2428 unsigned long flags;
2410 struct console *bcon = NULL; 2429 struct console *bcon = NULL;
2411 struct console_cmdline *c; 2430 struct console_cmdline *c;
2431 static bool has_preferred;
2412 2432
2413 if (console_drivers) 2433 if (console_drivers)
2414 for_each_console(bcon) 2434 for_each_console(bcon)
@@ -2435,15 +2455,15 @@ void register_console(struct console *newcon)
2435 if (console_drivers && console_drivers->flags & CON_BOOT) 2455 if (console_drivers && console_drivers->flags & CON_BOOT)
2436 bcon = console_drivers; 2456 bcon = console_drivers;
2437 2457
2438 if (preferred_console < 0 || bcon || !console_drivers) 2458 if (!has_preferred || bcon || !console_drivers)
2439 preferred_console = selected_console; 2459 has_preferred = preferred_console >= 0;
2440 2460
2441 /* 2461 /*
2442 * See if we want to use this console driver. If we 2462 * See if we want to use this console driver. If we
2443 * didn't select a console we take the first one 2463 * didn't select a console we take the first one
2444 * that registers here. 2464 * that registers here.
2445 */ 2465 */
2446 if (preferred_console < 0) { 2466 if (!has_preferred) {
2447 if (newcon->index < 0) 2467 if (newcon->index < 0)
2448 newcon->index = 0; 2468 newcon->index = 0;
2449 if (newcon->setup == NULL || 2469 if (newcon->setup == NULL ||
@@ -2451,18 +2471,29 @@ void register_console(struct console *newcon)
2451 newcon->flags |= CON_ENABLED; 2471 newcon->flags |= CON_ENABLED;
2452 if (newcon->device) { 2472 if (newcon->device) {
2453 newcon->flags |= CON_CONSDEV; 2473 newcon->flags |= CON_CONSDEV;
2454 preferred_console = 0; 2474 has_preferred = true;
2455 } 2475 }
2456 } 2476 }
2457 } 2477 }
2458 2478
2459 /* 2479 /*
2460 * See if this console matches one we selected on 2480 * See if this console matches one we selected on the command line.
2461 * the command line. 2481 *
2482 * There may be several entries in the console_cmdline array matching
2483 * with the same console, one with newcon->match(), another by
2484 * name/index:
2485 *
2486 * pl011,mmio,0x87e024000000,115200 -- added from SPCR
2487 * ttyAMA0 -- added from command line
2488 *
2489 * Traverse the console_cmdline array in reverse order to be
2490 * sure that if this console is preferred then it will be the first
2491 * matching entry. We use the invariant that is maintained in
2492 * __add_preferred_console().
2462 */ 2493 */
2463 for (i = 0, c = console_cmdline; 2494 for (i = console_cmdline_cnt - 1; i >= 0; i--) {
2464 i < MAX_CMDLINECONSOLES && c->name[0]; 2495 c = console_cmdline + i;
2465 i++, c++) { 2496
2466 if (!newcon->match || 2497 if (!newcon->match ||
2467 newcon->match(newcon, c->name, c->index, c->options) != 0) { 2498 newcon->match(newcon, c->name, c->index, c->options) != 0) {
2468 /* default matching */ 2499 /* default matching */
@@ -2484,9 +2515,9 @@ void register_console(struct console *newcon)
2484 } 2515 }
2485 2516
2486 newcon->flags |= CON_ENABLED; 2517 newcon->flags |= CON_ENABLED;
2487 if (i == selected_console) { 2518 if (i == preferred_console) {
2488 newcon->flags |= CON_CONSDEV; 2519 newcon->flags |= CON_CONSDEV;
2489 preferred_console = selected_console; 2520 has_preferred = true;
2490 } 2521 }
2491 break; 2522 break;
2492 } 2523 }
@@ -2611,6 +2642,30 @@ int unregister_console(struct console *console)
2611EXPORT_SYMBOL(unregister_console); 2642EXPORT_SYMBOL(unregister_console);
2612 2643
2613/* 2644/*
2645 * Initialize the console device. This is called *early*, so
2646 * we can't necessarily depend on lots of kernel help here.
2647 * Just do some early initializations, and do the complex setup
2648 * later.
2649 */
2650void __init console_init(void)
2651{
2652 initcall_t *call;
2653
2654 /* Setup the default TTY line discipline. */
2655 n_tty_init();
2656
2657 /*
2658 * set up the console device so that later boot sequences can
2659 * inform about problems etc..
2660 */
2661 call = __con_initcall_start;
2662 while (call < __con_initcall_end) {
2663 (*call)();
2664 call++;
2665 }
2666}
2667
2668/*
2614 * Some boot consoles access data that is in the init section and which will 2669 * Some boot consoles access data that is in the init section and which will
2615 * be discarded after the initcalls have been run. To make sure that no code 2670 * be discarded after the initcalls have been run. To make sure that no code
2616 * will access this data, unregister the boot consoles in a late initcall. 2671 * will access this data, unregister the boot consoles in a late initcall.
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 18dfc485225c..23803c7d5180 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,10 +3,13 @@
3KCOV_INSTRUMENT := n 3KCOV_INSTRUMENT := n
4 4
5obj-y += update.o sync.o 5obj-y += update.o sync.o
6obj-$(CONFIG_SRCU) += srcu.o 6obj-$(CONFIG_CLASSIC_SRCU) += srcu.o
7obj-$(CONFIG_TREE_SRCU) += srcutree.o
8obj-$(CONFIG_TINY_SRCU) += srcutiny.o
7obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 9obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
8obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o 10obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
9obj-$(CONFIG_TREE_RCU) += tree.o 11obj-$(CONFIG_TREE_RCU) += tree.o
10obj-$(CONFIG_PREEMPT_RCU) += tree.o 12obj-$(CONFIG_PREEMPT_RCU) += tree.o
11obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o 13obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
12obj-$(CONFIG_TINY_RCU) += tiny.o 14obj-$(CONFIG_TINY_RCU) += tiny.o
15obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 0d6ff3e471be..73e16ec4054b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -56,6 +56,83 @@
56#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ 56#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \
57 DYNTICK_TASK_FLAG) 57 DYNTICK_TASK_FLAG)
58 58
59
60/*
61 * Grace-period counter management.
62 */
63
64#define RCU_SEQ_CTR_SHIFT 2
65#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1)
66
67/*
68 * Return the counter portion of a sequence number previously returned
69 * by rcu_seq_snap() or rcu_seq_current().
70 */
71static inline unsigned long rcu_seq_ctr(unsigned long s)
72{
73 return s >> RCU_SEQ_CTR_SHIFT;
74}
75
76/*
77 * Return the state portion of a sequence number previously returned
78 * by rcu_seq_snap() or rcu_seq_current().
79 */
80static inline int rcu_seq_state(unsigned long s)
81{
82 return s & RCU_SEQ_STATE_MASK;
83}
84
85/*
86 * Set the state portion of the pointed-to sequence number.
87 * The caller is responsible for preventing conflicting updates.
88 */
89static inline void rcu_seq_set_state(unsigned long *sp, int newstate)
90{
91 WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK);
92 WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate);
93}
94
95/* Adjust sequence number for start of update-side operation. */
96static inline void rcu_seq_start(unsigned long *sp)
97{
98 WRITE_ONCE(*sp, *sp + 1);
99 smp_mb(); /* Ensure update-side operation after counter increment. */
100 WARN_ON_ONCE(rcu_seq_state(*sp) != 1);
101}
102
103/* Adjust sequence number for end of update-side operation. */
104static inline void rcu_seq_end(unsigned long *sp)
105{
106 smp_mb(); /* Ensure update-side operation before counter increment. */
107 WARN_ON_ONCE(!rcu_seq_state(*sp));
108 WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1);
109}
110
111/* Take a snapshot of the update side's sequence number. */
112static inline unsigned long rcu_seq_snap(unsigned long *sp)
113{
114 unsigned long s;
115
116 s = (READ_ONCE(*sp) + 2 * RCU_SEQ_STATE_MASK + 1) & ~RCU_SEQ_STATE_MASK;
117 smp_mb(); /* Above access must not bleed into critical section. */
118 return s;
119}
120
121/* Return the current value the update side's sequence number, no ordering. */
122static inline unsigned long rcu_seq_current(unsigned long *sp)
123{
124 return READ_ONCE(*sp);
125}
126
127/*
128 * Given a snapshot from rcu_seq_snap(), determine whether or not a
129 * full update-side operation has occurred.
130 */
131static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
132{
133 return ULONG_CMP_GE(READ_ONCE(*sp), s);
134}
135
59/* 136/*
60 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally 137 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
61 * by call_rcu() and rcu callback execution, and are therefore not part of the 138 * by call_rcu() and rcu callback execution, and are therefore not part of the
@@ -109,12 +186,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
109 186
110 rcu_lock_acquire(&rcu_callback_map); 187 rcu_lock_acquire(&rcu_callback_map);
111 if (__is_kfree_rcu_offset(offset)) { 188 if (__is_kfree_rcu_offset(offset)) {
112 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); 189 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);)
113 kfree((void *)head - offset); 190 kfree((void *)head - offset);
114 rcu_lock_release(&rcu_callback_map); 191 rcu_lock_release(&rcu_callback_map);
115 return true; 192 return true;
116 } else { 193 } else {
117 RCU_TRACE(trace_rcu_invoke_callback(rn, head)); 194 RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
118 head->func(head); 195 head->func(head);
119 rcu_lock_release(&rcu_callback_map); 196 rcu_lock_release(&rcu_callback_map);
120 return false; 197 return false;
@@ -144,4 +221,76 @@ void rcu_test_sync_prims(void);
144 */ 221 */
145extern void resched_cpu(int cpu); 222extern void resched_cpu(int cpu);
146 223
224#if defined(SRCU) || !defined(TINY_RCU)
225
226#include <linux/rcu_node_tree.h>
227
228extern int rcu_num_lvls;
229extern int num_rcu_lvl[];
230extern int rcu_num_nodes;
231static bool rcu_fanout_exact;
232static int rcu_fanout_leaf;
233
234/*
235 * Compute the per-level fanout, either using the exact fanout specified
236 * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
237 */
238static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
239{
240 int i;
241
242 if (rcu_fanout_exact) {
243 levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
244 for (i = rcu_num_lvls - 2; i >= 0; i--)
245 levelspread[i] = RCU_FANOUT;
246 } else {
247 int ccur;
248 int cprv;
249
250 cprv = nr_cpu_ids;
251 for (i = rcu_num_lvls - 1; i >= 0; i--) {
252 ccur = levelcnt[i];
253 levelspread[i] = (cprv + ccur - 1) / ccur;
254 cprv = ccur;
255 }
256 }
257}
258
259/*
260 * Do a full breadth-first scan of the rcu_node structures for the
261 * specified rcu_state structure.
262 */
263#define rcu_for_each_node_breadth_first(rsp, rnp) \
264 for ((rnp) = &(rsp)->node[0]; \
265 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
266
267/*
268 * Do a breadth-first scan of the non-leaf rcu_node structures for the
269 * specified rcu_state structure. Note that if there is a singleton
270 * rcu_node tree with but one rcu_node structure, this loop is a no-op.
271 */
272#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
273 for ((rnp) = &(rsp)->node[0]; \
274 (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
275
276/*
277 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
278 * structure. Note that if there is a singleton rcu_node tree with but
279 * one rcu_node structure, this loop -will- visit the rcu_node structure.
280 * It is still a leaf node, even if it is also the root node.
281 */
282#define rcu_for_each_leaf_node(rsp, rnp) \
283 for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
284 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
285
286/*
287 * Iterate over all possible CPUs in a leaf RCU node.
288 */
289#define for_each_leaf_node_possible_cpu(rnp, cpu) \
290 for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
291 cpu <= rnp->grphi; \
292 cpu = cpumask_next((cpu), cpu_possible_mask))
293
294#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
295
147#endif /* __LINUX_RCU_H */ 296#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
new file mode 100644
index 000000000000..2b62a38b080f
--- /dev/null
+++ b/kernel/rcu/rcu_segcblist.c
@@ -0,0 +1,505 @@
1/*
2 * RCU segmented callback lists, function definitions
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright IBM Corporation, 2017
19 *
20 * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 */
22
23#include <linux/types.h>
24#include <linux/kernel.h>
25#include <linux/interrupt.h>
26
27#include "rcu_segcblist.h"
28
29/* Initialize simple callback list. */
30void rcu_cblist_init(struct rcu_cblist *rclp)
31{
32 rclp->head = NULL;
33 rclp->tail = &rclp->head;
34 rclp->len = 0;
35 rclp->len_lazy = 0;
36}
37
38/*
39 * Debug function to actually count the number of callbacks.
40 * If the number exceeds the limit specified, return -1.
41 */
42long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
43{
44 int cnt = 0;
45 struct rcu_head **rhpp = &rclp->head;
46
47 for (;;) {
48 if (!*rhpp)
49 return cnt;
50 if (++cnt > lim)
51 return -1;
52 rhpp = &(*rhpp)->next;
53 }
54}
55
56/*
57 * Dequeue the oldest rcu_head structure from the specified callback
58 * list. This function assumes that the callback is non-lazy, but
59 * the caller can later invoke rcu_cblist_dequeued_lazy() if it
60 * finds otherwise (and if it cares about laziness). This allows
61 * different users to have different ways of determining laziness.
62 */
63struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
64{
65 struct rcu_head *rhp;
66
67 rhp = rclp->head;
68 if (!rhp)
69 return NULL;
70 rclp->len--;
71 rclp->head = rhp->next;
72 if (!rclp->head)
73 rclp->tail = &rclp->head;
74 return rhp;
75}
76
77/*
78 * Initialize an rcu_segcblist structure.
79 */
80void rcu_segcblist_init(struct rcu_segcblist *rsclp)
81{
82 int i;
83
84 BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
85 BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
86 rsclp->head = NULL;
87 for (i = 0; i < RCU_CBLIST_NSEGS; i++)
88 rsclp->tails[i] = &rsclp->head;
89 rsclp->len = 0;
90 rsclp->len_lazy = 0;
91}
92
93/*
94 * Disable the specified rcu_segcblist structure, so that callbacks can
95 * no longer be posted to it. This structure must be empty.
96 */
97void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
98{
99 WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
100 WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
101 WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
102 rsclp->tails[RCU_NEXT_TAIL] = NULL;
103}
104
105/*
106 * Is the specified segment of the specified rcu_segcblist structure
107 * empty of callbacks?
108 */
109bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
110{
111 if (seg == RCU_DONE_TAIL)
112 return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
113 return rsclp->tails[seg - 1] == rsclp->tails[seg];
114}
115
116/*
117 * Does the specified rcu_segcblist structure contain callbacks that
118 * are ready to be invoked?
119 */
120bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
121{
122 return rcu_segcblist_is_enabled(rsclp) &&
123 &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
124}
125
126/*
127 * Does the specified rcu_segcblist structure contain callbacks that
128 * are still pending, that is, not yet ready to be invoked?
129 */
130bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
131{
132 return rcu_segcblist_is_enabled(rsclp) &&
133 !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
134}
135
136/*
137 * Dequeue and return the first ready-to-invoke callback. If there
138 * are no ready-to-invoke callbacks, return NULL. Disables interrupts
139 * to avoid interference. Does not protect from interference from other
140 * CPUs or tasks.
141 */
142struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
143{
144 unsigned long flags;
145 int i;
146 struct rcu_head *rhp;
147
148 local_irq_save(flags);
149 if (!rcu_segcblist_ready_cbs(rsclp)) {
150 local_irq_restore(flags);
151 return NULL;
152 }
153 rhp = rsclp->head;
154 BUG_ON(!rhp);
155 rsclp->head = rhp->next;
156 for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
157 if (rsclp->tails[i] != &rhp->next)
158 break;
159 rsclp->tails[i] = &rsclp->head;
160 }
161 smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
162 WRITE_ONCE(rsclp->len, rsclp->len - 1);
163 local_irq_restore(flags);
164 return rhp;
165}
166
167/*
168 * Account for the fact that a previously dequeued callback turned out
169 * to be marked as lazy.
170 */
171void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
172{
173 unsigned long flags;
174
175 local_irq_save(flags);
176 rsclp->len_lazy--;
177 local_irq_restore(flags);
178}
179
180/*
181 * Return a pointer to the first callback in the specified rcu_segcblist
182 * structure. This is useful for diagnostics.
183 */
184struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
185{
186 if (rcu_segcblist_is_enabled(rsclp))
187 return rsclp->head;
188 return NULL;
189}
190
191/*
192 * Return a pointer to the first pending callback in the specified
193 * rcu_segcblist structure. This is useful just after posting a given
194 * callback -- if that callback is the first pending callback, then
195 * you cannot rely on someone else having already started up the required
196 * grace period.
197 */
198struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
199{
200 if (rcu_segcblist_is_enabled(rsclp))
201 return *rsclp->tails[RCU_DONE_TAIL];
202 return NULL;
203}
204
205/*
206 * Does the specified rcu_segcblist structure contain callbacks that
207 * have not yet been processed beyond having been posted, that is,
208 * does it contain callbacks in its last segment?
209 */
210bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
211{
212 return rcu_segcblist_is_enabled(rsclp) &&
213 !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
214}
215
216/*
217 * Enqueue the specified callback onto the specified rcu_segcblist
218 * structure, updating accounting as needed. Note that the ->len
219 * field may be accessed locklessly, hence the WRITE_ONCE().
220 * The ->len field is used by rcu_barrier() and friends to determine
221 * if it must post a callback on this structure, and it is OK
222 * for rcu_barrier() to sometimes post callbacks needlessly, but
223 * absolutely not OK for it to ever miss posting a callback.
224 */
225void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
226 struct rcu_head *rhp, bool lazy)
227{
228 WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
229 if (lazy)
230 rsclp->len_lazy++;
231 smp_mb(); /* Ensure counts are updated before callback is enqueued. */
232 rhp->next = NULL;
233 *rsclp->tails[RCU_NEXT_TAIL] = rhp;
234 rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
235}
236
237/*
238 * Entrain the specified callback onto the specified rcu_segcblist at
239 * the end of the last non-empty segment. If the entire rcu_segcblist
240 * is empty, make no change, but return false.
241 *
242 * This is intended for use by rcu_barrier()-like primitives, -not-
243 * for normal grace-period use. IMPORTANT: The callback you enqueue
244 * will wait for all prior callbacks, NOT necessarily for a grace
245 * period. You have been warned.
246 */
247bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
248 struct rcu_head *rhp, bool lazy)
249{
250 int i;
251
252 if (rcu_segcblist_n_cbs(rsclp) == 0)
253 return false;
254 WRITE_ONCE(rsclp->len, rsclp->len + 1);
255 if (lazy)
256 rsclp->len_lazy++;
257 smp_mb(); /* Ensure counts are updated before callback is entrained. */
258 rhp->next = NULL;
259 for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
260 if (rsclp->tails[i] != rsclp->tails[i - 1])
261 break;
262 *rsclp->tails[i] = rhp;
263 for (; i <= RCU_NEXT_TAIL; i++)
264 rsclp->tails[i] = &rhp->next;
265 return true;
266}
267
268/*
269 * Extract only the counts from the specified rcu_segcblist structure,
270 * and place them in the specified rcu_cblist structure. This function
271 * supports both callback orphaning and invocation, hence the separation
272 * of counts and callbacks. (Callbacks ready for invocation must be
273 * orphaned and adopted separately from pending callbacks, but counts
274 * apply to all callbacks. Locking must be used to make sure that
275 * both orphaned-callbacks lists are consistent.)
276 */
277void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
278 struct rcu_cblist *rclp)
279{
280 rclp->len_lazy += rsclp->len_lazy;
281 rclp->len += rsclp->len;
282 rsclp->len_lazy = 0;
283 WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
284}
285
286/*
287 * Extract only those callbacks ready to be invoked from the specified
288 * rcu_segcblist structure and place them in the specified rcu_cblist
289 * structure.
290 */
291void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
292 struct rcu_cblist *rclp)
293{
294 int i;
295
296 if (!rcu_segcblist_ready_cbs(rsclp))
297 return; /* Nothing to do. */
298 *rclp->tail = rsclp->head;
299 rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
300 *rsclp->tails[RCU_DONE_TAIL] = NULL;
301 rclp->tail = rsclp->tails[RCU_DONE_TAIL];
302 for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
303 if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
304 rsclp->tails[i] = &rsclp->head;
305}
306
307/*
308 * Extract only those callbacks still pending (not yet ready to be
309 * invoked) from the specified rcu_segcblist structure and place them in
310 * the specified rcu_cblist structure. Note that this loses information
311 * about any callbacks that might have been partway done waiting for
312 * their grace period. Too bad! They will have to start over.
313 */
314void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
315 struct rcu_cblist *rclp)
316{
317 int i;
318
319 if (!rcu_segcblist_pend_cbs(rsclp))
320 return; /* Nothing to do. */
321 *rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
322 rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
323 *rsclp->tails[RCU_DONE_TAIL] = NULL;
324 for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
325 rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
326}
327
328/*
329 * Insert counts from the specified rcu_cblist structure in the
330 * specified rcu_segcblist structure.
331 */
332void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
333 struct rcu_cblist *rclp)
334{
335 rsclp->len_lazy += rclp->len_lazy;
336 /* ->len sampled locklessly. */
337 WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
338 rclp->len_lazy = 0;
339 rclp->len = 0;
340}
341
342/*
343 * Move callbacks from the specified rcu_cblist to the beginning of the
344 * done-callbacks segment of the specified rcu_segcblist.
345 */
346void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
347 struct rcu_cblist *rclp)
348{
349 int i;
350
351 if (!rclp->head)
352 return; /* No callbacks to move. */
353 *rclp->tail = rsclp->head;
354 rsclp->head = rclp->head;
355 for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
356 if (&rsclp->head == rsclp->tails[i])
357 rsclp->tails[i] = rclp->tail;
358 else
359 break;
360 rclp->head = NULL;
361 rclp->tail = &rclp->head;
362}
363
364/*
365 * Move callbacks from the specified rcu_cblist to the end of the
366 * new-callbacks segment of the specified rcu_segcblist.
367 */
368void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
369 struct rcu_cblist *rclp)
370{
371 if (!rclp->head)
372 return; /* Nothing to do. */
373 *rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
374 rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
375 rclp->head = NULL;
376 rclp->tail = &rclp->head;
377}
378
379/*
380 * Advance the callbacks in the specified rcu_segcblist structure based
381 * on the current value passed in for the grace-period counter.
382 */
383void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
384{
385 int i, j;
386
387 WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
388 if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
389 return;
390
391 /*
392 * Find all callbacks whose ->gp_seq numbers indicate that they
393 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
394 */
395 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
396 if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
397 break;
398 rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
399 }
400
401 /* If no callbacks moved, nothing more need be done. */
402 if (i == RCU_WAIT_TAIL)
403 return;
404
405 /* Clean up tail pointers that might have been misordered above. */
406 for (j = RCU_WAIT_TAIL; j < i; j++)
407 rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
408
409 /*
410 * Callbacks moved, so clean up the misordered ->tails[] pointers
411 * that now point into the middle of the list of ready-to-invoke
412 * callbacks. The overall effect is to copy down the later pointers
413 * into the gap that was created by the now-ready segments.
414 */
415 for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
416 if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
417 break; /* No more callbacks. */
418 rsclp->tails[j] = rsclp->tails[i];
419 rsclp->gp_seq[j] = rsclp->gp_seq[i];
420 }
421}
422
423/*
424 * "Accelerate" callbacks based on more-accurate grace-period information.
425 * The reason for this is that RCU does not synchronize the beginnings and
426 * ends of grace periods, and that callbacks are posted locally. This in
427 * turn means that the callbacks must be labelled conservatively early
428 * on, as getting exact information would degrade both performance and
429 * scalability. When more accurate grace-period information becomes
430 * available, previously posted callbacks can be "accelerated", marking
431 * them to complete at the end of the earlier grace period.
432 *
433 * This function operates on an rcu_segcblist structure, and also the
434 * grace-period sequence number seq at which new callbacks would become
435 * ready to invoke. Returns true if there are callbacks that won't be
436 * ready to invoke until seq, false otherwise.
437 */
438bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
439{
440 int i;
441
442 WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
443 if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
444 return false;
445
446 /*
447 * Find the segment preceding the oldest segment of callbacks
448 * whose ->gp_seq[] completion is at or after that passed in via
449 * "seq", skipping any empty segments. This oldest segment, along
450 * with any later segments, can be merged in with any newly arrived
451 * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
452 * as their ->gp_seq[] grace-period completion sequence number.
453 */
454 for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
455 if (rsclp->tails[i] != rsclp->tails[i - 1] &&
456 ULONG_CMP_LT(rsclp->gp_seq[i], seq))
457 break;
458
459 /*
460 * If all the segments contain callbacks that correspond to
461 * earlier grace-period sequence numbers than "seq", leave.
462 * Assuming that the rcu_segcblist structure has enough
463 * segments in its arrays, this can only happen if some of
464 * the non-done segments contain callbacks that really are
465 * ready to invoke. This situation will get straightened
466 * out by the next call to rcu_segcblist_advance().
467 *
468 * Also advance to the oldest segment of callbacks whose
469 * ->gp_seq[] completion is at or after that passed in via "seq",
470 * skipping any empty segments.
471 */
472 if (++i >= RCU_NEXT_TAIL)
473 return false;
474
475 /*
476 * Merge all later callbacks, including newly arrived callbacks,
477 * into the segment located by the for-loop above. Assign "seq"
478 * as the ->gp_seq[] value in order to correctly handle the case
479 * where there were no pending callbacks in the rcu_segcblist
480 * structure other than in the RCU_NEXT_TAIL segment.
481 */
482 for (; i < RCU_NEXT_TAIL; i++) {
483 rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
484 rsclp->gp_seq[i] = seq;
485 }
486 return true;
487}
488
489/*
490 * Scan the specified rcu_segcblist structure for callbacks that need
491 * a grace period later than the one specified by "seq". We don't look
492 * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
493 * have a grace-period sequence number.
494 */
495bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
496 unsigned long seq)
497{
498 int i;
499
500 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
501 if (rsclp->tails[i - 1] != rsclp->tails[i] &&
502 ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
503 return true;
504 return false;
505}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
new file mode 100644
index 000000000000..6e36e36478cd
--- /dev/null
+++ b/kernel/rcu/rcu_segcblist.h
@@ -0,0 +1,164 @@
1/*
2 * RCU segmented callback lists, internal-to-rcu header file
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright IBM Corporation, 2017
19 *
20 * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 */
22
23#include <linux/rcu_segcblist.h>
24
25/*
26 * Account for the fact that a previously dequeued callback turned out
27 * to be marked as lazy.
28 */
29static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
30{
31 rclp->len_lazy--;
32}
33
34/*
35 * Interim function to return rcu_cblist head pointer. Longer term, the
36 * rcu_cblist will be used more pervasively, removing the need for this
37 * function.
38 */
39static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
40{
41 return rclp->head;
42}
43
44/*
45 * Interim function to return rcu_cblist head pointer. Longer term, the
46 * rcu_cblist will be used more pervasively, removing the need for this
47 * function.
48 */
49static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
50{
51 WARN_ON_ONCE(!rclp->head);
52 return rclp->tail;
53}
54
55void rcu_cblist_init(struct rcu_cblist *rclp);
56long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim);
57struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
58
59/*
60 * Is the specified rcu_segcblist structure empty?
61 *
62 * But careful! The fact that the ->head field is NULL does not
63 * necessarily imply that there are no callbacks associated with
64 * this structure. When callbacks are being invoked, they are
65 * removed as a group. If callback invocation must be preempted,
66 * the remaining callbacks will be added back to the list. Either
67 * way, the counts are updated later.
68 *
69 * So it is often the case that rcu_segcblist_n_cbs() should be used
70 * instead.
71 */
72static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
73{
74 return !rsclp->head;
75}
76
77/* Return number of callbacks in segmented callback list. */
78static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
79{
80 return READ_ONCE(rsclp->len);
81}
82
83/* Return number of lazy callbacks in segmented callback list. */
84static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
85{
86 return rsclp->len_lazy;
87}
88
89/* Return number of lazy callbacks in segmented callback list. */
90static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
91{
92 return rsclp->len - rsclp->len_lazy;
93}
94
95/*
96 * Is the specified rcu_segcblist enabled, for example, not corresponding
97 * to an offline or callback-offloaded CPU?
98 */
99static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
100{
101 return !!rsclp->tails[RCU_NEXT_TAIL];
102}
103
104/*
105 * Are all segments following the specified segment of the specified
106 * rcu_segcblist structure empty of callbacks? (The specified
107 * segment might well contain callbacks.)
108 */
109static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
110{
111 return !*rsclp->tails[seg];
112}
113
114/*
115 * Interim function to return rcu_segcblist head pointer. Longer term, the
116 * rcu_segcblist will be used more pervasively, removing the need for this
117 * function.
118 */
119static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
120{
121 return rsclp->head;
122}
123
124/*
125 * Interim function to return rcu_segcblist head pointer. Longer term, the
126 * rcu_segcblist will be used more pervasively, removing the need for this
127 * function.
128 */
129static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
130{
131 WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
132 return rsclp->tails[RCU_NEXT_TAIL];
133}
134
135void rcu_segcblist_init(struct rcu_segcblist *rsclp);
136void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
137bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg);
138bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
139bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
140struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp);
141void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp);
142struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
143struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
144bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp);
145void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
146 struct rcu_head *rhp, bool lazy);
147bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
148 struct rcu_head *rhp, bool lazy);
149void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
150 struct rcu_cblist *rclp);
151void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
152 struct rcu_cblist *rclp);
153void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
154 struct rcu_cblist *rclp);
155void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
156 struct rcu_cblist *rclp);
157void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
158 struct rcu_cblist *rclp);
159void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
160 struct rcu_cblist *rclp);
161void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
162bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
163bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
164 unsigned long seq);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index cccc417a8135..ae6e574d4cf5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -559,19 +559,34 @@ static void srcu_torture_barrier(void)
559 559
560static void srcu_torture_stats(void) 560static void srcu_torture_stats(void)
561{ 561{
562 int cpu; 562 int __maybe_unused cpu;
563 int idx = srcu_ctlp->completed & 0x1; 563 int idx;
564 564
565 pr_alert("%s%s per-CPU(idx=%d):", 565#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU)
566#ifdef CONFIG_TREE_SRCU
567 idx = srcu_ctlp->srcu_idx & 0x1;
568#else /* #ifdef CONFIG_TREE_SRCU */
569 idx = srcu_ctlp->completed & 0x1;
570#endif /* #else #ifdef CONFIG_TREE_SRCU */
571 pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
566 torture_type, TORTURE_FLAG, idx); 572 torture_type, TORTURE_FLAG, idx);
567 for_each_possible_cpu(cpu) { 573 for_each_possible_cpu(cpu) {
568 unsigned long l0, l1; 574 unsigned long l0, l1;
569 unsigned long u0, u1; 575 unsigned long u0, u1;
570 long c0, c1; 576 long c0, c1;
571 struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); 577#ifdef CONFIG_TREE_SRCU
578 struct srcu_data *counts;
572 579
580 counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
581 u0 = counts->srcu_unlock_count[!idx];
582 u1 = counts->srcu_unlock_count[idx];
583#else /* #ifdef CONFIG_TREE_SRCU */
584 struct srcu_array *counts;
585
586 counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
573 u0 = counts->unlock_count[!idx]; 587 u0 = counts->unlock_count[!idx];
574 u1 = counts->unlock_count[idx]; 588 u1 = counts->unlock_count[idx];
589#endif /* #else #ifdef CONFIG_TREE_SRCU */
575 590
576 /* 591 /*
577 * Make sure that a lock is always counted if the corresponding 592 * Make sure that a lock is always counted if the corresponding
@@ -579,14 +594,26 @@ static void srcu_torture_stats(void)
579 */ 594 */
580 smp_rmb(); 595 smp_rmb();
581 596
597#ifdef CONFIG_TREE_SRCU
598 l0 = counts->srcu_lock_count[!idx];
599 l1 = counts->srcu_lock_count[idx];
600#else /* #ifdef CONFIG_TREE_SRCU */
582 l0 = counts->lock_count[!idx]; 601 l0 = counts->lock_count[!idx];
583 l1 = counts->lock_count[idx]; 602 l1 = counts->lock_count[idx];
603#endif /* #else #ifdef CONFIG_TREE_SRCU */
584 604
585 c0 = l0 - u0; 605 c0 = l0 - u0;
586 c1 = l1 - u1; 606 c1 = l1 - u1;
587 pr_cont(" %d(%ld,%ld)", cpu, c0, c1); 607 pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
588 } 608 }
589 pr_cont("\n"); 609 pr_cont("\n");
610#elif defined(CONFIG_TINY_SRCU)
611 idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
612 pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n",
613 torture_type, TORTURE_FLAG, idx,
614 READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
615 READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
616#endif
590} 617}
591 618
592static void srcu_torture_synchronize_expedited(void) 619static void srcu_torture_synchronize_expedited(void)
@@ -1333,12 +1360,14 @@ rcu_torture_stats_print(void)
1333 cur_ops->stats(); 1360 cur_ops->stats();
1334 if (rtcv_snap == rcu_torture_current_version && 1361 if (rtcv_snap == rcu_torture_current_version &&
1335 rcu_torture_current != NULL) { 1362 rcu_torture_current != NULL) {
1336 int __maybe_unused flags; 1363 int __maybe_unused flags = 0;
1337 unsigned long __maybe_unused gpnum; 1364 unsigned long __maybe_unused gpnum = 0;
1338 unsigned long __maybe_unused completed; 1365 unsigned long __maybe_unused completed = 0;
1339 1366
1340 rcutorture_get_gp_data(cur_ops->ttype, 1367 rcutorture_get_gp_data(cur_ops->ttype,
1341 &flags, &gpnum, &completed); 1368 &flags, &gpnum, &completed);
1369 srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
1370 &flags, &gpnum, &completed);
1342 wtp = READ_ONCE(writer_task); 1371 wtp = READ_ONCE(writer_task);
1343 pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", 1372 pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
1344 rcu_torture_writer_state_getname(), 1373 rcu_torture_writer_state_getname(),
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index ef3bcfb15b39..584d8a983883 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -22,7 +22,7 @@
22 * Lai Jiangshan <laijs@cn.fujitsu.com> 22 * Lai Jiangshan <laijs@cn.fujitsu.com>
23 * 23 *
24 * For detailed explanation of Read-Copy Update mechanism see - 24 * For detailed explanation of Read-Copy Update mechanism see -
25 * Documentation/RCU/ *.txt 25 * Documentation/RCU/ *.txt
26 * 26 *
27 */ 27 */
28 28
@@ -243,8 +243,14 @@ static bool srcu_readers_active(struct srcu_struct *sp)
243 * cleanup_srcu_struct - deconstruct a sleep-RCU structure 243 * cleanup_srcu_struct - deconstruct a sleep-RCU structure
244 * @sp: structure to clean up. 244 * @sp: structure to clean up.
245 * 245 *
246 * Must invoke this after you are finished using a given srcu_struct that 246 * Must invoke this only after you are finished using a given srcu_struct
247 * was initialized via init_srcu_struct(), else you leak memory. 247 * that was initialized via init_srcu_struct(). This code does some
248 * probabalistic checking, spotting late uses of srcu_read_lock(),
249 * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu().
250 * If any such late uses are detected, the per-CPU memory associated with
251 * the srcu_struct is simply leaked and WARN_ON() is invoked. If the
252 * caller frees the srcu_struct itself, a use-after-free crash will likely
253 * ensue, but at least there will be a warning printed.
248 */ 254 */
249void cleanup_srcu_struct(struct srcu_struct *sp) 255void cleanup_srcu_struct(struct srcu_struct *sp)
250{ 256{
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
new file mode 100644
index 000000000000..36e1f82faed1
--- /dev/null
+++ b/kernel/rcu/srcutiny.c
@@ -0,0 +1,216 @@
1/*
2 * Sleepable Read-Copy Update mechanism for mutual exclusion,
3 * tiny version for non-preemptible single-CPU use.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, you can access it online at
17 * http://www.gnu.org/licenses/gpl-2.0.html.
18 *
19 * Copyright (C) IBM Corporation, 2017
20 *
21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 */
23
24#include <linux/export.h>
25#include <linux/mutex.h>
26#include <linux/preempt.h>
27#include <linux/rcupdate_wait.h>
28#include <linux/sched.h>
29#include <linux/delay.h>
30#include <linux/srcu.h>
31
32#include <linux/rcu_node_tree.h>
33#include "rcu_segcblist.h"
34#include "rcu.h"
35
36static int init_srcu_struct_fields(struct srcu_struct *sp)
37{
38 sp->srcu_lock_nesting[0] = 0;
39 sp->srcu_lock_nesting[1] = 0;
40 init_swait_queue_head(&sp->srcu_wq);
41 sp->srcu_gp_seq = 0;
42 rcu_segcblist_init(&sp->srcu_cblist);
43 sp->srcu_gp_running = false;
44 sp->srcu_gp_waiting = false;
45 sp->srcu_idx = 0;
46 INIT_WORK(&sp->srcu_work, srcu_drive_gp);
47 return 0;
48}
49
50#ifdef CONFIG_DEBUG_LOCK_ALLOC
51
52int __init_srcu_struct(struct srcu_struct *sp, const char *name,
53 struct lock_class_key *key)
54{
55 /* Don't re-initialize a lock while it is held. */
56 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
57 lockdep_init_map(&sp->dep_map, name, key, 0);
58 return init_srcu_struct_fields(sp);
59}
60EXPORT_SYMBOL_GPL(__init_srcu_struct);
61
62#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
63
64/*
65 * init_srcu_struct - initialize a sleep-RCU structure
66 * @sp: structure to initialize.
67 *
68 * Must invoke this on a given srcu_struct before passing that srcu_struct
69 * to any other function. Each srcu_struct represents a separate domain
70 * of SRCU protection.
71 */
72int init_srcu_struct(struct srcu_struct *sp)
73{
74 return init_srcu_struct_fields(sp);
75}
76EXPORT_SYMBOL_GPL(init_srcu_struct);
77
78#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
79
80/*
81 * cleanup_srcu_struct - deconstruct a sleep-RCU structure
82 * @sp: structure to clean up.
83 *
84 * Must invoke this after you are finished using a given srcu_struct that
85 * was initialized via init_srcu_struct(), else you leak memory.
86 */
87void cleanup_srcu_struct(struct srcu_struct *sp)
88{
89 WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
90 flush_work(&sp->srcu_work);
91 WARN_ON(rcu_seq_state(sp->srcu_gp_seq));
92 WARN_ON(sp->srcu_gp_running);
93 WARN_ON(sp->srcu_gp_waiting);
94 WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist));
95}
96EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
97
98/*
99 * Counts the new reader in the appropriate per-CPU element of the
100 * srcu_struct. Must be called from process context.
101 * Returns an index that must be passed to the matching srcu_read_unlock().
102 */
103int __srcu_read_lock(struct srcu_struct *sp)
104{
105 int idx;
106
107 idx = READ_ONCE(sp->srcu_idx);
108 WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1);
109 return idx;
110}
111EXPORT_SYMBOL_GPL(__srcu_read_lock);
112
113/*
114 * Removes the count for the old reader from the appropriate element of
115 * the srcu_struct. Must be called from process context.
116 */
117void __srcu_read_unlock(struct srcu_struct *sp, int idx)
118{
119 int newval = sp->srcu_lock_nesting[idx] - 1;
120
121 WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
122 if (!newval && READ_ONCE(sp->srcu_gp_waiting))
123 swake_up(&sp->srcu_wq);
124}
125EXPORT_SYMBOL_GPL(__srcu_read_unlock);
126
127/*
128 * Workqueue handler to drive one grace period and invoke any callbacks
129 * that become ready as a result. Single-CPU and !PREEMPT operation
130 * means that we get away with murder on synchronization. ;-)
131 */
132void srcu_drive_gp(struct work_struct *wp)
133{
134 int idx;
135 struct rcu_cblist ready_cbs;
136 struct srcu_struct *sp;
137 struct rcu_head *rhp;
138
139 sp = container_of(wp, struct srcu_struct, srcu_work);
140 if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist))
141 return; /* Already running or nothing to do. */
142
143 /* Tag recently arrived callbacks and wait for readers. */
144 WRITE_ONCE(sp->srcu_gp_running, true);
145 rcu_segcblist_accelerate(&sp->srcu_cblist,
146 rcu_seq_snap(&sp->srcu_gp_seq));
147 rcu_seq_start(&sp->srcu_gp_seq);
148 idx = sp->srcu_idx;
149 WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
150 WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
151 swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
152 WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
153 rcu_seq_end(&sp->srcu_gp_seq);
154
155 /* Update callback list based on GP, and invoke ready callbacks. */
156 rcu_segcblist_advance(&sp->srcu_cblist,
157 rcu_seq_current(&sp->srcu_gp_seq));
158 if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
159 rcu_cblist_init(&ready_cbs);
160 local_irq_disable();
161 rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
162 local_irq_enable();
163 rhp = rcu_cblist_dequeue(&ready_cbs);
164 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
165 local_bh_disable();
166 rhp->func(rhp);
167 local_bh_enable();
168 }
169 local_irq_disable();
170 rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
171 local_irq_enable();
172 }
173 WRITE_ONCE(sp->srcu_gp_running, false);
174
175 /*
176 * If more callbacks, reschedule ourselves. This can race with
177 * a call_srcu() at interrupt level, but the ->srcu_gp_running
178 * checks will straighten that out.
179 */
180 if (!rcu_segcblist_empty(&sp->srcu_cblist))
181 schedule_work(&sp->srcu_work);
182}
183EXPORT_SYMBOL_GPL(srcu_drive_gp);
184
185/*
186 * Enqueue an SRCU callback on the specified srcu_struct structure,
187 * initiating grace-period processing if it is not already running.
188 */
189void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
190 rcu_callback_t func)
191{
192 unsigned long flags;
193
194 head->func = func;
195 local_irq_save(flags);
196 rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
197 local_irq_restore(flags);
198 if (!READ_ONCE(sp->srcu_gp_running))
199 schedule_work(&sp->srcu_work);
200}
201EXPORT_SYMBOL_GPL(call_srcu);
202
203/*
204 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
205 */
206void synchronize_srcu(struct srcu_struct *sp)
207{
208 struct rcu_synchronize rs;
209
210 init_rcu_head_on_stack(&rs.head);
211 init_completion(&rs.completion);
212 call_srcu(sp, &rs.head, wakeme_after_rcu);
213 wait_for_completion(&rs.completion);
214 destroy_rcu_head_on_stack(&rs.head);
215}
216EXPORT_SYMBOL_GPL(synchronize_srcu);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
new file mode 100644
index 000000000000..3ae8474557df
--- /dev/null
+++ b/kernel/rcu/srcutree.c
@@ -0,0 +1,1155 @@
1/*
2 * Sleepable Read-Copy Update mechanism for mutual exclusion.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012
20 *
21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 * Lai Jiangshan <laijs@cn.fujitsu.com>
23 *
24 * For detailed explanation of Read-Copy Update mechanism see -
25 * Documentation/RCU/ *.txt
26 *
27 */
28
29#include <linux/export.h>
30#include <linux/mutex.h>
31#include <linux/percpu.h>
32#include <linux/preempt.h>
33#include <linux/rcupdate_wait.h>
34#include <linux/sched.h>
35#include <linux/smp.h>
36#include <linux/delay.h>
37#include <linux/module.h>
38#include <linux/srcu.h>
39
40#include "rcu.h"
41#include "rcu_segcblist.h"
42
43ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */
44module_param(exp_holdoff, ulong, 0444);
45
46static void srcu_invoke_callbacks(struct work_struct *work);
47static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
48
49/*
50 * Initialize SRCU combining tree. Note that statically allocated
51 * srcu_struct structures might already have srcu_read_lock() and
52 * srcu_read_unlock() running against them. So if the is_static parameter
53 * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
54 */
55static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
56{
57 int cpu;
58 int i;
59 int level = 0;
60 int levelspread[RCU_NUM_LVLS];
61 struct srcu_data *sdp;
62 struct srcu_node *snp;
63 struct srcu_node *snp_first;
64
65 /* Work out the overall tree geometry. */
66 sp->level[0] = &sp->node[0];
67 for (i = 1; i < rcu_num_lvls; i++)
68 sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1];
69 rcu_init_levelspread(levelspread, num_rcu_lvl);
70
71 /* Each pass through this loop initializes one srcu_node structure. */
72 rcu_for_each_node_breadth_first(sp, snp) {
73 spin_lock_init(&snp->lock);
74 WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
75 ARRAY_SIZE(snp->srcu_data_have_cbs));
76 for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
77 snp->srcu_have_cbs[i] = 0;
78 snp->srcu_data_have_cbs[i] = 0;
79 }
80 snp->srcu_gp_seq_needed_exp = 0;
81 snp->grplo = -1;
82 snp->grphi = -1;
83 if (snp == &sp->node[0]) {
84 /* Root node, special case. */
85 snp->srcu_parent = NULL;
86 continue;
87 }
88
89 /* Non-root node. */
90 if (snp == sp->level[level + 1])
91 level++;
92 snp->srcu_parent = sp->level[level - 1] +
93 (snp - sp->level[level]) /
94 levelspread[level - 1];
95 }
96
97 /*
98 * Initialize the per-CPU srcu_data array, which feeds into the
99 * leaves of the srcu_node tree.
100 */
101 WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
102 ARRAY_SIZE(sdp->srcu_unlock_count));
103 level = rcu_num_lvls - 1;
104 snp_first = sp->level[level];
105 for_each_possible_cpu(cpu) {
106 sdp = per_cpu_ptr(sp->sda, cpu);
107 spin_lock_init(&sdp->lock);
108 rcu_segcblist_init(&sdp->srcu_cblist);
109 sdp->srcu_cblist_invoking = false;
110 sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
111 sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq;
112 sdp->mynode = &snp_first[cpu / levelspread[level]];
113 for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
114 if (snp->grplo < 0)
115 snp->grplo = cpu;
116 snp->grphi = cpu;
117 }
118 sdp->cpu = cpu;
119 INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
120 sdp->sp = sp;
121 sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
122 if (is_static)
123 continue;
124
125 /* Dynamically allocated, better be no srcu_read_locks()! */
126 for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
127 sdp->srcu_lock_count[i] = 0;
128 sdp->srcu_unlock_count[i] = 0;
129 }
130 }
131}
132
133/*
134 * Initialize non-compile-time initialized fields, including the
135 * associated srcu_node and srcu_data structures. The is_static
136 * parameter is passed through to init_srcu_struct_nodes(), and
137 * also tells us that ->sda has already been wired up to srcu_data.
138 */
139static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
140{
141 mutex_init(&sp->srcu_cb_mutex);
142 mutex_init(&sp->srcu_gp_mutex);
143 sp->srcu_idx = 0;
144 sp->srcu_gp_seq = 0;
145 sp->srcu_barrier_seq = 0;
146 mutex_init(&sp->srcu_barrier_mutex);
147 atomic_set(&sp->srcu_barrier_cpu_cnt, 0);
148 INIT_DELAYED_WORK(&sp->work, process_srcu);
149 if (!is_static)
150 sp->sda = alloc_percpu(struct srcu_data);
151 init_srcu_struct_nodes(sp, is_static);
152 sp->srcu_gp_seq_needed_exp = 0;
153 sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
154 smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
155 return sp->sda ? 0 : -ENOMEM;
156}
157
158#ifdef CONFIG_DEBUG_LOCK_ALLOC
159
160int __init_srcu_struct(struct srcu_struct *sp, const char *name,
161 struct lock_class_key *key)
162{
163 /* Don't re-initialize a lock while it is held. */
164 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
165 lockdep_init_map(&sp->dep_map, name, key, 0);
166 spin_lock_init(&sp->gp_lock);
167 return init_srcu_struct_fields(sp, false);
168}
169EXPORT_SYMBOL_GPL(__init_srcu_struct);
170
171#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
172
173/**
174 * init_srcu_struct - initialize a sleep-RCU structure
175 * @sp: structure to initialize.
176 *
177 * Must invoke this on a given srcu_struct before passing that srcu_struct
178 * to any other function. Each srcu_struct represents a separate domain
179 * of SRCU protection.
180 */
181int init_srcu_struct(struct srcu_struct *sp)
182{
183 spin_lock_init(&sp->gp_lock);
184 return init_srcu_struct_fields(sp, false);
185}
186EXPORT_SYMBOL_GPL(init_srcu_struct);
187
188#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
189
190/*
191 * First-use initialization of statically allocated srcu_struct
192 * structure. Wiring up the combining tree is more than can be
193 * done with compile-time initialization, so this check is added
194 * to each update-side SRCU primitive. Use ->gp_lock, which -is-
195 * compile-time initialized, to resolve races involving multiple
196 * CPUs trying to garner first-use privileges.
197 */
198static void check_init_srcu_struct(struct srcu_struct *sp)
199{
200 unsigned long flags;
201
202 WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT);
203 /* The smp_load_acquire() pairs with the smp_store_release(). */
204 if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
205 return; /* Already initialized. */
206 spin_lock_irqsave(&sp->gp_lock, flags);
207 if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
208 spin_unlock_irqrestore(&sp->gp_lock, flags);
209 return;
210 }
211 init_srcu_struct_fields(sp, true);
212 spin_unlock_irqrestore(&sp->gp_lock, flags);
213}
214
215/*
216 * Returns approximate total of the readers' ->srcu_lock_count[] values
217 * for the rank of per-CPU counters specified by idx.
218 */
219static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
220{
221 int cpu;
222 unsigned long sum = 0;
223
224 for_each_possible_cpu(cpu) {
225 struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
226
227 sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
228 }
229 return sum;
230}
231
232/*
233 * Returns approximate total of the readers' ->srcu_unlock_count[] values
234 * for the rank of per-CPU counters specified by idx.
235 */
236static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
237{
238 int cpu;
239 unsigned long sum = 0;
240
241 for_each_possible_cpu(cpu) {
242 struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
243
244 sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
245 }
246 return sum;
247}
248
249/*
250 * Return true if the number of pre-existing readers is determined to
251 * be zero.
252 */
253static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
254{
255 unsigned long unlocks;
256
257 unlocks = srcu_readers_unlock_idx(sp, idx);
258
259 /*
260 * Make sure that a lock is always counted if the corresponding
261 * unlock is counted. Needs to be a smp_mb() as the read side may
262 * contain a read from a variable that is written to before the
263 * synchronize_srcu() in the write side. In this case smp_mb()s
264 * A and B act like the store buffering pattern.
265 *
266 * This smp_mb() also pairs with smp_mb() C to prevent accesses
267 * after the synchronize_srcu() from being executed before the
268 * grace period ends.
269 */
270 smp_mb(); /* A */
271
272 /*
273 * If the locks are the same as the unlocks, then there must have
274 * been no readers on this index at some time in between. This does
275 * not mean that there are no more readers, as one could have read
276 * the current index but not have incremented the lock counter yet.
277 *
278 * Possible bug: There is no guarantee that there haven't been
279 * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were
280 * counted, meaning that this could return true even if there are
281 * still active readers. Since there are no memory barriers around
282 * srcu_flip(), the CPU is not required to increment ->srcu_idx
283 * before running srcu_readers_unlock_idx(), which means that there
284 * could be an arbitrarily large number of critical sections that
285 * execute after srcu_readers_unlock_idx() but use the old value
286 * of ->srcu_idx.
287 */
288 return srcu_readers_lock_idx(sp, idx) == unlocks;
289}
290
291/**
292 * srcu_readers_active - returns true if there are readers. and false
293 * otherwise
294 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
295 *
296 * Note that this is not an atomic primitive, and can therefore suffer
297 * severe errors when invoked on an active srcu_struct. That said, it
298 * can be useful as an error check at cleanup time.
299 */
300static bool srcu_readers_active(struct srcu_struct *sp)
301{
302 int cpu;
303 unsigned long sum = 0;
304
305 for_each_possible_cpu(cpu) {
306 struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
307
308 sum += READ_ONCE(cpuc->srcu_lock_count[0]);
309 sum += READ_ONCE(cpuc->srcu_lock_count[1]);
310 sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
311 sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
312 }
313 return sum;
314}
315
316#define SRCU_INTERVAL 1
317
318/*
319 * Return grace-period delay, zero if there are expedited grace
320 * periods pending, SRCU_INTERVAL otherwise.
321 */
322static unsigned long srcu_get_delay(struct srcu_struct *sp)
323{
324 if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq),
325 READ_ONCE(sp->srcu_gp_seq_needed_exp)))
326 return 0;
327 return SRCU_INTERVAL;
328}
329
330/**
331 * cleanup_srcu_struct - deconstruct a sleep-RCU structure
332 * @sp: structure to clean up.
333 *
334 * Must invoke this after you are finished using a given srcu_struct that
335 * was initialized via init_srcu_struct(), else you leak memory.
336 */
337void cleanup_srcu_struct(struct srcu_struct *sp)
338{
339 int cpu;
340
341 if (WARN_ON(!srcu_get_delay(sp)))
342 return; /* Leakage unless caller handles error. */
343 if (WARN_ON(srcu_readers_active(sp)))
344 return; /* Leakage unless caller handles error. */
345 flush_delayed_work(&sp->work);
346 for_each_possible_cpu(cpu)
347 flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
348 if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
349 WARN_ON(srcu_readers_active(sp))) {
350 pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
351 return; /* Caller forgot to stop doing call_srcu()? */
352 }
353 free_percpu(sp->sda);
354 sp->sda = NULL;
355}
356EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
357
358/*
359 * Counts the new reader in the appropriate per-CPU element of the
360 * srcu_struct. Must be called from process context.
361 * Returns an index that must be passed to the matching srcu_read_unlock().
362 */
363int __srcu_read_lock(struct srcu_struct *sp)
364{
365 int idx;
366
367 idx = READ_ONCE(sp->srcu_idx) & 0x1;
368 __this_cpu_inc(sp->sda->srcu_lock_count[idx]);
369 smp_mb(); /* B */ /* Avoid leaking the critical section. */
370 return idx;
371}
372EXPORT_SYMBOL_GPL(__srcu_read_lock);
373
374/*
375 * Removes the count for the old reader from the appropriate per-CPU
376 * element of the srcu_struct. Note that this may well be a different
377 * CPU than that which was incremented by the corresponding srcu_read_lock().
378 * Must be called from process context.
379 */
380void __srcu_read_unlock(struct srcu_struct *sp, int idx)
381{
382 smp_mb(); /* C */ /* Avoid leaking the critical section. */
383 this_cpu_inc(sp->sda->srcu_unlock_count[idx]);
384}
385EXPORT_SYMBOL_GPL(__srcu_read_unlock);
386
387/*
388 * We use an adaptive strategy for synchronize_srcu() and especially for
389 * synchronize_srcu_expedited(). We spin for a fixed time period
390 * (defined below) to allow SRCU readers to exit their read-side critical
391 * sections. If there are still some readers after a few microseconds,
392 * we repeatedly block for 1-millisecond time periods.
393 */
394#define SRCU_RETRY_CHECK_DELAY 5
395
396/*
397 * Start an SRCU grace period.
398 */
399static void srcu_gp_start(struct srcu_struct *sp)
400{
401 struct srcu_data *sdp = this_cpu_ptr(sp->sda);
402 int state;
403
404 RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock),
405 "Invoked srcu_gp_start() without ->gp_lock!");
406 WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
407 rcu_segcblist_advance(&sdp->srcu_cblist,
408 rcu_seq_current(&sp->srcu_gp_seq));
409 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
410 rcu_seq_snap(&sp->srcu_gp_seq));
411 smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
412 rcu_seq_start(&sp->srcu_gp_seq);
413 state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
414 WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
415}
416
417/*
418 * Track online CPUs to guide callback workqueue placement.
419 */
420DEFINE_PER_CPU(bool, srcu_online);
421
422void srcu_online_cpu(unsigned int cpu)
423{
424 WRITE_ONCE(per_cpu(srcu_online, cpu), true);
425}
426
427void srcu_offline_cpu(unsigned int cpu)
428{
429 WRITE_ONCE(per_cpu(srcu_online, cpu), false);
430}
431
432/*
433 * Place the workqueue handler on the specified CPU if online, otherwise
434 * just run it whereever. This is useful for placing workqueue handlers
435 * that are to invoke the specified CPU's callbacks.
436 */
437static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
438 struct delayed_work *dwork,
439 unsigned long delay)
440{
441 bool ret;
442
443 preempt_disable();
444 if (READ_ONCE(per_cpu(srcu_online, cpu)))
445 ret = queue_delayed_work_on(cpu, wq, dwork, delay);
446 else
447 ret = queue_delayed_work(wq, dwork, delay);
448 preempt_enable();
449 return ret;
450}
451
452/*
453 * Schedule callback invocation for the specified srcu_data structure,
454 * if possible, on the corresponding CPU.
455 */
456static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
457{
458 srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq,
459 &sdp->work, delay);
460}
461
462/*
463 * Schedule callback invocation for all srcu_data structures associated
464 * with the specified srcu_node structure that have callbacks for the
465 * just-completed grace period, the one corresponding to idx. If possible,
466 * schedule this invocation on the corresponding CPUs.
467 */
468static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
469 unsigned long mask, unsigned long delay)
470{
471 int cpu;
472
473 for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
474 if (!(mask & (1 << (cpu - snp->grplo))))
475 continue;
476 srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay);
477 }
478}
479
480/*
481 * Note the end of an SRCU grace period. Initiates callback invocation
482 * and starts a new grace period if needed.
483 *
484 * The ->srcu_cb_mutex acquisition does not protect any data, but
485 * instead prevents more than one grace period from starting while we
486 * are initiating callback invocation. This allows the ->srcu_have_cbs[]
487 * array to have a finite number of elements.
488 */
489static void srcu_gp_end(struct srcu_struct *sp)
490{
491 unsigned long cbdelay;
492 bool cbs;
493 unsigned long gpseq;
494 int idx;
495 int idxnext;
496 unsigned long mask;
497 struct srcu_node *snp;
498
499 /* Prevent more than one additional grace period. */
500 mutex_lock(&sp->srcu_cb_mutex);
501
502 /* End the current grace period. */
503 spin_lock_irq(&sp->gp_lock);
504 idx = rcu_seq_state(sp->srcu_gp_seq);
505 WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
506 cbdelay = srcu_get_delay(sp);
507 sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
508 rcu_seq_end(&sp->srcu_gp_seq);
509 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
510 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
511 sp->srcu_gp_seq_needed_exp = gpseq;
512 spin_unlock_irq(&sp->gp_lock);
513 mutex_unlock(&sp->srcu_gp_mutex);
514 /* A new grace period can start at this point. But only one. */
515
516 /* Initiate callback invocation as needed. */
517 idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
518 idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
519 rcu_for_each_node_breadth_first(sp, snp) {
520 spin_lock_irq(&snp->lock);
521 cbs = false;
522 if (snp >= sp->level[rcu_num_lvls - 1])
523 cbs = snp->srcu_have_cbs[idx] == gpseq;
524 snp->srcu_have_cbs[idx] = gpseq;
525 rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
526 if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
527 snp->srcu_gp_seq_needed_exp = gpseq;
528 mask = snp->srcu_data_have_cbs[idx];
529 snp->srcu_data_have_cbs[idx] = 0;
530 spin_unlock_irq(&snp->lock);
531 if (cbs) {
532 smp_mb(); /* GP end before CB invocation. */
533 srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
534 }
535 }
536
537 /* Callback initiation done, allow grace periods after next. */
538 mutex_unlock(&sp->srcu_cb_mutex);
539
540 /* Start a new grace period if needed. */
541 spin_lock_irq(&sp->gp_lock);
542 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
543 if (!rcu_seq_state(gpseq) &&
544 ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
545 srcu_gp_start(sp);
546 spin_unlock_irq(&sp->gp_lock);
547 /* Throttle expedited grace periods: Should be rare! */
548 srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
549 ? 0 : SRCU_INTERVAL);
550 } else {
551 spin_unlock_irq(&sp->gp_lock);
552 }
553}
554
555/*
556 * Funnel-locking scheme to scalably mediate many concurrent expedited
557 * grace-period requests. This function is invoked for the first known
558 * expedited request for a grace period that has already been requested,
559 * but without expediting. To start a completely new grace period,
560 * whether expedited or not, use srcu_funnel_gp_start() instead.
561 */
562static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
563 unsigned long s)
564{
565 unsigned long flags;
566
567 for (; snp != NULL; snp = snp->srcu_parent) {
568 if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
569 ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
570 return;
571 spin_lock_irqsave(&snp->lock, flags);
572 if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
573 spin_unlock_irqrestore(&snp->lock, flags);
574 return;
575 }
576 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
577 spin_unlock_irqrestore(&snp->lock, flags);
578 }
579 spin_lock_irqsave(&sp->gp_lock, flags);
580 if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
581 sp->srcu_gp_seq_needed_exp = s;
582 spin_unlock_irqrestore(&sp->gp_lock, flags);
583}
584
585/*
586 * Funnel-locking scheme to scalably mediate many concurrent grace-period
587 * requests. The winner has to do the work of actually starting grace
588 * period s. Losers must either ensure that their desired grace-period
589 * number is recorded on at least their leaf srcu_node structure, or they
590 * must take steps to invoke their own callbacks.
591 */
592static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
593 unsigned long s, bool do_norm)
594{
595 unsigned long flags;
596 int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
597 struct srcu_node *snp = sdp->mynode;
598 unsigned long snp_seq;
599
600 /* Each pass through the loop does one level of the srcu_node tree. */
601 for (; snp != NULL; snp = snp->srcu_parent) {
602 if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
603 return; /* GP already done and CBs recorded. */
604 spin_lock_irqsave(&snp->lock, flags);
605 if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
606 snp_seq = snp->srcu_have_cbs[idx];
607 if (snp == sdp->mynode && snp_seq == s)
608 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
609 spin_unlock_irqrestore(&snp->lock, flags);
610 if (snp == sdp->mynode && snp_seq != s) {
611 smp_mb(); /* CBs after GP! */
612 srcu_schedule_cbs_sdp(sdp, do_norm
613 ? SRCU_INTERVAL
614 : 0);
615 return;
616 }
617 if (!do_norm)
618 srcu_funnel_exp_start(sp, snp, s);
619 return;
620 }
621 snp->srcu_have_cbs[idx] = s;
622 if (snp == sdp->mynode)
623 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
624 if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
625 snp->srcu_gp_seq_needed_exp = s;
626 spin_unlock_irqrestore(&snp->lock, flags);
627 }
628
629 /* Top of tree, must ensure the grace period will be started. */
630 spin_lock_irqsave(&sp->gp_lock, flags);
631 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
632 /*
633 * Record need for grace period s. Pair with load
634 * acquire setting up for initialization.
635 */
636 smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/
637 }
638 if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
639 sp->srcu_gp_seq_needed_exp = s;
640
641 /* If grace period not already done and none in progress, start it. */
642 if (!rcu_seq_done(&sp->srcu_gp_seq, s) &&
643 rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
644 WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
645 srcu_gp_start(sp);
646 queue_delayed_work(system_power_efficient_wq, &sp->work,
647 srcu_get_delay(sp));
648 }
649 spin_unlock_irqrestore(&sp->gp_lock, flags);
650}
651
652/*
653 * Wait until all readers counted by array index idx complete, but
654 * loop an additional time if there is an expedited grace period pending.
655 * The caller must ensure that ->srcu_idx is not changed while checking.
656 */
657static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
658{
659 for (;;) {
660 if (srcu_readers_active_idx_check(sp, idx))
661 return true;
662 if (--trycount + !srcu_get_delay(sp) <= 0)
663 return false;
664 udelay(SRCU_RETRY_CHECK_DELAY);
665 }
666}
667
668/*
669 * Increment the ->srcu_idx counter so that future SRCU readers will
670 * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows
671 * us to wait for pre-existing readers in a starvation-free manner.
672 */
673static void srcu_flip(struct srcu_struct *sp)
674{
675 WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
676
677 /*
678 * Ensure that if the updater misses an __srcu_read_unlock()
679 * increment, that task's next __srcu_read_lock() will see the
680 * above counter update. Note that both this memory barrier
681 * and the one in srcu_readers_active_idx_check() provide the
682 * guarantee for __srcu_read_lock().
683 */
684 smp_mb(); /* D */ /* Pairs with C. */
685}
686
687/*
688 * If SRCU is likely idle, return true, otherwise return false.
689 *
690 * Note that it is OK for several current from-idle requests for a new
691 * grace period from idle to specify expediting because they will all end
692 * up requesting the same grace period anyhow. So no loss.
693 *
694 * Note also that if any CPU (including the current one) is still invoking
695 * callbacks, this function will nevertheless say "idle". This is not
696 * ideal, but the overhead of checking all CPUs' callback lists is even
697 * less ideal, especially on large systems. Furthermore, the wakeup
698 * can happen before the callback is fully removed, so we have no choice
699 * but to accept this type of error.
700 *
701 * This function is also subject to counter-wrap errors, but let's face
702 * it, if this function was preempted for enough time for the counters
703 * to wrap, it really doesn't matter whether or not we expedite the grace
704 * period. The extra overhead of a needlessly expedited grace period is
705 * negligible when amoritized over that time period, and the extra latency
706 * of a needlessly non-expedited grace period is similarly negligible.
707 */
708static bool srcu_might_be_idle(struct srcu_struct *sp)
709{
710 unsigned long curseq;
711 unsigned long flags;
712 struct srcu_data *sdp;
713 unsigned long t;
714
715 /* If the local srcu_data structure has callbacks, not idle. */
716 local_irq_save(flags);
717 sdp = this_cpu_ptr(sp->sda);
718 if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
719 local_irq_restore(flags);
720 return false; /* Callbacks already present, so not idle. */
721 }
722 local_irq_restore(flags);
723
724 /*
725 * No local callbacks, so probabalistically probe global state.
726 * Exact information would require acquiring locks, which would
727 * kill scalability, hence the probabalistic nature of the probe.
728 */
729
730 /* First, see if enough time has passed since the last GP. */
731 t = ktime_get_mono_fast_ns();
732 if (exp_holdoff == 0 ||
733 time_in_range_open(t, sp->srcu_last_gp_end,
734 sp->srcu_last_gp_end + exp_holdoff))
735 return false; /* Too soon after last GP. */
736
737 /* Next, check for probable idleness. */
738 curseq = rcu_seq_current(&sp->srcu_gp_seq);
739 smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
740 if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed)))
741 return false; /* Grace period in progress, so not idle. */
742 smp_mb(); /* Order ->srcu_gp_seq with prior access. */
743 if (curseq != rcu_seq_current(&sp->srcu_gp_seq))
744 return false; /* GP # changed, so not idle. */
745 return true; /* With reasonable probability, idle! */
746}
747
748/*
749 * Enqueue an SRCU callback on the srcu_data structure associated with
750 * the current CPU and the specified srcu_struct structure, initiating
751 * grace-period processing if it is not already running.
752 *
753 * Note that all CPUs must agree that the grace period extended beyond
754 * all pre-existing SRCU read-side critical section. On systems with
755 * more than one CPU, this means that when "func()" is invoked, each CPU
756 * is guaranteed to have executed a full memory barrier since the end of
757 * its last corresponding SRCU read-side critical section whose beginning
758 * preceded the call to call_rcu(). It also means that each CPU executing
759 * an SRCU read-side critical section that continues beyond the start of
760 * "func()" must have executed a memory barrier after the call_rcu()
761 * but before the beginning of that SRCU read-side critical section.
762 * Note that these guarantees include CPUs that are offline, idle, or
763 * executing in user mode, as well as CPUs that are executing in the kernel.
764 *
765 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
766 * resulting SRCU callback function "func()", then both CPU A and CPU
767 * B are guaranteed to execute a full memory barrier during the time
768 * interval between the call to call_rcu() and the invocation of "func()".
769 * This guarantee applies even if CPU A and CPU B are the same CPU (but
770 * again only if the system has more than one CPU).
771 *
772 * Of course, these guarantees apply only for invocations of call_srcu(),
773 * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
774 * srcu_struct structure.
775 */
776void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
777 rcu_callback_t func, bool do_norm)
778{
779 unsigned long flags;
780 bool needexp = false;
781 bool needgp = false;
782 unsigned long s;
783 struct srcu_data *sdp;
784
785 check_init_srcu_struct(sp);
786 rhp->func = func;
787 local_irq_save(flags);
788 sdp = this_cpu_ptr(sp->sda);
789 spin_lock(&sdp->lock);
790 rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
791 rcu_segcblist_advance(&sdp->srcu_cblist,
792 rcu_seq_current(&sp->srcu_gp_seq));
793 s = rcu_seq_snap(&sp->srcu_gp_seq);
794 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
795 if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
796 sdp->srcu_gp_seq_needed = s;
797 needgp = true;
798 }
799 if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
800 sdp->srcu_gp_seq_needed_exp = s;
801 needexp = true;
802 }
803 spin_unlock_irqrestore(&sdp->lock, flags);
804 if (needgp)
805 srcu_funnel_gp_start(sp, sdp, s, do_norm);
806 else if (needexp)
807 srcu_funnel_exp_start(sp, sdp->mynode, s);
808}
809
810void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
811 rcu_callback_t func)
812{
813 __call_srcu(sp, rhp, func, true);
814}
815EXPORT_SYMBOL_GPL(call_srcu);
816
817/*
818 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
819 */
820static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
821{
822 struct rcu_synchronize rcu;
823
824 RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
825 lock_is_held(&rcu_bh_lock_map) ||
826 lock_is_held(&rcu_lock_map) ||
827 lock_is_held(&rcu_sched_lock_map),
828 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
829
830 if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
831 return;
832 might_sleep();
833 check_init_srcu_struct(sp);
834 init_completion(&rcu.completion);
835 init_rcu_head_on_stack(&rcu.head);
836 __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
837 wait_for_completion(&rcu.completion);
838 destroy_rcu_head_on_stack(&rcu.head);
839}
840
841/**
842 * synchronize_srcu_expedited - Brute-force SRCU grace period
843 * @sp: srcu_struct with which to synchronize.
844 *
845 * Wait for an SRCU grace period to elapse, but be more aggressive about
846 * spinning rather than blocking when waiting.
847 *
848 * Note that synchronize_srcu_expedited() has the same deadlock and
849 * memory-ordering properties as does synchronize_srcu().
850 */
851void synchronize_srcu_expedited(struct srcu_struct *sp)
852{
853 __synchronize_srcu(sp, rcu_gp_is_normal());
854}
855EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
856
857/**
858 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
859 * @sp: srcu_struct with which to synchronize.
860 *
861 * Wait for the count to drain to zero of both indexes. To avoid the
862 * possible starvation of synchronize_srcu(), it waits for the count of
863 * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first,
864 * and then flip the srcu_idx and wait for the count of the other index.
865 *
866 * Can block; must be called from process context.
867 *
868 * Note that it is illegal to call synchronize_srcu() from the corresponding
869 * SRCU read-side critical section; doing so will result in deadlock.
870 * However, it is perfectly legal to call synchronize_srcu() on one
871 * srcu_struct from some other srcu_struct's read-side critical section,
872 * as long as the resulting graph of srcu_structs is acyclic.
873 *
874 * There are memory-ordering constraints implied by synchronize_srcu().
875 * On systems with more than one CPU, when synchronize_srcu() returns,
876 * each CPU is guaranteed to have executed a full memory barrier since
877 * the end of its last corresponding SRCU-sched read-side critical section
878 * whose beginning preceded the call to synchronize_srcu(). In addition,
879 * each CPU having an SRCU read-side critical section that extends beyond
880 * the return from synchronize_srcu() is guaranteed to have executed a
881 * full memory barrier after the beginning of synchronize_srcu() and before
882 * the beginning of that SRCU read-side critical section. Note that these
883 * guarantees include CPUs that are offline, idle, or executing in user mode,
884 * as well as CPUs that are executing in the kernel.
885 *
886 * Furthermore, if CPU A invoked synchronize_srcu(), which returned
887 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
888 * to have executed a full memory barrier during the execution of
889 * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
890 * are the same CPU, but again only if the system has more than one CPU.
891 *
892 * Of course, these memory-ordering guarantees apply only when
893 * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
894 * passed the same srcu_struct structure.
895 *
896 * If SRCU is likely idle, expedite the first request. This semantic
897 * was provided by Classic SRCU, and is relied upon by its users, so TREE
898 * SRCU must also provide it. Note that detecting idleness is heuristic
899 * and subject to both false positives and negatives.
900 */
901void synchronize_srcu(struct srcu_struct *sp)
902{
903 if (srcu_might_be_idle(sp) || rcu_gp_is_expedited())
904 synchronize_srcu_expedited(sp);
905 else
906 __synchronize_srcu(sp, true);
907}
908EXPORT_SYMBOL_GPL(synchronize_srcu);
909
910/*
911 * Callback function for srcu_barrier() use.
912 */
913static void srcu_barrier_cb(struct rcu_head *rhp)
914{
915 struct srcu_data *sdp;
916 struct srcu_struct *sp;
917
918 sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
919 sp = sdp->sp;
920 if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
921 complete(&sp->srcu_barrier_completion);
922}
923
924/**
925 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
926 * @sp: srcu_struct on which to wait for in-flight callbacks.
927 */
928void srcu_barrier(struct srcu_struct *sp)
929{
930 int cpu;
931 struct srcu_data *sdp;
932 unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq);
933
934 check_init_srcu_struct(sp);
935 mutex_lock(&sp->srcu_barrier_mutex);
936 if (rcu_seq_done(&sp->srcu_barrier_seq, s)) {
937 smp_mb(); /* Force ordering following return. */
938 mutex_unlock(&sp->srcu_barrier_mutex);
939 return; /* Someone else did our work for us. */
940 }
941 rcu_seq_start(&sp->srcu_barrier_seq);
942 init_completion(&sp->srcu_barrier_completion);
943
944 /* Initial count prevents reaching zero until all CBs are posted. */
945 atomic_set(&sp->srcu_barrier_cpu_cnt, 1);
946
947 /*
948 * Each pass through this loop enqueues a callback, but only
949 * on CPUs already having callbacks enqueued. Note that if
950 * a CPU already has callbacks enqueue, it must have already
951 * registered the need for a future grace period, so all we
952 * need do is enqueue a callback that will use the same
953 * grace period as the last callback already in the queue.
954 */
955 for_each_possible_cpu(cpu) {
956 sdp = per_cpu_ptr(sp->sda, cpu);
957 spin_lock_irq(&sdp->lock);
958 atomic_inc(&sp->srcu_barrier_cpu_cnt);
959 sdp->srcu_barrier_head.func = srcu_barrier_cb;
960 if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
961 &sdp->srcu_barrier_head, 0))
962 atomic_dec(&sp->srcu_barrier_cpu_cnt);
963 spin_unlock_irq(&sdp->lock);
964 }
965
966 /* Remove the initial count, at which point reaching zero can happen. */
967 if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
968 complete(&sp->srcu_barrier_completion);
969 wait_for_completion(&sp->srcu_barrier_completion);
970
971 rcu_seq_end(&sp->srcu_barrier_seq);
972 mutex_unlock(&sp->srcu_barrier_mutex);
973}
974EXPORT_SYMBOL_GPL(srcu_barrier);
975
976/**
977 * srcu_batches_completed - return batches completed.
978 * @sp: srcu_struct on which to report batch completion.
979 *
980 * Report the number of batches, correlated with, but not necessarily
981 * precisely the same as, the number of grace periods that have elapsed.
982 */
983unsigned long srcu_batches_completed(struct srcu_struct *sp)
984{
985 return sp->srcu_idx;
986}
987EXPORT_SYMBOL_GPL(srcu_batches_completed);
988
989/*
990 * Core SRCU state machine. Push state bits of ->srcu_gp_seq
991 * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has
992 * completed in that state.
993 */
994static void srcu_advance_state(struct srcu_struct *sp)
995{
996 int idx;
997
998 mutex_lock(&sp->srcu_gp_mutex);
999
1000 /*
1001 * Because readers might be delayed for an extended period after
1002 * fetching ->srcu_idx for their index, at any point in time there
1003 * might well be readers using both idx=0 and idx=1. We therefore
1004 * need to wait for readers to clear from both index values before
1005 * invoking a callback.
1006 *
1007 * The load-acquire ensures that we see the accesses performed
1008 * by the prior grace period.
1009 */
1010 idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
1011 if (idx == SRCU_STATE_IDLE) {
1012 spin_lock_irq(&sp->gp_lock);
1013 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
1014 WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
1015 spin_unlock_irq(&sp->gp_lock);
1016 mutex_unlock(&sp->srcu_gp_mutex);
1017 return;
1018 }
1019 idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
1020 if (idx == SRCU_STATE_IDLE)
1021 srcu_gp_start(sp);
1022 spin_unlock_irq(&sp->gp_lock);
1023 if (idx != SRCU_STATE_IDLE) {
1024 mutex_unlock(&sp->srcu_gp_mutex);
1025 return; /* Someone else started the grace period. */
1026 }
1027 }
1028
1029 if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
1030 idx = 1 ^ (sp->srcu_idx & 1);
1031 if (!try_check_zero(sp, idx, 1)) {
1032 mutex_unlock(&sp->srcu_gp_mutex);
1033 return; /* readers present, retry later. */
1034 }
1035 srcu_flip(sp);
1036 rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
1037 }
1038
1039 if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
1040
1041 /*
1042 * SRCU read-side critical sections are normally short,
1043 * so check at least twice in quick succession after a flip.
1044 */
1045 idx = 1 ^ (sp->srcu_idx & 1);
1046 if (!try_check_zero(sp, idx, 2)) {
1047 mutex_unlock(&sp->srcu_gp_mutex);
1048 return; /* readers present, retry later. */
1049 }
1050 srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */
1051 }
1052}
1053
1054/*
1055 * Invoke a limited number of SRCU callbacks that have passed through
1056 * their grace period. If there are more to do, SRCU will reschedule
1057 * the workqueue. Note that needed memory barriers have been executed
1058 * in this task's context by srcu_readers_active_idx_check().
1059 */
1060static void srcu_invoke_callbacks(struct work_struct *work)
1061{
1062 bool more;
1063 struct rcu_cblist ready_cbs;
1064 struct rcu_head *rhp;
1065 struct srcu_data *sdp;
1066 struct srcu_struct *sp;
1067
1068 sdp = container_of(work, struct srcu_data, work.work);
1069 sp = sdp->sp;
1070 rcu_cblist_init(&ready_cbs);
1071 spin_lock_irq(&sdp->lock);
1072 smp_mb(); /* Old grace periods before callback invocation! */
1073 rcu_segcblist_advance(&sdp->srcu_cblist,
1074 rcu_seq_current(&sp->srcu_gp_seq));
1075 if (sdp->srcu_cblist_invoking ||
1076 !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
1077 spin_unlock_irq(&sdp->lock);
1078 return; /* Someone else on the job or nothing to do. */
1079 }
1080
1081 /* We are on the job! Extract and invoke ready callbacks. */
1082 sdp->srcu_cblist_invoking = true;
1083 rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
1084 spin_unlock_irq(&sdp->lock);
1085 rhp = rcu_cblist_dequeue(&ready_cbs);
1086 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
1087 local_bh_disable();
1088 rhp->func(rhp);
1089 local_bh_enable();
1090 }
1091
1092 /*
1093 * Update counts, accelerate new callbacks, and if needed,
1094 * schedule another round of callback invocation.
1095 */
1096 spin_lock_irq(&sdp->lock);
1097 rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
1098 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
1099 rcu_seq_snap(&sp->srcu_gp_seq));
1100 sdp->srcu_cblist_invoking = false;
1101 more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
1102 spin_unlock_irq(&sdp->lock);
1103 if (more)
1104 srcu_schedule_cbs_sdp(sdp, 0);
1105}
1106
1107/*
1108 * Finished one round of SRCU grace period. Start another if there are
1109 * more SRCU callbacks queued, otherwise put SRCU into not-running state.
1110 */
1111static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
1112{
1113 bool pushgp = true;
1114
1115 spin_lock_irq(&sp->gp_lock);
1116 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
1117 if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
1118 /* All requests fulfilled, time to go idle. */
1119 pushgp = false;
1120 }
1121 } else if (!rcu_seq_state(sp->srcu_gp_seq)) {
1122 /* Outstanding request and no GP. Start one. */
1123 srcu_gp_start(sp);
1124 }
1125 spin_unlock_irq(&sp->gp_lock);
1126
1127 if (pushgp)
1128 queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
1129}
1130
1131/*
1132 * This is the work-queue function that handles SRCU grace periods.
1133 */
1134void process_srcu(struct work_struct *work)
1135{
1136 struct srcu_struct *sp;
1137
1138 sp = container_of(work, struct srcu_struct, work.work);
1139
1140 srcu_advance_state(sp);
1141 srcu_reschedule(sp, srcu_get_delay(sp));
1142}
1143EXPORT_SYMBOL_GPL(process_srcu);
1144
1145void srcutorture_get_gp_data(enum rcutorture_type test_type,
1146 struct srcu_struct *sp, int *flags,
1147 unsigned long *gpnum, unsigned long *completed)
1148{
1149 if (test_type != SRCU_FLAVOR)
1150 return;
1151 *flags = 0;
1152 *completed = rcu_seq_ctr(sp->srcu_gp_seq);
1153 *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed);
1154}
1155EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 6ad330dbbae2..e5385731e391 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -79,7 +79,7 @@ EXPORT_SYMBOL(__rcu_is_watching);
79 */ 79 */
80static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 80static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
81{ 81{
82 RCU_TRACE(reset_cpu_stall_ticks(rcp)); 82 RCU_TRACE(reset_cpu_stall_ticks(rcp);)
83 if (rcp->donetail != rcp->curtail) { 83 if (rcp->donetail != rcp->curtail) {
84 rcp->donetail = rcp->curtail; 84 rcp->donetail = rcp->curtail;
85 return 1; 85 return 1;
@@ -125,7 +125,7 @@ void rcu_bh_qs(void)
125 */ 125 */
126void rcu_check_callbacks(int user) 126void rcu_check_callbacks(int user)
127{ 127{
128 RCU_TRACE(check_cpu_stalls()); 128 RCU_TRACE(check_cpu_stalls();)
129 if (user) 129 if (user)
130 rcu_sched_qs(); 130 rcu_sched_qs();
131 else if (!in_softirq()) 131 else if (!in_softirq())
@@ -143,7 +143,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
143 const char *rn = NULL; 143 const char *rn = NULL;
144 struct rcu_head *next, *list; 144 struct rcu_head *next, *list;
145 unsigned long flags; 145 unsigned long flags;
146 RCU_TRACE(int cb_count = 0); 146 RCU_TRACE(int cb_count = 0;)
147 147
148 /* Move the ready-to-invoke callbacks to a local list. */ 148 /* Move the ready-to-invoke callbacks to a local list. */
149 local_irq_save(flags); 149 local_irq_save(flags);
@@ -152,7 +152,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
152 local_irq_restore(flags); 152 local_irq_restore(flags);
153 return; 153 return;
154 } 154 }
155 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); 155 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);)
156 list = rcp->rcucblist; 156 list = rcp->rcucblist;
157 rcp->rcucblist = *rcp->donetail; 157 rcp->rcucblist = *rcp->donetail;
158 *rcp->donetail = NULL; 158 *rcp->donetail = NULL;
@@ -162,7 +162,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
162 local_irq_restore(flags); 162 local_irq_restore(flags);
163 163
164 /* Invoke the callbacks on the local list. */ 164 /* Invoke the callbacks on the local list. */
165 RCU_TRACE(rn = rcp->name); 165 RCU_TRACE(rn = rcp->name;)
166 while (list) { 166 while (list) {
167 next = list->next; 167 next = list->next;
168 prefetch(next); 168 prefetch(next);
@@ -171,9 +171,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
171 __rcu_reclaim(rn, list); 171 __rcu_reclaim(rn, list);
172 local_bh_enable(); 172 local_bh_enable();
173 list = next; 173 list = next;
174 RCU_TRACE(cb_count++); 174 RCU_TRACE(cb_count++;)
175 } 175 }
176 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 176 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);)
177 RCU_TRACE(trace_rcu_batch_end(rcp->name, 177 RCU_TRACE(trace_rcu_batch_end(rcp->name,
178 cb_count, 0, need_resched(), 178 cb_count, 0, need_resched(),
179 is_idle_task(current), 179 is_idle_task(current),
@@ -221,7 +221,7 @@ static void __call_rcu(struct rcu_head *head,
221 local_irq_save(flags); 221 local_irq_save(flags);
222 *rcp->curtail = head; 222 *rcp->curtail = head;
223 rcp->curtail = &head->next; 223 rcp->curtail = &head->next;
224 RCU_TRACE(rcp->qlen++); 224 RCU_TRACE(rcp->qlen++;)
225 local_irq_restore(flags); 225 local_irq_restore(flags);
226 226
227 if (unlikely(is_idle_task(current))) { 227 if (unlikely(is_idle_task(current))) {
@@ -254,8 +254,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
254void __init rcu_init(void) 254void __init rcu_init(void)
255{ 255{
256 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 256 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
257 RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); 257 RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);)
258 RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); 258 RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);)
259 259
260 rcu_early_boot_tests(); 260 rcu_early_boot_tests();
261} 261}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index c64b827ecbca..371034e77f87 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -52,7 +52,7 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
52 RCU_TRACE(.name = "rcu_bh") 52 RCU_TRACE(.name = "rcu_bh")
53}; 53};
54 54
55#ifdef CONFIG_DEBUG_LOCK_ALLOC 55#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
56#include <linux/kernel_stat.h> 56#include <linux/kernel_stat.h>
57 57
58int rcu_scheduler_active __read_mostly; 58int rcu_scheduler_active __read_mostly;
@@ -65,15 +65,16 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
65 * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. 65 * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
66 * The reason for this is that Tiny RCU does not need kthreads, so does 66 * The reason for this is that Tiny RCU does not need kthreads, so does
67 * not have to care about the fact that the scheduler is half-initialized 67 * not have to care about the fact that the scheduler is half-initialized
68 * at a certain phase of the boot process. 68 * at a certain phase of the boot process. Unless SRCU is in the mix.
69 */ 69 */
70void __init rcu_scheduler_starting(void) 70void __init rcu_scheduler_starting(void)
71{ 71{
72 WARN_ON(nr_context_switches() > 0); 72 WARN_ON(nr_context_switches() > 0);
73 rcu_scheduler_active = RCU_SCHEDULER_RUNNING; 73 rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
74 ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
74} 75}
75 76
76#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 77#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
77 78
78#ifdef CONFIG_RCU_TRACE 79#ifdef CONFIG_RCU_TRACE
79 80
@@ -162,8 +163,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
162 163
163static void check_cpu_stalls(void) 164static void check_cpu_stalls(void)
164{ 165{
165 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); 166 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);)
166 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); 167 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);)
167} 168}
168 169
169#endif /* #ifdef CONFIG_RCU_TRACE */ 170#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 50fee7689e71..e354e475e645 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -57,6 +57,7 @@
57#include <linux/random.h> 57#include <linux/random.h>
58#include <linux/trace_events.h> 58#include <linux/trace_events.h>
59#include <linux/suspend.h> 59#include <linux/suspend.h>
60#include <linux/ftrace.h>
60 61
61#include "tree.h" 62#include "tree.h"
62#include "rcu.h" 63#include "rcu.h"
@@ -97,8 +98,8 @@ struct rcu_state sname##_state = { \
97 .gpnum = 0UL - 300UL, \ 98 .gpnum = 0UL - 300UL, \
98 .completed = 0UL - 300UL, \ 99 .completed = 0UL - 300UL, \
99 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ 100 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
100 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 101 .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \
101 .orphan_donetail = &sname##_state.orphan_donelist, \ 102 .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \
102 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 103 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
103 .name = RCU_STATE_NAME(sname), \ 104 .name = RCU_STATE_NAME(sname), \
104 .abbr = sabbr, \ 105 .abbr = sabbr, \
@@ -123,7 +124,7 @@ static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
123module_param(rcu_fanout_leaf, int, 0444); 124module_param(rcu_fanout_leaf, int, 0444);
124int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; 125int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
125/* Number of rcu_nodes at specified level. */ 126/* Number of rcu_nodes at specified level. */
126static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; 127int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
127int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ 128int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
128/* panic() on RCU Stall sysctl. */ 129/* panic() on RCU Stall sysctl. */
129int sysctl_panic_on_rcu_stall __read_mostly; 130int sysctl_panic_on_rcu_stall __read_mostly;
@@ -199,7 +200,7 @@ static const int gp_cleanup_delay;
199 200
200/* 201/*
201 * Number of grace periods between delays, normalized by the duration of 202 * Number of grace periods between delays, normalized by the duration of
202 * the delay. The longer the the delay, the more the grace periods between 203 * the delay. The longer the delay, the more the grace periods between
203 * each delay. The reason for this normalization is that it means that, 204 * each delay. The reason for this normalization is that it means that,
204 * for non-zero delays, the overall slowdown of grace periods is constant 205 * for non-zero delays, the overall slowdown of grace periods is constant
205 * regardless of the duration of the delay. This arrangement balances 206 * regardless of the duration of the delay. This arrangement balances
@@ -272,11 +273,19 @@ void rcu_bh_qs(void)
272 } 273 }
273} 274}
274 275
275static DEFINE_PER_CPU(int, rcu_sched_qs_mask); 276/*
277 * Steal a bit from the bottom of ->dynticks for idle entry/exit
278 * control. Initially this is for TLB flushing.
279 */
280#define RCU_DYNTICK_CTRL_MASK 0x1
281#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
282#ifndef rcu_eqs_special_exit
283#define rcu_eqs_special_exit() do { } while (0)
284#endif
276 285
277static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 286static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
278 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 287 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
279 .dynticks = ATOMIC_INIT(1), 288 .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
280#ifdef CONFIG_NO_HZ_FULL_SYSIDLE 289#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
281 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, 290 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
282 .dynticks_idle = ATOMIC_INIT(1), 291 .dynticks_idle = ATOMIC_INIT(1),
@@ -284,21 +293,40 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
284}; 293};
285 294
286/* 295/*
296 * There's a few places, currently just in the tracing infrastructure,
297 * that uses rcu_irq_enter() to make sure RCU is watching. But there's
298 * a small location where that will not even work. In those cases
299 * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter()
300 * can be called.
301 */
302static DEFINE_PER_CPU(bool, disable_rcu_irq_enter);
303
304bool rcu_irq_enter_disabled(void)
305{
306 return this_cpu_read(disable_rcu_irq_enter);
307}
308
309/*
287 * Record entry into an extended quiescent state. This is only to be 310 * Record entry into an extended quiescent state. This is only to be
288 * called when not already in an extended quiescent state. 311 * called when not already in an extended quiescent state.
289 */ 312 */
290static void rcu_dynticks_eqs_enter(void) 313static void rcu_dynticks_eqs_enter(void)
291{ 314{
292 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 315 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
293 int special; 316 int seq;
294 317
295 /* 318 /*
296 * CPUs seeing atomic_inc_return() must see prior RCU read-side 319 * CPUs seeing atomic_add_return() must see prior RCU read-side
297 * critical sections, and we also must force ordering with the 320 * critical sections, and we also must force ordering with the
298 * next idle sojourn. 321 * next idle sojourn.
299 */ 322 */
300 special = atomic_inc_return(&rdtp->dynticks); 323 seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
301 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1); 324 /* Better be in an extended quiescent state! */
325 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
326 (seq & RCU_DYNTICK_CTRL_CTR));
327 /* Better not have special action (TLB flush) pending! */
328 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
329 (seq & RCU_DYNTICK_CTRL_MASK));
302} 330}
303 331
304/* 332/*
@@ -308,15 +336,22 @@ static void rcu_dynticks_eqs_enter(void)
308static void rcu_dynticks_eqs_exit(void) 336static void rcu_dynticks_eqs_exit(void)
309{ 337{
310 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 338 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
311 int special; 339 int seq;
312 340
313 /* 341 /*
314 * CPUs seeing atomic_inc_return() must see prior idle sojourns, 342 * CPUs seeing atomic_add_return() must see prior idle sojourns,
315 * and we also must force ordering with the next RCU read-side 343 * and we also must force ordering with the next RCU read-side
316 * critical section. 344 * critical section.
317 */ 345 */
318 special = atomic_inc_return(&rdtp->dynticks); 346 seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
319 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1)); 347 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
348 !(seq & RCU_DYNTICK_CTRL_CTR));
349 if (seq & RCU_DYNTICK_CTRL_MASK) {
350 atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks);
351 smp_mb__after_atomic(); /* _exit after clearing mask. */
352 /* Prefer duplicate flushes to losing a flush. */
353 rcu_eqs_special_exit();
354 }
320} 355}
321 356
322/* 357/*
@@ -333,9 +368,9 @@ static void rcu_dynticks_eqs_online(void)
333{ 368{
334 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 369 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
335 370
336 if (atomic_read(&rdtp->dynticks) & 0x1) 371 if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR)
337 return; 372 return;
338 atomic_add(0x1, &rdtp->dynticks); 373 atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
339} 374}
340 375
341/* 376/*
@@ -347,7 +382,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
347{ 382{
348 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 383 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
349 384
350 return !(atomic_read(&rdtp->dynticks) & 0x1); 385 return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR);
351} 386}
352 387
353/* 388/*
@@ -358,7 +393,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
358{ 393{
359 int snap = atomic_add_return(0, &rdtp->dynticks); 394 int snap = atomic_add_return(0, &rdtp->dynticks);
360 395
361 return snap; 396 return snap & ~RCU_DYNTICK_CTRL_MASK;
362} 397}
363 398
364/* 399/*
@@ -367,7 +402,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
367 */ 402 */
368static bool rcu_dynticks_in_eqs(int snap) 403static bool rcu_dynticks_in_eqs(int snap)
369{ 404{
370 return !(snap & 0x1); 405 return !(snap & RCU_DYNTICK_CTRL_CTR);
371} 406}
372 407
373/* 408/*
@@ -387,14 +422,34 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap)
387static void rcu_dynticks_momentary_idle(void) 422static void rcu_dynticks_momentary_idle(void)
388{ 423{
389 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 424 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
390 int special = atomic_add_return(2, &rdtp->dynticks); 425 int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
426 &rdtp->dynticks);
391 427
392 /* It is illegal to call this from idle state. */ 428 /* It is illegal to call this from idle state. */
393 WARN_ON_ONCE(!(special & 0x1)); 429 WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
394} 430}
395 431
396DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); 432/*
397EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); 433 * Set the special (bottom) bit of the specified CPU so that it
434 * will take special action (such as flushing its TLB) on the
435 * next exit from an extended quiescent state. Returns true if
436 * the bit was successfully set, or false if the CPU was not in
437 * an extended quiescent state.
438 */
439bool rcu_eqs_special_set(int cpu)
440{
441 int old;
442 int new;
443 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
444
445 do {
446 old = atomic_read(&rdtp->dynticks);
447 if (old & RCU_DYNTICK_CTRL_CTR)
448 return false;
449 new = old | RCU_DYNTICK_CTRL_MASK;
450 } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old);
451 return true;
452}
398 453
399/* 454/*
400 * Let the RCU core know that this CPU has gone through the scheduler, 455 * Let the RCU core know that this CPU has gone through the scheduler,
@@ -403,44 +458,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
403 * memory barriers to let the RCU core know about it, regardless of what 458 * memory barriers to let the RCU core know about it, regardless of what
404 * this CPU might (or might not) do in the near future. 459 * this CPU might (or might not) do in the near future.
405 * 460 *
406 * We inform the RCU core by emulating a zero-duration dyntick-idle 461 * We inform the RCU core by emulating a zero-duration dyntick-idle period.
407 * period, which we in turn do by incrementing the ->dynticks counter
408 * by two.
409 * 462 *
410 * The caller must have disabled interrupts. 463 * The caller must have disabled interrupts.
411 */ 464 */
412static void rcu_momentary_dyntick_idle(void) 465static void rcu_momentary_dyntick_idle(void)
413{ 466{
414 struct rcu_data *rdp; 467 raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false);
415 int resched_mask; 468 rcu_dynticks_momentary_idle();
416 struct rcu_state *rsp;
417
418 /*
419 * Yes, we can lose flag-setting operations. This is OK, because
420 * the flag will be set again after some delay.
421 */
422 resched_mask = raw_cpu_read(rcu_sched_qs_mask);
423 raw_cpu_write(rcu_sched_qs_mask, 0);
424
425 /* Find the flavor that needs a quiescent state. */
426 for_each_rcu_flavor(rsp) {
427 rdp = raw_cpu_ptr(rsp->rda);
428 if (!(resched_mask & rsp->flavor_mask))
429 continue;
430 smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
431 if (READ_ONCE(rdp->mynode->completed) !=
432 READ_ONCE(rdp->cond_resched_completed))
433 continue;
434
435 /*
436 * Pretend to be momentarily idle for the quiescent state.
437 * This allows the grace-period kthread to record the
438 * quiescent state, with no need for this CPU to do anything
439 * further.
440 */
441 rcu_dynticks_momentary_idle();
442 break;
443 }
444} 469}
445 470
446/* 471/*
@@ -448,14 +473,22 @@ static void rcu_momentary_dyntick_idle(void)
448 * and requires special handling for preemptible RCU. 473 * and requires special handling for preemptible RCU.
449 * The caller must have disabled interrupts. 474 * The caller must have disabled interrupts.
450 */ 475 */
451void rcu_note_context_switch(void) 476void rcu_note_context_switch(bool preempt)
452{ 477{
453 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 478 barrier(); /* Avoid RCU read-side critical sections leaking down. */
454 trace_rcu_utilization(TPS("Start context switch")); 479 trace_rcu_utilization(TPS("Start context switch"));
455 rcu_sched_qs(); 480 rcu_sched_qs();
456 rcu_preempt_note_context_switch(); 481 rcu_preempt_note_context_switch();
457 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 482 /* Load rcu_urgent_qs before other flags. */
483 if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs)))
484 goto out;
485 this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
486 if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs)))
458 rcu_momentary_dyntick_idle(); 487 rcu_momentary_dyntick_idle();
488 this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
489 if (!preempt)
490 rcu_note_voluntary_context_switch_lite(current);
491out:
459 trace_rcu_utilization(TPS("End context switch")); 492 trace_rcu_utilization(TPS("End context switch"));
460 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 493 barrier(); /* Avoid RCU read-side critical sections leaking up. */
461} 494}
@@ -478,29 +511,26 @@ void rcu_all_qs(void)
478{ 511{
479 unsigned long flags; 512 unsigned long flags;
480 513
514 if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs))
515 return;
516 preempt_disable();
517 /* Load rcu_urgent_qs before other flags. */
518 if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) {
519 preempt_enable();
520 return;
521 }
522 this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
481 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 523 barrier(); /* Avoid RCU read-side critical sections leaking down. */
482 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { 524 if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) {
483 local_irq_save(flags); 525 local_irq_save(flags);
484 rcu_momentary_dyntick_idle(); 526 rcu_momentary_dyntick_idle();
485 local_irq_restore(flags); 527 local_irq_restore(flags);
486 } 528 }
487 if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { 529 if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)))
488 /*
489 * Yes, we just checked a per-CPU variable with preemption
490 * enabled, so we might be migrated to some other CPU at
491 * this point. That is OK because in that case, the
492 * migration will supply the needed quiescent state.
493 * We might end up needlessly disabling preemption and
494 * invoking rcu_sched_qs() on the destination CPU, but
495 * the probability and cost are both quite low, so this
496 * should not be a problem in practice.
497 */
498 preempt_disable();
499 rcu_sched_qs(); 530 rcu_sched_qs();
500 preempt_enable(); 531 this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
501 }
502 this_cpu_inc(rcu_qs_ctr);
503 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 532 barrier(); /* Avoid RCU read-side critical sections leaking up. */
533 preempt_enable();
504} 534}
505EXPORT_SYMBOL_GPL(rcu_all_qs); 535EXPORT_SYMBOL_GPL(rcu_all_qs);
506 536
@@ -689,15 +719,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
689 default: 719 default:
690 break; 720 break;
691 } 721 }
692 if (rsp != NULL) { 722 if (rsp == NULL)
693 *flags = READ_ONCE(rsp->gp_flags);
694 *gpnum = READ_ONCE(rsp->gpnum);
695 *completed = READ_ONCE(rsp->completed);
696 return; 723 return;
697 } 724 *flags = READ_ONCE(rsp->gp_flags);
698 *flags = 0; 725 *gpnum = READ_ONCE(rsp->gpnum);
699 *gpnum = 0; 726 *completed = READ_ONCE(rsp->completed);
700 *completed = 0;
701} 727}
702EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); 728EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
703 729
@@ -713,16 +739,6 @@ void rcutorture_record_progress(unsigned long vernum)
713EXPORT_SYMBOL_GPL(rcutorture_record_progress); 739EXPORT_SYMBOL_GPL(rcutorture_record_progress);
714 740
715/* 741/*
716 * Does the CPU have callbacks ready to be invoked?
717 */
718static int
719cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
720{
721 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
722 rdp->nxttail[RCU_NEXT_TAIL] != NULL;
723}
724
725/*
726 * Return the root node of the specified rcu_state structure. 742 * Return the root node of the specified rcu_state structure.
727 */ 743 */
728static struct rcu_node *rcu_get_root(struct rcu_state *rsp) 744static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
@@ -752,44 +768,39 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
752static bool 768static bool
753cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 769cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
754{ 770{
755 int i;
756
757 if (rcu_gp_in_progress(rsp)) 771 if (rcu_gp_in_progress(rsp))
758 return false; /* No, a grace period is already in progress. */ 772 return false; /* No, a grace period is already in progress. */
759 if (rcu_future_needs_gp(rsp)) 773 if (rcu_future_needs_gp(rsp))
760 return true; /* Yes, a no-CBs CPU needs one. */ 774 return true; /* Yes, a no-CBs CPU needs one. */
761 if (!rdp->nxttail[RCU_NEXT_TAIL]) 775 if (!rcu_segcblist_is_enabled(&rdp->cblist))
762 return false; /* No, this is a no-CBs (or offline) CPU. */ 776 return false; /* No, this is a no-CBs (or offline) CPU. */
763 if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) 777 if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
764 return true; /* Yes, CPU has newly registered callbacks. */ 778 return true; /* Yes, CPU has newly registered callbacks. */
765 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) 779 if (rcu_segcblist_future_gp_needed(&rdp->cblist,
766 if (rdp->nxttail[i - 1] != rdp->nxttail[i] && 780 READ_ONCE(rsp->completed)))
767 ULONG_CMP_LT(READ_ONCE(rsp->completed), 781 return true; /* Yes, CBs for future grace period. */
768 rdp->nxtcompleted[i]))
769 return true; /* Yes, CBs for future grace period. */
770 return false; /* No grace period needed. */ 782 return false; /* No grace period needed. */
771} 783}
772 784
773/* 785/*
774 * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state 786 * rcu_eqs_enter_common - current CPU is entering an extended quiescent state
775 * 787 *
776 * If the new value of the ->dynticks_nesting counter now is zero, 788 * Enter idle, doing appropriate accounting. The caller must have
777 * we really have entered idle, and must do the appropriate accounting. 789 * disabled interrupts.
778 * The caller must have disabled interrupts.
779 */ 790 */
780static void rcu_eqs_enter_common(long long oldval, bool user) 791static void rcu_eqs_enter_common(bool user)
781{ 792{
782 struct rcu_state *rsp; 793 struct rcu_state *rsp;
783 struct rcu_data *rdp; 794 struct rcu_data *rdp;
784 RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) 795 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
785 796
786 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); 797 trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0);
787 if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 798 if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
788 !user && !is_idle_task(current)) { 799 !user && !is_idle_task(current)) {
789 struct task_struct *idle __maybe_unused = 800 struct task_struct *idle __maybe_unused =
790 idle_task(smp_processor_id()); 801 idle_task(smp_processor_id());
791 802
792 trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); 803 trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0);
793 rcu_ftrace_dump(DUMP_ORIG); 804 rcu_ftrace_dump(DUMP_ORIG);
794 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 805 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
795 current->pid, current->comm, 806 current->pid, current->comm,
@@ -800,7 +811,10 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
800 do_nocb_deferred_wakeup(rdp); 811 do_nocb_deferred_wakeup(rdp);
801 } 812 }
802 rcu_prepare_for_idle(); 813 rcu_prepare_for_idle();
803 rcu_dynticks_eqs_enter(); 814 __this_cpu_inc(disable_rcu_irq_enter);
815 rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */
816 rcu_dynticks_eqs_enter(); /* After this, tracing works again. */
817 __this_cpu_dec(disable_rcu_irq_enter);
804 rcu_dynticks_task_enter(); 818 rcu_dynticks_task_enter();
805 819
806 /* 820 /*
@@ -821,19 +835,15 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
821 */ 835 */
822static void rcu_eqs_enter(bool user) 836static void rcu_eqs_enter(bool user)
823{ 837{
824 long long oldval;
825 struct rcu_dynticks *rdtp; 838 struct rcu_dynticks *rdtp;
826 839
827 rdtp = this_cpu_ptr(&rcu_dynticks); 840 rdtp = this_cpu_ptr(&rcu_dynticks);
828 oldval = rdtp->dynticks_nesting;
829 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 841 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
830 (oldval & DYNTICK_TASK_NEST_MASK) == 0); 842 (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
831 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { 843 if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
832 rdtp->dynticks_nesting = 0; 844 rcu_eqs_enter_common(user);
833 rcu_eqs_enter_common(oldval, user); 845 else
834 } else {
835 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; 846 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
836 }
837} 847}
838 848
839/** 849/**
@@ -892,19 +902,18 @@ void rcu_user_enter(void)
892 */ 902 */
893void rcu_irq_exit(void) 903void rcu_irq_exit(void)
894{ 904{
895 long long oldval;
896 struct rcu_dynticks *rdtp; 905 struct rcu_dynticks *rdtp;
897 906
898 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); 907 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
899 rdtp = this_cpu_ptr(&rcu_dynticks); 908 rdtp = this_cpu_ptr(&rcu_dynticks);
900 oldval = rdtp->dynticks_nesting;
901 rdtp->dynticks_nesting--;
902 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 909 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
903 rdtp->dynticks_nesting < 0); 910 rdtp->dynticks_nesting < 1);
904 if (rdtp->dynticks_nesting) 911 if (rdtp->dynticks_nesting <= 1) {
905 trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); 912 rcu_eqs_enter_common(true);
906 else 913 } else {
907 rcu_eqs_enter_common(oldval, true); 914 trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1);
915 rdtp->dynticks_nesting--;
916 }
908 rcu_sysidle_enter(1); 917 rcu_sysidle_enter(1);
909} 918}
910 919
@@ -1150,6 +1159,24 @@ bool notrace rcu_is_watching(void)
1150} 1159}
1151EXPORT_SYMBOL_GPL(rcu_is_watching); 1160EXPORT_SYMBOL_GPL(rcu_is_watching);
1152 1161
1162/*
1163 * If a holdout task is actually running, request an urgent quiescent
1164 * state from its CPU. This is unsynchronized, so migrations can cause
1165 * the request to go to the wrong CPU. Which is OK, all that will happen
1166 * is that the CPU's next context switch will be a bit slower and next
1167 * time around this task will generate another request.
1168 */
1169void rcu_request_urgent_qs_task(struct task_struct *t)
1170{
1171 int cpu;
1172
1173 barrier();
1174 cpu = task_cpu(t);
1175 if (!task_curr(t))
1176 return; /* This task is not running on that CPU. */
1177 smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true);
1178}
1179
1153#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 1180#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
1154 1181
1155/* 1182/*
@@ -1235,7 +1262,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
1235 bool *isidle, unsigned long *maxj) 1262 bool *isidle, unsigned long *maxj)
1236{ 1263{
1237 unsigned long jtsq; 1264 unsigned long jtsq;
1238 int *rcrmp; 1265 bool *rnhqp;
1266 bool *ruqp;
1239 unsigned long rjtsc; 1267 unsigned long rjtsc;
1240 struct rcu_node *rnp; 1268 struct rcu_node *rnp;
1241 1269
@@ -1271,11 +1299,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
1271 * might not be the case for nohz_full CPUs looping in the kernel. 1299 * might not be the case for nohz_full CPUs looping in the kernel.
1272 */ 1300 */
1273 rnp = rdp->mynode; 1301 rnp = rdp->mynode;
1302 ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
1274 if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && 1303 if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
1275 READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) && 1304 READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) &&
1276 READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { 1305 READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
1277 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); 1306 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
1278 return 1; 1307 return 1;
1308 } else {
1309 /* Load rcu_qs_ctr before store to rcu_urgent_qs. */
1310 smp_store_release(ruqp, true);
1279 } 1311 }
1280 1312
1281 /* Check for the CPU being offline. */ 1313 /* Check for the CPU being offline. */
@@ -1292,7 +1324,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
1292 * in-kernel CPU-bound tasks cannot advance grace periods. 1324 * in-kernel CPU-bound tasks cannot advance grace periods.
1293 * So if the grace period is old enough, make the CPU pay attention. 1325 * So if the grace period is old enough, make the CPU pay attention.
1294 * Note that the unsynchronized assignments to the per-CPU 1326 * Note that the unsynchronized assignments to the per-CPU
1295 * rcu_sched_qs_mask variable are safe. Yes, setting of 1327 * rcu_need_heavy_qs variable are safe. Yes, setting of
1296 * bits can be lost, but they will be set again on the next 1328 * bits can be lost, but they will be set again on the next
1297 * force-quiescent-state pass. So lost bit sets do not result 1329 * force-quiescent-state pass. So lost bit sets do not result
1298 * in incorrect behavior, merely in a grace period lasting 1330 * in incorrect behavior, merely in a grace period lasting
@@ -1306,16 +1338,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
1306 * is set too high, we override with half of the RCU CPU stall 1338 * is set too high, we override with half of the RCU CPU stall
1307 * warning delay. 1339 * warning delay.
1308 */ 1340 */
1309 rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); 1341 rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu);
1310 if (time_after(jiffies, rdp->rsp->gp_start + jtsq) || 1342 if (!READ_ONCE(*rnhqp) &&
1311 time_after(jiffies, rdp->rsp->jiffies_resched)) { 1343 (time_after(jiffies, rdp->rsp->gp_start + jtsq) ||
1312 if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { 1344 time_after(jiffies, rdp->rsp->jiffies_resched))) {
1313 WRITE_ONCE(rdp->cond_resched_completed, 1345 WRITE_ONCE(*rnhqp, true);
1314 READ_ONCE(rdp->mynode->completed)); 1346 /* Store rcu_need_heavy_qs before rcu_urgent_qs. */
1315 smp_mb(); /* ->cond_resched_completed before *rcrmp. */ 1347 smp_store_release(ruqp, true);
1316 WRITE_ONCE(*rcrmp,
1317 READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
1318 }
1319 rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ 1348 rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
1320 } 1349 }
1321 1350
@@ -1475,7 +1504,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1475 1504
1476 print_cpu_stall_info_end(); 1505 print_cpu_stall_info_end();
1477 for_each_possible_cpu(cpu) 1506 for_each_possible_cpu(cpu)
1478 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1507 totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
1508 cpu)->cblist);
1479 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", 1509 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
1480 smp_processor_id(), (long)(jiffies - rsp->gp_start), 1510 smp_processor_id(), (long)(jiffies - rsp->gp_start),
1481 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1511 (long)rsp->gpnum, (long)rsp->completed, totqlen);
@@ -1529,7 +1559,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
1529 print_cpu_stall_info(rsp, smp_processor_id()); 1559 print_cpu_stall_info(rsp, smp_processor_id());
1530 print_cpu_stall_info_end(); 1560 print_cpu_stall_info_end();
1531 for_each_possible_cpu(cpu) 1561 for_each_possible_cpu(cpu)
1532 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1562 totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
1563 cpu)->cblist);
1533 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", 1564 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
1534 jiffies - rsp->gp_start, 1565 jiffies - rsp->gp_start,
1535 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1566 (long)rsp->gpnum, (long)rsp->completed, totqlen);
@@ -1632,30 +1663,6 @@ void rcu_cpu_stall_reset(void)
1632} 1663}
1633 1664
1634/* 1665/*
1635 * Initialize the specified rcu_data structure's default callback list
1636 * to empty. The default callback list is the one that is not used by
1637 * no-callbacks CPUs.
1638 */
1639static void init_default_callback_list(struct rcu_data *rdp)
1640{
1641 int i;
1642
1643 rdp->nxtlist = NULL;
1644 for (i = 0; i < RCU_NEXT_SIZE; i++)
1645 rdp->nxttail[i] = &rdp->nxtlist;
1646}
1647
1648/*
1649 * Initialize the specified rcu_data structure's callback list to empty.
1650 */
1651static void init_callback_list(struct rcu_data *rdp)
1652{
1653 if (init_nocb_callback_list(rdp))
1654 return;
1655 init_default_callback_list(rdp);
1656}
1657
1658/*
1659 * Determine the value that ->completed will have at the end of the 1666 * Determine the value that ->completed will have at the end of the
1660 * next subsequent grace period. This is used to tag callbacks so that 1667 * next subsequent grace period. This is used to tag callbacks so that
1661 * a CPU can invoke callbacks in a timely fashion even if that CPU has 1668 * a CPU can invoke callbacks in a timely fashion even if that CPU has
@@ -1709,7 +1716,6 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1709 unsigned long *c_out) 1716 unsigned long *c_out)
1710{ 1717{
1711 unsigned long c; 1718 unsigned long c;
1712 int i;
1713 bool ret = false; 1719 bool ret = false;
1714 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); 1720 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
1715 1721
@@ -1755,13 +1761,11 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1755 /* 1761 /*
1756 * Get a new grace-period number. If there really is no grace 1762 * Get a new grace-period number. If there really is no grace
1757 * period in progress, it will be smaller than the one we obtained 1763 * period in progress, it will be smaller than the one we obtained
1758 * earlier. Adjust callbacks as needed. Note that even no-CBs 1764 * earlier. Adjust callbacks as needed.
1759 * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
1760 */ 1765 */
1761 c = rcu_cbs_completed(rdp->rsp, rnp_root); 1766 c = rcu_cbs_completed(rdp->rsp, rnp_root);
1762 for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) 1767 if (!rcu_is_nocb_cpu(rdp->cpu))
1763 if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) 1768 (void)rcu_segcblist_accelerate(&rdp->cblist, c);
1764 rdp->nxtcompleted[i] = c;
1765 1769
1766 /* 1770 /*
1767 * If the needed for the required grace period is already 1771 * If the needed for the required grace period is already
@@ -1793,9 +1797,7 @@ out:
1793 1797
1794/* 1798/*
1795 * Clean up any old requests for the just-ended grace period. Also return 1799 * Clean up any old requests for the just-ended grace period. Also return
1796 * whether any additional grace periods have been requested. Also invoke 1800 * whether any additional grace periods have been requested.
1797 * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
1798 * waiting for this grace period to complete.
1799 */ 1801 */
1800static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 1802static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1801{ 1803{
@@ -1841,57 +1843,27 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
1841static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1843static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1842 struct rcu_data *rdp) 1844 struct rcu_data *rdp)
1843{ 1845{
1844 unsigned long c; 1846 bool ret = false;
1845 int i;
1846 bool ret;
1847
1848 /* If the CPU has no callbacks, nothing to do. */
1849 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1850 return false;
1851
1852 /*
1853 * Starting from the sublist containing the callbacks most
1854 * recently assigned a ->completed number and working down, find the
1855 * first sublist that is not assignable to an upcoming grace period.
1856 * Such a sublist has something in it (first two tests) and has
1857 * a ->completed number assigned that will complete sooner than
1858 * the ->completed number for newly arrived callbacks (last test).
1859 *
1860 * The key point is that any later sublist can be assigned the
1861 * same ->completed number as the newly arrived callbacks, which
1862 * means that the callbacks in any of these later sublist can be
1863 * grouped into a single sublist, whether or not they have already
1864 * been assigned a ->completed number.
1865 */
1866 c = rcu_cbs_completed(rsp, rnp);
1867 for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
1868 if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
1869 !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
1870 break;
1871 1847
1872 /* 1848 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1873 * If there are no sublist for unassigned callbacks, leave. 1849 if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1874 * At the same time, advance "i" one sublist, so that "i" will
1875 * index into the sublist where all the remaining callbacks should
1876 * be grouped into.
1877 */
1878 if (++i >= RCU_NEXT_TAIL)
1879 return false; 1850 return false;
1880 1851
1881 /* 1852 /*
1882 * Assign all subsequent callbacks' ->completed number to the next 1853 * Callbacks are often registered with incomplete grace-period
1883 * full grace period and group them all in the sublist initially 1854 * information. Something about the fact that getting exact
1884 * indexed by "i". 1855 * information requires acquiring a global lock... RCU therefore
1856 * makes a conservative estimate of the grace period number at which
1857 * a given callback will become ready to invoke. The following
1858 * code checks this estimate and improves it when possible, thus
1859 * accelerating callback invocation to an earlier grace-period
1860 * number.
1885 */ 1861 */
1886 for (; i <= RCU_NEXT_TAIL; i++) { 1862 if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp)))
1887 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; 1863 ret = rcu_start_future_gp(rnp, rdp, NULL);
1888 rdp->nxtcompleted[i] = c;
1889 }
1890 /* Record any needed additional grace periods. */
1891 ret = rcu_start_future_gp(rnp, rdp, NULL);
1892 1864
1893 /* Trace depending on how much we were able to accelerate. */ 1865 /* Trace depending on how much we were able to accelerate. */
1894 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1866 if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
1895 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); 1867 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
1896 else 1868 else
1897 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); 1869 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
@@ -1911,32 +1883,15 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1911static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1883static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1912 struct rcu_data *rdp) 1884 struct rcu_data *rdp)
1913{ 1885{
1914 int i, j; 1886 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1915 1887 if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1916 /* If the CPU has no callbacks, nothing to do. */
1917 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1918 return false; 1888 return false;
1919 1889
1920 /* 1890 /*
1921 * Find all callbacks whose ->completed numbers indicate that they 1891 * Find all callbacks whose ->completed numbers indicate that they
1922 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. 1892 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
1923 */ 1893 */
1924 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { 1894 rcu_segcblist_advance(&rdp->cblist, rnp->completed);
1925 if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
1926 break;
1927 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
1928 }
1929 /* Clean up any sublist tail pointers that were misordered above. */
1930 for (j = RCU_WAIT_TAIL; j < i; j++)
1931 rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
1932
1933 /* Copy down callbacks to fill in empty sublists. */
1934 for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
1935 if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
1936 break;
1937 rdp->nxttail[j] = rdp->nxttail[i];
1938 rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
1939 }
1940 1895
1941 /* Classify any remaining callbacks. */ 1896 /* Classify any remaining callbacks. */
1942 return rcu_accelerate_cbs(rsp, rnp, rdp); 1897 return rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1981,7 +1936,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1981 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); 1936 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
1982 need_gp = !!(rnp->qsmask & rdp->grpmask); 1937 need_gp = !!(rnp->qsmask & rdp->grpmask);
1983 rdp->cpu_no_qs.b.norm = need_gp; 1938 rdp->cpu_no_qs.b.norm = need_gp;
1984 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 1939 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
1985 rdp->core_needs_qs = need_gp; 1940 rdp->core_needs_qs = need_gp;
1986 zero_cpu_stall_ticks(rdp); 1941 zero_cpu_stall_ticks(rdp);
1987 WRITE_ONCE(rdp->gpwrap, false); 1942 WRITE_ONCE(rdp->gpwrap, false);
@@ -2579,7 +2534,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2579 * within the current grace period. 2534 * within the current grace period.
2580 */ 2535 */
2581 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ 2536 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
2582 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 2537 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
2583 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2538 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2584 return; 2539 return;
2585 } 2540 }
@@ -2653,13 +2608,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
2653 * because _rcu_barrier() excludes CPU-hotplug operations, so it 2608 * because _rcu_barrier() excludes CPU-hotplug operations, so it
2654 * cannot be running now. Thus no memory barrier is required. 2609 * cannot be running now. Thus no memory barrier is required.
2655 */ 2610 */
2656 if (rdp->nxtlist != NULL) { 2611 rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
2657 rsp->qlen_lazy += rdp->qlen_lazy; 2612 rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
2658 rsp->qlen += rdp->qlen;
2659 rdp->n_cbs_orphaned += rdp->qlen;
2660 rdp->qlen_lazy = 0;
2661 WRITE_ONCE(rdp->qlen, 0);
2662 }
2663 2613
2664 /* 2614 /*
2665 * Next, move those callbacks still needing a grace period to 2615 * Next, move those callbacks still needing a grace period to
@@ -2667,31 +2617,18 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
2667 * Some of the callbacks might have gone partway through a grace 2617 * Some of the callbacks might have gone partway through a grace
2668 * period, but that is too bad. They get to start over because we 2618 * period, but that is too bad. They get to start over because we
2669 * cannot assume that grace periods are synchronized across CPUs. 2619 * cannot assume that grace periods are synchronized across CPUs.
2670 * We don't bother updating the ->nxttail[] array yet, instead
2671 * we just reset the whole thing later on.
2672 */ 2620 */
2673 if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { 2621 rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
2674 *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
2675 rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
2676 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
2677 }
2678 2622
2679 /* 2623 /*
2680 * Then move the ready-to-invoke callbacks to the orphanage, 2624 * Then move the ready-to-invoke callbacks to the orphanage,
2681 * where some other CPU will pick them up. These will not be 2625 * where some other CPU will pick them up. These will not be
2682 * required to pass though another grace period: They are done. 2626 * required to pass though another grace period: They are done.
2683 */ 2627 */
2684 if (rdp->nxtlist != NULL) { 2628 rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
2685 *rsp->orphan_donetail = rdp->nxtlist;
2686 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
2687 }
2688 2629
2689 /* 2630 /* Finally, disallow further callbacks on this CPU. */
2690 * Finally, initialize the rcu_data structure's list to empty and 2631 rcu_segcblist_disable(&rdp->cblist);
2691 * disallow further callbacks on this CPU.
2692 */
2693 init_callback_list(rdp);
2694 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2695} 2632}
2696 2633
2697/* 2634/*
@@ -2700,7 +2637,6 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
2700 */ 2637 */
2701static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) 2638static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
2702{ 2639{
2703 int i;
2704 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 2640 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
2705 2641
2706 /* No-CBs CPUs are handled specially. */ 2642 /* No-CBs CPUs are handled specially. */
@@ -2709,13 +2645,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
2709 return; 2645 return;
2710 2646
2711 /* Do the accounting first. */ 2647 /* Do the accounting first. */
2712 rdp->qlen_lazy += rsp->qlen_lazy; 2648 rdp->n_cbs_adopted += rsp->orphan_done.len;
2713 rdp->qlen += rsp->qlen; 2649 if (rsp->orphan_done.len_lazy != rsp->orphan_done.len)
2714 rdp->n_cbs_adopted += rsp->qlen;
2715 if (rsp->qlen_lazy != rsp->qlen)
2716 rcu_idle_count_callbacks_posted(); 2650 rcu_idle_count_callbacks_posted();
2717 rsp->qlen_lazy = 0; 2651 rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
2718 rsp->qlen = 0;
2719 2652
2720 /* 2653 /*
2721 * We do not need a memory barrier here because the only way we 2654 * We do not need a memory barrier here because the only way we
@@ -2723,24 +2656,13 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
2723 * we are the task doing the rcu_barrier(). 2656 * we are the task doing the rcu_barrier().
2724 */ 2657 */
2725 2658
2726 /* First adopt the ready-to-invoke callbacks. */ 2659 /* First adopt the ready-to-invoke callbacks, then the done ones. */
2727 if (rsp->orphan_donelist != NULL) { 2660 rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
2728 *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; 2661 WARN_ON_ONCE(rsp->orphan_done.head);
2729 *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; 2662 rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
2730 for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) 2663 WARN_ON_ONCE(rsp->orphan_pend.head);
2731 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) 2664 WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
2732 rdp->nxttail[i] = rsp->orphan_donetail; 2665 !rcu_segcblist_n_cbs(&rdp->cblist));
2733 rsp->orphan_donelist = NULL;
2734 rsp->orphan_donetail = &rsp->orphan_donelist;
2735 }
2736
2737 /* And then adopt the callbacks that still need a grace period. */
2738 if (rsp->orphan_nxtlist != NULL) {
2739 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
2740 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
2741 rsp->orphan_nxtlist = NULL;
2742 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
2743 }
2744} 2666}
2745 2667
2746/* 2668/*
@@ -2748,14 +2670,14 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
2748 */ 2670 */
2749static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 2671static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
2750{ 2672{
2751 RCU_TRACE(unsigned long mask); 2673 RCU_TRACE(unsigned long mask;)
2752 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); 2674 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);)
2753 RCU_TRACE(struct rcu_node *rnp = rdp->mynode); 2675 RCU_TRACE(struct rcu_node *rnp = rdp->mynode;)
2754 2676
2755 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) 2677 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
2756 return; 2678 return;
2757 2679
2758 RCU_TRACE(mask = rdp->grpmask); 2680 RCU_TRACE(mask = rdp->grpmask;)
2759 trace_rcu_grace_period(rsp->name, 2681 trace_rcu_grace_period(rsp->name,
2760 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 2682 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
2761 TPS("cpuofl")); 2683 TPS("cpuofl"));
@@ -2828,9 +2750,11 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2828 rcu_adopt_orphan_cbs(rsp, flags); 2750 rcu_adopt_orphan_cbs(rsp, flags);
2829 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); 2751 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
2830 2752
2831 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 2753 WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
2832 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 2754 !rcu_segcblist_empty(&rdp->cblist),
2833 cpu, rdp->qlen, rdp->nxtlist); 2755 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
2756 cpu, rcu_segcblist_n_cbs(&rdp->cblist),
2757 rcu_segcblist_first_cb(&rdp->cblist));
2834} 2758}
2835 2759
2836/* 2760/*
@@ -2840,14 +2764,17 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2840static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) 2764static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2841{ 2765{
2842 unsigned long flags; 2766 unsigned long flags;
2843 struct rcu_head *next, *list, **tail; 2767 struct rcu_head *rhp;
2844 long bl, count, count_lazy; 2768 struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
2845 int i; 2769 long bl, count;
2846 2770
2847 /* If no callbacks are ready, just return. */ 2771 /* If no callbacks are ready, just return. */
2848 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 2772 if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
2849 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); 2773 trace_rcu_batch_start(rsp->name,
2850 trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist), 2774 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
2775 rcu_segcblist_n_cbs(&rdp->cblist), 0);
2776 trace_rcu_batch_end(rsp->name, 0,
2777 !rcu_segcblist_empty(&rdp->cblist),
2851 need_resched(), is_idle_task(current), 2778 need_resched(), is_idle_task(current),
2852 rcu_is_callbacks_kthread()); 2779 rcu_is_callbacks_kthread());
2853 return; 2780 return;
@@ -2855,73 +2782,61 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2855 2782
2856 /* 2783 /*
2857 * Extract the list of ready callbacks, disabling to prevent 2784 * Extract the list of ready callbacks, disabling to prevent
2858 * races with call_rcu() from interrupt handlers. 2785 * races with call_rcu() from interrupt handlers. Leave the
2786 * callback counts, as rcu_barrier() needs to be conservative.
2859 */ 2787 */
2860 local_irq_save(flags); 2788 local_irq_save(flags);
2861 WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); 2789 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
2862 bl = rdp->blimit; 2790 bl = rdp->blimit;
2863 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); 2791 trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist),
2864 list = rdp->nxtlist; 2792 rcu_segcblist_n_cbs(&rdp->cblist), bl);
2865 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 2793 rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
2866 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
2867 tail = rdp->nxttail[RCU_DONE_TAIL];
2868 for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
2869 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
2870 rdp->nxttail[i] = &rdp->nxtlist;
2871 local_irq_restore(flags); 2794 local_irq_restore(flags);
2872 2795
2873 /* Invoke callbacks. */ 2796 /* Invoke callbacks. */
2874 count = count_lazy = 0; 2797 rhp = rcu_cblist_dequeue(&rcl);
2875 while (list) { 2798 for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
2876 next = list->next; 2799 debug_rcu_head_unqueue(rhp);
2877 prefetch(next); 2800 if (__rcu_reclaim(rsp->name, rhp))
2878 debug_rcu_head_unqueue(list); 2801 rcu_cblist_dequeued_lazy(&rcl);
2879 if (__rcu_reclaim(rsp->name, list)) 2802 /*
2880 count_lazy++; 2803 * Stop only if limit reached and CPU has something to do.
2881 list = next; 2804 * Note: The rcl structure counts down from zero.
2882 /* Stop only if limit reached and CPU has something to do. */ 2805 */
2883 if (++count >= bl && 2806 if (-rcl.len >= bl &&
2884 (need_resched() || 2807 (need_resched() ||
2885 (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) 2808 (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
2886 break; 2809 break;
2887 } 2810 }
2888 2811
2889 local_irq_save(flags); 2812 local_irq_save(flags);
2890 trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), 2813 count = -rcl.len;
2891 is_idle_task(current), 2814 trace_rcu_batch_end(rsp->name, count, !!rcl.head, need_resched(),
2892 rcu_is_callbacks_kthread()); 2815 is_idle_task(current), rcu_is_callbacks_kthread());
2893 2816
2894 /* Update count, and requeue any remaining callbacks. */ 2817 /* Update counts and requeue any remaining callbacks. */
2895 if (list != NULL) { 2818 rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
2896 *tail = rdp->nxtlist;
2897 rdp->nxtlist = list;
2898 for (i = 0; i < RCU_NEXT_SIZE; i++)
2899 if (&rdp->nxtlist == rdp->nxttail[i])
2900 rdp->nxttail[i] = tail;
2901 else
2902 break;
2903 }
2904 smp_mb(); /* List handling before counting for rcu_barrier(). */ 2819 smp_mb(); /* List handling before counting for rcu_barrier(). */
2905 rdp->qlen_lazy -= count_lazy;
2906 WRITE_ONCE(rdp->qlen, rdp->qlen - count);
2907 rdp->n_cbs_invoked += count; 2820 rdp->n_cbs_invoked += count;
2821 rcu_segcblist_insert_count(&rdp->cblist, &rcl);
2908 2822
2909 /* Reinstate batch limit if we have worked down the excess. */ 2823 /* Reinstate batch limit if we have worked down the excess. */
2910 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 2824 count = rcu_segcblist_n_cbs(&rdp->cblist);
2825 if (rdp->blimit == LONG_MAX && count <= qlowmark)
2911 rdp->blimit = blimit; 2826 rdp->blimit = blimit;
2912 2827
2913 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ 2828 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
2914 if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { 2829 if (count == 0 && rdp->qlen_last_fqs_check != 0) {
2915 rdp->qlen_last_fqs_check = 0; 2830 rdp->qlen_last_fqs_check = 0;
2916 rdp->n_force_qs_snap = rsp->n_force_qs; 2831 rdp->n_force_qs_snap = rsp->n_force_qs;
2917 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) 2832 } else if (count < rdp->qlen_last_fqs_check - qhimark)
2918 rdp->qlen_last_fqs_check = rdp->qlen; 2833 rdp->qlen_last_fqs_check = count;
2919 WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); 2834 WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
2920 2835
2921 local_irq_restore(flags); 2836 local_irq_restore(flags);
2922 2837
2923 /* Re-invoke RCU core processing if there are callbacks remaining. */ 2838 /* Re-invoke RCU core processing if there are callbacks remaining. */
2924 if (cpu_has_callbacks_ready_to_invoke(rdp)) 2839 if (rcu_segcblist_ready_cbs(&rdp->cblist))
2925 invoke_rcu_core(); 2840 invoke_rcu_core();
2926} 2841}
2927 2842
@@ -3087,7 +3002,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
3087 bool needwake; 3002 bool needwake;
3088 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 3003 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
3089 3004
3090 WARN_ON_ONCE(rdp->beenonline == 0); 3005 WARN_ON_ONCE(!rdp->beenonline);
3091 3006
3092 /* Update RCU state based on any recent quiescent states. */ 3007 /* Update RCU state based on any recent quiescent states. */
3093 rcu_check_quiescent_state(rsp, rdp); 3008 rcu_check_quiescent_state(rsp, rdp);
@@ -3105,7 +3020,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
3105 } 3020 }
3106 3021
3107 /* If there are callbacks ready, invoke them. */ 3022 /* If there are callbacks ready, invoke them. */
3108 if (cpu_has_callbacks_ready_to_invoke(rdp)) 3023 if (rcu_segcblist_ready_cbs(&rdp->cblist))
3109 invoke_rcu_callbacks(rsp, rdp); 3024 invoke_rcu_callbacks(rsp, rdp);
3110 3025
3111 /* Do any needed deferred wakeups of rcuo kthreads. */ 3026 /* Do any needed deferred wakeups of rcuo kthreads. */
@@ -3177,7 +3092,8 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
3177 * invoking force_quiescent_state() if the newly enqueued callback 3092 * invoking force_quiescent_state() if the newly enqueued callback
3178 * is the only one waiting for a grace period to complete. 3093 * is the only one waiting for a grace period to complete.
3179 */ 3094 */
3180 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 3095 if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
3096 rdp->qlen_last_fqs_check + qhimark)) {
3181 3097
3182 /* Are we ignoring a completed grace period? */ 3098 /* Are we ignoring a completed grace period? */
3183 note_gp_changes(rsp, rdp); 3099 note_gp_changes(rsp, rdp);
@@ -3195,10 +3111,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
3195 /* Give the grace period a kick. */ 3111 /* Give the grace period a kick. */
3196 rdp->blimit = LONG_MAX; 3112 rdp->blimit = LONG_MAX;
3197 if (rsp->n_force_qs == rdp->n_force_qs_snap && 3113 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
3198 *rdp->nxttail[RCU_DONE_TAIL] != head) 3114 rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
3199 force_quiescent_state(rsp); 3115 force_quiescent_state(rsp);
3200 rdp->n_force_qs_snap = rsp->n_force_qs; 3116 rdp->n_force_qs_snap = rsp->n_force_qs;
3201 rdp->qlen_last_fqs_check = rdp->qlen; 3117 rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
3202 } 3118 }
3203 } 3119 }
3204} 3120}
@@ -3238,7 +3154,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
3238 rdp = this_cpu_ptr(rsp->rda); 3154 rdp = this_cpu_ptr(rsp->rda);
3239 3155
3240 /* Add the callback to our list. */ 3156 /* Add the callback to our list. */
3241 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { 3157 if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) {
3242 int offline; 3158 int offline;
3243 3159
3244 if (cpu != -1) 3160 if (cpu != -1)
@@ -3257,23 +3173,21 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
3257 */ 3173 */
3258 BUG_ON(cpu != -1); 3174 BUG_ON(cpu != -1);
3259 WARN_ON_ONCE(!rcu_is_watching()); 3175 WARN_ON_ONCE(!rcu_is_watching());
3260 if (!likely(rdp->nxtlist)) 3176 if (rcu_segcblist_empty(&rdp->cblist))
3261 init_default_callback_list(rdp); 3177 rcu_segcblist_init(&rdp->cblist);
3262 } 3178 }
3263 WRITE_ONCE(rdp->qlen, rdp->qlen + 1); 3179 rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
3264 if (lazy) 3180 if (!lazy)
3265 rdp->qlen_lazy++;
3266 else
3267 rcu_idle_count_callbacks_posted(); 3181 rcu_idle_count_callbacks_posted();
3268 smp_mb(); /* Count before adding callback for rcu_barrier(). */
3269 *rdp->nxttail[RCU_NEXT_TAIL] = head;
3270 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
3271 3182
3272 if (__is_kfree_rcu_offset((unsigned long)func)) 3183 if (__is_kfree_rcu_offset((unsigned long)func))
3273 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 3184 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
3274 rdp->qlen_lazy, rdp->qlen); 3185 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
3186 rcu_segcblist_n_cbs(&rdp->cblist));
3275 else 3187 else
3276 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); 3188 trace_rcu_callback(rsp->name, head,
3189 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
3190 rcu_segcblist_n_cbs(&rdp->cblist));
3277 3191
3278 /* Go handle any RCU core processing required. */ 3192 /* Go handle any RCU core processing required. */
3279 __call_rcu_core(rsp, rdp, head, flags); 3193 __call_rcu_core(rsp, rdp, head, flags);
@@ -3519,41 +3433,6 @@ void cond_synchronize_sched(unsigned long oldstate)
3519} 3433}
3520EXPORT_SYMBOL_GPL(cond_synchronize_sched); 3434EXPORT_SYMBOL_GPL(cond_synchronize_sched);
3521 3435
3522/* Adjust sequence number for start of update-side operation. */
3523static void rcu_seq_start(unsigned long *sp)
3524{
3525 WRITE_ONCE(*sp, *sp + 1);
3526 smp_mb(); /* Ensure update-side operation after counter increment. */
3527 WARN_ON_ONCE(!(*sp & 0x1));
3528}
3529
3530/* Adjust sequence number for end of update-side operation. */
3531static void rcu_seq_end(unsigned long *sp)
3532{
3533 smp_mb(); /* Ensure update-side operation before counter increment. */
3534 WRITE_ONCE(*sp, *sp + 1);
3535 WARN_ON_ONCE(*sp & 0x1);
3536}
3537
3538/* Take a snapshot of the update side's sequence number. */
3539static unsigned long rcu_seq_snap(unsigned long *sp)
3540{
3541 unsigned long s;
3542
3543 s = (READ_ONCE(*sp) + 3) & ~0x1;
3544 smp_mb(); /* Above access must not bleed into critical section. */
3545 return s;
3546}
3547
3548/*
3549 * Given a snapshot from rcu_seq_snap(), determine whether or not a
3550 * full update-side operation has occurred.
3551 */
3552static bool rcu_seq_done(unsigned long *sp, unsigned long s)
3553{
3554 return ULONG_CMP_GE(READ_ONCE(*sp), s);
3555}
3556
3557/* 3436/*
3558 * Check to see if there is any immediate RCU-related work to be done 3437 * Check to see if there is any immediate RCU-related work to be done
3559 * by the current CPU, for the specified type of RCU, returning 1 if so. 3438 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -3577,7 +3456,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3577 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3456 /* Is the RCU core waiting for a quiescent state from this CPU? */
3578 if (rcu_scheduler_fully_active && 3457 if (rcu_scheduler_fully_active &&
3579 rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && 3458 rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
3580 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { 3459 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) {
3581 rdp->n_rp_core_needs_qs++; 3460 rdp->n_rp_core_needs_qs++;
3582 } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { 3461 } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) {
3583 rdp->n_rp_report_qs++; 3462 rdp->n_rp_report_qs++;
@@ -3585,7 +3464,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3585 } 3464 }
3586 3465
3587 /* Does this CPU have callbacks ready to invoke? */ 3466 /* Does this CPU have callbacks ready to invoke? */
3588 if (cpu_has_callbacks_ready_to_invoke(rdp)) { 3467 if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
3589 rdp->n_rp_cb_ready++; 3468 rdp->n_rp_cb_ready++;
3590 return 1; 3469 return 1;
3591 } 3470 }
@@ -3649,10 +3528,10 @@ static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
3649 3528
3650 for_each_rcu_flavor(rsp) { 3529 for_each_rcu_flavor(rsp) {
3651 rdp = this_cpu_ptr(rsp->rda); 3530 rdp = this_cpu_ptr(rsp->rda);
3652 if (!rdp->nxtlist) 3531 if (rcu_segcblist_empty(&rdp->cblist))
3653 continue; 3532 continue;
3654 hc = true; 3533 hc = true;
3655 if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { 3534 if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) {
3656 al = false; 3535 al = false;
3657 break; 3536 break;
3658 } 3537 }
@@ -3761,7 +3640,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
3761 __call_rcu(&rdp->barrier_head, 3640 __call_rcu(&rdp->barrier_head,
3762 rcu_barrier_callback, rsp, cpu, 0); 3641 rcu_barrier_callback, rsp, cpu, 0);
3763 } 3642 }
3764 } else if (READ_ONCE(rdp->qlen)) { 3643 } else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
3765 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 3644 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
3766 rsp->barrier_sequence); 3645 rsp->barrier_sequence);
3767 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 3646 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -3870,8 +3749,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3870 rdp->qlen_last_fqs_check = 0; 3749 rdp->qlen_last_fqs_check = 0;
3871 rdp->n_force_qs_snap = rsp->n_force_qs; 3750 rdp->n_force_qs_snap = rsp->n_force_qs;
3872 rdp->blimit = blimit; 3751 rdp->blimit = blimit;
3873 if (!rdp->nxtlist) 3752 if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
3874 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ 3753 !init_nocb_callback_list(rdp))
3754 rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
3875 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 3755 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
3876 rcu_sysidle_init_percpu_data(rdp->dynticks); 3756 rcu_sysidle_init_percpu_data(rdp->dynticks);
3877 rcu_dynticks_eqs_online(); 3757 rcu_dynticks_eqs_online();
@@ -3890,12 +3770,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3890 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ 3770 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
3891 rdp->completed = rnp->completed; 3771 rdp->completed = rnp->completed;
3892 rdp->cpu_no_qs.b.norm = true; 3772 rdp->cpu_no_qs.b.norm = true;
3893 rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); 3773 rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu);
3894 rdp->core_needs_qs = false; 3774 rdp->core_needs_qs = false;
3895 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); 3775 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3896 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3776 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3897} 3777}
3898 3778
3779/*
3780 * Invoked early in the CPU-online process, when pretty much all
3781 * services are available. The incoming CPU is not present.
3782 */
3899int rcutree_prepare_cpu(unsigned int cpu) 3783int rcutree_prepare_cpu(unsigned int cpu)
3900{ 3784{
3901 struct rcu_state *rsp; 3785 struct rcu_state *rsp;
@@ -3909,6 +3793,9 @@ int rcutree_prepare_cpu(unsigned int cpu)
3909 return 0; 3793 return 0;
3910} 3794}
3911 3795
3796/*
3797 * Update RCU priority boot kthread affinity for CPU-hotplug changes.
3798 */
3912static void rcutree_affinity_setting(unsigned int cpu, int outgoing) 3799static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
3913{ 3800{
3914 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); 3801 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
@@ -3916,20 +3803,34 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
3916 rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); 3803 rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
3917} 3804}
3918 3805
3806/*
3807 * Near the end of the CPU-online process. Pretty much all services
3808 * enabled, and the CPU is now very much alive.
3809 */
3919int rcutree_online_cpu(unsigned int cpu) 3810int rcutree_online_cpu(unsigned int cpu)
3920{ 3811{
3921 sync_sched_exp_online_cleanup(cpu); 3812 sync_sched_exp_online_cleanup(cpu);
3922 rcutree_affinity_setting(cpu, -1); 3813 rcutree_affinity_setting(cpu, -1);
3814 if (IS_ENABLED(CONFIG_TREE_SRCU))
3815 srcu_online_cpu(cpu);
3923 return 0; 3816 return 0;
3924} 3817}
3925 3818
3819/*
3820 * Near the beginning of the process. The CPU is still very much alive
3821 * with pretty much all services enabled.
3822 */
3926int rcutree_offline_cpu(unsigned int cpu) 3823int rcutree_offline_cpu(unsigned int cpu)
3927{ 3824{
3928 rcutree_affinity_setting(cpu, cpu); 3825 rcutree_affinity_setting(cpu, cpu);
3826 if (IS_ENABLED(CONFIG_TREE_SRCU))
3827 srcu_offline_cpu(cpu);
3929 return 0; 3828 return 0;
3930} 3829}
3931 3830
3932 3831/*
3832 * Near the end of the offline process. We do only tracing here.
3833 */
3933int rcutree_dying_cpu(unsigned int cpu) 3834int rcutree_dying_cpu(unsigned int cpu)
3934{ 3835{
3935 struct rcu_state *rsp; 3836 struct rcu_state *rsp;
@@ -3939,6 +3840,9 @@ int rcutree_dying_cpu(unsigned int cpu)
3939 return 0; 3840 return 0;
3940} 3841}
3941 3842
3843/*
3844 * The outgoing CPU is gone and we are running elsewhere.
3845 */
3942int rcutree_dead_cpu(unsigned int cpu) 3846int rcutree_dead_cpu(unsigned int cpu)
3943{ 3847{
3944 struct rcu_state *rsp; 3848 struct rcu_state *rsp;
@@ -3956,6 +3860,10 @@ int rcutree_dead_cpu(unsigned int cpu)
3956 * incoming CPUs are not allowed to use RCU read-side critical sections 3860 * incoming CPUs are not allowed to use RCU read-side critical sections
3957 * until this function is called. Failing to observe this restriction 3861 * until this function is called. Failing to observe this restriction
3958 * will result in lockdep splats. 3862 * will result in lockdep splats.
3863 *
3864 * Note that this function is special in that it is invoked directly
3865 * from the incoming CPU rather than from the cpuhp_step mechanism.
3866 * This is because this function must be invoked at a precise location.
3959 */ 3867 */
3960void rcu_cpu_starting(unsigned int cpu) 3868void rcu_cpu_starting(unsigned int cpu)
3961{ 3869{
@@ -3981,9 +3889,6 @@ void rcu_cpu_starting(unsigned int cpu)
3981 * The CPU is exiting the idle loop into the arch_cpu_idle_dead() 3889 * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
3982 * function. We now remove it from the rcu_node tree's ->qsmaskinit 3890 * function. We now remove it from the rcu_node tree's ->qsmaskinit
3983 * bit masks. 3891 * bit masks.
3984 * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
3985 * function. We now remove it from the rcu_node tree's ->qsmaskinit
3986 * bit masks.
3987 */ 3892 */
3988static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) 3893static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
3989{ 3894{
@@ -3999,6 +3904,14 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
3999 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3904 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4000} 3905}
4001 3906
3907/*
3908 * The outgoing function has no further need of RCU, so remove it from
3909 * the list of CPUs that RCU must track.
3910 *
3911 * Note that this function is special in that it is invoked directly
3912 * from the outgoing CPU rather than from the cpuhp_step mechanism.
3913 * This is because this function must be invoked at a precise location.
3914 */
4002void rcu_report_dead(unsigned int cpu) 3915void rcu_report_dead(unsigned int cpu)
4003{ 3916{
4004 struct rcu_state *rsp; 3917 struct rcu_state *rsp;
@@ -4013,6 +3926,10 @@ void rcu_report_dead(unsigned int cpu)
4013} 3926}
4014#endif 3927#endif
4015 3928
3929/*
3930 * On non-huge systems, use expedited RCU grace periods to make suspend
3931 * and hibernation run faster.
3932 */
4016static int rcu_pm_notify(struct notifier_block *self, 3933static int rcu_pm_notify(struct notifier_block *self,
4017 unsigned long action, void *hcpu) 3934 unsigned long action, void *hcpu)
4018{ 3935{
@@ -4083,7 +4000,7 @@ early_initcall(rcu_spawn_gp_kthread);
4083 * task is booting the system, and such primitives are no-ops). After this 4000 * task is booting the system, and such primitives are no-ops). After this
4084 * function is called, any synchronous grace-period primitives are run as 4001 * function is called, any synchronous grace-period primitives are run as
4085 * expedited, with the requesting task driving the grace period forward. 4002 * expedited, with the requesting task driving the grace period forward.
4086 * A later core_initcall() rcu_exp_runtime_mode() will switch to full 4003 * A later core_initcall() rcu_set_runtime_mode() will switch to full
4087 * runtime RCU functionality. 4004 * runtime RCU functionality.
4088 */ 4005 */
4089void rcu_scheduler_starting(void) 4006void rcu_scheduler_starting(void)
@@ -4096,31 +4013,6 @@ void rcu_scheduler_starting(void)
4096} 4013}
4097 4014
4098/* 4015/*
4099 * Compute the per-level fanout, either using the exact fanout specified
4100 * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
4101 */
4102static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
4103{
4104 int i;
4105
4106 if (rcu_fanout_exact) {
4107 levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
4108 for (i = rcu_num_lvls - 2; i >= 0; i--)
4109 levelspread[i] = RCU_FANOUT;
4110 } else {
4111 int ccur;
4112 int cprv;
4113
4114 cprv = nr_cpu_ids;
4115 for (i = rcu_num_lvls - 1; i >= 0; i--) {
4116 ccur = levelcnt[i];
4117 levelspread[i] = (cprv + ccur - 1) / ccur;
4118 cprv = ccur;
4119 }
4120 }
4121}
4122
4123/*
4124 * Helper function for rcu_init() that initializes one rcu_state structure. 4016 * Helper function for rcu_init() that initializes one rcu_state structure.
4125 */ 4017 */
4126static void __init rcu_init_one(struct rcu_state *rsp) 4018static void __init rcu_init_one(struct rcu_state *rsp)
@@ -4129,9 +4021,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
4129 static const char * const fqs[] = RCU_FQS_NAME_INIT; 4021 static const char * const fqs[] = RCU_FQS_NAME_INIT;
4130 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 4022 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
4131 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 4023 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
4132 static u8 fl_mask = 0x1;
4133 4024
4134 int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
4135 int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ 4025 int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
4136 int cpustride = 1; 4026 int cpustride = 1;
4137 int i; 4027 int i;
@@ -4146,20 +4036,16 @@ static void __init rcu_init_one(struct rcu_state *rsp)
4146 4036
4147 /* Initialize the level-tracking arrays. */ 4037 /* Initialize the level-tracking arrays. */
4148 4038
4149 for (i = 0; i < rcu_num_lvls; i++)
4150 levelcnt[i] = num_rcu_lvl[i];
4151 for (i = 1; i < rcu_num_lvls; i++) 4039 for (i = 1; i < rcu_num_lvls; i++)
4152 rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; 4040 rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1];
4153 rcu_init_levelspread(levelspread, levelcnt); 4041 rcu_init_levelspread(levelspread, num_rcu_lvl);
4154 rsp->flavor_mask = fl_mask;
4155 fl_mask <<= 1;
4156 4042
4157 /* Initialize the elements themselves, starting from the leaves. */ 4043 /* Initialize the elements themselves, starting from the leaves. */
4158 4044
4159 for (i = rcu_num_lvls - 1; i >= 0; i--) { 4045 for (i = rcu_num_lvls - 1; i >= 0; i--) {
4160 cpustride *= levelspread[i]; 4046 cpustride *= levelspread[i];
4161 rnp = rsp->level[i]; 4047 rnp = rsp->level[i];
4162 for (j = 0; j < levelcnt[i]; j++, rnp++) { 4048 for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) {
4163 raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); 4049 raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
4164 lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), 4050 lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
4165 &rcu_node_class[i], buf[i]); 4051 &rcu_node_class[i], buf[i]);
@@ -4332,6 +4218,8 @@ void __init rcu_init(void)
4332 for_each_online_cpu(cpu) { 4218 for_each_online_cpu(cpu) {
4333 rcutree_prepare_cpu(cpu); 4219 rcutree_prepare_cpu(cpu);
4334 rcu_cpu_starting(cpu); 4220 rcu_cpu_starting(cpu);
4221 if (IS_ENABLED(CONFIG_TREE_SRCU))
4222 srcu_online_cpu(cpu);
4335 } 4223 }
4336} 4224}
4337 4225
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ec62a05bfdb3..ba38262c3554 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -30,80 +30,9 @@
30#include <linux/seqlock.h> 30#include <linux/seqlock.h>
31#include <linux/swait.h> 31#include <linux/swait.h>
32#include <linux/stop_machine.h> 32#include <linux/stop_machine.h>
33#include <linux/rcu_node_tree.h>
33 34
34/* 35#include "rcu_segcblist.h"
35 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
36 * CONFIG_RCU_FANOUT_LEAF.
37 * In theory, it should be possible to add more levels straightforwardly.
38 * In practice, this did work well going from three levels to four.
39 * Of course, your mileage may vary.
40 */
41
42#ifdef CONFIG_RCU_FANOUT
43#define RCU_FANOUT CONFIG_RCU_FANOUT
44#else /* #ifdef CONFIG_RCU_FANOUT */
45# ifdef CONFIG_64BIT
46# define RCU_FANOUT 64
47# else
48# define RCU_FANOUT 32
49# endif
50#endif /* #else #ifdef CONFIG_RCU_FANOUT */
51
52#ifdef CONFIG_RCU_FANOUT_LEAF
53#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
54#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
55# ifdef CONFIG_64BIT
56# define RCU_FANOUT_LEAF 64
57# else
58# define RCU_FANOUT_LEAF 32
59# endif
60#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
61
62#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
63#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT)
64#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT)
65#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT)
66
67#if NR_CPUS <= RCU_FANOUT_1
68# define RCU_NUM_LVLS 1
69# define NUM_RCU_LVL_0 1
70# define NUM_RCU_NODES NUM_RCU_LVL_0
71# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
72# define RCU_NODE_NAME_INIT { "rcu_node_0" }
73# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
74#elif NR_CPUS <= RCU_FANOUT_2
75# define RCU_NUM_LVLS 2
76# define NUM_RCU_LVL_0 1
77# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
78# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
79# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
80# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
81# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
82#elif NR_CPUS <= RCU_FANOUT_3
83# define RCU_NUM_LVLS 3
84# define NUM_RCU_LVL_0 1
85# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
86# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
87# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
88# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
89# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
90# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
91#elif NR_CPUS <= RCU_FANOUT_4
92# define RCU_NUM_LVLS 4
93# define NUM_RCU_LVL_0 1
94# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
95# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
96# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
97# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
98# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
99# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
100# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
101#else
102# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
103#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
104
105extern int rcu_num_lvls;
106extern int rcu_num_nodes;
107 36
108/* 37/*
109 * Dynticks per-CPU state. 38 * Dynticks per-CPU state.
@@ -113,6 +42,9 @@ struct rcu_dynticks {
113 /* Process level is worth LLONG_MAX/2. */ 42 /* Process level is worth LLONG_MAX/2. */
114 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 43 int dynticks_nmi_nesting; /* Track NMI nesting level. */
115 atomic_t dynticks; /* Even value for idle, else odd. */ 44 atomic_t dynticks; /* Even value for idle, else odd. */
45 bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
46 unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
47 bool rcu_urgent_qs; /* GP old need light quiescent state. */
116#ifdef CONFIG_NO_HZ_FULL_SYSIDLE 48#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
117 long long dynticks_idle_nesting; 49 long long dynticks_idle_nesting;
118 /* irq/process nesting level from idle. */ 50 /* irq/process nesting level from idle. */
@@ -262,41 +194,6 @@ struct rcu_node {
262#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) 194#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo))
263 195
264/* 196/*
265 * Do a full breadth-first scan of the rcu_node structures for the
266 * specified rcu_state structure.
267 */
268#define rcu_for_each_node_breadth_first(rsp, rnp) \
269 for ((rnp) = &(rsp)->node[0]; \
270 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
271
272/*
273 * Do a breadth-first scan of the non-leaf rcu_node structures for the
274 * specified rcu_state structure. Note that if there is a singleton
275 * rcu_node tree with but one rcu_node structure, this loop is a no-op.
276 */
277#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
278 for ((rnp) = &(rsp)->node[0]; \
279 (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
280
281/*
282 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
283 * structure. Note that if there is a singleton rcu_node tree with but
284 * one rcu_node structure, this loop -will- visit the rcu_node structure.
285 * It is still a leaf node, even if it is also the root node.
286 */
287#define rcu_for_each_leaf_node(rsp, rnp) \
288 for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
289 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
290
291/*
292 * Iterate over all possible CPUs in a leaf RCU node.
293 */
294#define for_each_leaf_node_possible_cpu(rnp, cpu) \
295 for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
296 cpu <= rnp->grphi; \
297 cpu = cpumask_next((cpu), cpu_possible_mask))
298
299/*
300 * Union to allow "aggregate OR" operation on the need for a quiescent 197 * Union to allow "aggregate OR" operation on the need for a quiescent
301 * state by the normal and expedited grace periods. 198 * state by the normal and expedited grace periods.
302 */ 199 */
@@ -336,34 +233,9 @@ struct rcu_data {
336 /* period it is aware of. */ 233 /* period it is aware of. */
337 234
338 /* 2) batch handling */ 235 /* 2) batch handling */
339 /* 236 struct rcu_segcblist cblist; /* Segmented callback list, with */
340 * If nxtlist is not NULL, it is partitioned as follows. 237 /* different callbacks waiting for */
341 * Any of the partitions might be empty, in which case the 238 /* different grace periods. */
342 * pointer to that partition will be equal to the pointer for
343 * the following partition. When the list is empty, all of
344 * the nxttail elements point to the ->nxtlist pointer itself,
345 * which in that case is NULL.
346 *
347 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
348 * Entries that batch # <= ->completed
349 * The grace period for these entries has completed, and
350 * the other grace-period-completed entries may be moved
351 * here temporarily in rcu_process_callbacks().
352 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
353 * Entries that batch # <= ->completed - 1: waiting for current GP
354 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
355 * Entries known to have arrived before current GP ended
356 * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
357 * Entries that might have arrived after current GP ended
358 * Note that the value of *nxttail[RCU_NEXT_TAIL] will
359 * always be NULL, as this is the end of the list.
360 */
361 struct rcu_head *nxtlist;
362 struct rcu_head **nxttail[RCU_NEXT_SIZE];
363 unsigned long nxtcompleted[RCU_NEXT_SIZE];
364 /* grace periods for sublists. */
365 long qlen_lazy; /* # of lazy queued callbacks */
366 long qlen; /* # of queued callbacks, incl lazy */
367 long qlen_last_fqs_check; 239 long qlen_last_fqs_check;
368 /* qlen at last check for QS forcing */ 240 /* qlen at last check for QS forcing */
369 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 241 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
@@ -482,7 +354,6 @@ struct rcu_state {
482 struct rcu_node *level[RCU_NUM_LVLS + 1]; 354 struct rcu_node *level[RCU_NUM_LVLS + 1];
483 /* Hierarchy levels (+1 to */ 355 /* Hierarchy levels (+1 to */
484 /* shut bogus gcc warning) */ 356 /* shut bogus gcc warning) */
485 u8 flavor_mask; /* bit in flavor mask. */
486 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 357 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
487 call_rcu_func_t call; /* call_rcu() flavor. */ 358 call_rcu_func_t call; /* call_rcu() flavor. */
488 int ncpus; /* # CPUs seen so far. */ 359 int ncpus; /* # CPUs seen so far. */
@@ -502,14 +373,11 @@ struct rcu_state {
502 373
503 raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; 374 raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
504 /* Protect following fields. */ 375 /* Protect following fields. */
505 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ 376 struct rcu_cblist orphan_pend; /* Orphaned callbacks that */
506 /* need a grace period. */ 377 /* need a grace period. */
507 struct rcu_head **orphan_nxttail; /* Tail of above. */ 378 struct rcu_cblist orphan_done; /* Orphaned callbacks that */
508 struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
509 /* are ready to invoke. */ 379 /* are ready to invoke. */
510 struct rcu_head **orphan_donetail; /* Tail of above. */ 380 /* (Contains counts.) */
511 long qlen_lazy; /* Number of lazy callbacks. */
512 long qlen; /* Total number of callbacks. */
513 /* End of fields guarded by orphan_lock. */ 381 /* End of fields guarded by orphan_lock. */
514 382
515 struct mutex barrier_mutex; /* Guards barrier fields. */ 383 struct mutex barrier_mutex; /* Guards barrier fields. */
@@ -596,6 +464,7 @@ extern struct rcu_state rcu_preempt_state;
596#endif /* #ifdef CONFIG_PREEMPT_RCU */ 464#endif /* #ifdef CONFIG_PREEMPT_RCU */
597 465
598int rcu_dynticks_snap(struct rcu_dynticks *rdtp); 466int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
467bool rcu_eqs_special_set(int cpu);
599 468
600#ifdef CONFIG_RCU_BOOST 469#ifdef CONFIG_RCU_BOOST
601DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); 470DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -673,6 +542,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
673static void rcu_dynticks_task_enter(void); 542static void rcu_dynticks_task_enter(void);
674static void rcu_dynticks_task_exit(void); 543static void rcu_dynticks_task_exit(void);
675 544
545#ifdef CONFIG_SRCU
546void srcu_online_cpu(unsigned int cpu);
547void srcu_offline_cpu(unsigned int cpu);
548#else /* #ifdef CONFIG_SRCU */
549void srcu_online_cpu(unsigned int cpu) { }
550void srcu_offline_cpu(unsigned int cpu) { }
551#endif /* #else #ifdef CONFIG_SRCU */
552
676#endif /* #ifndef RCU_TREE_NONCORE */ 553#endif /* #ifndef RCU_TREE_NONCORE */
677 554
678#ifdef CONFIG_RCU_TRACE 555#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index a7b639ccd46e..e513b4ab1197 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -292,7 +292,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
292 trace_rcu_exp_funnel_lock(rsp->name, rnp->level, 292 trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
293 rnp->grplo, rnp->grphi, 293 rnp->grplo, rnp->grphi,
294 TPS("wait")); 294 TPS("wait"));
295 wait_event(rnp->exp_wq[(s >> 1) & 0x3], 295 wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
296 sync_exp_work_done(rsp, 296 sync_exp_work_done(rsp,
297 &rdp->exp_workdone2, s)); 297 &rdp->exp_workdone2, s));
298 return true; 298 return true;
@@ -331,6 +331,8 @@ static void sync_sched_exp_handler(void *data)
331 return; 331 return;
332 } 332 }
333 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); 333 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
334 /* Store .exp before .rcu_urgent_qs. */
335 smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
334 resched_cpu(smp_processor_id()); 336 resched_cpu(smp_processor_id());
335} 337}
336 338
@@ -531,7 +533,8 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
531 rnp->exp_seq_rq = s; 533 rnp->exp_seq_rq = s;
532 spin_unlock(&rnp->exp_lock); 534 spin_unlock(&rnp->exp_lock);
533 } 535 }
534 wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); 536 smp_mb(); /* All above changes before wakeup. */
537 wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]);
535 } 538 }
536 trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); 539 trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
537 mutex_unlock(&rsp->exp_wake_mutex); 540 mutex_unlock(&rsp->exp_wake_mutex);
@@ -609,9 +612,9 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
609 /* Wait for expedited grace period to complete. */ 612 /* Wait for expedited grace period to complete. */
610 rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); 613 rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
611 rnp = rcu_get_root(rsp); 614 rnp = rcu_get_root(rsp);
612 wait_event(rnp->exp_wq[(s >> 1) & 0x3], 615 wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
613 sync_exp_work_done(rsp, 616 sync_exp_work_done(rsp, &rdp->exp_workdone0, s));
614 &rdp->exp_workdone0, s)); 617 smp_mb(); /* Workqueue actions happen before return. */
615 618
616 /* Let the next expedited grace period start. */ 619 /* Let the next expedited grace period start. */
617 mutex_unlock(&rsp->exp_mutex); 620 mutex_unlock(&rsp->exp_mutex);
@@ -735,15 +738,3 @@ void synchronize_rcu_expedited(void)
735EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 738EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
736 739
737#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ 740#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
738
739/*
740 * Switch to run-time mode once Tree RCU has fully initialized.
741 */
742static int __init rcu_exp_runtime_mode(void)
743{
744 rcu_test_sync_prims();
745 rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
746 rcu_test_sync_prims();
747 return 0;
748}
749core_initcall(rcu_exp_runtime_mode);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a62a8f1caac..c9a48657512a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1350,10 +1350,10 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
1350 */ 1350 */
1351 if ((rdp->completed != rnp->completed || 1351 if ((rdp->completed != rnp->completed ||
1352 unlikely(READ_ONCE(rdp->gpwrap))) && 1352 unlikely(READ_ONCE(rdp->gpwrap))) &&
1353 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) 1353 rcu_segcblist_pend_cbs(&rdp->cblist))
1354 note_gp_changes(rsp, rdp); 1354 note_gp_changes(rsp, rdp);
1355 1355
1356 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1356 if (rcu_segcblist_ready_cbs(&rdp->cblist))
1357 cbs_ready = true; 1357 cbs_ready = true;
1358 } 1358 }
1359 return cbs_ready; 1359 return cbs_ready;
@@ -1461,7 +1461,7 @@ static void rcu_prepare_for_idle(void)
1461 rdtp->last_accelerate = jiffies; 1461 rdtp->last_accelerate = jiffies;
1462 for_each_rcu_flavor(rsp) { 1462 for_each_rcu_flavor(rsp) {
1463 rdp = this_cpu_ptr(rsp->rda); 1463 rdp = this_cpu_ptr(rsp->rda);
1464 if (!*rdp->nxttail[RCU_DONE_TAIL]) 1464 if (rcu_segcblist_pend_cbs(&rdp->cblist))
1465 continue; 1465 continue;
1466 rnp = rdp->mynode; 1466 rnp = rdp->mynode;
1467 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 1467 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
@@ -1529,7 +1529,7 @@ static void rcu_oom_notify_cpu(void *unused)
1529 1529
1530 for_each_rcu_flavor(rsp) { 1530 for_each_rcu_flavor(rsp) {
1531 rdp = raw_cpu_ptr(rsp->rda); 1531 rdp = raw_cpu_ptr(rsp->rda);
1532 if (rdp->qlen_lazy != 0) { 1532 if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) {
1533 atomic_inc(&oom_callback_count); 1533 atomic_inc(&oom_callback_count);
1534 rsp->call(&rdp->oom_head, rcu_oom_callback); 1534 rsp->call(&rdp->oom_head, rcu_oom_callback);
1535 } 1535 }
@@ -1709,7 +1709,7 @@ __setup("rcu_nocbs=", rcu_nocb_setup);
1709 1709
1710static int __init parse_rcu_nocb_poll(char *arg) 1710static int __init parse_rcu_nocb_poll(char *arg)
1711{ 1711{
1712 rcu_nocb_poll = 1; 1712 rcu_nocb_poll = true;
1713 return 0; 1713 return 0;
1714} 1714}
1715early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 1715early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
@@ -1860,7 +1860,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
1860 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1860 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1861 TPS("WakeEmpty")); 1861 TPS("WakeEmpty"));
1862 } else { 1862 } else {
1863 rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; 1863 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE);
1864 /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
1865 smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
1864 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1866 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1865 TPS("WakeEmptyIsDeferred")); 1867 TPS("WakeEmptyIsDeferred"));
1866 } 1868 }
@@ -1872,7 +1874,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
1872 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1874 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1873 TPS("WakeOvf")); 1875 TPS("WakeOvf"));
1874 } else { 1876 } else {
1875 rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; 1877 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE);
1878 /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
1879 smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
1876 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1880 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1877 TPS("WakeOvfIsDeferred")); 1881 TPS("WakeOvfIsDeferred"));
1878 } 1882 }
@@ -1930,30 +1934,26 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
1930 struct rcu_data *rdp, 1934 struct rcu_data *rdp,
1931 unsigned long flags) 1935 unsigned long flags)
1932{ 1936{
1933 long ql = rsp->qlen; 1937 long ql = rsp->orphan_done.len;
1934 long qll = rsp->qlen_lazy; 1938 long qll = rsp->orphan_done.len_lazy;
1935 1939
1936 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ 1940 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
1937 if (!rcu_is_nocb_cpu(smp_processor_id())) 1941 if (!rcu_is_nocb_cpu(smp_processor_id()))
1938 return false; 1942 return false;
1939 rsp->qlen = 0;
1940 rsp->qlen_lazy = 0;
1941 1943
1942 /* First, enqueue the donelist, if any. This preserves CB ordering. */ 1944 /* First, enqueue the donelist, if any. This preserves CB ordering. */
1943 if (rsp->orphan_donelist != NULL) { 1945 if (rsp->orphan_done.head) {
1944 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, 1946 __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done),
1945 rsp->orphan_donetail, ql, qll, flags); 1947 rcu_cblist_tail(&rsp->orphan_done),
1946 ql = qll = 0; 1948 ql, qll, flags);
1947 rsp->orphan_donelist = NULL;
1948 rsp->orphan_donetail = &rsp->orphan_donelist;
1949 } 1949 }
1950 if (rsp->orphan_nxtlist != NULL) { 1950 if (rsp->orphan_pend.head) {
1951 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, 1951 __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend),
1952 rsp->orphan_nxttail, ql, qll, flags); 1952 rcu_cblist_tail(&rsp->orphan_pend),
1953 ql = qll = 0; 1953 ql, qll, flags);
1954 rsp->orphan_nxtlist = NULL;
1955 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
1956 } 1954 }
1955 rcu_cblist_init(&rsp->orphan_done);
1956 rcu_cblist_init(&rsp->orphan_pend);
1957 return true; 1957 return true;
1958} 1958}
1959 1959
@@ -2395,16 +2395,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
2395 return false; 2395 return false;
2396 2396
2397 /* If there are early-boot callbacks, move them to nocb lists. */ 2397 /* If there are early-boot callbacks, move them to nocb lists. */
2398 if (rdp->nxtlist) { 2398 if (!rcu_segcblist_empty(&rdp->cblist)) {
2399 rdp->nocb_head = rdp->nxtlist; 2399 rdp->nocb_head = rcu_segcblist_head(&rdp->cblist);
2400 rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL]; 2400 rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist);
2401 atomic_long_set(&rdp->nocb_q_count, rdp->qlen); 2401 atomic_long_set(&rdp->nocb_q_count,
2402 atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy); 2402 rcu_segcblist_n_cbs(&rdp->cblist));
2403 rdp->nxtlist = NULL; 2403 atomic_long_set(&rdp->nocb_q_count_lazy,
2404 rdp->qlen = 0; 2404 rcu_segcblist_n_lazy_cbs(&rdp->cblist));
2405 rdp->qlen_lazy = 0; 2405 rcu_segcblist_init(&rdp->cblist);
2406 } 2406 }
2407 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2407 rcu_segcblist_disable(&rdp->cblist);
2408 return true; 2408 return true;
2409} 2409}
2410 2410
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 8751a748499a..6cea17a1ea30 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -41,11 +41,11 @@
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/debugfs.h> 42#include <linux/debugfs.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/prefetch.h>
44 45
45#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
46#include "tree.h" 47#include "tree.h"
47 48#include "rcu.h"
48DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
49 49
50static int r_open(struct inode *inode, struct file *file, 50static int r_open(struct inode *inode, struct file *file,
51 const struct seq_operations *op) 51 const struct seq_operations *op)
@@ -121,7 +121,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
121 cpu_is_offline(rdp->cpu) ? '!' : ' ', 121 cpu_is_offline(rdp->cpu) ? '!' : ' ',
122 ulong2long(rdp->completed), ulong2long(rdp->gpnum), 122 ulong2long(rdp->completed), ulong2long(rdp->gpnum),
123 rdp->cpu_no_qs.b.norm, 123 rdp->cpu_no_qs.b.norm,
124 rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), 124 rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu),
125 rdp->core_needs_qs); 125 rdp->core_needs_qs);
126 seq_printf(m, " dt=%d/%llx/%d df=%lu", 126 seq_printf(m, " dt=%d/%llx/%d df=%lu",
127 rcu_dynticks_snap(rdp->dynticks), 127 rcu_dynticks_snap(rdp->dynticks),
@@ -130,17 +130,15 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
130 rdp->dynticks_fqs); 130 rdp->dynticks_fqs);
131 seq_printf(m, " of=%lu", rdp->offline_fqs); 131 seq_printf(m, " of=%lu", rdp->offline_fqs);
132 rcu_nocb_q_lengths(rdp, &ql, &qll); 132 rcu_nocb_q_lengths(rdp, &ql, &qll);
133 qll += rdp->qlen_lazy; 133 qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist);
134 ql += rdp->qlen; 134 ql += rcu_segcblist_n_cbs(&rdp->cblist);
135 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", 135 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
136 qll, ql, 136 qll, ql,
137 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 137 ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)],
138 rdp->nxttail[RCU_NEXT_TAIL]], 138 ".R"[!rcu_segcblist_segempty(&rdp->cblist,
139 ".R"[rdp->nxttail[RCU_WAIT_TAIL] != 139 RCU_NEXT_READY_TAIL)],
140 rdp->nxttail[RCU_NEXT_READY_TAIL]], 140 ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)],
141 ".W"[rdp->nxttail[RCU_DONE_TAIL] != 141 ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]);
142 rdp->nxttail[RCU_WAIT_TAIL]],
143 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
144#ifdef CONFIG_RCU_BOOST 142#ifdef CONFIG_RCU_BOOST
145 seq_printf(m, " kt=%d/%c ktl=%x", 143 seq_printf(m, " kt=%d/%c ktl=%x",
146 per_cpu(rcu_cpu_has_work, rdp->cpu), 144 per_cpu(rcu_cpu_has_work, rdp->cpu),
@@ -278,7 +276,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
278 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 276 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
279 rsp->n_force_qs, rsp->n_force_qs_ngp, 277 rsp->n_force_qs, rsp->n_force_qs_ngp,
280 rsp->n_force_qs - rsp->n_force_qs_ngp, 278 rsp->n_force_qs - rsp->n_force_qs_ngp,
281 READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); 279 READ_ONCE(rsp->n_force_qs_lh),
280 rsp->orphan_done.len_lazy,
281 rsp->orphan_done.len);
282 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { 282 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
283 if (rnp->level != level) { 283 if (rnp->level != level) {
284 seq_puts(m, "\n"); 284 seq_puts(m, "\n");
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 55c8530316c7..273e869ca21d 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
124 * non-expedited counterparts? Intended for use within RCU. Note 124 * non-expedited counterparts? Intended for use within RCU. Note
125 * that if the user specifies both rcu_expedited and rcu_normal, then 125 * that if the user specifies both rcu_expedited and rcu_normal, then
126 * rcu_normal wins. (Except during the time period during boot from 126 * rcu_normal wins. (Except during the time period during boot from
127 * when the first task is spawned until the rcu_exp_runtime_mode() 127 * when the first task is spawned until the rcu_set_runtime_mode()
128 * core_initcall() is invoked, at which point everything is expedited.) 128 * core_initcall() is invoked, at which point everything is expedited.)
129 */ 129 */
130bool rcu_gp_is_normal(void) 130bool rcu_gp_is_normal(void)
@@ -190,6 +190,39 @@ void rcu_end_inkernel_boot(void)
190 190
191#endif /* #ifndef CONFIG_TINY_RCU */ 191#endif /* #ifndef CONFIG_TINY_RCU */
192 192
193/*
194 * Test each non-SRCU synchronous grace-period wait API. This is
195 * useful just after a change in mode for these primitives, and
196 * during early boot.
197 */
198void rcu_test_sync_prims(void)
199{
200 if (!IS_ENABLED(CONFIG_PROVE_RCU))
201 return;
202 synchronize_rcu();
203 synchronize_rcu_bh();
204 synchronize_sched();
205 synchronize_rcu_expedited();
206 synchronize_rcu_bh_expedited();
207 synchronize_sched_expedited();
208}
209
210#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU)
211
212/*
213 * Switch to run-time mode once RCU has fully initialized.
214 */
215static int __init rcu_set_runtime_mode(void)
216{
217 rcu_test_sync_prims();
218 rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
219 rcu_test_sync_prims();
220 return 0;
221}
222core_initcall(rcu_set_runtime_mode);
223
224#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
225
193#ifdef CONFIG_PREEMPT_RCU 226#ifdef CONFIG_PREEMPT_RCU
194 227
195/* 228/*
@@ -632,6 +665,7 @@ static void check_holdout_task(struct task_struct *t,
632 put_task_struct(t); 665 put_task_struct(t);
633 return; 666 return;
634 } 667 }
668 rcu_request_urgent_qs_task(t);
635 if (!needreport) 669 if (!needreport)
636 return; 670 return;
637 if (*firstreport) { 671 if (*firstreport) {
@@ -817,23 +851,6 @@ static void rcu_spawn_tasks_kthread(void)
817 851
818#endif /* #ifdef CONFIG_TASKS_RCU */ 852#endif /* #ifdef CONFIG_TASKS_RCU */
819 853
820/*
821 * Test each non-SRCU synchronous grace-period wait API. This is
822 * useful just after a change in mode for these primitives, and
823 * during early boot.
824 */
825void rcu_test_sync_prims(void)
826{
827 if (!IS_ENABLED(CONFIG_PROVE_RCU))
828 return;
829 synchronize_rcu();
830 synchronize_rcu_bh();
831 synchronize_sched();
832 synchronize_rcu_expedited();
833 synchronize_rcu_bh_expedited();
834 synchronize_sched_expedited();
835}
836
837#ifdef CONFIG_PROVE_RCU 854#ifdef CONFIG_PROVE_RCU
838 855
839/* 856/*
diff --git a/kernel/relay.c b/kernel/relay.c
index 0e413d9eec8a..39a9dfc69486 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1212,7 +1212,6 @@ static ssize_t subbuf_splice_actor(struct file *in,
1212 .nr_pages = 0, 1212 .nr_pages = 0,
1213 .nr_pages_max = PIPE_DEF_BUFFERS, 1213 .nr_pages_max = PIPE_DEF_BUFFERS,
1214 .partial = partial, 1214 .partial = partial,
1215 .flags = flags,
1216 .ops = &relay_pipe_buf_ops, 1215 .ops = &relay_pipe_buf_ops,
1217 .spd_release = relay_page_release, 1216 .spd_release = relay_page_release,
1218 }; 1217 };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3b31fc05a0f1..803c3bc274c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -86,21 +86,6 @@ int sysctl_sched_rt_runtime = 950000;
86cpumask_var_t cpu_isolated_map; 86cpumask_var_t cpu_isolated_map;
87 87
88/* 88/*
89 * this_rq_lock - lock this runqueue and disable interrupts.
90 */
91static struct rq *this_rq_lock(void)
92 __acquires(rq->lock)
93{
94 struct rq *rq;
95
96 local_irq_disable();
97 rq = this_rq();
98 raw_spin_lock(&rq->lock);
99
100 return rq;
101}
102
103/*
104 * __task_rq_lock - lock the rq @p resides on. 89 * __task_rq_lock - lock the rq @p resides on.
105 */ 90 */
106struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) 91struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
@@ -233,8 +218,11 @@ void update_rq_clock(struct rq *rq)
233 return; 218 return;
234 219
235#ifdef CONFIG_SCHED_DEBUG 220#ifdef CONFIG_SCHED_DEBUG
221 if (sched_feat(WARN_DOUBLE_CLOCK))
222 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
236 rq->clock_update_flags |= RQCF_UPDATED; 223 rq->clock_update_flags |= RQCF_UPDATED;
237#endif 224#endif
225
238 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 226 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
239 if (delta < 0) 227 if (delta < 0)
240 return; 228 return;
@@ -261,13 +249,14 @@ static void hrtick_clear(struct rq *rq)
261static enum hrtimer_restart hrtick(struct hrtimer *timer) 249static enum hrtimer_restart hrtick(struct hrtimer *timer)
262{ 250{
263 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 251 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
252 struct rq_flags rf;
264 253
265 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 254 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
266 255
267 raw_spin_lock(&rq->lock); 256 rq_lock(rq, &rf);
268 update_rq_clock(rq); 257 update_rq_clock(rq);
269 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 258 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
270 raw_spin_unlock(&rq->lock); 259 rq_unlock(rq, &rf);
271 260
272 return HRTIMER_NORESTART; 261 return HRTIMER_NORESTART;
273} 262}
@@ -287,11 +276,12 @@ static void __hrtick_restart(struct rq *rq)
287static void __hrtick_start(void *arg) 276static void __hrtick_start(void *arg)
288{ 277{
289 struct rq *rq = arg; 278 struct rq *rq = arg;
279 struct rq_flags rf;
290 280
291 raw_spin_lock(&rq->lock); 281 rq_lock(rq, &rf);
292 __hrtick_restart(rq); 282 __hrtick_restart(rq);
293 rq->hrtick_csd_pending = 0; 283 rq->hrtick_csd_pending = 0;
294 raw_spin_unlock(&rq->lock); 284 rq_unlock(rq, &rf);
295} 285}
296 286
297/* 287/*
@@ -762,17 +752,23 @@ static void set_load_weight(struct task_struct *p)
762 752
763static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 753static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
764{ 754{
765 update_rq_clock(rq); 755 if (!(flags & ENQUEUE_NOCLOCK))
756 update_rq_clock(rq);
757
766 if (!(flags & ENQUEUE_RESTORE)) 758 if (!(flags & ENQUEUE_RESTORE))
767 sched_info_queued(rq, p); 759 sched_info_queued(rq, p);
760
768 p->sched_class->enqueue_task(rq, p, flags); 761 p->sched_class->enqueue_task(rq, p, flags);
769} 762}
770 763
771static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 764static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
772{ 765{
773 update_rq_clock(rq); 766 if (!(flags & DEQUEUE_NOCLOCK))
767 update_rq_clock(rq);
768
774 if (!(flags & DEQUEUE_SAVE)) 769 if (!(flags & DEQUEUE_SAVE))
775 sched_info_dequeued(rq, p); 770 sched_info_dequeued(rq, p);
771
776 p->sched_class->dequeue_task(rq, p, flags); 772 p->sched_class->dequeue_task(rq, p, flags);
777} 773}
778 774
@@ -946,18 +942,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
946 * 942 *
947 * Returns (locked) new rq. Old rq's lock is released. 943 * Returns (locked) new rq. Old rq's lock is released.
948 */ 944 */
949static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) 945static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
946 struct task_struct *p, int new_cpu)
950{ 947{
951 lockdep_assert_held(&rq->lock); 948 lockdep_assert_held(&rq->lock);
952 949
953 p->on_rq = TASK_ON_RQ_MIGRATING; 950 p->on_rq = TASK_ON_RQ_MIGRATING;
954 dequeue_task(rq, p, 0); 951 dequeue_task(rq, p, DEQUEUE_NOCLOCK);
955 set_task_cpu(p, new_cpu); 952 set_task_cpu(p, new_cpu);
956 raw_spin_unlock(&rq->lock); 953 rq_unlock(rq, rf);
957 954
958 rq = cpu_rq(new_cpu); 955 rq = cpu_rq(new_cpu);
959 956
960 raw_spin_lock(&rq->lock); 957 rq_lock(rq, rf);
961 BUG_ON(task_cpu(p) != new_cpu); 958 BUG_ON(task_cpu(p) != new_cpu);
962 enqueue_task(rq, p, 0); 959 enqueue_task(rq, p, 0);
963 p->on_rq = TASK_ON_RQ_QUEUED; 960 p->on_rq = TASK_ON_RQ_QUEUED;
@@ -980,7 +977,8 @@ struct migration_arg {
980 * So we race with normal scheduler movements, but that's OK, as long 977 * So we race with normal scheduler movements, but that's OK, as long
981 * as the task is no longer on this CPU. 978 * as the task is no longer on this CPU.
982 */ 979 */
983static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) 980static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
981 struct task_struct *p, int dest_cpu)
984{ 982{
985 if (unlikely(!cpu_active(dest_cpu))) 983 if (unlikely(!cpu_active(dest_cpu)))
986 return rq; 984 return rq;
@@ -989,7 +987,8 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
989 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 987 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
990 return rq; 988 return rq;
991 989
992 rq = move_queued_task(rq, p, dest_cpu); 990 update_rq_clock(rq);
991 rq = move_queued_task(rq, rf, p, dest_cpu);
993 992
994 return rq; 993 return rq;
995} 994}
@@ -1004,6 +1003,7 @@ static int migration_cpu_stop(void *data)
1004 struct migration_arg *arg = data; 1003 struct migration_arg *arg = data;
1005 struct task_struct *p = arg->task; 1004 struct task_struct *p = arg->task;
1006 struct rq *rq = this_rq(); 1005 struct rq *rq = this_rq();
1006 struct rq_flags rf;
1007 1007
1008 /* 1008 /*
1009 * The original target CPU might have gone down and we might 1009 * The original target CPU might have gone down and we might
@@ -1018,7 +1018,7 @@ static int migration_cpu_stop(void *data)
1018 sched_ttwu_pending(); 1018 sched_ttwu_pending();
1019 1019
1020 raw_spin_lock(&p->pi_lock); 1020 raw_spin_lock(&p->pi_lock);
1021 raw_spin_lock(&rq->lock); 1021 rq_lock(rq, &rf);
1022 /* 1022 /*
1023 * If task_rq(p) != rq, it cannot be migrated here, because we're 1023 * If task_rq(p) != rq, it cannot be migrated here, because we're
1024 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because 1024 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
@@ -1026,11 +1026,11 @@ static int migration_cpu_stop(void *data)
1026 */ 1026 */
1027 if (task_rq(p) == rq) { 1027 if (task_rq(p) == rq) {
1028 if (task_on_rq_queued(p)) 1028 if (task_on_rq_queued(p))
1029 rq = __migrate_task(rq, p, arg->dest_cpu); 1029 rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1030 else 1030 else
1031 p->wake_cpu = arg->dest_cpu; 1031 p->wake_cpu = arg->dest_cpu;
1032 } 1032 }
1033 raw_spin_unlock(&rq->lock); 1033 rq_unlock(rq, &rf);
1034 raw_spin_unlock(&p->pi_lock); 1034 raw_spin_unlock(&p->pi_lock);
1035 1035
1036 local_irq_enable(); 1036 local_irq_enable();
@@ -1063,7 +1063,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1063 * holding rq->lock. 1063 * holding rq->lock.
1064 */ 1064 */
1065 lockdep_assert_held(&rq->lock); 1065 lockdep_assert_held(&rq->lock);
1066 dequeue_task(rq, p, DEQUEUE_SAVE); 1066 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
1067 } 1067 }
1068 if (running) 1068 if (running)
1069 put_prev_task(rq, p); 1069 put_prev_task(rq, p);
@@ -1071,7 +1071,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1071 p->sched_class->set_cpus_allowed(p, new_mask); 1071 p->sched_class->set_cpus_allowed(p, new_mask);
1072 1072
1073 if (queued) 1073 if (queued)
1074 enqueue_task(rq, p, ENQUEUE_RESTORE); 1074 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
1075 if (running) 1075 if (running)
1076 set_curr_task(rq, p); 1076 set_curr_task(rq, p);
1077} 1077}
@@ -1150,9 +1150,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
1150 * OK, since we're going to drop the lock immediately 1150 * OK, since we're going to drop the lock immediately
1151 * afterwards anyway. 1151 * afterwards anyway.
1152 */ 1152 */
1153 rq_unpin_lock(rq, &rf); 1153 rq = move_queued_task(rq, &rf, p, dest_cpu);
1154 rq = move_queued_task(rq, p, dest_cpu);
1155 rq_repin_lock(rq, &rf);
1156 } 1154 }
1157out: 1155out:
1158 task_rq_unlock(rq, p, &rf); 1156 task_rq_unlock(rq, p, &rf);
@@ -1217,16 +1215,24 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
1217{ 1215{
1218 if (task_on_rq_queued(p)) { 1216 if (task_on_rq_queued(p)) {
1219 struct rq *src_rq, *dst_rq; 1217 struct rq *src_rq, *dst_rq;
1218 struct rq_flags srf, drf;
1220 1219
1221 src_rq = task_rq(p); 1220 src_rq = task_rq(p);
1222 dst_rq = cpu_rq(cpu); 1221 dst_rq = cpu_rq(cpu);
1223 1222
1223 rq_pin_lock(src_rq, &srf);
1224 rq_pin_lock(dst_rq, &drf);
1225
1224 p->on_rq = TASK_ON_RQ_MIGRATING; 1226 p->on_rq = TASK_ON_RQ_MIGRATING;
1225 deactivate_task(src_rq, p, 0); 1227 deactivate_task(src_rq, p, 0);
1226 set_task_cpu(p, cpu); 1228 set_task_cpu(p, cpu);
1227 activate_task(dst_rq, p, 0); 1229 activate_task(dst_rq, p, 0);
1228 p->on_rq = TASK_ON_RQ_QUEUED; 1230 p->on_rq = TASK_ON_RQ_QUEUED;
1229 check_preempt_curr(dst_rq, p, 0); 1231 check_preempt_curr(dst_rq, p, 0);
1232
1233 rq_unpin_lock(dst_rq, &drf);
1234 rq_unpin_lock(src_rq, &srf);
1235
1230 } else { 1236 } else {
1231 /* 1237 /*
1232 * Task isn't running anymore; make it appear like we migrated 1238 * Task isn't running anymore; make it appear like we migrated
@@ -1680,7 +1686,7 @@ static void
1680ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, 1686ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
1681 struct rq_flags *rf) 1687 struct rq_flags *rf)
1682{ 1688{
1683 int en_flags = ENQUEUE_WAKEUP; 1689 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
1684 1690
1685 lockdep_assert_held(&rq->lock); 1691 lockdep_assert_held(&rq->lock);
1686 1692
@@ -1726,14 +1732,13 @@ void sched_ttwu_pending(void)
1726 struct rq *rq = this_rq(); 1732 struct rq *rq = this_rq();
1727 struct llist_node *llist = llist_del_all(&rq->wake_list); 1733 struct llist_node *llist = llist_del_all(&rq->wake_list);
1728 struct task_struct *p; 1734 struct task_struct *p;
1729 unsigned long flags;
1730 struct rq_flags rf; 1735 struct rq_flags rf;
1731 1736
1732 if (!llist) 1737 if (!llist)
1733 return; 1738 return;
1734 1739
1735 raw_spin_lock_irqsave(&rq->lock, flags); 1740 rq_lock_irqsave(rq, &rf);
1736 rq_pin_lock(rq, &rf); 1741 update_rq_clock(rq);
1737 1742
1738 while (llist) { 1743 while (llist) {
1739 int wake_flags = 0; 1744 int wake_flags = 0;
@@ -1747,8 +1752,7 @@ void sched_ttwu_pending(void)
1747 ttwu_do_activate(rq, p, wake_flags, &rf); 1752 ttwu_do_activate(rq, p, wake_flags, &rf);
1748 } 1753 }
1749 1754
1750 rq_unpin_lock(rq, &rf); 1755 rq_unlock_irqrestore(rq, &rf);
1751 raw_spin_unlock_irqrestore(&rq->lock, flags);
1752} 1756}
1753 1757
1754void scheduler_ipi(void) 1758void scheduler_ipi(void)
@@ -1806,7 +1810,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
1806void wake_up_if_idle(int cpu) 1810void wake_up_if_idle(int cpu)
1807{ 1811{
1808 struct rq *rq = cpu_rq(cpu); 1812 struct rq *rq = cpu_rq(cpu);
1809 unsigned long flags; 1813 struct rq_flags rf;
1810 1814
1811 rcu_read_lock(); 1815 rcu_read_lock();
1812 1816
@@ -1816,11 +1820,11 @@ void wake_up_if_idle(int cpu)
1816 if (set_nr_if_polling(rq->idle)) { 1820 if (set_nr_if_polling(rq->idle)) {
1817 trace_sched_wake_idle_without_ipi(cpu); 1821 trace_sched_wake_idle_without_ipi(cpu);
1818 } else { 1822 } else {
1819 raw_spin_lock_irqsave(&rq->lock, flags); 1823 rq_lock_irqsave(rq, &rf);
1820 if (is_idle_task(rq->curr)) 1824 if (is_idle_task(rq->curr))
1821 smp_send_reschedule(cpu); 1825 smp_send_reschedule(cpu);
1822 /* Else CPU is not idle, do nothing here: */ 1826 /* Else CPU is not idle, do nothing here: */
1823 raw_spin_unlock_irqrestore(&rq->lock, flags); 1827 rq_unlock_irqrestore(rq, &rf);
1824 } 1828 }
1825 1829
1826out: 1830out:
@@ -1846,11 +1850,10 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
1846 } 1850 }
1847#endif 1851#endif
1848 1852
1849 raw_spin_lock(&rq->lock); 1853 rq_lock(rq, &rf);
1850 rq_pin_lock(rq, &rf); 1854 update_rq_clock(rq);
1851 ttwu_do_activate(rq, p, wake_flags, &rf); 1855 ttwu_do_activate(rq, p, wake_flags, &rf);
1852 rq_unpin_lock(rq, &rf); 1856 rq_unlock(rq, &rf);
1853 raw_spin_unlock(&rq->lock);
1854} 1857}
1855 1858
1856/* 1859/*
@@ -2097,11 +2100,9 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
2097 * disabled avoiding further scheduler activity on it and we've 2100 * disabled avoiding further scheduler activity on it and we've
2098 * not yet picked a replacement task. 2101 * not yet picked a replacement task.
2099 */ 2102 */
2100 rq_unpin_lock(rq, rf); 2103 rq_unlock(rq, rf);
2101 raw_spin_unlock(&rq->lock);
2102 raw_spin_lock(&p->pi_lock); 2104 raw_spin_lock(&p->pi_lock);
2103 raw_spin_lock(&rq->lock); 2105 rq_relock(rq, rf);
2104 rq_repin_lock(rq, rf);
2105 } 2106 }
2106 2107
2107 if (!(p->state & TASK_NORMAL)) 2108 if (!(p->state & TASK_NORMAL))
@@ -2114,7 +2115,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
2114 delayacct_blkio_end(); 2115 delayacct_blkio_end();
2115 atomic_dec(&rq->nr_iowait); 2116 atomic_dec(&rq->nr_iowait);
2116 } 2117 }
2117 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 2118 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
2118 } 2119 }
2119 2120
2120 ttwu_do_wakeup(rq, p, 0, rf); 2121 ttwu_do_wakeup(rq, p, 0, rf);
@@ -2555,7 +2556,7 @@ void wake_up_new_task(struct task_struct *p)
2555 update_rq_clock(rq); 2556 update_rq_clock(rq);
2556 post_init_entity_util_avg(&p->se); 2557 post_init_entity_util_avg(&p->se);
2557 2558
2558 activate_task(rq, p, 0); 2559 activate_task(rq, p, ENQUEUE_NOCLOCK);
2559 p->on_rq = TASK_ON_RQ_QUEUED; 2560 p->on_rq = TASK_ON_RQ_QUEUED;
2560 trace_sched_wakeup_new(p); 2561 trace_sched_wakeup_new(p);
2561 check_preempt_curr(rq, p, WF_FORK); 2562 check_preempt_curr(rq, p, WF_FORK);
@@ -3093,15 +3094,18 @@ void scheduler_tick(void)
3093 int cpu = smp_processor_id(); 3094 int cpu = smp_processor_id();
3094 struct rq *rq = cpu_rq(cpu); 3095 struct rq *rq = cpu_rq(cpu);
3095 struct task_struct *curr = rq->curr; 3096 struct task_struct *curr = rq->curr;
3097 struct rq_flags rf;
3096 3098
3097 sched_clock_tick(); 3099 sched_clock_tick();
3098 3100
3099 raw_spin_lock(&rq->lock); 3101 rq_lock(rq, &rf);
3102
3100 update_rq_clock(rq); 3103 update_rq_clock(rq);
3101 curr->sched_class->task_tick(rq, curr, 0); 3104 curr->sched_class->task_tick(rq, curr, 0);
3102 cpu_load_update_active(rq); 3105 cpu_load_update_active(rq);
3103 calc_global_load_tick(rq); 3106 calc_global_load_tick(rq);
3104 raw_spin_unlock(&rq->lock); 3107
3108 rq_unlock(rq, &rf);
3105 3109
3106 perf_event_task_tick(); 3110 perf_event_task_tick();
3107 3111
@@ -3378,7 +3382,7 @@ static void __sched notrace __schedule(bool preempt)
3378 hrtick_clear(rq); 3382 hrtick_clear(rq);
3379 3383
3380 local_irq_disable(); 3384 local_irq_disable();
3381 rcu_note_context_switch(); 3385 rcu_note_context_switch(preempt);
3382 3386
3383 /* 3387 /*
3384 * Make sure that signal_pending_state()->signal_pending() below 3388 * Make sure that signal_pending_state()->signal_pending() below
@@ -3386,18 +3390,18 @@ static void __sched notrace __schedule(bool preempt)
3386 * done by the caller to avoid the race with signal_wake_up(). 3390 * done by the caller to avoid the race with signal_wake_up().
3387 */ 3391 */
3388 smp_mb__before_spinlock(); 3392 smp_mb__before_spinlock();
3389 raw_spin_lock(&rq->lock); 3393 rq_lock(rq, &rf);
3390 rq_pin_lock(rq, &rf);
3391 3394
3392 /* Promote REQ to ACT */ 3395 /* Promote REQ to ACT */
3393 rq->clock_update_flags <<= 1; 3396 rq->clock_update_flags <<= 1;
3397 update_rq_clock(rq);
3394 3398
3395 switch_count = &prev->nivcsw; 3399 switch_count = &prev->nivcsw;
3396 if (!preempt && prev->state) { 3400 if (!preempt && prev->state) {
3397 if (unlikely(signal_pending_state(prev->state, prev))) { 3401 if (unlikely(signal_pending_state(prev->state, prev))) {
3398 prev->state = TASK_RUNNING; 3402 prev->state = TASK_RUNNING;
3399 } else { 3403 } else {
3400 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3404 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
3401 prev->on_rq = 0; 3405 prev->on_rq = 0;
3402 3406
3403 if (prev->in_iowait) { 3407 if (prev->in_iowait) {
@@ -3421,9 +3425,6 @@ static void __sched notrace __schedule(bool preempt)
3421 switch_count = &prev->nvcsw; 3425 switch_count = &prev->nvcsw;
3422 } 3426 }
3423 3427
3424 if (task_on_rq_queued(prev))
3425 update_rq_clock(rq);
3426
3427 next = pick_next_task(rq, prev, &rf); 3428 next = pick_next_task(rq, prev, &rf);
3428 clear_tsk_need_resched(prev); 3429 clear_tsk_need_resched(prev);
3429 clear_preempt_need_resched(); 3430 clear_preempt_need_resched();
@@ -3439,8 +3440,7 @@ static void __sched notrace __schedule(bool preempt)
3439 rq = context_switch(rq, prev, next, &rf); 3440 rq = context_switch(rq, prev, next, &rf);
3440 } else { 3441 } else {
3441 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); 3442 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
3442 rq_unpin_lock(rq, &rf); 3443 rq_unlock_irq(rq, &rf);
3443 raw_spin_unlock_irq(&rq->lock);
3444 } 3444 }
3445 3445
3446 balance_callback(rq); 3446 balance_callback(rq);
@@ -3502,6 +3502,31 @@ asmlinkage __visible void __sched schedule(void)
3502} 3502}
3503EXPORT_SYMBOL(schedule); 3503EXPORT_SYMBOL(schedule);
3504 3504
3505/*
3506 * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
3507 * state (have scheduled out non-voluntarily) by making sure that all
3508 * tasks have either left the run queue or have gone into user space.
3509 * As idle tasks do not do either, they must not ever be preempted
3510 * (schedule out non-voluntarily).
3511 *
3512 * schedule_idle() is similar to schedule_preempt_disable() except that it
3513 * never enables preemption because it does not call sched_submit_work().
3514 */
3515void __sched schedule_idle(void)
3516{
3517 /*
3518 * As this skips calling sched_submit_work(), which the idle task does
3519 * regardless because that function is a nop when the task is in a
3520 * TASK_RUNNING state, make sure this isn't used someplace that the
3521 * current task can be in any other state. Note, idle is always in the
3522 * TASK_RUNNING state.
3523 */
3524 WARN_ON_ONCE(current->state);
3525 do {
3526 __schedule(false);
3527 } while (need_resched());
3528}
3529
3505#ifdef CONFIG_CONTEXT_TRACKING 3530#ifdef CONFIG_CONTEXT_TRACKING
3506asmlinkage __visible void __sched schedule_user(void) 3531asmlinkage __visible void __sched schedule_user(void)
3507{ 3532{
@@ -3671,10 +3696,25 @@ EXPORT_SYMBOL(default_wake_function);
3671 3696
3672#ifdef CONFIG_RT_MUTEXES 3697#ifdef CONFIG_RT_MUTEXES
3673 3698
3699static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
3700{
3701 if (pi_task)
3702 prio = min(prio, pi_task->prio);
3703
3704 return prio;
3705}
3706
3707static inline int rt_effective_prio(struct task_struct *p, int prio)
3708{
3709 struct task_struct *pi_task = rt_mutex_get_top_task(p);
3710
3711 return __rt_effective_prio(pi_task, prio);
3712}
3713
3674/* 3714/*
3675 * rt_mutex_setprio - set the current priority of a task 3715 * rt_mutex_setprio - set the current priority of a task
3676 * @p: task 3716 * @p: task to boost
3677 * @prio: prio value (kernel-internal form) 3717 * @pi_task: donor task
3678 * 3718 *
3679 * This function changes the 'effective' priority of a task. It does 3719 * This function changes the 'effective' priority of a task. It does
3680 * not touch ->normal_prio like __setscheduler(). 3720 * not touch ->normal_prio like __setscheduler().
@@ -3682,17 +3722,42 @@ EXPORT_SYMBOL(default_wake_function);
3682 * Used by the rt_mutex code to implement priority inheritance 3722 * Used by the rt_mutex code to implement priority inheritance
3683 * logic. Call site only calls if the priority of the task changed. 3723 * logic. Call site only calls if the priority of the task changed.
3684 */ 3724 */
3685void rt_mutex_setprio(struct task_struct *p, int prio) 3725void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
3686{ 3726{
3687 int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; 3727 int prio, oldprio, queued, running, queue_flag =
3728 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
3688 const struct sched_class *prev_class; 3729 const struct sched_class *prev_class;
3689 struct rq_flags rf; 3730 struct rq_flags rf;
3690 struct rq *rq; 3731 struct rq *rq;
3691 3732
3692 BUG_ON(prio > MAX_PRIO); 3733 /* XXX used to be waiter->prio, not waiter->task->prio */
3734 prio = __rt_effective_prio(pi_task, p->normal_prio);
3735
3736 /*
3737 * If nothing changed; bail early.
3738 */
3739 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
3740 return;
3693 3741
3694 rq = __task_rq_lock(p, &rf); 3742 rq = __task_rq_lock(p, &rf);
3695 update_rq_clock(rq); 3743 update_rq_clock(rq);
3744 /*
3745 * Set under pi_lock && rq->lock, such that the value can be used under
3746 * either lock.
3747 *
3748 * Note that there is loads of tricky to make this pointer cache work
3749 * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
3750 * ensure a task is de-boosted (pi_task is set to NULL) before the
3751 * task is allowed to run again (and can exit). This ensures the pointer
3752 * points to a blocked task -- which guaratees the task is present.
3753 */
3754 p->pi_top_task = pi_task;
3755
3756 /*
3757 * For FIFO/RR we only need to set prio, if that matches we're done.
3758 */
3759 if (prio == p->prio && !dl_prio(prio))
3760 goto out_unlock;
3696 3761
3697 /* 3762 /*
3698 * Idle task boosting is a nono in general. There is one 3763 * Idle task boosting is a nono in general. There is one
@@ -3712,7 +3777,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3712 goto out_unlock; 3777 goto out_unlock;
3713 } 3778 }
3714 3779
3715 trace_sched_pi_setprio(p, prio); 3780 trace_sched_pi_setprio(p, pi_task);
3716 oldprio = p->prio; 3781 oldprio = p->prio;
3717 3782
3718 if (oldprio == prio) 3783 if (oldprio == prio)
@@ -3736,7 +3801,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3736 * running task 3801 * running task
3737 */ 3802 */
3738 if (dl_prio(prio)) { 3803 if (dl_prio(prio)) {
3739 struct task_struct *pi_task = rt_mutex_get_top_task(p);
3740 if (!dl_prio(p->normal_prio) || 3804 if (!dl_prio(p->normal_prio) ||
3741 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { 3805 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
3742 p->dl.dl_boosted = 1; 3806 p->dl.dl_boosted = 1;
@@ -3774,6 +3838,11 @@ out_unlock:
3774 balance_callback(rq); 3838 balance_callback(rq);
3775 preempt_enable(); 3839 preempt_enable();
3776} 3840}
3841#else
3842static inline int rt_effective_prio(struct task_struct *p, int prio)
3843{
3844 return prio;
3845}
3777#endif 3846#endif
3778 3847
3779void set_user_nice(struct task_struct *p, long nice) 3848void set_user_nice(struct task_struct *p, long nice)
@@ -3805,7 +3874,7 @@ void set_user_nice(struct task_struct *p, long nice)
3805 queued = task_on_rq_queued(p); 3874 queued = task_on_rq_queued(p);
3806 running = task_current(rq, p); 3875 running = task_current(rq, p);
3807 if (queued) 3876 if (queued)
3808 dequeue_task(rq, p, DEQUEUE_SAVE); 3877 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
3809 if (running) 3878 if (running)
3810 put_prev_task(rq, p); 3879 put_prev_task(rq, p);
3811 3880
@@ -3816,7 +3885,7 @@ void set_user_nice(struct task_struct *p, long nice)
3816 delta = p->prio - old_prio; 3885 delta = p->prio - old_prio;
3817 3886
3818 if (queued) { 3887 if (queued) {
3819 enqueue_task(rq, p, ENQUEUE_RESTORE); 3888 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
3820 /* 3889 /*
3821 * If the task increased its priority or is running and 3890 * If the task increased its priority or is running and
3822 * lowered its priority, then reschedule its CPU: 3891 * lowered its priority, then reschedule its CPU:
@@ -4020,10 +4089,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
4020 * Keep a potential priority boosting if called from 4089 * Keep a potential priority boosting if called from
4021 * sched_setscheduler(). 4090 * sched_setscheduler().
4022 */ 4091 */
4092 p->prio = normal_prio(p);
4023 if (keep_boost) 4093 if (keep_boost)
4024 p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); 4094 p->prio = rt_effective_prio(p, p->prio);
4025 else
4026 p->prio = normal_prio(p);
4027 4095
4028 if (dl_prio(p->prio)) 4096 if (dl_prio(p->prio))
4029 p->sched_class = &dl_sched_class; 4097 p->sched_class = &dl_sched_class;
@@ -4126,7 +4194,7 @@ static int __sched_setscheduler(struct task_struct *p,
4126 const struct sched_class *prev_class; 4194 const struct sched_class *prev_class;
4127 struct rq_flags rf; 4195 struct rq_flags rf;
4128 int reset_on_fork; 4196 int reset_on_fork;
4129 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 4197 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
4130 struct rq *rq; 4198 struct rq *rq;
4131 4199
4132 /* May grab non-irq protected spin_locks: */ 4200 /* May grab non-irq protected spin_locks: */
@@ -4310,7 +4378,7 @@ change:
4310 * the runqueue. This will be done when the task deboost 4378 * the runqueue. This will be done when the task deboost
4311 * itself. 4379 * itself.
4312 */ 4380 */
4313 new_effective_prio = rt_mutex_get_effective_prio(p, newprio); 4381 new_effective_prio = rt_effective_prio(p, newprio);
4314 if (new_effective_prio == oldprio) 4382 if (new_effective_prio == oldprio)
4315 queue_flags &= ~DEQUEUE_MOVE; 4383 queue_flags &= ~DEQUEUE_MOVE;
4316 } 4384 }
@@ -4923,7 +4991,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4923 */ 4991 */
4924SYSCALL_DEFINE0(sched_yield) 4992SYSCALL_DEFINE0(sched_yield)
4925{ 4993{
4926 struct rq *rq = this_rq_lock(); 4994 struct rq_flags rf;
4995 struct rq *rq;
4996
4997 local_irq_disable();
4998 rq = this_rq();
4999 rq_lock(rq, &rf);
4927 5000
4928 schedstat_inc(rq->yld_count); 5001 schedstat_inc(rq->yld_count);
4929 current->sched_class->yield_task(rq); 5002 current->sched_class->yield_task(rq);
@@ -4932,9 +5005,8 @@ SYSCALL_DEFINE0(sched_yield)
4932 * Since we are going to call schedule() anyway, there's 5005 * Since we are going to call schedule() anyway, there's
4933 * no need to preempt or enable interrupts: 5006 * no need to preempt or enable interrupts:
4934 */ 5007 */
4935 __release(rq->lock); 5008 preempt_disable();
4936 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 5009 rq_unlock(rq, &rf);
4937 do_raw_spin_unlock(&rq->lock);
4938 sched_preempt_enable_no_resched(); 5010 sched_preempt_enable_no_resched();
4939 5011
4940 schedule(); 5012 schedule();
@@ -5514,7 +5586,7 @@ void sched_setnuma(struct task_struct *p, int nid)
5514 p->numa_preferred_nid = nid; 5586 p->numa_preferred_nid = nid;
5515 5587
5516 if (queued) 5588 if (queued)
5517 enqueue_task(rq, p, ENQUEUE_RESTORE); 5589 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
5518 if (running) 5590 if (running)
5519 set_curr_task(rq, p); 5591 set_curr_task(rq, p);
5520 task_rq_unlock(rq, p, &rf); 5592 task_rq_unlock(rq, p, &rf);
@@ -5579,11 +5651,11 @@ static struct task_struct fake_task = {
5579 * there's no concurrency possible, we hold the required locks anyway 5651 * there's no concurrency possible, we hold the required locks anyway
5580 * because of lock validation efforts. 5652 * because of lock validation efforts.
5581 */ 5653 */
5582static void migrate_tasks(struct rq *dead_rq) 5654static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
5583{ 5655{
5584 struct rq *rq = dead_rq; 5656 struct rq *rq = dead_rq;
5585 struct task_struct *next, *stop = rq->stop; 5657 struct task_struct *next, *stop = rq->stop;
5586 struct rq_flags rf; 5658 struct rq_flags orf = *rf;
5587 int dest_cpu; 5659 int dest_cpu;
5588 5660
5589 /* 5661 /*
@@ -5602,9 +5674,7 @@ static void migrate_tasks(struct rq *dead_rq)
5602 * class method both need to have an up-to-date 5674 * class method both need to have an up-to-date
5603 * value of rq->clock[_task] 5675 * value of rq->clock[_task]
5604 */ 5676 */
5605 rq_pin_lock(rq, &rf);
5606 update_rq_clock(rq); 5677 update_rq_clock(rq);
5607 rq_unpin_lock(rq, &rf);
5608 5678
5609 for (;;) { 5679 for (;;) {
5610 /* 5680 /*
@@ -5617,8 +5687,7 @@ static void migrate_tasks(struct rq *dead_rq)
5617 /* 5687 /*
5618 * pick_next_task() assumes pinned rq->lock: 5688 * pick_next_task() assumes pinned rq->lock:
5619 */ 5689 */
5620 rq_repin_lock(rq, &rf); 5690 next = pick_next_task(rq, &fake_task, rf);
5621 next = pick_next_task(rq, &fake_task, &rf);
5622 BUG_ON(!next); 5691 BUG_ON(!next);
5623 next->sched_class->put_prev_task(rq, next); 5692 next->sched_class->put_prev_task(rq, next);
5624 5693
@@ -5631,10 +5700,9 @@ static void migrate_tasks(struct rq *dead_rq)
5631 * because !cpu_active at this point, which means load-balance 5700 * because !cpu_active at this point, which means load-balance
5632 * will not interfere. Also, stop-machine. 5701 * will not interfere. Also, stop-machine.
5633 */ 5702 */
5634 rq_unpin_lock(rq, &rf); 5703 rq_unlock(rq, rf);
5635 raw_spin_unlock(&rq->lock);
5636 raw_spin_lock(&next->pi_lock); 5704 raw_spin_lock(&next->pi_lock);
5637 raw_spin_lock(&rq->lock); 5705 rq_relock(rq, rf);
5638 5706
5639 /* 5707 /*
5640 * Since we're inside stop-machine, _nothing_ should have 5708 * Since we're inside stop-machine, _nothing_ should have
@@ -5648,12 +5716,12 @@ static void migrate_tasks(struct rq *dead_rq)
5648 5716
5649 /* Find suitable destination for @next, with force if needed. */ 5717 /* Find suitable destination for @next, with force if needed. */
5650 dest_cpu = select_fallback_rq(dead_rq->cpu, next); 5718 dest_cpu = select_fallback_rq(dead_rq->cpu, next);
5651 5719 rq = __migrate_task(rq, rf, next, dest_cpu);
5652 rq = __migrate_task(rq, next, dest_cpu);
5653 if (rq != dead_rq) { 5720 if (rq != dead_rq) {
5654 raw_spin_unlock(&rq->lock); 5721 rq_unlock(rq, rf);
5655 rq = dead_rq; 5722 rq = dead_rq;
5656 raw_spin_lock(&rq->lock); 5723 *rf = orf;
5724 rq_relock(rq, rf);
5657 } 5725 }
5658 raw_spin_unlock(&next->pi_lock); 5726 raw_spin_unlock(&next->pi_lock);
5659 } 5727 }
@@ -5732,7 +5800,7 @@ static void cpuset_cpu_active(void)
5732 * cpuset configurations. 5800 * cpuset configurations.
5733 */ 5801 */
5734 } 5802 }
5735 cpuset_update_active_cpus(true); 5803 cpuset_update_active_cpus();
5736} 5804}
5737 5805
5738static int cpuset_cpu_inactive(unsigned int cpu) 5806static int cpuset_cpu_inactive(unsigned int cpu)
@@ -5755,7 +5823,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
5755 5823
5756 if (overflow) 5824 if (overflow)
5757 return -EBUSY; 5825 return -EBUSY;
5758 cpuset_update_active_cpus(false); 5826 cpuset_update_active_cpus();
5759 } else { 5827 } else {
5760 num_cpus_frozen++; 5828 num_cpus_frozen++;
5761 partition_sched_domains(1, NULL, NULL); 5829 partition_sched_domains(1, NULL, NULL);
@@ -5766,7 +5834,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
5766int sched_cpu_activate(unsigned int cpu) 5834int sched_cpu_activate(unsigned int cpu)
5767{ 5835{
5768 struct rq *rq = cpu_rq(cpu); 5836 struct rq *rq = cpu_rq(cpu);
5769 unsigned long flags; 5837 struct rq_flags rf;
5770 5838
5771 set_cpu_active(cpu, true); 5839 set_cpu_active(cpu, true);
5772 5840
@@ -5784,12 +5852,12 @@ int sched_cpu_activate(unsigned int cpu)
5784 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the 5852 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
5785 * domains. 5853 * domains.
5786 */ 5854 */
5787 raw_spin_lock_irqsave(&rq->lock, flags); 5855 rq_lock_irqsave(rq, &rf);
5788 if (rq->rd) { 5856 if (rq->rd) {
5789 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5857 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5790 set_rq_online(rq); 5858 set_rq_online(rq);
5791 } 5859 }
5792 raw_spin_unlock_irqrestore(&rq->lock, flags); 5860 rq_unlock_irqrestore(rq, &rf);
5793 5861
5794 update_max_interval(); 5862 update_max_interval();
5795 5863
@@ -5847,18 +5915,20 @@ int sched_cpu_starting(unsigned int cpu)
5847int sched_cpu_dying(unsigned int cpu) 5915int sched_cpu_dying(unsigned int cpu)
5848{ 5916{
5849 struct rq *rq = cpu_rq(cpu); 5917 struct rq *rq = cpu_rq(cpu);
5850 unsigned long flags; 5918 struct rq_flags rf;
5851 5919
5852 /* Handle pending wakeups and then migrate everything off */ 5920 /* Handle pending wakeups and then migrate everything off */
5853 sched_ttwu_pending(); 5921 sched_ttwu_pending();
5854 raw_spin_lock_irqsave(&rq->lock, flags); 5922
5923 rq_lock_irqsave(rq, &rf);
5855 if (rq->rd) { 5924 if (rq->rd) {
5856 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5925 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5857 set_rq_offline(rq); 5926 set_rq_offline(rq);
5858 } 5927 }
5859 migrate_tasks(rq); 5928 migrate_tasks(rq, &rf);
5860 BUG_ON(rq->nr_running != 1); 5929 BUG_ON(rq->nr_running != 1);
5861 raw_spin_unlock_irqrestore(&rq->lock, flags); 5930 rq_unlock_irqrestore(rq, &rf);
5931
5862 calc_load_migrate(rq); 5932 calc_load_migrate(rq);
5863 update_max_interval(); 5933 update_max_interval();
5864 nohz_balance_exit_idle(cpu); 5934 nohz_balance_exit_idle(cpu);
@@ -6412,7 +6482,8 @@ static void sched_change_group(struct task_struct *tsk, int type)
6412 */ 6482 */
6413void sched_move_task(struct task_struct *tsk) 6483void sched_move_task(struct task_struct *tsk)
6414{ 6484{
6415 int queued, running; 6485 int queued, running, queue_flags =
6486 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
6416 struct rq_flags rf; 6487 struct rq_flags rf;
6417 struct rq *rq; 6488 struct rq *rq;
6418 6489
@@ -6423,14 +6494,14 @@ void sched_move_task(struct task_struct *tsk)
6423 queued = task_on_rq_queued(tsk); 6494 queued = task_on_rq_queued(tsk);
6424 6495
6425 if (queued) 6496 if (queued)
6426 dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); 6497 dequeue_task(rq, tsk, queue_flags);
6427 if (running) 6498 if (running)
6428 put_prev_task(rq, tsk); 6499 put_prev_task(rq, tsk);
6429 6500
6430 sched_change_group(tsk, TASK_MOVE_GROUP); 6501 sched_change_group(tsk, TASK_MOVE_GROUP);
6431 6502
6432 if (queued) 6503 if (queued)
6433 enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); 6504 enqueue_task(rq, tsk, queue_flags);
6434 if (running) 6505 if (running)
6435 set_curr_task(rq, tsk); 6506 set_curr_task(rq, tsk);
6436 6507
@@ -7008,14 +7079,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7008 for_each_online_cpu(i) { 7079 for_each_online_cpu(i) {
7009 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7080 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7010 struct rq *rq = cfs_rq->rq; 7081 struct rq *rq = cfs_rq->rq;
7082 struct rq_flags rf;
7011 7083
7012 raw_spin_lock_irq(&rq->lock); 7084 rq_lock_irq(rq, &rf);
7013 cfs_rq->runtime_enabled = runtime_enabled; 7085 cfs_rq->runtime_enabled = runtime_enabled;
7014 cfs_rq->runtime_remaining = 0; 7086 cfs_rq->runtime_remaining = 0;
7015 7087
7016 if (cfs_rq->throttled) 7088 if (cfs_rq->throttled)
7017 unthrottle_cfs_rq(cfs_rq); 7089 unthrottle_cfs_rq(cfs_rq);
7018 raw_spin_unlock_irq(&rq->lock); 7090 rq_unlock_irq(rq, &rf);
7019 } 7091 }
7020 if (runtime_was_enabled && !runtime_enabled) 7092 if (runtime_was_enabled && !runtime_enabled)
7021 cfs_bandwidth_usage_dec(); 7093 cfs_bandwidth_usage_dec();
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 54c577578da6..622eed1b7658 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -61,6 +61,11 @@ struct sugov_cpu {
61 unsigned long util; 61 unsigned long util;
62 unsigned long max; 62 unsigned long max;
63 unsigned int flags; 63 unsigned int flags;
64
65 /* The field below is for single-CPU policies only. */
66#ifdef CONFIG_NO_HZ_COMMON
67 unsigned long saved_idle_calls;
68#endif
64}; 69};
65 70
66static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); 71static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -93,22 +98,23 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
93{ 98{
94 struct cpufreq_policy *policy = sg_policy->policy; 99 struct cpufreq_policy *policy = sg_policy->policy;
95 100
101 if (sg_policy->next_freq == next_freq)
102 return;
103
104 if (sg_policy->next_freq > next_freq)
105 next_freq = (sg_policy->next_freq + next_freq) >> 1;
106
107 sg_policy->next_freq = next_freq;
96 sg_policy->last_freq_update_time = time; 108 sg_policy->last_freq_update_time = time;
97 109
98 if (policy->fast_switch_enabled) { 110 if (policy->fast_switch_enabled) {
99 if (sg_policy->next_freq == next_freq) {
100 trace_cpu_frequency(policy->cur, smp_processor_id());
101 return;
102 }
103 sg_policy->next_freq = next_freq;
104 next_freq = cpufreq_driver_fast_switch(policy, next_freq); 111 next_freq = cpufreq_driver_fast_switch(policy, next_freq);
105 if (next_freq == CPUFREQ_ENTRY_INVALID) 112 if (next_freq == CPUFREQ_ENTRY_INVALID)
106 return; 113 return;
107 114
108 policy->cur = next_freq; 115 policy->cur = next_freq;
109 trace_cpu_frequency(next_freq, smp_processor_id()); 116 trace_cpu_frequency(next_freq, smp_processor_id());
110 } else if (sg_policy->next_freq != next_freq) { 117 } else {
111 sg_policy->next_freq = next_freq;
112 sg_policy->work_in_progress = true; 118 sg_policy->work_in_progress = true;
113 irq_work_queue(&sg_policy->irq_work); 119 irq_work_queue(&sg_policy->irq_work);
114 } 120 }
@@ -192,6 +198,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
192 sg_cpu->iowait_boost >>= 1; 198 sg_cpu->iowait_boost >>= 1;
193} 199}
194 200
201#ifdef CONFIG_NO_HZ_COMMON
202static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
203{
204 unsigned long idle_calls = tick_nohz_get_idle_calls();
205 bool ret = idle_calls == sg_cpu->saved_idle_calls;
206
207 sg_cpu->saved_idle_calls = idle_calls;
208 return ret;
209}
210#else
211static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
212#endif /* CONFIG_NO_HZ_COMMON */
213
195static void sugov_update_single(struct update_util_data *hook, u64 time, 214static void sugov_update_single(struct update_util_data *hook, u64 time,
196 unsigned int flags) 215 unsigned int flags)
197{ 216{
@@ -200,6 +219,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
200 struct cpufreq_policy *policy = sg_policy->policy; 219 struct cpufreq_policy *policy = sg_policy->policy;
201 unsigned long util, max; 220 unsigned long util, max;
202 unsigned int next_f; 221 unsigned int next_f;
222 bool busy;
203 223
204 sugov_set_iowait_boost(sg_cpu, time, flags); 224 sugov_set_iowait_boost(sg_cpu, time, flags);
205 sg_cpu->last_update = time; 225 sg_cpu->last_update = time;
@@ -207,40 +227,36 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
207 if (!sugov_should_update_freq(sg_policy, time)) 227 if (!sugov_should_update_freq(sg_policy, time))
208 return; 228 return;
209 229
230 busy = sugov_cpu_is_busy(sg_cpu);
231
210 if (flags & SCHED_CPUFREQ_RT_DL) { 232 if (flags & SCHED_CPUFREQ_RT_DL) {
211 next_f = policy->cpuinfo.max_freq; 233 next_f = policy->cpuinfo.max_freq;
212 } else { 234 } else {
213 sugov_get_util(&util, &max); 235 sugov_get_util(&util, &max);
214 sugov_iowait_boost(sg_cpu, &util, &max); 236 sugov_iowait_boost(sg_cpu, &util, &max);
215 next_f = get_next_freq(sg_policy, util, max); 237 next_f = get_next_freq(sg_policy, util, max);
238 /*
239 * Do not reduce the frequency if the CPU has not been idle
240 * recently, as the reduction is likely to be premature then.
241 */
242 if (busy && next_f < sg_policy->next_freq)
243 next_f = sg_policy->next_freq;
216 } 244 }
217 sugov_update_commit(sg_policy, time, next_f); 245 sugov_update_commit(sg_policy, time, next_f);
218} 246}
219 247
220static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, 248static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
221 unsigned long util, unsigned long max,
222 unsigned int flags)
223{ 249{
224 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 250 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
225 struct cpufreq_policy *policy = sg_policy->policy; 251 struct cpufreq_policy *policy = sg_policy->policy;
226 unsigned int max_f = policy->cpuinfo.max_freq; 252 unsigned long util = 0, max = 1;
227 u64 last_freq_update_time = sg_policy->last_freq_update_time;
228 unsigned int j; 253 unsigned int j;
229 254
230 if (flags & SCHED_CPUFREQ_RT_DL)
231 return max_f;
232
233 sugov_iowait_boost(sg_cpu, &util, &max);
234
235 for_each_cpu(j, policy->cpus) { 255 for_each_cpu(j, policy->cpus) {
236 struct sugov_cpu *j_sg_cpu; 256 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
237 unsigned long j_util, j_max; 257 unsigned long j_util, j_max;
238 s64 delta_ns; 258 s64 delta_ns;
239 259
240 if (j == smp_processor_id())
241 continue;
242
243 j_sg_cpu = &per_cpu(sugov_cpu, j);
244 /* 260 /*
245 * If the CPU utilization was last updated before the previous 261 * If the CPU utilization was last updated before the previous
246 * frequency update and the time elapsed between the last update 262 * frequency update and the time elapsed between the last update
@@ -248,13 +264,13 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
248 * enough, don't take the CPU into account as it probably is 264 * enough, don't take the CPU into account as it probably is
249 * idle now (and clear iowait_boost for it). 265 * idle now (and clear iowait_boost for it).
250 */ 266 */
251 delta_ns = last_freq_update_time - j_sg_cpu->last_update; 267 delta_ns = time - j_sg_cpu->last_update;
252 if (delta_ns > TICK_NSEC) { 268 if (delta_ns > TICK_NSEC) {
253 j_sg_cpu->iowait_boost = 0; 269 j_sg_cpu->iowait_boost = 0;
254 continue; 270 continue;
255 } 271 }
256 if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) 272 if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
257 return max_f; 273 return policy->cpuinfo.max_freq;
258 274
259 j_util = j_sg_cpu->util; 275 j_util = j_sg_cpu->util;
260 j_max = j_sg_cpu->max; 276 j_max = j_sg_cpu->max;
@@ -289,7 +305,11 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
289 sg_cpu->last_update = time; 305 sg_cpu->last_update = time;
290 306
291 if (sugov_should_update_freq(sg_policy, time)) { 307 if (sugov_should_update_freq(sg_policy, time)) {
292 next_f = sugov_next_freq_shared(sg_cpu, util, max, flags); 308 if (flags & SCHED_CPUFREQ_RT_DL)
309 next_f = sg_policy->policy->cpuinfo.max_freq;
310 else
311 next_f = sugov_next_freq_shared(sg_cpu, time);
312
293 sugov_update_commit(sg_policy, time, next_f); 313 sugov_update_commit(sg_policy, time, next_f);
294 } 314 }
295 315
@@ -473,7 +493,6 @@ static int sugov_init(struct cpufreq_policy *policy)
473{ 493{
474 struct sugov_policy *sg_policy; 494 struct sugov_policy *sg_policy;
475 struct sugov_tunables *tunables; 495 struct sugov_tunables *tunables;
476 unsigned int lat;
477 int ret = 0; 496 int ret = 0;
478 497
479 /* State should be equivalent to EXIT */ 498 /* State should be equivalent to EXIT */
@@ -512,10 +531,16 @@ static int sugov_init(struct cpufreq_policy *policy)
512 goto stop_kthread; 531 goto stop_kthread;
513 } 532 }
514 533
515 tunables->rate_limit_us = LATENCY_MULTIPLIER; 534 if (policy->transition_delay_us) {
516 lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; 535 tunables->rate_limit_us = policy->transition_delay_us;
517 if (lat) 536 } else {
518 tunables->rate_limit_us *= lat; 537 unsigned int lat;
538
539 tunables->rate_limit_us = LATENCY_MULTIPLIER;
540 lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
541 if (lat)
542 tunables->rate_limit_us *= lat;
543 }
519 544
520 policy->governor_data = sg_policy; 545 policy->governor_data = sg_policy;
521 sg_policy->tunables = tunables; 546 sg_policy->tunables = tunables;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3778e2b46c8..aea3135c5d90 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
34 sched_clock_irqtime = 0; 34 sched_clock_irqtime = 0;
35} 35}
36 36
37static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
38 enum cpu_usage_stat idx)
39{
40 u64 *cpustat = kcpustat_this_cpu->cpustat;
41
42 u64_stats_update_begin(&irqtime->sync);
43 cpustat[idx] += delta;
44 irqtime->total += delta;
45 irqtime->tick_delta += delta;
46 u64_stats_update_end(&irqtime->sync);
47}
48
37/* 49/*
38 * Called before incrementing preempt_count on {soft,}irq_enter 50 * Called before incrementing preempt_count on {soft,}irq_enter
39 * and before decrementing preempt_count on {soft,}irq_exit. 51 * and before decrementing preempt_count on {soft,}irq_exit.
@@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
41void irqtime_account_irq(struct task_struct *curr) 53void irqtime_account_irq(struct task_struct *curr)
42{ 54{
43 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 55 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
44 u64 *cpustat = kcpustat_this_cpu->cpustat;
45 s64 delta; 56 s64 delta;
46 int cpu; 57 int cpu;
47 58
@@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
52 delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; 63 delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
53 irqtime->irq_start_time += delta; 64 irqtime->irq_start_time += delta;
54 65
55 u64_stats_update_begin(&irqtime->sync);
56 /* 66 /*
57 * We do not account for softirq time from ksoftirqd here. 67 * We do not account for softirq time from ksoftirqd here.
58 * We want to continue accounting softirq time to ksoftirqd thread 68 * We want to continue accounting softirq time to ksoftirqd thread
59 * in that case, so as not to confuse scheduler with a special task 69 * in that case, so as not to confuse scheduler with a special task
60 * that do not consume any time, but still wants to run. 70 * that do not consume any time, but still wants to run.
61 */ 71 */
62 if (hardirq_count()) { 72 if (hardirq_count())
63 cpustat[CPUTIME_IRQ] += delta; 73 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
64 irqtime->tick_delta += delta; 74 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
65 } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) { 75 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
66 cpustat[CPUTIME_SOFTIRQ] += delta;
67 irqtime->tick_delta += delta;
68 }
69
70 u64_stats_update_end(&irqtime->sync);
71} 76}
72EXPORT_SYMBOL_GPL(irqtime_account_irq); 77EXPORT_SYMBOL_GPL(irqtime_account_irq);
73 78
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dea138964b91..d71109321841 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -717,18 +717,12 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
717} 717}
718 718
719#ifdef CONFIG_SMP 719#ifdef CONFIG_SMP
720
721#include "sched-pelt.h"
722
720static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); 723static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
721static unsigned long task_h_load(struct task_struct *p); 724static unsigned long task_h_load(struct task_struct *p);
722 725
723/*
724 * We choose a half-life close to 1 scheduling period.
725 * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
726 * dependent on this value.
727 */
728#define LOAD_AVG_PERIOD 32
729#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
730#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
731
732/* Give new sched_entity start runnable values to heavy its load in infant time */ 726/* Give new sched_entity start runnable values to heavy its load in infant time */
733void init_entity_runnable_average(struct sched_entity *se) 727void init_entity_runnable_average(struct sched_entity *se)
734{ 728{
@@ -2733,47 +2727,15 @@ static inline void update_cfs_shares(struct sched_entity *se)
2733#endif /* CONFIG_FAIR_GROUP_SCHED */ 2727#endif /* CONFIG_FAIR_GROUP_SCHED */
2734 2728
2735#ifdef CONFIG_SMP 2729#ifdef CONFIG_SMP
2736/* Precomputed fixed inverse multiplies for multiplication by y^n */
2737static const u32 runnable_avg_yN_inv[] = {
2738 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2739 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2740 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2741 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2742 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2743 0x85aac367, 0x82cd8698,
2744};
2745
2746/*
2747 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
2748 * over-estimates when re-combining.
2749 */
2750static const u32 runnable_avg_yN_sum[] = {
2751 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2752 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2753 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2754};
2755
2756/*
2757 * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
2758 * lower integers. See Documentation/scheduler/sched-avg.txt how these
2759 * were generated:
2760 */
2761static const u32 __accumulated_sum_N32[] = {
2762 0, 23371, 35056, 40899, 43820, 45281,
2763 46011, 46376, 46559, 46650, 46696, 46719,
2764};
2765
2766/* 2730/*
2767 * Approximate: 2731 * Approximate:
2768 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) 2732 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
2769 */ 2733 */
2770static __always_inline u64 decay_load(u64 val, u64 n) 2734static u64 decay_load(u64 val, u64 n)
2771{ 2735{
2772 unsigned int local_n; 2736 unsigned int local_n;
2773 2737
2774 if (!n) 2738 if (unlikely(n > LOAD_AVG_PERIOD * 63))
2775 return val;
2776 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2777 return 0; 2739 return 0;
2778 2740
2779 /* after bounds checking we can collapse to 32-bit */ 2741 /* after bounds checking we can collapse to 32-bit */
@@ -2795,30 +2757,97 @@ static __always_inline u64 decay_load(u64 val, u64 n)
2795 return val; 2757 return val;
2796} 2758}
2797 2759
2760static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
2761{
2762 u32 c1, c2, c3 = d3; /* y^0 == 1 */
2763
2764 /*
2765 * c1 = d1 y^p
2766 */
2767 c1 = decay_load((u64)d1, periods);
2768
2769 /*
2770 * p-1
2771 * c2 = 1024 \Sum y^n
2772 * n=1
2773 *
2774 * inf inf
2775 * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
2776 * n=0 n=p
2777 */
2778 c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
2779
2780 return c1 + c2 + c3;
2781}
2782
2783#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2784
2798/* 2785/*
2799 * For updates fully spanning n periods, the contribution to runnable 2786 * Accumulate the three separate parts of the sum; d1 the remainder
2800 * average will be: \Sum 1024*y^n 2787 * of the last (incomplete) period, d2 the span of full periods and d3
2788 * the remainder of the (incomplete) current period.
2789 *
2790 * d1 d2 d3
2791 * ^ ^ ^
2792 * | | |
2793 * |<->|<----------------->|<--->|
2794 * ... |---x---|------| ... |------|-----x (now)
2795 *
2796 * p-1
2797 * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
2798 * n=1
2801 * 2799 *
2802 * We can compute this reasonably efficiently by combining: 2800 * = u y^p + (Step 1)
2803 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} 2801 *
2802 * p-1
2803 * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
2804 * n=1
2804 */ 2805 */
2805static u32 __compute_runnable_contrib(u64 n) 2806static __always_inline u32
2807accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
2808 unsigned long weight, int running, struct cfs_rq *cfs_rq)
2806{ 2809{
2807 u32 contrib = 0; 2810 unsigned long scale_freq, scale_cpu;
2811 u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
2812 u64 periods;
2808 2813
2809 if (likely(n <= LOAD_AVG_PERIOD)) 2814 scale_freq = arch_scale_freq_capacity(NULL, cpu);
2810 return runnable_avg_yN_sum[n]; 2815 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2811 else if (unlikely(n >= LOAD_AVG_MAX_N))
2812 return LOAD_AVG_MAX;
2813 2816
2814 /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ 2817 delta += sa->period_contrib;
2815 contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; 2818 periods = delta / 1024; /* A period is 1024us (~1ms) */
2816 n %= LOAD_AVG_PERIOD;
2817 contrib = decay_load(contrib, n);
2818 return contrib + runnable_avg_yN_sum[n];
2819}
2820 2819
2821#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) 2820 /*
2821 * Step 1: decay old *_sum if we crossed period boundaries.
2822 */
2823 if (periods) {
2824 sa->load_sum = decay_load(sa->load_sum, periods);
2825 if (cfs_rq) {
2826 cfs_rq->runnable_load_sum =
2827 decay_load(cfs_rq->runnable_load_sum, periods);
2828 }
2829 sa->util_sum = decay_load((u64)(sa->util_sum), periods);
2830
2831 /*
2832 * Step 2
2833 */
2834 delta %= 1024;
2835 contrib = __accumulate_pelt_segments(periods,
2836 1024 - sa->period_contrib, delta);
2837 }
2838 sa->period_contrib = delta;
2839
2840 contrib = cap_scale(contrib, scale_freq);
2841 if (weight) {
2842 sa->load_sum += weight * contrib;
2843 if (cfs_rq)
2844 cfs_rq->runnable_load_sum += weight * contrib;
2845 }
2846 if (running)
2847 sa->util_sum += contrib * scale_cpu;
2848
2849 return periods;
2850}
2822 2851
2823/* 2852/*
2824 * We can represent the historical contribution to runnable average as the 2853 * We can represent the historical contribution to runnable average as the
@@ -2849,13 +2878,10 @@ static u32 __compute_runnable_contrib(u64 n)
2849 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] 2878 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2850 */ 2879 */
2851static __always_inline int 2880static __always_inline int
2852__update_load_avg(u64 now, int cpu, struct sched_avg *sa, 2881___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2853 unsigned long weight, int running, struct cfs_rq *cfs_rq) 2882 unsigned long weight, int running, struct cfs_rq *cfs_rq)
2854{ 2883{
2855 u64 delta, scaled_delta, periods; 2884 u64 delta;
2856 u32 contrib;
2857 unsigned int delta_w, scaled_delta_w, decayed = 0;
2858 unsigned long scale_freq, scale_cpu;
2859 2885
2860 delta = now - sa->last_update_time; 2886 delta = now - sa->last_update_time;
2861 /* 2887 /*
@@ -2874,83 +2900,52 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2874 delta >>= 10; 2900 delta >>= 10;
2875 if (!delta) 2901 if (!delta)
2876 return 0; 2902 return 0;
2877 sa->last_update_time = now;
2878
2879 scale_freq = arch_scale_freq_capacity(NULL, cpu);
2880 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2881
2882 /* delta_w is the amount already accumulated against our next period */
2883 delta_w = sa->period_contrib;
2884 if (delta + delta_w >= 1024) {
2885 decayed = 1;
2886 2903
2887 /* how much left for next period will start over, we don't know yet */ 2904 sa->last_update_time += delta << 10;
2888 sa->period_contrib = 0;
2889 2905
2890 /* 2906 /*
2891 * Now that we know we're crossing a period boundary, figure 2907 * Now we know we crossed measurement unit boundaries. The *_avg
2892 * out how much from delta we need to complete the current 2908 * accrues by two steps:
2893 * period and accrue it. 2909 *
2894 */ 2910 * Step 1: accumulate *_sum since last_update_time. If we haven't
2895 delta_w = 1024 - delta_w; 2911 * crossed period boundaries, finish.
2896 scaled_delta_w = cap_scale(delta_w, scale_freq); 2912 */
2897 if (weight) { 2913 if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
2898 sa->load_sum += weight * scaled_delta_w; 2914 return 0;
2899 if (cfs_rq) {
2900 cfs_rq->runnable_load_sum +=
2901 weight * scaled_delta_w;
2902 }
2903 }
2904 if (running)
2905 sa->util_sum += scaled_delta_w * scale_cpu;
2906
2907 delta -= delta_w;
2908
2909 /* Figure out how many additional periods this update spans */
2910 periods = delta / 1024;
2911 delta %= 1024;
2912 2915
2913 sa->load_sum = decay_load(sa->load_sum, periods + 1); 2916 /*
2914 if (cfs_rq) { 2917 * Step 2: update *_avg.
2915 cfs_rq->runnable_load_sum = 2918 */
2916 decay_load(cfs_rq->runnable_load_sum, periods + 1); 2919 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
2917 } 2920 if (cfs_rq) {
2918 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); 2921 cfs_rq->runnable_load_avg =
2919 2922 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2920 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
2921 contrib = __compute_runnable_contrib(periods);
2922 contrib = cap_scale(contrib, scale_freq);
2923 if (weight) {
2924 sa->load_sum += weight * contrib;
2925 if (cfs_rq)
2926 cfs_rq->runnable_load_sum += weight * contrib;
2927 }
2928 if (running)
2929 sa->util_sum += contrib * scale_cpu;
2930 } 2923 }
2924 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2931 2925
2932 /* Remainder of delta accrued against u_0` */ 2926 return 1;
2933 scaled_delta = cap_scale(delta, scale_freq); 2927}
2934 if (weight) {
2935 sa->load_sum += weight * scaled_delta;
2936 if (cfs_rq)
2937 cfs_rq->runnable_load_sum += weight * scaled_delta;
2938 }
2939 if (running)
2940 sa->util_sum += scaled_delta * scale_cpu;
2941 2928
2942 sa->period_contrib += delta; 2929static int
2930__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
2931{
2932 return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
2933}
2943 2934
2944 if (decayed) { 2935static int
2945 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); 2936__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
2946 if (cfs_rq) { 2937{
2947 cfs_rq->runnable_load_avg = 2938 return ___update_load_avg(now, cpu, &se->avg,
2948 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); 2939 se->on_rq * scale_load_down(se->load.weight),
2949 } 2940 cfs_rq->curr == se, NULL);
2950 sa->util_avg = sa->util_sum / LOAD_AVG_MAX; 2941}
2951 }
2952 2942
2953 return decayed; 2943static int
2944__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
2945{
2946 return ___update_load_avg(now, cpu, &cfs_rq->avg,
2947 scale_load_down(cfs_rq->load.weight),
2948 cfs_rq->curr != NULL, cfs_rq);
2954} 2949}
2955 2950
2956/* 2951/*
@@ -3014,6 +3009,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3014void set_task_rq_fair(struct sched_entity *se, 3009void set_task_rq_fair(struct sched_entity *se,
3015 struct cfs_rq *prev, struct cfs_rq *next) 3010 struct cfs_rq *prev, struct cfs_rq *next)
3016{ 3011{
3012 u64 p_last_update_time;
3013 u64 n_last_update_time;
3014
3017 if (!sched_feat(ATTACH_AGE_LOAD)) 3015 if (!sched_feat(ATTACH_AGE_LOAD))
3018 return; 3016 return;
3019 3017
@@ -3024,11 +3022,11 @@ void set_task_rq_fair(struct sched_entity *se,
3024 * time. This will result in the wakee task is less decayed, but giving 3022 * time. This will result in the wakee task is less decayed, but giving
3025 * the wakee more load sounds not bad. 3023 * the wakee more load sounds not bad.
3026 */ 3024 */
3027 if (se->avg.last_update_time && prev) { 3025 if (!(se->avg.last_update_time && prev))
3028 u64 p_last_update_time; 3026 return;
3029 u64 n_last_update_time;
3030 3027
3031#ifndef CONFIG_64BIT 3028#ifndef CONFIG_64BIT
3029 {
3032 u64 p_last_update_time_copy; 3030 u64 p_last_update_time_copy;
3033 u64 n_last_update_time_copy; 3031 u64 n_last_update_time_copy;
3034 3032
@@ -3043,14 +3041,13 @@ void set_task_rq_fair(struct sched_entity *se,
3043 3041
3044 } while (p_last_update_time != p_last_update_time_copy || 3042 } while (p_last_update_time != p_last_update_time_copy ||
3045 n_last_update_time != n_last_update_time_copy); 3043 n_last_update_time != n_last_update_time_copy);
3044 }
3046#else 3045#else
3047 p_last_update_time = prev->avg.last_update_time; 3046 p_last_update_time = prev->avg.last_update_time;
3048 n_last_update_time = next->avg.last_update_time; 3047 n_last_update_time = next->avg.last_update_time;
3049#endif 3048#endif
3050 __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), 3049 __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
3051 &se->avg, 0, 0, NULL); 3050 se->avg.last_update_time = n_last_update_time;
3052 se->avg.last_update_time = n_last_update_time;
3053 }
3054} 3051}
3055 3052
3056/* Take into account change of utilization of a child task group */ 3053/* Take into account change of utilization of a child task group */
@@ -3173,6 +3170,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
3173 return 1; 3170 return 1;
3174} 3171}
3175 3172
3173/*
3174 * Check if we need to update the load and the utilization of a blocked
3175 * group_entity:
3176 */
3177static inline bool skip_blocked_update(struct sched_entity *se)
3178{
3179 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3180
3181 /*
3182 * If sched_entity still have not zero load or utilization, we have to
3183 * decay it:
3184 */
3185 if (se->avg.load_avg || se->avg.util_avg)
3186 return false;
3187
3188 /*
3189 * If there is a pending propagation, we have to update the load and
3190 * the utilization of the sched_entity:
3191 */
3192 if (gcfs_rq->propagate_avg)
3193 return false;
3194
3195 /*
3196 * Otherwise, the load and the utilization of the sched_entity is
3197 * already zero and there is no pending propagation, so it will be a
3198 * waste of time to try to decay it:
3199 */
3200 return true;
3201}
3202
3176#else /* CONFIG_FAIR_GROUP_SCHED */ 3203#else /* CONFIG_FAIR_GROUP_SCHED */
3177 3204
3178static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} 3205static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
@@ -3265,8 +3292,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3265 set_tg_cfs_propagate(cfs_rq); 3292 set_tg_cfs_propagate(cfs_rq);
3266 } 3293 }
3267 3294
3268 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, 3295 decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
3269 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
3270 3296
3271#ifndef CONFIG_64BIT 3297#ifndef CONFIG_64BIT
3272 smp_wmb(); 3298 smp_wmb();
@@ -3298,11 +3324,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
3298 * Track task load average for carrying it to new CPU after migrated, and 3324 * Track task load average for carrying it to new CPU after migrated, and
3299 * track group sched_entity load average for task_h_load calc in migration 3325 * track group sched_entity load average for task_h_load calc in migration
3300 */ 3326 */
3301 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { 3327 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3302 __update_load_avg(now, cpu, &se->avg, 3328 __update_load_avg_se(now, cpu, cfs_rq, se);
3303 se->on_rq * scale_load_down(se->load.weight),
3304 cfs_rq->curr == se, NULL);
3305 }
3306 3329
3307 decayed = update_cfs_rq_load_avg(now, cfs_rq, true); 3330 decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
3308 decayed |= propagate_entity_load_avg(se); 3331 decayed |= propagate_entity_load_avg(se);
@@ -3407,7 +3430,7 @@ void sync_entity_load_avg(struct sched_entity *se)
3407 u64 last_update_time; 3430 u64 last_update_time;
3408 3431
3409 last_update_time = cfs_rq_last_update_time(cfs_rq); 3432 last_update_time = cfs_rq_last_update_time(cfs_rq);
3410 __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); 3433 __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
3411} 3434}
3412 3435
3413/* 3436/*
@@ -4271,8 +4294,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4271 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, 4294 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4272 throttled_list) { 4295 throttled_list) {
4273 struct rq *rq = rq_of(cfs_rq); 4296 struct rq *rq = rq_of(cfs_rq);
4297 struct rq_flags rf;
4274 4298
4275 raw_spin_lock(&rq->lock); 4299 rq_lock(rq, &rf);
4276 if (!cfs_rq_throttled(cfs_rq)) 4300 if (!cfs_rq_throttled(cfs_rq))
4277 goto next; 4301 goto next;
4278 4302
@@ -4289,7 +4313,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4289 unthrottle_cfs_rq(cfs_rq); 4313 unthrottle_cfs_rq(cfs_rq);
4290 4314
4291next: 4315next:
4292 raw_spin_unlock(&rq->lock); 4316 rq_unlock(rq, &rf);
4293 4317
4294 if (!remaining) 4318 if (!remaining)
4295 break; 4319 break;
@@ -5097,15 +5121,16 @@ void cpu_load_update_nohz_stop(void)
5097 unsigned long curr_jiffies = READ_ONCE(jiffies); 5121 unsigned long curr_jiffies = READ_ONCE(jiffies);
5098 struct rq *this_rq = this_rq(); 5122 struct rq *this_rq = this_rq();
5099 unsigned long load; 5123 unsigned long load;
5124 struct rq_flags rf;
5100 5125
5101 if (curr_jiffies == this_rq->last_load_update_tick) 5126 if (curr_jiffies == this_rq->last_load_update_tick)
5102 return; 5127 return;
5103 5128
5104 load = weighted_cpuload(cpu_of(this_rq)); 5129 load = weighted_cpuload(cpu_of(this_rq));
5105 raw_spin_lock(&this_rq->lock); 5130 rq_lock(this_rq, &rf);
5106 update_rq_clock(this_rq); 5131 update_rq_clock(this_rq);
5107 cpu_load_update_nohz(this_rq, curr_jiffies, load); 5132 cpu_load_update_nohz(this_rq, curr_jiffies, load);
5108 raw_spin_unlock(&this_rq->lock); 5133 rq_unlock(this_rq, &rf);
5109} 5134}
5110#else /* !CONFIG_NO_HZ_COMMON */ 5135#else /* !CONFIG_NO_HZ_COMMON */
5111static inline void cpu_load_update_nohz(struct rq *this_rq, 5136static inline void cpu_load_update_nohz(struct rq *this_rq,
@@ -6769,7 +6794,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
6769 lockdep_assert_held(&env->src_rq->lock); 6794 lockdep_assert_held(&env->src_rq->lock);
6770 6795
6771 p->on_rq = TASK_ON_RQ_MIGRATING; 6796 p->on_rq = TASK_ON_RQ_MIGRATING;
6772 deactivate_task(env->src_rq, p, 0); 6797 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
6773 set_task_cpu(p, env->dst_cpu); 6798 set_task_cpu(p, env->dst_cpu);
6774} 6799}
6775 6800
@@ -6902,7 +6927,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
6902 lockdep_assert_held(&rq->lock); 6927 lockdep_assert_held(&rq->lock);
6903 6928
6904 BUG_ON(task_rq(p) != rq); 6929 BUG_ON(task_rq(p) != rq);
6905 activate_task(rq, p, 0); 6930 activate_task(rq, p, ENQUEUE_NOCLOCK);
6906 p->on_rq = TASK_ON_RQ_QUEUED; 6931 p->on_rq = TASK_ON_RQ_QUEUED;
6907 check_preempt_curr(rq, p, 0); 6932 check_preempt_curr(rq, p, 0);
6908} 6933}
@@ -6913,9 +6938,12 @@ static void attach_task(struct rq *rq, struct task_struct *p)
6913 */ 6938 */
6914static void attach_one_task(struct rq *rq, struct task_struct *p) 6939static void attach_one_task(struct rq *rq, struct task_struct *p)
6915{ 6940{
6916 raw_spin_lock(&rq->lock); 6941 struct rq_flags rf;
6942
6943 rq_lock(rq, &rf);
6944 update_rq_clock(rq);
6917 attach_task(rq, p); 6945 attach_task(rq, p);
6918 raw_spin_unlock(&rq->lock); 6946 rq_unlock(rq, &rf);
6919} 6947}
6920 6948
6921/* 6949/*
@@ -6926,8 +6954,10 @@ static void attach_tasks(struct lb_env *env)
6926{ 6954{
6927 struct list_head *tasks = &env->tasks; 6955 struct list_head *tasks = &env->tasks;
6928 struct task_struct *p; 6956 struct task_struct *p;
6957 struct rq_flags rf;
6929 6958
6930 raw_spin_lock(&env->dst_rq->lock); 6959 rq_lock(env->dst_rq, &rf);
6960 update_rq_clock(env->dst_rq);
6931 6961
6932 while (!list_empty(tasks)) { 6962 while (!list_empty(tasks)) {
6933 p = list_first_entry(tasks, struct task_struct, se.group_node); 6963 p = list_first_entry(tasks, struct task_struct, se.group_node);
@@ -6936,7 +6966,7 @@ static void attach_tasks(struct lb_env *env)
6936 attach_task(env->dst_rq, p); 6966 attach_task(env->dst_rq, p);
6937 } 6967 }
6938 6968
6939 raw_spin_unlock(&env->dst_rq->lock); 6969 rq_unlock(env->dst_rq, &rf);
6940} 6970}
6941 6971
6942#ifdef CONFIG_FAIR_GROUP_SCHED 6972#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6944,9 +6974,9 @@ static void update_blocked_averages(int cpu)
6944{ 6974{
6945 struct rq *rq = cpu_rq(cpu); 6975 struct rq *rq = cpu_rq(cpu);
6946 struct cfs_rq *cfs_rq; 6976 struct cfs_rq *cfs_rq;
6947 unsigned long flags; 6977 struct rq_flags rf;
6948 6978
6949 raw_spin_lock_irqsave(&rq->lock, flags); 6979 rq_lock_irqsave(rq, &rf);
6950 update_rq_clock(rq); 6980 update_rq_clock(rq);
6951 6981
6952 /* 6982 /*
@@ -6954,6 +6984,8 @@ static void update_blocked_averages(int cpu)
6954 * list_add_leaf_cfs_rq() for details. 6984 * list_add_leaf_cfs_rq() for details.
6955 */ 6985 */
6956 for_each_leaf_cfs_rq(rq, cfs_rq) { 6986 for_each_leaf_cfs_rq(rq, cfs_rq) {
6987 struct sched_entity *se;
6988
6957 /* throttled entities do not contribute to load */ 6989 /* throttled entities do not contribute to load */
6958 if (throttled_hierarchy(cfs_rq)) 6990 if (throttled_hierarchy(cfs_rq))
6959 continue; 6991 continue;
@@ -6961,11 +6993,12 @@ static void update_blocked_averages(int cpu)
6961 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) 6993 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
6962 update_tg_load_avg(cfs_rq, 0); 6994 update_tg_load_avg(cfs_rq, 0);
6963 6995
6964 /* Propagate pending load changes to the parent */ 6996 /* Propagate pending load changes to the parent, if any: */
6965 if (cfs_rq->tg->se[cpu]) 6997 se = cfs_rq->tg->se[cpu];
6966 update_load_avg(cfs_rq->tg->se[cpu], 0); 6998 if (se && !skip_blocked_update(se))
6999 update_load_avg(se, 0);
6967 } 7000 }
6968 raw_spin_unlock_irqrestore(&rq->lock, flags); 7001 rq_unlock_irqrestore(rq, &rf);
6969} 7002}
6970 7003
6971/* 7004/*
@@ -7019,12 +7052,12 @@ static inline void update_blocked_averages(int cpu)
7019{ 7052{
7020 struct rq *rq = cpu_rq(cpu); 7053 struct rq *rq = cpu_rq(cpu);
7021 struct cfs_rq *cfs_rq = &rq->cfs; 7054 struct cfs_rq *cfs_rq = &rq->cfs;
7022 unsigned long flags; 7055 struct rq_flags rf;
7023 7056
7024 raw_spin_lock_irqsave(&rq->lock, flags); 7057 rq_lock_irqsave(rq, &rf);
7025 update_rq_clock(rq); 7058 update_rq_clock(rq);
7026 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); 7059 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
7027 raw_spin_unlock_irqrestore(&rq->lock, flags); 7060 rq_unlock_irqrestore(rq, &rf);
7028} 7061}
7029 7062
7030static unsigned long task_h_load(struct task_struct *p) 7063static unsigned long task_h_load(struct task_struct *p)
@@ -7525,6 +7558,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
7525{ 7558{
7526 struct sched_domain *child = env->sd->child; 7559 struct sched_domain *child = env->sd->child;
7527 struct sched_group *sg = env->sd->groups; 7560 struct sched_group *sg = env->sd->groups;
7561 struct sg_lb_stats *local = &sds->local_stat;
7528 struct sg_lb_stats tmp_sgs; 7562 struct sg_lb_stats tmp_sgs;
7529 int load_idx, prefer_sibling = 0; 7563 int load_idx, prefer_sibling = 0;
7530 bool overload = false; 7564 bool overload = false;
@@ -7541,7 +7575,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
7541 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); 7575 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
7542 if (local_group) { 7576 if (local_group) {
7543 sds->local = sg; 7577 sds->local = sg;
7544 sgs = &sds->local_stat; 7578 sgs = local;
7545 7579
7546 if (env->idle != CPU_NEWLY_IDLE || 7580 if (env->idle != CPU_NEWLY_IDLE ||
7547 time_after_eq(jiffies, sg->sgc->next_update)) 7581 time_after_eq(jiffies, sg->sgc->next_update))
@@ -7565,8 +7599,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
7565 * the tasks on the system). 7599 * the tasks on the system).
7566 */ 7600 */
7567 if (prefer_sibling && sds->local && 7601 if (prefer_sibling && sds->local &&
7568 group_has_capacity(env, &sds->local_stat) && 7602 group_has_capacity(env, local) &&
7569 (sgs->sum_nr_running > 1)) { 7603 (sgs->sum_nr_running > local->sum_nr_running + 1)) {
7570 sgs->group_no_capacity = 1; 7604 sgs->group_no_capacity = 1;
7571 sgs->group_type = group_classify(sg, sgs); 7605 sgs->group_type = group_classify(sg, sgs);
7572 } 7606 }
@@ -7597,7 +7631,7 @@ next_group:
7597 7631
7598/** 7632/**
7599 * check_asym_packing - Check to see if the group is packed into the 7633 * check_asym_packing - Check to see if the group is packed into the
7600 * sched doman. 7634 * sched domain.
7601 * 7635 *
7602 * This is primarily intended to used at the sibling level. Some 7636 * This is primarily intended to used at the sibling level. Some
7603 * cores like POWER7 prefer to use lower numbered SMT threads. In the 7637 * cores like POWER7 prefer to use lower numbered SMT threads. In the
@@ -8042,7 +8076,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
8042 struct sched_domain *sd_parent = sd->parent; 8076 struct sched_domain *sd_parent = sd->parent;
8043 struct sched_group *group; 8077 struct sched_group *group;
8044 struct rq *busiest; 8078 struct rq *busiest;
8045 unsigned long flags; 8079 struct rq_flags rf;
8046 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); 8080 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
8047 8081
8048 struct lb_env env = { 8082 struct lb_env env = {
@@ -8105,7 +8139,7 @@ redo:
8105 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 8139 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
8106 8140
8107more_balance: 8141more_balance:
8108 raw_spin_lock_irqsave(&busiest->lock, flags); 8142 rq_lock_irqsave(busiest, &rf);
8109 update_rq_clock(busiest); 8143 update_rq_clock(busiest);
8110 8144
8111 /* 8145 /*
@@ -8122,14 +8156,14 @@ more_balance:
8122 * See task_rq_lock() family for the details. 8156 * See task_rq_lock() family for the details.
8123 */ 8157 */
8124 8158
8125 raw_spin_unlock(&busiest->lock); 8159 rq_unlock(busiest, &rf);
8126 8160
8127 if (cur_ld_moved) { 8161 if (cur_ld_moved) {
8128 attach_tasks(&env); 8162 attach_tasks(&env);
8129 ld_moved += cur_ld_moved; 8163 ld_moved += cur_ld_moved;
8130 } 8164 }
8131 8165
8132 local_irq_restore(flags); 8166 local_irq_restore(rf.flags);
8133 8167
8134 if (env.flags & LBF_NEED_BREAK) { 8168 if (env.flags & LBF_NEED_BREAK) {
8135 env.flags &= ~LBF_NEED_BREAK; 8169 env.flags &= ~LBF_NEED_BREAK;
@@ -8207,6 +8241,8 @@ more_balance:
8207 sd->nr_balance_failed++; 8241 sd->nr_balance_failed++;
8208 8242
8209 if (need_active_balance(&env)) { 8243 if (need_active_balance(&env)) {
8244 unsigned long flags;
8245
8210 raw_spin_lock_irqsave(&busiest->lock, flags); 8246 raw_spin_lock_irqsave(&busiest->lock, flags);
8211 8247
8212 /* don't kick the active_load_balance_cpu_stop, 8248 /* don't kick the active_load_balance_cpu_stop,
@@ -8444,8 +8480,9 @@ static int active_load_balance_cpu_stop(void *data)
8444 struct rq *target_rq = cpu_rq(target_cpu); 8480 struct rq *target_rq = cpu_rq(target_cpu);
8445 struct sched_domain *sd; 8481 struct sched_domain *sd;
8446 struct task_struct *p = NULL; 8482 struct task_struct *p = NULL;
8483 struct rq_flags rf;
8447 8484
8448 raw_spin_lock_irq(&busiest_rq->lock); 8485 rq_lock_irq(busiest_rq, &rf);
8449 8486
8450 /* make sure the requested cpu hasn't gone down in the meantime */ 8487 /* make sure the requested cpu hasn't gone down in the meantime */
8451 if (unlikely(busiest_cpu != smp_processor_id() || 8488 if (unlikely(busiest_cpu != smp_processor_id() ||
@@ -8496,7 +8533,7 @@ static int active_load_balance_cpu_stop(void *data)
8496 rcu_read_unlock(); 8533 rcu_read_unlock();
8497out_unlock: 8534out_unlock:
8498 busiest_rq->active_balance = 0; 8535 busiest_rq->active_balance = 0;
8499 raw_spin_unlock(&busiest_rq->lock); 8536 rq_unlock(busiest_rq, &rf);
8500 8537
8501 if (p) 8538 if (p)
8502 attach_one_task(target_rq, p); 8539 attach_one_task(target_rq, p);
@@ -8794,10 +8831,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
8794 * do the balance. 8831 * do the balance.
8795 */ 8832 */
8796 if (time_after_eq(jiffies, rq->next_balance)) { 8833 if (time_after_eq(jiffies, rq->next_balance)) {
8797 raw_spin_lock_irq(&rq->lock); 8834 struct rq_flags rf;
8835
8836 rq_lock_irq(rq, &rf);
8798 update_rq_clock(rq); 8837 update_rq_clock(rq);
8799 cpu_load_update_idle(rq); 8838 cpu_load_update_idle(rq);
8800 raw_spin_unlock_irq(&rq->lock); 8839 rq_unlock_irq(rq, &rf);
8840
8801 rebalance_domains(rq, CPU_IDLE); 8841 rebalance_domains(rq, CPU_IDLE);
8802 } 8842 }
8803 8843
@@ -8988,8 +9028,9 @@ static void task_fork_fair(struct task_struct *p)
8988 struct cfs_rq *cfs_rq; 9028 struct cfs_rq *cfs_rq;
8989 struct sched_entity *se = &p->se, *curr; 9029 struct sched_entity *se = &p->se, *curr;
8990 struct rq *rq = this_rq(); 9030 struct rq *rq = this_rq();
9031 struct rq_flags rf;
8991 9032
8992 raw_spin_lock(&rq->lock); 9033 rq_lock(rq, &rf);
8993 update_rq_clock(rq); 9034 update_rq_clock(rq);
8994 9035
8995 cfs_rq = task_cfs_rq(current); 9036 cfs_rq = task_cfs_rq(current);
@@ -9010,7 +9051,7 @@ static void task_fork_fair(struct task_struct *p)
9010 } 9051 }
9011 9052
9012 se->vruntime -= cfs_rq->min_vruntime; 9053 se->vruntime -= cfs_rq->min_vruntime;
9013 raw_spin_unlock(&rq->lock); 9054 rq_unlock(rq, &rf);
9014} 9055}
9015 9056
9016/* 9057/*
@@ -9372,7 +9413,6 @@ static DEFINE_MUTEX(shares_mutex);
9372int sched_group_set_shares(struct task_group *tg, unsigned long shares) 9413int sched_group_set_shares(struct task_group *tg, unsigned long shares)
9373{ 9414{
9374 int i; 9415 int i;
9375 unsigned long flags;
9376 9416
9377 /* 9417 /*
9378 * We can't change the weight of the root cgroup. 9418 * We can't change the weight of the root cgroup.
@@ -9389,19 +9429,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
9389 tg->shares = shares; 9429 tg->shares = shares;
9390 for_each_possible_cpu(i) { 9430 for_each_possible_cpu(i) {
9391 struct rq *rq = cpu_rq(i); 9431 struct rq *rq = cpu_rq(i);
9392 struct sched_entity *se; 9432 struct sched_entity *se = tg->se[i];
9433 struct rq_flags rf;
9393 9434
9394 se = tg->se[i];
9395 /* Propagate contribution to hierarchy */ 9435 /* Propagate contribution to hierarchy */
9396 raw_spin_lock_irqsave(&rq->lock, flags); 9436 rq_lock_irqsave(rq, &rf);
9397
9398 /* Possible calls to update_curr() need rq clock */
9399 update_rq_clock(rq); 9437 update_rq_clock(rq);
9400 for_each_sched_entity(se) { 9438 for_each_sched_entity(se) {
9401 update_load_avg(se, UPDATE_TG); 9439 update_load_avg(se, UPDATE_TG);
9402 update_cfs_shares(se); 9440 update_cfs_shares(se);
9403 } 9441 }
9404 raw_spin_unlock_irqrestore(&rq->lock, flags); 9442 rq_unlock_irqrestore(rq, &rf);
9405 } 9443 }
9406 9444
9407done: 9445done:
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1b3c8189b286..11192e0cb122 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true)
56 */ 56 */
57SCHED_FEAT(SIS_AVG_CPU, false) 57SCHED_FEAT(SIS_AVG_CPU, false)
58 58
59/*
60 * Issue a WARN when we do multiple update_rq_clock() calls
61 * in a single rq->lock section. Default disabled because the
62 * annotations are not complete.
63 */
64SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
65
59#ifdef HAVE_RT_PUSH_IPI 66#ifdef HAVE_RT_PUSH_IPI
60/* 67/*
61 * In order to avoid a thundering herd attack of CPUs that are 68 * In order to avoid a thundering herd attack of CPUs that are
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index ac6d5176463d..ef63adce0c9c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -10,6 +10,7 @@
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/stackprotector.h> 11#include <linux/stackprotector.h>
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/livepatch.h>
13 14
14#include <asm/tlb.h> 15#include <asm/tlb.h>
15 16
@@ -264,7 +265,10 @@ static void do_idle(void)
264 smp_mb__after_atomic(); 265 smp_mb__after_atomic();
265 266
266 sched_ttwu_pending(); 267 sched_ttwu_pending();
267 schedule_preempt_disabled(); 268 schedule_idle();
269
270 if (unlikely(klp_patch_pending(current)))
271 klp_update_patch_state(current);
268} 272}
269 273
270bool cpu_in_idle(unsigned long pc) 274bool cpu_in_idle(unsigned long pc)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9f3e40226dec..979b7341008a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq)
1927#define RT_PUSH_IPI_EXECUTING 1 1927#define RT_PUSH_IPI_EXECUTING 1
1928#define RT_PUSH_IPI_RESTART 2 1928#define RT_PUSH_IPI_RESTART 2
1929 1929
1930/*
1931 * When a high priority task schedules out from a CPU and a lower priority
1932 * task is scheduled in, a check is made to see if there's any RT tasks
1933 * on other CPUs that are waiting to run because a higher priority RT task
1934 * is currently running on its CPU. In this case, the CPU with multiple RT
1935 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
1936 * up that may be able to run one of its non-running queued RT tasks.
1937 *
1938 * On large CPU boxes, there's the case that several CPUs could schedule
1939 * a lower priority task at the same time, in which case it will look for
1940 * any overloaded CPUs that it could pull a task from. To do this, the runqueue
1941 * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
1942 * for a single overloaded CPU's runqueue lock can produce a large latency.
1943 * (This has actually been observed on large boxes running cyclictest).
1944 * Instead of taking the runqueue lock of the overloaded CPU, each of the
1945 * CPUs that scheduled a lower priority task simply sends an IPI to the
1946 * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
1947 * lots of contention. The overloaded CPU will look to push its non-running
1948 * RT task off, and if it does, it can then ignore the other IPIs coming
1949 * in, and just pass those IPIs off to any other overloaded CPU.
1950 *
1951 * When a CPU schedules a lower priority task, it only sends an IPI to
1952 * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
1953 * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
1954 * RT overloaded tasks, would cause 100 IPIs to go out at once.
1955 *
1956 * The overloaded RT CPU, when receiving an IPI, will try to push off its
1957 * overloaded RT tasks and then send an IPI to the next CPU that has
1958 * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
1959 * have completed. Just because a CPU may have pushed off its own overloaded
1960 * RT task does not mean it should stop sending the IPI around to other
1961 * overloaded CPUs. There may be another RT task waiting to run on one of
1962 * those CPUs that are of higher priority than the one that was just
1963 * pushed.
1964 *
1965 * An optimization that could possibly be made is to make a CPU array similar
1966 * to the cpupri array mask of all running RT tasks, but for the overloaded
1967 * case, then the IPI could be sent to only the CPU with the highest priority
1968 * RT task waiting, and that CPU could send off further IPIs to the CPU with
1969 * the next highest waiting task. Since the overloaded case is much less likely
1970 * to happen, the complexity of this implementation may not be worth it.
1971 * Instead, just send an IPI around to all overloaded CPUs.
1972 *
1973 * The rq->rt.push_flags holds the status of the IPI that is going around.
1974 * A run queue can only send out a single IPI at a time. The possible flags
1975 * for rq->rt.push_flags are:
1976 *
1977 * (None or zero): No IPI is going around for the current rq
1978 * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around
1979 * RT_PUSH_IPI_RESTART: The priority of the running task for the rq
1980 * has changed, and the IPI should restart
1981 * circulating the overloaded CPUs again.
1982 *
1983 * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
1984 * before sending to the next CPU.
1985 *
1986 * Instead of having all CPUs that schedule a lower priority task send
1987 * an IPI to the same "first" CPU in the RT overload mask, they send it
1988 * to the next overloaded CPU after their own CPU. This helps distribute
1989 * the work when there's more than one overloaded CPU and multiple CPUs
1990 * scheduling in lower priority tasks.
1991 *
1992 * When a rq schedules a lower priority task than what was currently
1993 * running, the next CPU with overloaded RT tasks is examined first.
1994 * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
1995 * priority task, it will send an IPI first to CPU 5, then CPU 5 will
1996 * send to CPU 1 if it is still overloaded. CPU 1 will clear the
1997 * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
1998 *
1999 * The first CPU to notice IPI_RESTART is set, will clear that flag and then
2000 * send an IPI to the next overloaded CPU after the rq->cpu and not the next
2001 * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
2002 * schedules a lower priority task, and the IPI_RESTART gets set while the
2003 * handling is being done on CPU 5, it will clear the flag and send it back to
2004 * CPU 4 instead of CPU 1.
2005 *
2006 * Note, the above logic can be disabled by turning off the sched_feature
2007 * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
2008 * taken by the CPU requesting a pull and the waiting RT task will be pulled
2009 * by that CPU. This may be fine for machines with few CPUs.
2010 */
1930static void tell_cpu_to_push(struct rq *rq) 2011static void tell_cpu_to_push(struct rq *rq)
1931{ 2012{
1932 int cpu; 2013 int cpu;
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
new file mode 100644
index 000000000000..cd200d16529e
--- /dev/null
+++ b/kernel/sched/sched-pelt.h
@@ -0,0 +1,13 @@
1/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
2
3static const u32 runnable_avg_yN_inv[] = {
4 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
5 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
6 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
7 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
8 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
9 0x85aac367, 0x82cd8698,
10};
11
12#define LOAD_AVG_PERIOD 32
13#define LOAD_AVG_MAX 47742
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cbf92214ad8..6dda2aab731e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40];
1331#define DEQUEUE_SLEEP 0x01 1331#define DEQUEUE_SLEEP 0x01
1332#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ 1332#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
1333#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ 1333#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
1334#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */
1334 1335
1335#define ENQUEUE_WAKEUP 0x01 1336#define ENQUEUE_WAKEUP 0x01
1336#define ENQUEUE_RESTORE 0x02 1337#define ENQUEUE_RESTORE 0x02
1337#define ENQUEUE_MOVE 0x04 1338#define ENQUEUE_MOVE 0x04
1339#define ENQUEUE_NOCLOCK 0x08
1338 1340
1339#define ENQUEUE_HEAD 0x08 1341#define ENQUEUE_HEAD 0x10
1340#define ENQUEUE_REPLENISH 0x10 1342#define ENQUEUE_REPLENISH 0x20
1341#ifdef CONFIG_SMP 1343#ifdef CONFIG_SMP
1342#define ENQUEUE_MIGRATED 0x20 1344#define ENQUEUE_MIGRATED 0x40
1343#else 1345#else
1344#define ENQUEUE_MIGRATED 0x00 1346#define ENQUEUE_MIGRATED 0x00
1345#endif 1347#endif
@@ -1465,6 +1467,8 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1465} 1467}
1466#endif 1468#endif
1467 1469
1470extern void schedule_idle(void);
1471
1468extern void sysrq_sched_debug_show(void); 1472extern void sysrq_sched_debug_show(void);
1469extern void sched_init_granularity(void); 1473extern void sched_init_granularity(void);
1470extern void update_max_interval(void); 1474extern void update_max_interval(void);
@@ -1624,6 +1628,7 @@ static inline void sched_avg_update(struct rq *rq) { }
1624 1628
1625struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) 1629struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
1626 __acquires(rq->lock); 1630 __acquires(rq->lock);
1631
1627struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) 1632struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
1628 __acquires(p->pi_lock) 1633 __acquires(p->pi_lock)
1629 __acquires(rq->lock); 1634 __acquires(rq->lock);
@@ -1645,6 +1650,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1645 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); 1650 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
1646} 1651}
1647 1652
1653static inline void
1654rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
1655 __acquires(rq->lock)
1656{
1657 raw_spin_lock_irqsave(&rq->lock, rf->flags);
1658 rq_pin_lock(rq, rf);
1659}
1660
1661static inline void
1662rq_lock_irq(struct rq *rq, struct rq_flags *rf)
1663 __acquires(rq->lock)
1664{
1665 raw_spin_lock_irq(&rq->lock);
1666 rq_pin_lock(rq, rf);
1667}
1668
1669static inline void
1670rq_lock(struct rq *rq, struct rq_flags *rf)
1671 __acquires(rq->lock)
1672{
1673 raw_spin_lock(&rq->lock);
1674 rq_pin_lock(rq, rf);
1675}
1676
1677static inline void
1678rq_relock(struct rq *rq, struct rq_flags *rf)
1679 __acquires(rq->lock)
1680{
1681 raw_spin_lock(&rq->lock);
1682 rq_repin_lock(rq, rf);
1683}
1684
1685static inline void
1686rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
1687 __releases(rq->lock)
1688{
1689 rq_unpin_lock(rq, rf);
1690 raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
1691}
1692
1693static inline void
1694rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
1695 __releases(rq->lock)
1696{
1697 rq_unpin_lock(rq, rf);
1698 raw_spin_unlock_irq(&rq->lock);
1699}
1700
1701static inline void
1702rq_unlock(struct rq *rq, struct rq_flags *rf)
1703 __releases(rq->lock)
1704{
1705 rq_unpin_lock(rq, rf);
1706 raw_spin_unlock(&rq->lock);
1707}
1708
1648#ifdef CONFIG_SMP 1709#ifdef CONFIG_SMP
1649#ifdef CONFIG_PREEMPT 1710#ifdef CONFIG_PREEMPT
1650 1711
@@ -1869,6 +1930,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
1869 1930
1870#ifdef CONFIG_IRQ_TIME_ACCOUNTING 1931#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1871struct irqtime { 1932struct irqtime {
1933 u64 total;
1872 u64 tick_delta; 1934 u64 tick_delta;
1873 u64 irq_start_time; 1935 u64 irq_start_time;
1874 struct u64_stats_sync sync; 1936 struct u64_stats_sync sync;
@@ -1876,16 +1938,20 @@ struct irqtime {
1876 1938
1877DECLARE_PER_CPU(struct irqtime, cpu_irqtime); 1939DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
1878 1940
1941/*
1942 * Returns the irqtime minus the softirq time computed by ksoftirqd.
1943 * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
1944 * and never move forward.
1945 */
1879static inline u64 irq_time_read(int cpu) 1946static inline u64 irq_time_read(int cpu)
1880{ 1947{
1881 struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); 1948 struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
1882 u64 *cpustat = kcpustat_cpu(cpu).cpustat;
1883 unsigned int seq; 1949 unsigned int seq;
1884 u64 total; 1950 u64 total;
1885 1951
1886 do { 1952 do {
1887 seq = __u64_stats_fetch_begin(&irqtime->sync); 1953 seq = __u64_stats_fetch_begin(&irqtime->sync);
1888 total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ]; 1954 total = irqtime->total;
1889 } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); 1955 } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
1890 1956
1891 return total; 1957 return total;
diff --git a/kernel/signal.c b/kernel/signal.c
index 7e59ebc2c25e..ca92bcfeb322 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1237,7 +1237,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1237 } 1237 }
1238 /* 1238 /*
1239 * This sighand can be already freed and even reused, but 1239 * This sighand can be already freed and even reused, but
1240 * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which 1240 * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
1241 * initializes ->siglock: this slab can't go away, it has 1241 * initializes ->siglock: this slab can't go away, it has
1242 * the same object type, ->siglock can't be reinitialized. 1242 * the same object type, ->siglock can't be reinitialized.
1243 * 1243 *
@@ -1318,7 +1318,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
1318 } 1318 }
1319} 1319}
1320 1320
1321int kill_proc_info(int sig, struct siginfo *info, pid_t pid) 1321static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1322{ 1322{
1323 int error; 1323 int error;
1324 rcu_read_lock(); 1324 rcu_read_lock();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 744fa611cae0..4e09821f9d9e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -309,7 +309,7 @@ restart:
309 account_irq_exit_time(current); 309 account_irq_exit_time(current);
310 __local_bh_enable(SOFTIRQ_OFFSET); 310 __local_bh_enable(SOFTIRQ_OFFSET);
311 WARN_ON_ONCE(in_interrupt()); 311 WARN_ON_ONCE(in_interrupt());
312 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 312 current_restore_flags(old_flags, PF_MEMALLOC);
313} 313}
314 314
315asmlinkage __visible void do_softirq(void) 315asmlinkage __visible void do_softirq(void)
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9c15a9124e83..f8edee9c792d 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -54,8 +54,8 @@ int snprint_stack_trace(char *buf, size_t size,
54EXPORT_SYMBOL_GPL(snprint_stack_trace); 54EXPORT_SYMBOL_GPL(snprint_stack_trace);
55 55
56/* 56/*
57 * Architectures that do not implement save_stack_trace_tsk or 57 * Architectures that do not implement save_stack_trace_*()
58 * save_stack_trace_regs get this weak alias and a once-per-bootup warning 58 * get these weak aliases and once-per-bootup warnings
59 * (whenever this facility is utilized - for example by procfs): 59 * (whenever this facility is utilized - for example by procfs):
60 */ 60 */
61__weak void 61__weak void
@@ -69,3 +69,11 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
69{ 69{
70 WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); 70 WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
71} 71}
72
73__weak int
74save_stack_trace_tsk_reliable(struct task_struct *tsk,
75 struct stack_trace *trace)
76{
77 WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n");
78 return -ENOSYS;
79}
diff --git a/kernel/sys.c b/kernel/sys.c
index 7ff6d1b10cec..8a94b4eabcaa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1396,8 +1396,7 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
1396 !capable(CAP_SYS_RESOURCE)) 1396 !capable(CAP_SYS_RESOURCE))
1397 retval = -EPERM; 1397 retval = -EPERM;
1398 if (!retval) 1398 if (!retval)
1399 retval = security_task_setrlimit(tsk->group_leader, 1399 retval = security_task_setrlimit(tsk, resource, new_rlim);
1400 resource, new_rlim);
1401 if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { 1400 if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
1402 /* 1401 /*
1403 * The caller is asking for an immediate RLIMIT_CPU 1402 * The caller is asking for an immediate RLIMIT_CPU
@@ -1432,25 +1431,26 @@ out:
1432} 1431}
1433 1432
1434/* rcu lock must be held */ 1433/* rcu lock must be held */
1435static int check_prlimit_permission(struct task_struct *task) 1434static int check_prlimit_permission(struct task_struct *task,
1435 unsigned int flags)
1436{ 1436{
1437 const struct cred *cred = current_cred(), *tcred; 1437 const struct cred *cred = current_cred(), *tcred;
1438 bool id_match;
1438 1439
1439 if (current == task) 1440 if (current == task)
1440 return 0; 1441 return 0;
1441 1442
1442 tcred = __task_cred(task); 1443 tcred = __task_cred(task);
1443 if (uid_eq(cred->uid, tcred->euid) && 1444 id_match = (uid_eq(cred->uid, tcred->euid) &&
1444 uid_eq(cred->uid, tcred->suid) && 1445 uid_eq(cred->uid, tcred->suid) &&
1445 uid_eq(cred->uid, tcred->uid) && 1446 uid_eq(cred->uid, tcred->uid) &&
1446 gid_eq(cred->gid, tcred->egid) && 1447 gid_eq(cred->gid, tcred->egid) &&
1447 gid_eq(cred->gid, tcred->sgid) && 1448 gid_eq(cred->gid, tcred->sgid) &&
1448 gid_eq(cred->gid, tcred->gid)) 1449 gid_eq(cred->gid, tcred->gid));
1449 return 0; 1450 if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
1450 if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) 1451 return -EPERM;
1451 return 0;
1452 1452
1453 return -EPERM; 1453 return security_task_prlimit(cred, tcred, flags);
1454} 1454}
1455 1455
1456SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, 1456SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
@@ -1460,12 +1460,17 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
1460 struct rlimit64 old64, new64; 1460 struct rlimit64 old64, new64;
1461 struct rlimit old, new; 1461 struct rlimit old, new;
1462 struct task_struct *tsk; 1462 struct task_struct *tsk;
1463 unsigned int checkflags = 0;
1463 int ret; 1464 int ret;
1464 1465
1466 if (old_rlim)
1467 checkflags |= LSM_PRLIMIT_READ;
1468
1465 if (new_rlim) { 1469 if (new_rlim) {
1466 if (copy_from_user(&new64, new_rlim, sizeof(new64))) 1470 if (copy_from_user(&new64, new_rlim, sizeof(new64)))
1467 return -EFAULT; 1471 return -EFAULT;
1468 rlim64_to_rlim(&new64, &new); 1472 rlim64_to_rlim(&new64, &new);
1473 checkflags |= LSM_PRLIMIT_WRITE;
1469 } 1474 }
1470 1475
1471 rcu_read_lock(); 1476 rcu_read_lock();
@@ -1474,7 +1479,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
1474 rcu_read_unlock(); 1479 rcu_read_unlock();
1475 return -ESRCH; 1480 return -ESRCH;
1476 } 1481 }
1477 ret = check_prlimit_permission(tsk); 1482 ret = check_prlimit_permission(tsk, checkflags);
1478 if (ret) { 1483 if (ret) {
1479 rcu_read_unlock(); 1484 rcu_read_unlock();
1480 return ret; 1485 return ret;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8c8714fcb53c..4dfba1a76cc3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1176,6 +1176,8 @@ static struct ctl_table kern_table[] = {
1176 .maxlen = sizeof(unsigned int), 1176 .maxlen = sizeof(unsigned int),
1177 .mode = 0644, 1177 .mode = 0644,
1178 .proc_handler = timer_migration_handler, 1178 .proc_handler = timer_migration_handler,
1179 .extra1 = &zero,
1180 .extra2 = &one,
1179 }, 1181 },
1180#endif 1182#endif
1181#ifdef CONFIG_BPF_SYSCALL 1183#ifdef CONFIG_BPF_SYSCALL
@@ -2574,7 +2576,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
2574 int write, void *data) 2576 int write, void *data)
2575{ 2577{
2576 if (write) { 2578 if (write) {
2577 if (*lvalp > LONG_MAX / HZ) 2579 if (*lvalp > INT_MAX / HZ)
2578 return 1; 2580 return 1;
2579 *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); 2581 *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
2580 } else { 2582 } else {
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 8a5e44236f78..4559e914452b 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -30,6 +30,7 @@
30#include <linux/pid_namespace.h> 30#include <linux/pid_namespace.h>
31#include <net/genetlink.h> 31#include <net/genetlink.h>
32#include <linux/atomic.h> 32#include <linux/atomic.h>
33#include <linux/sched/cputime.h>
33 34
34/* 35/*
35 * Maximum length of a cpumask that can be specified in 36 * Maximum length of a cpumask that can be specified in
@@ -210,6 +211,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
210 struct task_struct *tsk, *first; 211 struct task_struct *tsk, *first;
211 unsigned long flags; 212 unsigned long flags;
212 int rc = -ESRCH; 213 int rc = -ESRCH;
214 u64 delta, utime, stime;
215 u64 start_time;
213 216
214 /* 217 /*
215 * Add additional stats from live tasks except zombie thread group 218 * Add additional stats from live tasks except zombie thread group
@@ -227,6 +230,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
227 memset(stats, 0, sizeof(*stats)); 230 memset(stats, 0, sizeof(*stats));
228 231
229 tsk = first; 232 tsk = first;
233 start_time = ktime_get_ns();
230 do { 234 do {
231 if (tsk->exit_state) 235 if (tsk->exit_state)
232 continue; 236 continue;
@@ -238,6 +242,16 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
238 */ 242 */
239 delayacct_add_tsk(stats, tsk); 243 delayacct_add_tsk(stats, tsk);
240 244
245 /* calculate task elapsed time in nsec */
246 delta = start_time - tsk->start_time;
247 /* Convert to micro seconds */
248 do_div(delta, NSEC_PER_USEC);
249 stats->ac_etime += delta;
250
251 task_cputime(tsk, &utime, &stime);
252 stats->ac_utime += div_u64(utime, NSEC_PER_USEC);
253 stats->ac_stime += div_u64(stime, NSEC_PER_USEC);
254
241 stats->nvcsw += tsk->nvcsw; 255 stats->nvcsw += tsk->nvcsw;
242 stats->nivcsw += tsk->nivcsw; 256 stats->nivcsw += tsk->nivcsw;
243 } while_each_thread(first, tsk); 257 } while_each_thread(first, tsk);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ce3a31e8eb36..5cb5b0008d97 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -541,7 +541,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
541 * 541 *
542 * Returns the granularity of underlying alarm base clock 542 * Returns the granularity of underlying alarm base clock
543 */ 543 */
544static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) 544static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
545{ 545{
546 if (!alarmtimer_get_rtcdev()) 546 if (!alarmtimer_get_rtcdev())
547 return -EINVAL; 547 return -EINVAL;
@@ -558,14 +558,14 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
558 * 558 *
559 * Provides the underlying alarm base time. 559 * Provides the underlying alarm base time.
560 */ 560 */
561static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) 561static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp)
562{ 562{
563 struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; 563 struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
564 564
565 if (!alarmtimer_get_rtcdev()) 565 if (!alarmtimer_get_rtcdev())
566 return -EINVAL; 566 return -EINVAL;
567 567
568 *tp = ktime_to_timespec(base->gettime()); 568 *tp = ktime_to_timespec64(base->gettime());
569 return 0; 569 return 0;
570} 570}
571 571
@@ -598,19 +598,19 @@ static int alarm_timer_create(struct k_itimer *new_timer)
598 * Copies out the current itimerspec data 598 * Copies out the current itimerspec data
599 */ 599 */
600static void alarm_timer_get(struct k_itimer *timr, 600static void alarm_timer_get(struct k_itimer *timr,
601 struct itimerspec *cur_setting) 601 struct itimerspec64 *cur_setting)
602{ 602{
603 ktime_t relative_expiry_time = 603 ktime_t relative_expiry_time =
604 alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); 604 alarm_expires_remaining(&(timr->it.alarm.alarmtimer));
605 605
606 if (ktime_to_ns(relative_expiry_time) > 0) { 606 if (ktime_to_ns(relative_expiry_time) > 0) {
607 cur_setting->it_value = ktime_to_timespec(relative_expiry_time); 607 cur_setting->it_value = ktime_to_timespec64(relative_expiry_time);
608 } else { 608 } else {
609 cur_setting->it_value.tv_sec = 0; 609 cur_setting->it_value.tv_sec = 0;
610 cur_setting->it_value.tv_nsec = 0; 610 cur_setting->it_value.tv_nsec = 0;
611 } 611 }
612 612
613 cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval); 613 cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval);
614} 614}
615 615
616/** 616/**
@@ -640,8 +640,8 @@ static int alarm_timer_del(struct k_itimer *timr)
640 * Sets the timer to new_setting, and starts the timer. 640 * Sets the timer to new_setting, and starts the timer.
641 */ 641 */
642static int alarm_timer_set(struct k_itimer *timr, int flags, 642static int alarm_timer_set(struct k_itimer *timr, int flags,
643 struct itimerspec *new_setting, 643 struct itimerspec64 *new_setting,
644 struct itimerspec *old_setting) 644 struct itimerspec64 *old_setting)
645{ 645{
646 ktime_t exp; 646 ktime_t exp;
647 647
@@ -659,8 +659,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
659 return TIMER_RETRY; 659 return TIMER_RETRY;
660 660
661 /* start the timer */ 661 /* start the timer */
662 timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); 662 timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval);
663 exp = timespec_to_ktime(new_setting->it_value); 663 exp = timespec64_to_ktime(new_setting->it_value);
664 /* Convert (if necessary) to absolute time */ 664 /* Convert (if necessary) to absolute time */
665 if (flags != TIMER_ABSTIME) { 665 if (flags != TIMER_ABSTIME) {
666 ktime_t now; 666 ktime_t now;
@@ -790,13 +790,14 @@ out:
790 * Handles clock_nanosleep calls against _ALARM clockids 790 * Handles clock_nanosleep calls against _ALARM clockids
791 */ 791 */
792static int alarm_timer_nsleep(const clockid_t which_clock, int flags, 792static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
793 struct timespec *tsreq, struct timespec __user *rmtp) 793 struct timespec64 *tsreq,
794 struct timespec __user *rmtp)
794{ 795{
795 enum alarmtimer_type type = clock2alarm(which_clock); 796 enum alarmtimer_type type = clock2alarm(which_clock);
797 struct restart_block *restart;
796 struct alarm alarm; 798 struct alarm alarm;
797 ktime_t exp; 799 ktime_t exp;
798 int ret = 0; 800 int ret = 0;
799 struct restart_block *restart;
800 801
801 if (!alarmtimer_get_rtcdev()) 802 if (!alarmtimer_get_rtcdev())
802 return -ENOTSUPP; 803 return -ENOTSUPP;
@@ -809,7 +810,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
809 810
810 alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); 811 alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
811 812
812 exp = timespec_to_ktime(*tsreq); 813 exp = timespec64_to_ktime(*tsreq);
813 /* Convert (if necessary) to absolute time */ 814 /* Convert (if necessary) to absolute time */
814 if (flags != TIMER_ABSTIME) { 815 if (flags != TIMER_ABSTIME) {
815 ktime_t now = alarm_bases[type].gettime(); 816 ktime_t now = alarm_bases[type].gettime();
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 97ac0951f164..4237e0744e26 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -468,7 +468,7 @@ void clockevents_register_device(struct clock_event_device *dev)
468} 468}
469EXPORT_SYMBOL_GPL(clockevents_register_device); 469EXPORT_SYMBOL_GPL(clockevents_register_device);
470 470
471void clockevents_config(struct clock_event_device *dev, u32 freq) 471static void clockevents_config(struct clock_event_device *dev, u32 freq)
472{ 472{
473 u64 sec; 473 u64 sec;
474 474
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index ec08f527d7ee..ac053bb5296e 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -987,7 +987,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
987 * Returns: 987 * Returns:
988 * 0 when the timer was not active 988 * 0 when the timer was not active
989 * 1 when the timer was active 989 * 1 when the timer was active
990 * -1 when the timer is currently excuting the callback function and 990 * -1 when the timer is currently executing the callback function and
991 * cannot be stopped 991 * cannot be stopped
992 */ 992 */
993int hrtimer_try_to_cancel(struct hrtimer *timer) 993int hrtimer_try_to_cancel(struct hrtimer *timer)
@@ -1368,10 +1368,7 @@ retry:
1368 ktime_to_ns(delta)); 1368 ktime_to_ns(delta));
1369} 1369}
1370 1370
1371/* 1371/* called with interrupts disabled */
1372 * local version of hrtimer_peek_ahead_timers() called with interrupts
1373 * disabled.
1374 */
1375static inline void __hrtimer_peek_ahead_timers(void) 1372static inline void __hrtimer_peek_ahead_timers(void)
1376{ 1373{
1377 struct tick_device *td; 1374 struct tick_device *td;
@@ -1506,7 +1503,7 @@ out:
1506 return ret; 1503 return ret;
1507} 1504}
1508 1505
1509long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, 1506long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp,
1510 const enum hrtimer_mode mode, const clockid_t clockid) 1507 const enum hrtimer_mode mode, const clockid_t clockid)
1511{ 1508{
1512 struct restart_block *restart; 1509 struct restart_block *restart;
@@ -1519,7 +1516,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1519 slack = 0; 1516 slack = 0;
1520 1517
1521 hrtimer_init_on_stack(&t.timer, clockid, mode); 1518 hrtimer_init_on_stack(&t.timer, clockid, mode);
1522 hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack); 1519 hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
1523 if (do_nanosleep(&t, mode)) 1520 if (do_nanosleep(&t, mode))
1524 goto out; 1521 goto out;
1525 1522
@@ -1550,15 +1547,17 @@ out:
1550SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, 1547SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
1551 struct timespec __user *, rmtp) 1548 struct timespec __user *, rmtp)
1552{ 1549{
1550 struct timespec64 tu64;
1553 struct timespec tu; 1551 struct timespec tu;
1554 1552
1555 if (copy_from_user(&tu, rqtp, sizeof(tu))) 1553 if (copy_from_user(&tu, rqtp, sizeof(tu)))
1556 return -EFAULT; 1554 return -EFAULT;
1557 1555
1558 if (!timespec_valid(&tu)) 1556 tu64 = timespec_to_timespec64(tu);
1557 if (!timespec64_valid(&tu64))
1559 return -EINVAL; 1558 return -EINVAL;
1560 1559
1561 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); 1560 return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
1562} 1561}
1563 1562
1564/* 1563/*
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 9cff0ab82b63..31d588d37a17 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -297,7 +297,7 @@ out:
297 return err; 297 return err;
298} 298}
299 299
300static int pc_clock_gettime(clockid_t id, struct timespec *ts) 300static int pc_clock_gettime(clockid_t id, struct timespec64 *ts)
301{ 301{
302 struct posix_clock_desc cd; 302 struct posix_clock_desc cd;
303 int err; 303 int err;
@@ -316,7 +316,7 @@ static int pc_clock_gettime(clockid_t id, struct timespec *ts)
316 return err; 316 return err;
317} 317}
318 318
319static int pc_clock_getres(clockid_t id, struct timespec *ts) 319static int pc_clock_getres(clockid_t id, struct timespec64 *ts)
320{ 320{
321 struct posix_clock_desc cd; 321 struct posix_clock_desc cd;
322 int err; 322 int err;
@@ -335,7 +335,7 @@ static int pc_clock_getres(clockid_t id, struct timespec *ts)
335 return err; 335 return err;
336} 336}
337 337
338static int pc_clock_settime(clockid_t id, const struct timespec *ts) 338static int pc_clock_settime(clockid_t id, const struct timespec64 *ts)
339{ 339{
340 struct posix_clock_desc cd; 340 struct posix_clock_desc cd;
341 int err; 341 int err;
@@ -399,7 +399,7 @@ static int pc_timer_delete(struct k_itimer *kit)
399 return err; 399 return err;
400} 400}
401 401
402static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) 402static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts)
403{ 403{
404 clockid_t id = kit->it_clock; 404 clockid_t id = kit->it_clock;
405 struct posix_clock_desc cd; 405 struct posix_clock_desc cd;
@@ -414,7 +414,7 @@ static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
414} 414}
415 415
416static int pc_timer_settime(struct k_itimer *kit, int flags, 416static int pc_timer_settime(struct k_itimer *kit, int flags,
417 struct itimerspec *ts, struct itimerspec *old) 417 struct itimerspec64 *ts, struct itimerspec64 *old)
418{ 418{
419 clockid_t id = kit->it_clock; 419 clockid_t id = kit->it_clock;
420 struct posix_clock_desc cd; 420 struct posix_clock_desc cd;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 4513ad16a253..1370f067fb51 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -116,7 +116,7 @@ static inline u64 virt_ticks(struct task_struct *p)
116} 116}
117 117
118static int 118static int
119posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 119posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
120{ 120{
121 int error = check_clock(which_clock); 121 int error = check_clock(which_clock);
122 if (!error) { 122 if (!error) {
@@ -135,7 +135,7 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
135} 135}
136 136
137static int 137static int
138posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) 138posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp)
139{ 139{
140 /* 140 /*
141 * You can never reset a CPU clock, but we check for other errors 141 * You can never reset a CPU clock, but we check for other errors
@@ -261,7 +261,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
261 261
262static int posix_cpu_clock_get_task(struct task_struct *tsk, 262static int posix_cpu_clock_get_task(struct task_struct *tsk,
263 const clockid_t which_clock, 263 const clockid_t which_clock,
264 struct timespec *tp) 264 struct timespec64 *tp)
265{ 265{
266 int err = -EINVAL; 266 int err = -EINVAL;
267 u64 rtn; 267 u64 rtn;
@@ -275,13 +275,13 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
275 } 275 }
276 276
277 if (!err) 277 if (!err)
278 *tp = ns_to_timespec(rtn); 278 *tp = ns_to_timespec64(rtn);
279 279
280 return err; 280 return err;
281} 281}
282 282
283 283
284static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) 284static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp)
285{ 285{
286 const pid_t pid = CPUCLOCK_PID(which_clock); 286 const pid_t pid = CPUCLOCK_PID(which_clock);
287 int err = -EINVAL; 287 int err = -EINVAL;
@@ -562,7 +562,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
562 * and try again. (This happens when the timer is in the middle of firing.) 562 * and try again. (This happens when the timer is in the middle of firing.)
563 */ 563 */
564static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, 564static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
565 struct itimerspec *new, struct itimerspec *old) 565 struct itimerspec64 *new, struct itimerspec64 *old)
566{ 566{
567 unsigned long flags; 567 unsigned long flags;
568 struct sighand_struct *sighand; 568 struct sighand_struct *sighand;
@@ -572,7 +572,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
572 572
573 WARN_ON_ONCE(p == NULL); 573 WARN_ON_ONCE(p == NULL);
574 574
575 new_expires = timespec_to_ns(&new->it_value); 575 new_expires = timespec64_to_ns(&new->it_value);
576 576
577 /* 577 /*
578 * Protect against sighand release/switch in exit/exec and p->cpu_timers 578 * Protect against sighand release/switch in exit/exec and p->cpu_timers
@@ -633,7 +633,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
633 bump_cpu_timer(timer, val); 633 bump_cpu_timer(timer, val);
634 if (val < timer->it.cpu.expires) { 634 if (val < timer->it.cpu.expires) {
635 old_expires = timer->it.cpu.expires - val; 635 old_expires = timer->it.cpu.expires - val;
636 old->it_value = ns_to_timespec(old_expires); 636 old->it_value = ns_to_timespec64(old_expires);
637 } else { 637 } else {
638 old->it_value.tv_nsec = 1; 638 old->it_value.tv_nsec = 1;
639 old->it_value.tv_sec = 0; 639 old->it_value.tv_sec = 0;
@@ -671,7 +671,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
671 * Install the new reload setting, and 671 * Install the new reload setting, and
672 * set up the signal and overrun bookkeeping. 672 * set up the signal and overrun bookkeeping.
673 */ 673 */
674 timer->it.cpu.incr = timespec_to_ns(&new->it_interval); 674 timer->it.cpu.incr = timespec64_to_ns(&new->it_interval);
675 675
676 /* 676 /*
677 * This acts as a modification timestamp for the timer, 677 * This acts as a modification timestamp for the timer,
@@ -695,12 +695,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
695 ret = 0; 695 ret = 0;
696 out: 696 out:
697 if (old) 697 if (old)
698 old->it_interval = ns_to_timespec(old_incr); 698 old->it_interval = ns_to_timespec64(old_incr);
699 699
700 return ret; 700 return ret;
701} 701}
702 702
703static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) 703static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
704{ 704{
705 u64 now; 705 u64 now;
706 struct task_struct *p = timer->it.cpu.task; 706 struct task_struct *p = timer->it.cpu.task;
@@ -710,7 +710,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
710 /* 710 /*
711 * Easy part: convert the reload time. 711 * Easy part: convert the reload time.
712 */ 712 */
713 itp->it_interval = ns_to_timespec(timer->it.cpu.incr); 713 itp->it_interval = ns_to_timespec64(timer->it.cpu.incr);
714 714
715 if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ 715 if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */
716 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; 716 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
@@ -739,7 +739,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
739 * Call the timer disarmed, nothing else to do. 739 * Call the timer disarmed, nothing else to do.
740 */ 740 */
741 timer->it.cpu.expires = 0; 741 timer->it.cpu.expires = 0;
742 itp->it_value = ns_to_timespec(timer->it.cpu.expires); 742 itp->it_value = ns_to_timespec64(timer->it.cpu.expires);
743 return; 743 return;
744 } else { 744 } else {
745 cpu_timer_sample_group(timer->it_clock, p, &now); 745 cpu_timer_sample_group(timer->it_clock, p, &now);
@@ -748,7 +748,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
748 } 748 }
749 749
750 if (now < timer->it.cpu.expires) { 750 if (now < timer->it.cpu.expires) {
751 itp->it_value = ns_to_timespec(timer->it.cpu.expires - now); 751 itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now);
752 } else { 752 } else {
753 /* 753 /*
754 * The timer should have expired already, but the firing 754 * The timer should have expired already, but the firing
@@ -825,6 +825,8 @@ static void check_thread_timers(struct task_struct *tsk,
825 * At the hard limit, we just die. 825 * At the hard limit, we just die.
826 * No need to calculate anything else now. 826 * No need to calculate anything else now.
827 */ 827 */
828 pr_info("CPU Watchdog Timeout (hard): %s[%d]\n",
829 tsk->comm, task_pid_nr(tsk));
828 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 830 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
829 return; 831 return;
830 } 832 }
@@ -836,8 +838,7 @@ static void check_thread_timers(struct task_struct *tsk,
836 soft += USEC_PER_SEC; 838 soft += USEC_PER_SEC;
837 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; 839 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
838 } 840 }
839 printk(KERN_INFO 841 pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
840 "RT Watchdog Timeout: %s[%d]\n",
841 tsk->comm, task_pid_nr(tsk)); 842 tsk->comm, task_pid_nr(tsk));
842 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); 843 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
843 } 844 }
@@ -935,6 +936,8 @@ static void check_process_timers(struct task_struct *tsk,
935 * At the hard limit, we just die. 936 * At the hard limit, we just die.
936 * No need to calculate anything else now. 937 * No need to calculate anything else now.
937 */ 938 */
939 pr_info("RT Watchdog Timeout (hard): %s[%d]\n",
940 tsk->comm, task_pid_nr(tsk));
938 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 941 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
939 return; 942 return;
940 } 943 }
@@ -942,6 +945,8 @@ static void check_process_timers(struct task_struct *tsk,
942 /* 945 /*
943 * At the soft limit, send a SIGXCPU every second. 946 * At the soft limit, send a SIGXCPU every second.
944 */ 947 */
948 pr_info("CPU Watchdog Timeout (soft): %s[%d]\n",
949 tsk->comm, task_pid_nr(tsk));
945 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); 950 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
946 if (soft < hard) { 951 if (soft < hard) {
947 soft++; 952 soft++;
@@ -1214,7 +1219,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1214} 1219}
1215 1220
1216static int do_cpu_nanosleep(const clockid_t which_clock, int flags, 1221static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1217 struct timespec *rqtp, struct itimerspec *it) 1222 struct timespec64 *rqtp, struct itimerspec64 *it)
1218{ 1223{
1219 struct k_itimer timer; 1224 struct k_itimer timer;
1220 int error; 1225 int error;
@@ -1229,7 +1234,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1229 error = posix_cpu_timer_create(&timer); 1234 error = posix_cpu_timer_create(&timer);
1230 timer.it_process = current; 1235 timer.it_process = current;
1231 if (!error) { 1236 if (!error) {
1232 static struct itimerspec zero_it; 1237 static struct itimerspec64 zero_it;
1233 1238
1234 memset(it, 0, sizeof *it); 1239 memset(it, 0, sizeof *it);
1235 it->it_value = *rqtp; 1240 it->it_value = *rqtp;
@@ -1264,7 +1269,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1264 /* 1269 /*
1265 * We were interrupted by a signal. 1270 * We were interrupted by a signal.
1266 */ 1271 */
1267 *rqtp = ns_to_timespec(timer.it.cpu.expires); 1272 *rqtp = ns_to_timespec64(timer.it.cpu.expires);
1268 error = posix_cpu_timer_set(&timer, 0, &zero_it, it); 1273 error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
1269 if (!error) { 1274 if (!error) {
1270 /* 1275 /*
@@ -1301,10 +1306,11 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1301static long posix_cpu_nsleep_restart(struct restart_block *restart_block); 1306static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1302 1307
1303static int posix_cpu_nsleep(const clockid_t which_clock, int flags, 1308static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1304 struct timespec *rqtp, struct timespec __user *rmtp) 1309 struct timespec64 *rqtp, struct timespec __user *rmtp)
1305{ 1310{
1306 struct restart_block *restart_block = &current->restart_block; 1311 struct restart_block *restart_block = &current->restart_block;
1307 struct itimerspec it; 1312 struct itimerspec64 it;
1313 struct timespec ts;
1308 int error; 1314 int error;
1309 1315
1310 /* 1316 /*
@@ -1312,7 +1318,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1312 */ 1318 */
1313 if (CPUCLOCK_PERTHREAD(which_clock) && 1319 if (CPUCLOCK_PERTHREAD(which_clock) &&
1314 (CPUCLOCK_PID(which_clock) == 0 || 1320 (CPUCLOCK_PID(which_clock) == 0 ||
1315 CPUCLOCK_PID(which_clock) == current->pid)) 1321 CPUCLOCK_PID(which_clock) == task_pid_vnr(current)))
1316 return -EINVAL; 1322 return -EINVAL;
1317 1323
1318 error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); 1324 error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
@@ -1324,13 +1330,14 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1324 /* 1330 /*
1325 * Report back to the user the time still remaining. 1331 * Report back to the user the time still remaining.
1326 */ 1332 */
1327 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1333 ts = timespec64_to_timespec(it.it_value);
1334 if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp)))
1328 return -EFAULT; 1335 return -EFAULT;
1329 1336
1330 restart_block->fn = posix_cpu_nsleep_restart; 1337 restart_block->fn = posix_cpu_nsleep_restart;
1331 restart_block->nanosleep.clockid = which_clock; 1338 restart_block->nanosleep.clockid = which_clock;
1332 restart_block->nanosleep.rmtp = rmtp; 1339 restart_block->nanosleep.rmtp = rmtp;
1333 restart_block->nanosleep.expires = timespec_to_ns(rqtp); 1340 restart_block->nanosleep.expires = timespec64_to_ns(rqtp);
1334 } 1341 }
1335 return error; 1342 return error;
1336} 1343}
@@ -1338,11 +1345,12 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1338static long posix_cpu_nsleep_restart(struct restart_block *restart_block) 1345static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1339{ 1346{
1340 clockid_t which_clock = restart_block->nanosleep.clockid; 1347 clockid_t which_clock = restart_block->nanosleep.clockid;
1341 struct timespec t; 1348 struct itimerspec64 it;
1342 struct itimerspec it; 1349 struct timespec64 t;
1350 struct timespec tmp;
1343 int error; 1351 int error;
1344 1352
1345 t = ns_to_timespec(restart_block->nanosleep.expires); 1353 t = ns_to_timespec64(restart_block->nanosleep.expires);
1346 1354
1347 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); 1355 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
1348 1356
@@ -1351,10 +1359,11 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1351 /* 1359 /*
1352 * Report back to the user the time still remaining. 1360 * Report back to the user the time still remaining.
1353 */ 1361 */
1354 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1362 tmp = timespec64_to_timespec(it.it_value);
1363 if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp)))
1355 return -EFAULT; 1364 return -EFAULT;
1356 1365
1357 restart_block->nanosleep.expires = timespec_to_ns(&t); 1366 restart_block->nanosleep.expires = timespec64_to_ns(&t);
1358 } 1367 }
1359 return error; 1368 return error;
1360 1369
@@ -1364,12 +1373,12 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1364#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) 1373#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1365 1374
1366static int process_cpu_clock_getres(const clockid_t which_clock, 1375static int process_cpu_clock_getres(const clockid_t which_clock,
1367 struct timespec *tp) 1376 struct timespec64 *tp)
1368{ 1377{
1369 return posix_cpu_clock_getres(PROCESS_CLOCK, tp); 1378 return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
1370} 1379}
1371static int process_cpu_clock_get(const clockid_t which_clock, 1380static int process_cpu_clock_get(const clockid_t which_clock,
1372 struct timespec *tp) 1381 struct timespec64 *tp)
1373{ 1382{
1374 return posix_cpu_clock_get(PROCESS_CLOCK, tp); 1383 return posix_cpu_clock_get(PROCESS_CLOCK, tp);
1375} 1384}
@@ -1379,7 +1388,7 @@ static int process_cpu_timer_create(struct k_itimer *timer)
1379 return posix_cpu_timer_create(timer); 1388 return posix_cpu_timer_create(timer);
1380} 1389}
1381static int process_cpu_nsleep(const clockid_t which_clock, int flags, 1390static int process_cpu_nsleep(const clockid_t which_clock, int flags,
1382 struct timespec *rqtp, 1391 struct timespec64 *rqtp,
1383 struct timespec __user *rmtp) 1392 struct timespec __user *rmtp)
1384{ 1393{
1385 return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); 1394 return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
@@ -1389,12 +1398,12 @@ static long process_cpu_nsleep_restart(struct restart_block *restart_block)
1389 return -EINVAL; 1398 return -EINVAL;
1390} 1399}
1391static int thread_cpu_clock_getres(const clockid_t which_clock, 1400static int thread_cpu_clock_getres(const clockid_t which_clock,
1392 struct timespec *tp) 1401 struct timespec64 *tp)
1393{ 1402{
1394 return posix_cpu_clock_getres(THREAD_CLOCK, tp); 1403 return posix_cpu_clock_getres(THREAD_CLOCK, tp);
1395} 1404}
1396static int thread_cpu_clock_get(const clockid_t which_clock, 1405static int thread_cpu_clock_get(const clockid_t which_clock,
1397 struct timespec *tp) 1406 struct timespec64 *tp)
1398{ 1407{
1399 return posix_cpu_clock_get(THREAD_CLOCK, tp); 1408 return posix_cpu_clock_get(THREAD_CLOCK, tp);
1400} 1409}
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index cd6716e115e8..c0cd53eb018a 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -49,26 +49,32 @@ SYS_NI(alarm);
49SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, 49SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
50 const struct timespec __user *, tp) 50 const struct timespec __user *, tp)
51{ 51{
52 struct timespec64 new_tp64;
52 struct timespec new_tp; 53 struct timespec new_tp;
53 54
54 if (which_clock != CLOCK_REALTIME) 55 if (which_clock != CLOCK_REALTIME)
55 return -EINVAL; 56 return -EINVAL;
56 if (copy_from_user(&new_tp, tp, sizeof (*tp))) 57 if (copy_from_user(&new_tp, tp, sizeof (*tp)))
57 return -EFAULT; 58 return -EFAULT;
58 return do_sys_settimeofday(&new_tp, NULL); 59
60 new_tp64 = timespec_to_timespec64(new_tp);
61 return do_sys_settimeofday64(&new_tp64, NULL);
59} 62}
60 63
61SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, 64SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
62 struct timespec __user *,tp) 65 struct timespec __user *,tp)
63{ 66{
67 struct timespec64 kernel_tp64;
64 struct timespec kernel_tp; 68 struct timespec kernel_tp;
65 69
66 switch (which_clock) { 70 switch (which_clock) {
67 case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break; 71 case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
68 case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break; 72 case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
69 case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break; 73 case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
70 default: return -EINVAL; 74 default: return -EINVAL;
71 } 75 }
76
77 kernel_tp = timespec64_to_timespec(kernel_tp64);
72 if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) 78 if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
73 return -EFAULT; 79 return -EFAULT;
74 return 0; 80 return 0;
@@ -97,6 +103,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
97 const struct timespec __user *, rqtp, 103 const struct timespec __user *, rqtp,
98 struct timespec __user *, rmtp) 104 struct timespec __user *, rmtp)
99{ 105{
106 struct timespec64 t64;
100 struct timespec t; 107 struct timespec t;
101 108
102 switch (which_clock) { 109 switch (which_clock) {
@@ -105,9 +112,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
105 case CLOCK_BOOTTIME: 112 case CLOCK_BOOTTIME:
106 if (copy_from_user(&t, rqtp, sizeof (struct timespec))) 113 if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
107 return -EFAULT; 114 return -EFAULT;
108 if (!timespec_valid(&t)) 115 t64 = timespec_to_timespec64(t);
116 if (!timespec64_valid(&t64))
109 return -EINVAL; 117 return -EINVAL;
110 return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ? 118 return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ?
111 HRTIMER_MODE_ABS : HRTIMER_MODE_REL, 119 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
112 which_clock); 120 which_clock);
113 default: 121 default:
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 50a6a47020de..4d7b2ce09c27 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -130,12 +130,12 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
130/* 130/*
131 * These ones are defined below. 131 * These ones are defined below.
132 */ 132 */
133static int common_nsleep(const clockid_t, int flags, struct timespec *t, 133static int common_nsleep(const clockid_t, int flags, struct timespec64 *t,
134 struct timespec __user *rmtp); 134 struct timespec __user *rmtp);
135static int common_timer_create(struct k_itimer *new_timer); 135static int common_timer_create(struct k_itimer *new_timer);
136static void common_timer_get(struct k_itimer *, struct itimerspec *); 136static void common_timer_get(struct k_itimer *, struct itimerspec64 *);
137static int common_timer_set(struct k_itimer *, int, 137static int common_timer_set(struct k_itimer *, int,
138 struct itimerspec *, struct itimerspec *); 138 struct itimerspec64 *, struct itimerspec64 *);
139static int common_timer_del(struct k_itimer *timer); 139static int common_timer_del(struct k_itimer *timer);
140 140
141static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); 141static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
@@ -204,17 +204,17 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
204} 204}
205 205
206/* Get clock_realtime */ 206/* Get clock_realtime */
207static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) 207static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp)
208{ 208{
209 ktime_get_real_ts(tp); 209 ktime_get_real_ts64(tp);
210 return 0; 210 return 0;
211} 211}
212 212
213/* Set clock_realtime */ 213/* Set clock_realtime */
214static int posix_clock_realtime_set(const clockid_t which_clock, 214static int posix_clock_realtime_set(const clockid_t which_clock,
215 const struct timespec *tp) 215 const struct timespec64 *tp)
216{ 216{
217 return do_sys_settimeofday(tp, NULL); 217 return do_sys_settimeofday64(tp, NULL);
218} 218}
219 219
220static int posix_clock_realtime_adj(const clockid_t which_clock, 220static int posix_clock_realtime_adj(const clockid_t which_clock,
@@ -226,54 +226,54 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
226/* 226/*
227 * Get monotonic time for posix timers 227 * Get monotonic time for posix timers
228 */ 228 */
229static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) 229static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp)
230{ 230{
231 ktime_get_ts(tp); 231 ktime_get_ts64(tp);
232 return 0; 232 return 0;
233} 233}
234 234
235/* 235/*
236 * Get monotonic-raw time for posix timers 236 * Get monotonic-raw time for posix timers
237 */ 237 */
238static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) 238static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
239{ 239{
240 getrawmonotonic(tp); 240 getrawmonotonic64(tp);
241 return 0; 241 return 0;
242} 242}
243 243
244 244
245static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) 245static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp)
246{ 246{
247 *tp = current_kernel_time(); 247 *tp = current_kernel_time64();
248 return 0; 248 return 0;
249} 249}
250 250
251static int posix_get_monotonic_coarse(clockid_t which_clock, 251static int posix_get_monotonic_coarse(clockid_t which_clock,
252 struct timespec *tp) 252 struct timespec64 *tp)
253{ 253{
254 *tp = get_monotonic_coarse(); 254 *tp = get_monotonic_coarse64();
255 return 0; 255 return 0;
256} 256}
257 257
258static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) 258static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp)
259{ 259{
260 *tp = ktime_to_timespec(KTIME_LOW_RES); 260 *tp = ktime_to_timespec64(KTIME_LOW_RES);
261 return 0; 261 return 0;
262} 262}
263 263
264static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) 264static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp)
265{ 265{
266 get_monotonic_boottime(tp); 266 get_monotonic_boottime64(tp);
267 return 0; 267 return 0;
268} 268}
269 269
270static int posix_get_tai(clockid_t which_clock, struct timespec *tp) 270static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
271{ 271{
272 timekeeping_clocktai(tp); 272 timekeeping_clocktai64(tp);
273 return 0; 273 return 0;
274} 274}
275 275
276static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp) 276static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
277{ 277{
278 tp->tv_sec = 0; 278 tp->tv_sec = 0;
279 tp->tv_nsec = hrtimer_resolution; 279 tp->tv_nsec = hrtimer_resolution;
@@ -734,18 +734,18 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
734 * report. 734 * report.
735 */ 735 */
736static void 736static void
737common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) 737common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
738{ 738{
739 ktime_t now, remaining, iv; 739 ktime_t now, remaining, iv;
740 struct hrtimer *timer = &timr->it.real.timer; 740 struct hrtimer *timer = &timr->it.real.timer;
741 741
742 memset(cur_setting, 0, sizeof(struct itimerspec)); 742 memset(cur_setting, 0, sizeof(*cur_setting));
743 743
744 iv = timr->it.real.interval; 744 iv = timr->it.real.interval;
745 745
746 /* interval timer ? */ 746 /* interval timer ? */
747 if (iv) 747 if (iv)
748 cur_setting->it_interval = ktime_to_timespec(iv); 748 cur_setting->it_interval = ktime_to_timespec64(iv);
749 else if (!hrtimer_active(timer) && 749 else if (!hrtimer_active(timer) &&
750 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) 750 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
751 return; 751 return;
@@ -771,13 +771,14 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
771 if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) 771 if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
772 cur_setting->it_value.tv_nsec = 1; 772 cur_setting->it_value.tv_nsec = 1;
773 } else 773 } else
774 cur_setting->it_value = ktime_to_timespec(remaining); 774 cur_setting->it_value = ktime_to_timespec64(remaining);
775} 775}
776 776
777/* Get the time remaining on a POSIX.1b interval timer. */ 777/* Get the time remaining on a POSIX.1b interval timer. */
778SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, 778SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
779 struct itimerspec __user *, setting) 779 struct itimerspec __user *, setting)
780{ 780{
781 struct itimerspec64 cur_setting64;
781 struct itimerspec cur_setting; 782 struct itimerspec cur_setting;
782 struct k_itimer *timr; 783 struct k_itimer *timr;
783 struct k_clock *kc; 784 struct k_clock *kc;
@@ -792,10 +793,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
792 if (WARN_ON_ONCE(!kc || !kc->timer_get)) 793 if (WARN_ON_ONCE(!kc || !kc->timer_get))
793 ret = -EINVAL; 794 ret = -EINVAL;
794 else 795 else
795 kc->timer_get(timr, &cur_setting); 796 kc->timer_get(timr, &cur_setting64);
796 797
797 unlock_timer(timr, flags); 798 unlock_timer(timr, flags);
798 799
800 cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
799 if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) 801 if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
800 return -EFAULT; 802 return -EFAULT;
801 803
@@ -831,7 +833,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
831/* timr->it_lock is taken. */ 833/* timr->it_lock is taken. */
832static int 834static int
833common_timer_set(struct k_itimer *timr, int flags, 835common_timer_set(struct k_itimer *timr, int flags,
834 struct itimerspec *new_setting, struct itimerspec *old_setting) 836 struct itimerspec64 *new_setting, struct itimerspec64 *old_setting)
835{ 837{
836 struct hrtimer *timer = &timr->it.real.timer; 838 struct hrtimer *timer = &timr->it.real.timer;
837 enum hrtimer_mode mode; 839 enum hrtimer_mode mode;
@@ -860,10 +862,10 @@ common_timer_set(struct k_itimer *timr, int flags,
860 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); 862 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
861 timr->it.real.timer.function = posix_timer_fn; 863 timr->it.real.timer.function = posix_timer_fn;
862 864
863 hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value)); 865 hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value));
864 866
865 /* Convert interval */ 867 /* Convert interval */
866 timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); 868 timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval);
867 869
868 /* SIGEV_NONE timers are not queued ! See common_timer_get */ 870 /* SIGEV_NONE timers are not queued ! See common_timer_get */
869 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { 871 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
@@ -883,21 +885,23 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
883 const struct itimerspec __user *, new_setting, 885 const struct itimerspec __user *, new_setting,
884 struct itimerspec __user *, old_setting) 886 struct itimerspec __user *, old_setting)
885{ 887{
886 struct k_itimer *timr; 888 struct itimerspec64 new_spec64, old_spec64;
889 struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL;
887 struct itimerspec new_spec, old_spec; 890 struct itimerspec new_spec, old_spec;
888 int error = 0; 891 struct k_itimer *timr;
889 unsigned long flag; 892 unsigned long flag;
890 struct itimerspec *rtn = old_setting ? &old_spec : NULL;
891 struct k_clock *kc; 893 struct k_clock *kc;
894 int error = 0;
892 895
893 if (!new_setting) 896 if (!new_setting)
894 return -EINVAL; 897 return -EINVAL;
895 898
896 if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) 899 if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
897 return -EFAULT; 900 return -EFAULT;
901 new_spec64 = itimerspec_to_itimerspec64(&new_spec);
898 902
899 if (!timespec_valid(&new_spec.it_interval) || 903 if (!timespec64_valid(&new_spec64.it_interval) ||
900 !timespec_valid(&new_spec.it_value)) 904 !timespec64_valid(&new_spec64.it_value))
901 return -EINVAL; 905 return -EINVAL;
902retry: 906retry:
903 timr = lock_timer(timer_id, &flag); 907 timr = lock_timer(timer_id, &flag);
@@ -908,7 +912,7 @@ retry:
908 if (WARN_ON_ONCE(!kc || !kc->timer_set)) 912 if (WARN_ON_ONCE(!kc || !kc->timer_set))
909 error = -EINVAL; 913 error = -EINVAL;
910 else 914 else
911 error = kc->timer_set(timr, flags, &new_spec, rtn); 915 error = kc->timer_set(timr, flags, &new_spec64, rtn);
912 916
913 unlock_timer(timr, flag); 917 unlock_timer(timr, flag);
914 if (error == TIMER_RETRY) { 918 if (error == TIMER_RETRY) {
@@ -916,6 +920,7 @@ retry:
916 goto retry; 920 goto retry;
917 } 921 }
918 922
923 old_spec = itimerspec64_to_itimerspec(&old_spec64);
919 if (old_setting && !error && 924 if (old_setting && !error &&
920 copy_to_user(old_setting, &old_spec, sizeof (old_spec))) 925 copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
921 error = -EFAULT; 926 error = -EFAULT;
@@ -1014,6 +1019,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
1014 const struct timespec __user *, tp) 1019 const struct timespec __user *, tp)
1015{ 1020{
1016 struct k_clock *kc = clockid_to_kclock(which_clock); 1021 struct k_clock *kc = clockid_to_kclock(which_clock);
1022 struct timespec64 new_tp64;
1017 struct timespec new_tp; 1023 struct timespec new_tp;
1018 1024
1019 if (!kc || !kc->clock_set) 1025 if (!kc || !kc->clock_set)
@@ -1021,21 +1027,24 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
1021 1027
1022 if (copy_from_user(&new_tp, tp, sizeof (*tp))) 1028 if (copy_from_user(&new_tp, tp, sizeof (*tp)))
1023 return -EFAULT; 1029 return -EFAULT;
1030 new_tp64 = timespec_to_timespec64(new_tp);
1024 1031
1025 return kc->clock_set(which_clock, &new_tp); 1032 return kc->clock_set(which_clock, &new_tp64);
1026} 1033}
1027 1034
1028SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, 1035SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
1029 struct timespec __user *,tp) 1036 struct timespec __user *,tp)
1030{ 1037{
1031 struct k_clock *kc = clockid_to_kclock(which_clock); 1038 struct k_clock *kc = clockid_to_kclock(which_clock);
1039 struct timespec64 kernel_tp64;
1032 struct timespec kernel_tp; 1040 struct timespec kernel_tp;
1033 int error; 1041 int error;
1034 1042
1035 if (!kc) 1043 if (!kc)
1036 return -EINVAL; 1044 return -EINVAL;
1037 1045
1038 error = kc->clock_get(which_clock, &kernel_tp); 1046 error = kc->clock_get(which_clock, &kernel_tp64);
1047 kernel_tp = timespec64_to_timespec(kernel_tp64);
1039 1048
1040 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) 1049 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
1041 error = -EFAULT; 1050 error = -EFAULT;
@@ -1070,13 +1079,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
1070 struct timespec __user *, tp) 1079 struct timespec __user *, tp)
1071{ 1080{
1072 struct k_clock *kc = clockid_to_kclock(which_clock); 1081 struct k_clock *kc = clockid_to_kclock(which_clock);
1082 struct timespec64 rtn_tp64;
1073 struct timespec rtn_tp; 1083 struct timespec rtn_tp;
1074 int error; 1084 int error;
1075 1085
1076 if (!kc) 1086 if (!kc)
1077 return -EINVAL; 1087 return -EINVAL;
1078 1088
1079 error = kc->clock_getres(which_clock, &rtn_tp); 1089 error = kc->clock_getres(which_clock, &rtn_tp64);
1090 rtn_tp = timespec64_to_timespec(rtn_tp64);
1080 1091
1081 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) 1092 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
1082 error = -EFAULT; 1093 error = -EFAULT;
@@ -1088,7 +1099,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
1088 * nanosleep for monotonic and realtime clocks 1099 * nanosleep for monotonic and realtime clocks
1089 */ 1100 */
1090static int common_nsleep(const clockid_t which_clock, int flags, 1101static int common_nsleep(const clockid_t which_clock, int flags,
1091 struct timespec *tsave, struct timespec __user *rmtp) 1102 struct timespec64 *tsave, struct timespec __user *rmtp)
1092{ 1103{
1093 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? 1104 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
1094 HRTIMER_MODE_ABS : HRTIMER_MODE_REL, 1105 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
@@ -1100,6 +1111,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1100 struct timespec __user *, rmtp) 1111 struct timespec __user *, rmtp)
1101{ 1112{
1102 struct k_clock *kc = clockid_to_kclock(which_clock); 1113 struct k_clock *kc = clockid_to_kclock(which_clock);
1114 struct timespec64 t64;
1103 struct timespec t; 1115 struct timespec t;
1104 1116
1105 if (!kc) 1117 if (!kc)
@@ -1110,10 +1122,11 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1110 if (copy_from_user(&t, rqtp, sizeof (struct timespec))) 1122 if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
1111 return -EFAULT; 1123 return -EFAULT;
1112 1124
1113 if (!timespec_valid(&t)) 1125 t64 = timespec_to_timespec64(t);
1126 if (!timespec64_valid(&t64))
1114 return -EINVAL; 1127 return -EINVAL;
1115 1128
1116 return kc->nsleep(which_clock, flags, &t, rmtp); 1129 return kc->nsleep(which_clock, flags, &t64, rmtp);
1117} 1130}
1118 1131
1119/* 1132/*
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index ea6b610c4c57..2d8f05aad442 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -206,6 +206,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
206 206
207 update_clock_read_data(&rd); 207 update_clock_read_data(&rd);
208 208
209 if (sched_clock_timer.function != NULL) {
210 /* update timeout for clock wrap */
211 hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
212 }
213
209 r = rate; 214 r = rate;
210 if (r >= 4000000) { 215 if (r >= 4000000) {
211 r /= 1000000; 216 r /= 1000000;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7fe53be86077..64c97fc130c4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -993,6 +993,18 @@ ktime_t tick_nohz_get_sleep_length(void)
993 return ts->sleep_length; 993 return ts->sleep_length;
994} 994}
995 995
996/**
997 * tick_nohz_get_idle_calls - return the current idle calls counter value
998 *
999 * Called from the schedutil frequency scaling governor in scheduler context.
1000 */
1001unsigned long tick_nohz_get_idle_calls(void)
1002{
1003 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1004
1005 return ts->idle_calls;
1006}
1007
996static void tick_nohz_account_idle_ticks(struct tick_sched *ts) 1008static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
997{ 1009{
998#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 1010#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 25bdd2504571..49c73c6ed648 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -193,8 +193,8 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
193SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, 193SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
194 struct timezone __user *, tz) 194 struct timezone __user *, tz)
195{ 195{
196 struct timespec64 new_ts;
196 struct timeval user_tv; 197 struct timeval user_tv;
197 struct timespec new_ts;
198 struct timezone new_tz; 198 struct timezone new_tz;
199 199
200 if (tv) { 200 if (tv) {
@@ -212,7 +212,7 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
212 return -EFAULT; 212 return -EFAULT;
213 } 213 }
214 214
215 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); 215 return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
216} 216}
217 217
218SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) 218SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
@@ -230,20 +230,6 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
230 return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; 230 return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
231} 231}
232 232
233/**
234 * current_fs_time - Return FS time
235 * @sb: Superblock.
236 *
237 * Return the current time truncated to the time granularity supported by
238 * the fs.
239 */
240struct timespec current_fs_time(struct super_block *sb)
241{
242 struct timespec now = current_kernel_time();
243 return timespec_trunc(now, sb->s_time_gran);
244}
245EXPORT_SYMBOL(current_fs_time);
246
247/* 233/*
248 * Convert jiffies to milliseconds and back. 234 * Convert jiffies to milliseconds and back.
249 * 235 *
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5b63a2102c29..9652bc57fd09 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -996,8 +996,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history,
996 return 0; 996 return 0;
997 997
998 /* Interpolate shortest distance from beginning or end of history */ 998 /* Interpolate shortest distance from beginning or end of history */
999 interp_forward = partial_history_cycles > total_history_cycles/2 ? 999 interp_forward = partial_history_cycles > total_history_cycles / 2;
1000 true : false;
1001 partial_history_cycles = interp_forward ? 1000 partial_history_cycles = interp_forward ?
1002 total_history_cycles - partial_history_cycles : 1001 total_history_cycles - partial_history_cycles :
1003 partial_history_cycles; 1002 partial_history_cycles;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 1dc0256bfb6e..152a706ef8b8 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -241,7 +241,7 @@ int timer_migration_handler(struct ctl_table *table, int write,
241 int ret; 241 int ret;
242 242
243 mutex_lock(&mutex); 243 mutex_lock(&mutex);
244 ret = proc_dointvec(table, write, buffer, lenp, ppos); 244 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
245 if (!ret && write) 245 if (!ret && write)
246 timers_update_migration(false); 246 timers_update_migration(false);
247 mutex_unlock(&mutex); 247 mutex_unlock(&mutex);
@@ -1120,7 +1120,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
1120EXPORT_SYMBOL_GPL(add_timer_on); 1120EXPORT_SYMBOL_GPL(add_timer_on);
1121 1121
1122/** 1122/**
1123 * del_timer - deactive a timer. 1123 * del_timer - deactivate a timer.
1124 * @timer: the timer to be deactivated 1124 * @timer: the timer to be deactivated
1125 * 1125 *
1126 * del_timer() deactivates a timer - this works on both active and inactive 1126 * del_timer() deactivates a timer - this works on both active and inactive
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ff8d5c13d04b..0e7f5428a148 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -16,6 +16,7 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/nmi.h>
19 20
20#include <linux/uaccess.h> 21#include <linux/uaccess.h>
21 22
@@ -86,6 +87,9 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
86 87
87next_one: 88next_one:
88 i = 0; 89 i = 0;
90
91 touch_nmi_watchdog();
92
89 raw_spin_lock_irqsave(&base->cpu_base->lock, flags); 93 raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
90 94
91 curr = timerqueue_getnext(&base->active); 95 curr = timerqueue_getnext(&base->active);
@@ -197,6 +201,8 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
197{ 201{
198 struct clock_event_device *dev = td->evtdev; 202 struct clock_event_device *dev = td->evtdev;
199 203
204 touch_nmi_watchdog();
205
200 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); 206 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
201 if (cpu < 0) 207 if (cpu < 0)
202 SEQ_printf(m, "Broadcast device\n"); 208 SEQ_printf(m, "Broadcast device\n");
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d4a06e714645..7e06f04e98fe 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -134,7 +134,8 @@ config FUNCTION_TRACER
134 select KALLSYMS 134 select KALLSYMS
135 select GENERIC_TRACER 135 select GENERIC_TRACER
136 select CONTEXT_SWITCH_TRACER 136 select CONTEXT_SWITCH_TRACER
137 select GLOB 137 select GLOB
138 select TASKS_RCU if PREEMPT
138 help 139 help
139 Enable the kernel to trace every kernel function. This is done 140 Enable the kernel to trace every kernel function. This is done
140 by using a compiler feature to insert a small, 5-byte No-Operation 141 by using a compiler feature to insert a small, 5-byte No-Operation
@@ -455,7 +456,7 @@ config UPROBE_EVENTS
455 select UPROBES 456 select UPROBES
456 select PROBE_EVENTS 457 select PROBE_EVENTS
457 select TRACING 458 select TRACING
458 default n 459 default y
459 help 460 help
460 This allows the user to add tracing events on top of userspace 461 This allows the user to add tracing events on top of userspace
461 dynamic events (similar to tracepoints) on the fly via the trace 462 dynamic events (similar to tracepoints) on the fly via the trace
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b2058a7f94bd..193c5f5e3f79 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -690,8 +690,8 @@ void blk_trace_shutdown(struct request_queue *q)
690 690
691/** 691/**
692 * blk_add_trace_rq - Add a trace for a request oriented action 692 * blk_add_trace_rq - Add a trace for a request oriented action
693 * @q: queue the io is for
694 * @rq: the source request 693 * @rq: the source request
694 * @error: return status to log
695 * @nr_bytes: number of completed bytes 695 * @nr_bytes: number of completed bytes
696 * @what: the action 696 * @what: the action
697 * 697 *
@@ -699,10 +699,10 @@ void blk_trace_shutdown(struct request_queue *q)
699 * Records an action against a request. Will log the bio offset + size. 699 * Records an action against a request. Will log the bio offset + size.
700 * 700 *
701 **/ 701 **/
702static void blk_add_trace_rq(struct request_queue *q, struct request *rq, 702static void blk_add_trace_rq(struct request *rq, int error,
703 unsigned int nr_bytes, u32 what) 703 unsigned int nr_bytes, u32 what)
704{ 704{
705 struct blk_trace *bt = q->blk_trace; 705 struct blk_trace *bt = rq->q->blk_trace;
706 706
707 if (likely(!bt)) 707 if (likely(!bt))
708 return; 708 return;
@@ -713,40 +713,32 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
713 what |= BLK_TC_ACT(BLK_TC_FS); 713 what |= BLK_TC_ACT(BLK_TC_FS);
714 714
715 __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), 715 __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
716 rq->cmd_flags, what, rq->errors, 0, NULL); 716 rq->cmd_flags, what, error, 0, NULL);
717}
718
719static void blk_add_trace_rq_abort(void *ignore,
720 struct request_queue *q, struct request *rq)
721{
722 blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT);
723} 717}
724 718
725static void blk_add_trace_rq_insert(void *ignore, 719static void blk_add_trace_rq_insert(void *ignore,
726 struct request_queue *q, struct request *rq) 720 struct request_queue *q, struct request *rq)
727{ 721{
728 blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT); 722 blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
729} 723}
730 724
731static void blk_add_trace_rq_issue(void *ignore, 725static void blk_add_trace_rq_issue(void *ignore,
732 struct request_queue *q, struct request *rq) 726 struct request_queue *q, struct request *rq)
733{ 727{
734 blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE); 728 blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
735} 729}
736 730
737static void blk_add_trace_rq_requeue(void *ignore, 731static void blk_add_trace_rq_requeue(void *ignore,
738 struct request_queue *q, 732 struct request_queue *q,
739 struct request *rq) 733 struct request *rq)
740{ 734{
741 blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE); 735 blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
742} 736}
743 737
744static void blk_add_trace_rq_complete(void *ignore, 738static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
745 struct request_queue *q, 739 int error, unsigned int nr_bytes)
746 struct request *rq,
747 unsigned int nr_bytes)
748{ 740{
749 blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE); 741 blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
750} 742}
751 743
752/** 744/**
@@ -941,7 +933,7 @@ static void blk_add_trace_rq_remap(void *ignore,
941 r.sector_from = cpu_to_be64(from); 933 r.sector_from = cpu_to_be64(from);
942 934
943 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 935 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
944 rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors, 936 rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
945 sizeof(r), &r); 937 sizeof(r), &r);
946} 938}
947 939
@@ -966,7 +958,7 @@ void blk_add_driver_data(struct request_queue *q,
966 return; 958 return;
967 959
968 __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, 960 __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
969 BLK_TA_DRV_DATA, rq->errors, len, data); 961 BLK_TA_DRV_DATA, 0, len, data);
970} 962}
971EXPORT_SYMBOL_GPL(blk_add_driver_data); 963EXPORT_SYMBOL_GPL(blk_add_driver_data);
972 964
@@ -974,8 +966,6 @@ static void blk_register_tracepoints(void)
974{ 966{
975 int ret; 967 int ret;
976 968
977 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
978 WARN_ON(ret);
979 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); 969 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
980 WARN_ON(ret); 970 WARN_ON(ret);
981 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); 971 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
@@ -1028,7 +1018,6 @@ static void blk_unregister_tracepoints(void)
1028 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); 1018 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
1029 unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); 1019 unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
1030 unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); 1020 unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
1031 unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
1032 1021
1033 tracepoint_synchronize_unregister(); 1022 tracepoint_synchronize_unregister();
1034} 1023}
@@ -1673,14 +1662,14 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1673 goto out; 1662 goto out;
1674 1663
1675 if (attr == &dev_attr_act_mask) { 1664 if (attr == &dev_attr_act_mask) {
1676 if (sscanf(buf, "%llx", &value) != 1) { 1665 if (kstrtoull(buf, 0, &value)) {
1677 /* Assume it is a list of trace category names */ 1666 /* Assume it is a list of trace category names */
1678 ret = blk_trace_str2mask(buf); 1667 ret = blk_trace_str2mask(buf);
1679 if (ret < 0) 1668 if (ret < 0)
1680 goto out; 1669 goto out;
1681 value = ret; 1670 value = ret;
1682 } 1671 }
1683 } else if (sscanf(buf, "%llu", &value) != 1) 1672 } else if (kstrtoull(buf, 0, &value))
1684 goto out; 1673 goto out;
1685 1674
1686 ret = -ENXIO; 1675 ret = -ENXIO;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cee9802cf3e0..460a031c77e5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -96,7 +96,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
96 if (unlikely(in_interrupt() || 96 if (unlikely(in_interrupt() ||
97 current->flags & (PF_KTHREAD | PF_EXITING))) 97 current->flags & (PF_KTHREAD | PF_EXITING)))
98 return -EPERM; 98 return -EPERM;
99 if (unlikely(segment_eq(get_fs(), KERNEL_DS))) 99 if (unlikely(uaccess_kernel()))
100 return -EPERM; 100 return -EPERM;
101 if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) 101 if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
102 return -EPERM; 102 return -EPERM;
@@ -501,16 +501,11 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
501 return true; 501 return true;
502} 502}
503 503
504static const struct bpf_verifier_ops kprobe_prog_ops = { 504const struct bpf_verifier_ops kprobe_prog_ops = {
505 .get_func_proto = kprobe_prog_func_proto, 505 .get_func_proto = kprobe_prog_func_proto,
506 .is_valid_access = kprobe_prog_is_valid_access, 506 .is_valid_access = kprobe_prog_is_valid_access,
507}; 507};
508 508
509static struct bpf_prog_type_list kprobe_tl __ro_after_init = {
510 .ops = &kprobe_prog_ops,
511 .type = BPF_PROG_TYPE_KPROBE,
512};
513
514BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, 509BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
515 u64, flags, void *, data, u64, size) 510 u64, flags, void *, data, u64, size)
516{ 511{
@@ -584,16 +579,11 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
584 return true; 579 return true;
585} 580}
586 581
587static const struct bpf_verifier_ops tracepoint_prog_ops = { 582const struct bpf_verifier_ops tracepoint_prog_ops = {
588 .get_func_proto = tp_prog_func_proto, 583 .get_func_proto = tp_prog_func_proto,
589 .is_valid_access = tp_prog_is_valid_access, 584 .is_valid_access = tp_prog_is_valid_access,
590}; 585};
591 586
592static struct bpf_prog_type_list tracepoint_tl __ro_after_init = {
593 .ops = &tracepoint_prog_ops,
594 .type = BPF_PROG_TYPE_TRACEPOINT,
595};
596
597static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, 587static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
598 enum bpf_reg_type *reg_type) 588 enum bpf_reg_type *reg_type)
599{ 589{
@@ -642,22 +632,8 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
642 return insn - insn_buf; 632 return insn - insn_buf;
643} 633}
644 634
645static const struct bpf_verifier_ops perf_event_prog_ops = { 635const struct bpf_verifier_ops perf_event_prog_ops = {
646 .get_func_proto = tp_prog_func_proto, 636 .get_func_proto = tp_prog_func_proto,
647 .is_valid_access = pe_prog_is_valid_access, 637 .is_valid_access = pe_prog_is_valid_access,
648 .convert_ctx_access = pe_prog_convert_ctx_access, 638 .convert_ctx_access = pe_prog_convert_ctx_access,
649}; 639};
650
651static struct bpf_prog_type_list perf_event_tl __ro_after_init = {
652 .ops = &perf_event_prog_ops,
653 .type = BPF_PROG_TYPE_PERF_EVENT,
654};
655
656static int __init register_kprobe_prog_ops(void)
657{
658 bpf_register_prog_type(&kprobe_tl);
659 bpf_register_prog_type(&tracepoint_tl);
660 bpf_register_prog_type(&perf_event_tl);
661 return 0;
662}
663late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index dd3e91d68dc7..74fdfe9ed3db 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -36,6 +36,7 @@
36 36
37#include <trace/events/sched.h> 37#include <trace/events/sched.h>
38 38
39#include <asm/sections.h>
39#include <asm/setup.h> 40#include <asm/setup.h>
40 41
41#include "trace_output.h" 42#include "trace_output.h"
@@ -1095,22 +1096,20 @@ static bool update_all_ops;
1095# error Dynamic ftrace depends on MCOUNT_RECORD 1096# error Dynamic ftrace depends on MCOUNT_RECORD
1096#endif 1097#endif
1097 1098
1098static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
1099
1100struct ftrace_func_probe {
1101 struct hlist_node node;
1102 struct ftrace_probe_ops *ops;
1103 unsigned long flags;
1104 unsigned long ip;
1105 void *data;
1106 struct list_head free_list;
1107};
1108
1109struct ftrace_func_entry { 1099struct ftrace_func_entry {
1110 struct hlist_node hlist; 1100 struct hlist_node hlist;
1111 unsigned long ip; 1101 unsigned long ip;
1112}; 1102};
1113 1103
1104struct ftrace_func_probe {
1105 struct ftrace_probe_ops *probe_ops;
1106 struct ftrace_ops ops;
1107 struct trace_array *tr;
1108 struct list_head list;
1109 void *data;
1110 int ref;
1111};
1112
1114/* 1113/*
1115 * We make these constant because no one should touch them, 1114 * We make these constant because no one should touch them,
1116 * but they are used as the default "empty hash", to avoid allocating 1115 * but they are used as the default "empty hash", to avoid allocating
@@ -1271,7 +1270,7 @@ static void
1271remove_hash_entry(struct ftrace_hash *hash, 1270remove_hash_entry(struct ftrace_hash *hash,
1272 struct ftrace_func_entry *entry) 1271 struct ftrace_func_entry *entry)
1273{ 1272{
1274 hlist_del(&entry->hlist); 1273 hlist_del_rcu(&entry->hlist);
1275 hash->count--; 1274 hash->count--;
1276} 1275}
1277 1276
@@ -2807,18 +2806,28 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2807 * callers are done before leaving this function. 2806 * callers are done before leaving this function.
2808 * The same goes for freeing the per_cpu data of the per_cpu 2807 * The same goes for freeing the per_cpu data of the per_cpu
2809 * ops. 2808 * ops.
2810 *
2811 * Again, normal synchronize_sched() is not good enough.
2812 * We need to do a hard force of sched synchronization.
2813 * This is because we use preempt_disable() to do RCU, but
2814 * the function tracers can be called where RCU is not watching
2815 * (like before user_exit()). We can not rely on the RCU
2816 * infrastructure to do the synchronization, thus we must do it
2817 * ourselves.
2818 */ 2809 */
2819 if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) { 2810 if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
2811 /*
2812 * We need to do a hard force of sched synchronization.
2813 * This is because we use preempt_disable() to do RCU, but
2814 * the function tracers can be called where RCU is not watching
2815 * (like before user_exit()). We can not rely on the RCU
2816 * infrastructure to do the synchronization, thus we must do it
2817 * ourselves.
2818 */
2820 schedule_on_each_cpu(ftrace_sync); 2819 schedule_on_each_cpu(ftrace_sync);
2821 2820
2821 /*
2822 * When the kernel is preeptive, tasks can be preempted
2823 * while on a ftrace trampoline. Just scheduling a task on
2824 * a CPU is not good enough to flush them. Calling
2825 * synchornize_rcu_tasks() will wait for those tasks to
2826 * execute and either schedule voluntarily or enter user space.
2827 */
2828 if (IS_ENABLED(CONFIG_PREEMPT))
2829 synchronize_rcu_tasks();
2830
2822 arch_ftrace_trampoline_free(ops); 2831 arch_ftrace_trampoline_free(ops);
2823 2832
2824 if (ops->flags & FTRACE_OPS_FL_PER_CPU) 2833 if (ops->flags & FTRACE_OPS_FL_PER_CPU)
@@ -3055,34 +3064,63 @@ struct ftrace_iterator {
3055 struct ftrace_page *pg; 3064 struct ftrace_page *pg;
3056 struct dyn_ftrace *func; 3065 struct dyn_ftrace *func;
3057 struct ftrace_func_probe *probe; 3066 struct ftrace_func_probe *probe;
3067 struct ftrace_func_entry *probe_entry;
3058 struct trace_parser parser; 3068 struct trace_parser parser;
3059 struct ftrace_hash *hash; 3069 struct ftrace_hash *hash;
3060 struct ftrace_ops *ops; 3070 struct ftrace_ops *ops;
3061 int hidx; 3071 int pidx;
3062 int idx; 3072 int idx;
3063 unsigned flags; 3073 unsigned flags;
3064}; 3074};
3065 3075
3066static void * 3076static void *
3067t_hash_next(struct seq_file *m, loff_t *pos) 3077t_probe_next(struct seq_file *m, loff_t *pos)
3068{ 3078{
3069 struct ftrace_iterator *iter = m->private; 3079 struct ftrace_iterator *iter = m->private;
3080 struct trace_array *tr = iter->ops->private;
3081 struct list_head *func_probes;
3082 struct ftrace_hash *hash;
3083 struct list_head *next;
3070 struct hlist_node *hnd = NULL; 3084 struct hlist_node *hnd = NULL;
3071 struct hlist_head *hhd; 3085 struct hlist_head *hhd;
3086 int size;
3072 3087
3073 (*pos)++; 3088 (*pos)++;
3074 iter->pos = *pos; 3089 iter->pos = *pos;
3075 3090
3076 if (iter->probe) 3091 if (!tr)
3077 hnd = &iter->probe->node;
3078 retry:
3079 if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
3080 return NULL; 3092 return NULL;
3081 3093
3082 hhd = &ftrace_func_hash[iter->hidx]; 3094 func_probes = &tr->func_probes;
3095 if (list_empty(func_probes))
3096 return NULL;
3097
3098 if (!iter->probe) {
3099 next = func_probes->next;
3100 iter->probe = list_entry(next, struct ftrace_func_probe, list);
3101 }
3102
3103 if (iter->probe_entry)
3104 hnd = &iter->probe_entry->hlist;
3105
3106 hash = iter->probe->ops.func_hash->filter_hash;
3107 size = 1 << hash->size_bits;
3108
3109 retry:
3110 if (iter->pidx >= size) {
3111 if (iter->probe->list.next == func_probes)
3112 return NULL;
3113 next = iter->probe->list.next;
3114 iter->probe = list_entry(next, struct ftrace_func_probe, list);
3115 hash = iter->probe->ops.func_hash->filter_hash;
3116 size = 1 << hash->size_bits;
3117 iter->pidx = 0;
3118 }
3119
3120 hhd = &hash->buckets[iter->pidx];
3083 3121
3084 if (hlist_empty(hhd)) { 3122 if (hlist_empty(hhd)) {
3085 iter->hidx++; 3123 iter->pidx++;
3086 hnd = NULL; 3124 hnd = NULL;
3087 goto retry; 3125 goto retry;
3088 } 3126 }
@@ -3092,7 +3130,7 @@ t_hash_next(struct seq_file *m, loff_t *pos)
3092 else { 3130 else {
3093 hnd = hnd->next; 3131 hnd = hnd->next;
3094 if (!hnd) { 3132 if (!hnd) {
3095 iter->hidx++; 3133 iter->pidx++;
3096 goto retry; 3134 goto retry;
3097 } 3135 }
3098 } 3136 }
@@ -3100,26 +3138,28 @@ t_hash_next(struct seq_file *m, loff_t *pos)
3100 if (WARN_ON_ONCE(!hnd)) 3138 if (WARN_ON_ONCE(!hnd))
3101 return NULL; 3139 return NULL;
3102 3140
3103 iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); 3141 iter->probe_entry = hlist_entry(hnd, struct ftrace_func_entry, hlist);
3104 3142
3105 return iter; 3143 return iter;
3106} 3144}
3107 3145
3108static void *t_hash_start(struct seq_file *m, loff_t *pos) 3146static void *t_probe_start(struct seq_file *m, loff_t *pos)
3109{ 3147{
3110 struct ftrace_iterator *iter = m->private; 3148 struct ftrace_iterator *iter = m->private;
3111 void *p = NULL; 3149 void *p = NULL;
3112 loff_t l; 3150 loff_t l;
3113 3151
3114 if (!(iter->flags & FTRACE_ITER_DO_HASH)) 3152 if (!(iter->flags & FTRACE_ITER_DO_PROBES))
3115 return NULL; 3153 return NULL;
3116 3154
3117 if (iter->func_pos > *pos) 3155 if (iter->func_pos > *pos)
3118 return NULL; 3156 return NULL;
3119 3157
3120 iter->hidx = 0; 3158 iter->probe = NULL;
3159 iter->probe_entry = NULL;
3160 iter->pidx = 0;
3121 for (l = 0; l <= (*pos - iter->func_pos); ) { 3161 for (l = 0; l <= (*pos - iter->func_pos); ) {
3122 p = t_hash_next(m, &l); 3162 p = t_probe_next(m, &l);
3123 if (!p) 3163 if (!p)
3124 break; 3164 break;
3125 } 3165 }
@@ -3127,50 +3167,42 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
3127 return NULL; 3167 return NULL;
3128 3168
3129 /* Only set this if we have an item */ 3169 /* Only set this if we have an item */
3130 iter->flags |= FTRACE_ITER_HASH; 3170 iter->flags |= FTRACE_ITER_PROBE;
3131 3171
3132 return iter; 3172 return iter;
3133} 3173}
3134 3174
3135static int 3175static int
3136t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) 3176t_probe_show(struct seq_file *m, struct ftrace_iterator *iter)
3137{ 3177{
3138 struct ftrace_func_probe *rec; 3178 struct ftrace_func_entry *probe_entry;
3179 struct ftrace_probe_ops *probe_ops;
3180 struct ftrace_func_probe *probe;
3181
3182 probe = iter->probe;
3183 probe_entry = iter->probe_entry;
3139 3184
3140 rec = iter->probe; 3185 if (WARN_ON_ONCE(!probe || !probe_entry))
3141 if (WARN_ON_ONCE(!rec))
3142 return -EIO; 3186 return -EIO;
3143 3187
3144 if (rec->ops->print) 3188 probe_ops = probe->probe_ops;
3145 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
3146 3189
3147 seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func); 3190 if (probe_ops->print)
3191 return probe_ops->print(m, probe_entry->ip, probe_ops, probe->data);
3148 3192
3149 if (rec->data) 3193 seq_printf(m, "%ps:%ps\n", (void *)probe_entry->ip,
3150 seq_printf(m, ":%p", rec->data); 3194 (void *)probe_ops->func);
3151 seq_putc(m, '\n');
3152 3195
3153 return 0; 3196 return 0;
3154} 3197}
3155 3198
3156static void * 3199static void *
3157t_next(struct seq_file *m, void *v, loff_t *pos) 3200t_func_next(struct seq_file *m, loff_t *pos)
3158{ 3201{
3159 struct ftrace_iterator *iter = m->private; 3202 struct ftrace_iterator *iter = m->private;
3160 struct ftrace_ops *ops = iter->ops;
3161 struct dyn_ftrace *rec = NULL; 3203 struct dyn_ftrace *rec = NULL;
3162 3204
3163 if (unlikely(ftrace_disabled))
3164 return NULL;
3165
3166 if (iter->flags & FTRACE_ITER_HASH)
3167 return t_hash_next(m, pos);
3168
3169 (*pos)++; 3205 (*pos)++;
3170 iter->pos = iter->func_pos = *pos;
3171
3172 if (iter->flags & FTRACE_ITER_PRINTALL)
3173 return t_hash_start(m, pos);
3174 3206
3175 retry: 3207 retry:
3176 if (iter->idx >= iter->pg->index) { 3208 if (iter->idx >= iter->pg->index) {
@@ -3181,11 +3213,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
3181 } 3213 }
3182 } else { 3214 } else {
3183 rec = &iter->pg->records[iter->idx++]; 3215 rec = &iter->pg->records[iter->idx++];
3184 if (((iter->flags & FTRACE_ITER_FILTER) && 3216 if (((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) &&
3185 !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) || 3217 !ftrace_lookup_ip(iter->hash, rec->ip)) ||
3186
3187 ((iter->flags & FTRACE_ITER_NOTRACE) &&
3188 !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) ||
3189 3218
3190 ((iter->flags & FTRACE_ITER_ENABLED) && 3219 ((iter->flags & FTRACE_ITER_ENABLED) &&
3191 !(rec->flags & FTRACE_FL_ENABLED))) { 3220 !(rec->flags & FTRACE_FL_ENABLED))) {
@@ -3196,24 +3225,51 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
3196 } 3225 }
3197 3226
3198 if (!rec) 3227 if (!rec)
3199 return t_hash_start(m, pos); 3228 return NULL;
3200 3229
3230 iter->pos = iter->func_pos = *pos;
3201 iter->func = rec; 3231 iter->func = rec;
3202 3232
3203 return iter; 3233 return iter;
3204} 3234}
3205 3235
3236static void *
3237t_next(struct seq_file *m, void *v, loff_t *pos)
3238{
3239 struct ftrace_iterator *iter = m->private;
3240 loff_t l = *pos; /* t_hash_start() must use original pos */
3241 void *ret;
3242
3243 if (unlikely(ftrace_disabled))
3244 return NULL;
3245
3246 if (iter->flags & FTRACE_ITER_PROBE)
3247 return t_probe_next(m, pos);
3248
3249 if (iter->flags & FTRACE_ITER_PRINTALL) {
3250 /* next must increment pos, and t_probe_start does not */
3251 (*pos)++;
3252 return t_probe_start(m, &l);
3253 }
3254
3255 ret = t_func_next(m, pos);
3256
3257 if (!ret)
3258 return t_probe_start(m, &l);
3259
3260 return ret;
3261}
3262
3206static void reset_iter_read(struct ftrace_iterator *iter) 3263static void reset_iter_read(struct ftrace_iterator *iter)
3207{ 3264{
3208 iter->pos = 0; 3265 iter->pos = 0;
3209 iter->func_pos = 0; 3266 iter->func_pos = 0;
3210 iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); 3267 iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE);
3211} 3268}
3212 3269
3213static void *t_start(struct seq_file *m, loff_t *pos) 3270static void *t_start(struct seq_file *m, loff_t *pos)
3214{ 3271{
3215 struct ftrace_iterator *iter = m->private; 3272 struct ftrace_iterator *iter = m->private;
3216 struct ftrace_ops *ops = iter->ops;
3217 void *p = NULL; 3273 void *p = NULL;
3218 loff_t l; 3274 loff_t l;
3219 3275
@@ -3233,20 +3289,19 @@ static void *t_start(struct seq_file *m, loff_t *pos)
3233 * off, we can short cut and just print out that all 3289 * off, we can short cut and just print out that all
3234 * functions are enabled. 3290 * functions are enabled.
3235 */ 3291 */
3236 if ((iter->flags & FTRACE_ITER_FILTER && 3292 if ((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) &&
3237 ftrace_hash_empty(ops->func_hash->filter_hash)) || 3293 ftrace_hash_empty(iter->hash)) {
3238 (iter->flags & FTRACE_ITER_NOTRACE && 3294 iter->func_pos = 1; /* Account for the message */
3239 ftrace_hash_empty(ops->func_hash->notrace_hash))) {
3240 if (*pos > 0) 3295 if (*pos > 0)
3241 return t_hash_start(m, pos); 3296 return t_probe_start(m, pos);
3242 iter->flags |= FTRACE_ITER_PRINTALL; 3297 iter->flags |= FTRACE_ITER_PRINTALL;
3243 /* reset in case of seek/pread */ 3298 /* reset in case of seek/pread */
3244 iter->flags &= ~FTRACE_ITER_HASH; 3299 iter->flags &= ~FTRACE_ITER_PROBE;
3245 return iter; 3300 return iter;
3246 } 3301 }
3247 3302
3248 if (iter->flags & FTRACE_ITER_HASH) 3303 if (iter->flags & FTRACE_ITER_PROBE)
3249 return t_hash_start(m, pos); 3304 return t_probe_start(m, pos);
3250 3305
3251 /* 3306 /*
3252 * Unfortunately, we need to restart at ftrace_pages_start 3307 * Unfortunately, we need to restart at ftrace_pages_start
@@ -3256,13 +3311,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
3256 iter->pg = ftrace_pages_start; 3311 iter->pg = ftrace_pages_start;
3257 iter->idx = 0; 3312 iter->idx = 0;
3258 for (l = 0; l <= *pos; ) { 3313 for (l = 0; l <= *pos; ) {
3259 p = t_next(m, p, &l); 3314 p = t_func_next(m, &l);
3260 if (!p) 3315 if (!p)
3261 break; 3316 break;
3262 } 3317 }
3263 3318
3264 if (!p) 3319 if (!p)
3265 return t_hash_start(m, pos); 3320 return t_probe_start(m, pos);
3266 3321
3267 return iter; 3322 return iter;
3268} 3323}
@@ -3293,8 +3348,8 @@ static int t_show(struct seq_file *m, void *v)
3293 struct ftrace_iterator *iter = m->private; 3348 struct ftrace_iterator *iter = m->private;
3294 struct dyn_ftrace *rec; 3349 struct dyn_ftrace *rec;
3295 3350
3296 if (iter->flags & FTRACE_ITER_HASH) 3351 if (iter->flags & FTRACE_ITER_PROBE)
3297 return t_hash_show(m, iter); 3352 return t_probe_show(m, iter);
3298 3353
3299 if (iter->flags & FTRACE_ITER_PRINTALL) { 3354 if (iter->flags & FTRACE_ITER_PRINTALL) {
3300 if (iter->flags & FTRACE_ITER_NOTRACE) 3355 if (iter->flags & FTRACE_ITER_NOTRACE)
@@ -3355,12 +3410,13 @@ ftrace_avail_open(struct inode *inode, struct file *file)
3355 return -ENODEV; 3410 return -ENODEV;
3356 3411
3357 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); 3412 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
3358 if (iter) { 3413 if (!iter)
3359 iter->pg = ftrace_pages_start; 3414 return -ENOMEM;
3360 iter->ops = &global_ops; 3415
3361 } 3416 iter->pg = ftrace_pages_start;
3417 iter->ops = &global_ops;
3362 3418
3363 return iter ? 0 : -ENOMEM; 3419 return 0;
3364} 3420}
3365 3421
3366static int 3422static int
@@ -3369,13 +3425,14 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
3369 struct ftrace_iterator *iter; 3425 struct ftrace_iterator *iter;
3370 3426
3371 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); 3427 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
3372 if (iter) { 3428 if (!iter)
3373 iter->pg = ftrace_pages_start; 3429 return -ENOMEM;
3374 iter->flags = FTRACE_ITER_ENABLED; 3430
3375 iter->ops = &global_ops; 3431 iter->pg = ftrace_pages_start;
3376 } 3432 iter->flags = FTRACE_ITER_ENABLED;
3433 iter->ops = &global_ops;
3377 3434
3378 return iter ? 0 : -ENOMEM; 3435 return 0;
3379} 3436}
3380 3437
3381/** 3438/**
@@ -3440,7 +3497,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
3440 ret = -ENOMEM; 3497 ret = -ENOMEM;
3441 goto out_unlock; 3498 goto out_unlock;
3442 } 3499 }
3443 } 3500 } else
3501 iter->hash = hash;
3444 3502
3445 if (file->f_mode & FMODE_READ) { 3503 if (file->f_mode & FMODE_READ) {
3446 iter->pg = ftrace_pages_start; 3504 iter->pg = ftrace_pages_start;
@@ -3470,7 +3528,7 @@ ftrace_filter_open(struct inode *inode, struct file *file)
3470 struct ftrace_ops *ops = inode->i_private; 3528 struct ftrace_ops *ops = inode->i_private;
3471 3529
3472 return ftrace_regex_open(ops, 3530 return ftrace_regex_open(ops,
3473 FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, 3531 FTRACE_ITER_FILTER | FTRACE_ITER_DO_PROBES,
3474 inode, file); 3532 inode, file);
3475} 3533}
3476 3534
@@ -3573,22 +3631,20 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g,
3573 /* blank module name to match all modules */ 3631 /* blank module name to match all modules */
3574 if (!mod_g->len) { 3632 if (!mod_g->len) {
3575 /* blank module globbing: modname xor exclude_mod */ 3633 /* blank module globbing: modname xor exclude_mod */
3576 if ((!exclude_mod) != (!modname)) 3634 if (!exclude_mod != !modname)
3577 goto func_match; 3635 goto func_match;
3578 return 0; 3636 return 0;
3579 } 3637 }
3580 3638
3581 /* not matching the module */ 3639 /*
3582 if (!modname || !mod_matches) { 3640 * exclude_mod is set to trace everything but the given
3583 if (exclude_mod) 3641 * module. If it is set and the module matches, then
3584 goto func_match; 3642 * return 0. If it is not set, and the module doesn't match
3585 else 3643 * also return 0. Otherwise, check the function to see if
3586 return 0; 3644 * that matches.
3587 } 3645 */
3588 3646 if (!mod_matches == !exclude_mod)
3589 if (mod_matches && exclude_mod)
3590 return 0; 3647 return 0;
3591
3592func_match: 3648func_match:
3593 /* blank search means to match all funcs in the mod */ 3649 /* blank search means to match all funcs in the mod */
3594 if (!func_g->len) 3650 if (!func_g->len)
@@ -3654,6 +3710,56 @@ ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
3654 return match_records(hash, buff, len, NULL); 3710 return match_records(hash, buff, len, NULL);
3655} 3711}
3656 3712
3713static void ftrace_ops_update_code(struct ftrace_ops *ops,
3714 struct ftrace_ops_hash *old_hash)
3715{
3716 struct ftrace_ops *op;
3717
3718 if (!ftrace_enabled)
3719 return;
3720
3721 if (ops->flags & FTRACE_OPS_FL_ENABLED) {
3722 ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
3723 return;
3724 }
3725
3726 /*
3727 * If this is the shared global_ops filter, then we need to
3728 * check if there is another ops that shares it, is enabled.
3729 * If so, we still need to run the modify code.
3730 */
3731 if (ops->func_hash != &global_ops.local_hash)
3732 return;
3733
3734 do_for_each_ftrace_op(op, ftrace_ops_list) {
3735 if (op->func_hash == &global_ops.local_hash &&
3736 op->flags & FTRACE_OPS_FL_ENABLED) {
3737 ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash);
3738 /* Only need to do this once */
3739 return;
3740 }
3741 } while_for_each_ftrace_op(op);
3742}
3743
3744static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
3745 struct ftrace_hash **orig_hash,
3746 struct ftrace_hash *hash,
3747 int enable)
3748{
3749 struct ftrace_ops_hash old_hash_ops;
3750 struct ftrace_hash *old_hash;
3751 int ret;
3752
3753 old_hash = *orig_hash;
3754 old_hash_ops.filter_hash = ops->func_hash->filter_hash;
3755 old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
3756 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
3757 if (!ret) {
3758 ftrace_ops_update_code(ops, &old_hash_ops);
3759 free_ftrace_hash_rcu(old_hash);
3760 }
3761 return ret;
3762}
3657 3763
3658/* 3764/*
3659 * We register the module command as a template to show others how 3765 * We register the module command as a template to show others how
@@ -3661,7 +3767,7 @@ ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
3661 */ 3767 */
3662 3768
3663static int 3769static int
3664ftrace_mod_callback(struct ftrace_hash *hash, 3770ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
3665 char *func, char *cmd, char *module, int enable) 3771 char *func, char *cmd, char *module, int enable)
3666{ 3772{
3667 int ret; 3773 int ret;
@@ -3695,16 +3801,11 @@ core_initcall(ftrace_mod_cmd_init);
3695static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, 3801static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
3696 struct ftrace_ops *op, struct pt_regs *pt_regs) 3802 struct ftrace_ops *op, struct pt_regs *pt_regs)
3697{ 3803{
3698 struct ftrace_func_probe *entry; 3804 struct ftrace_probe_ops *probe_ops;
3699 struct hlist_head *hhd; 3805 struct ftrace_func_probe *probe;
3700 unsigned long key;
3701 3806
3702 key = hash_long(ip, FTRACE_HASH_BITS); 3807 probe = container_of(op, struct ftrace_func_probe, ops);
3703 3808 probe_ops = probe->probe_ops;
3704 hhd = &ftrace_func_hash[key];
3705
3706 if (hlist_empty(hhd))
3707 return;
3708 3809
3709 /* 3810 /*
3710 * Disable preemption for these calls to prevent a RCU grace 3811 * Disable preemption for these calls to prevent a RCU grace
@@ -3712,213 +3813,340 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
3712 * on the hash. rcu_read_lock is too dangerous here. 3813 * on the hash. rcu_read_lock is too dangerous here.
3713 */ 3814 */
3714 preempt_disable_notrace(); 3815 preempt_disable_notrace();
3715 hlist_for_each_entry_rcu_notrace(entry, hhd, node) { 3816 probe_ops->func(ip, parent_ip, probe->tr, probe_ops, probe->data);
3716 if (entry->ip == ip)
3717 entry->ops->func(ip, parent_ip, &entry->data);
3718 }
3719 preempt_enable_notrace(); 3817 preempt_enable_notrace();
3720} 3818}
3721 3819
3722static struct ftrace_ops trace_probe_ops __read_mostly = 3820struct ftrace_func_map {
3723{ 3821 struct ftrace_func_entry entry;
3724 .func = function_trace_probe_call, 3822 void *data;
3725 .flags = FTRACE_OPS_FL_INITIALIZED,
3726 INIT_OPS_HASH(trace_probe_ops)
3727}; 3823};
3728 3824
3729static int ftrace_probe_registered; 3825struct ftrace_func_mapper {
3826 struct ftrace_hash hash;
3827};
3730 3828
3731static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash) 3829/**
3830 * allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper
3831 *
3832 * Returns a ftrace_func_mapper descriptor that can be used to map ips to data.
3833 */
3834struct ftrace_func_mapper *allocate_ftrace_func_mapper(void)
3732{ 3835{
3733 int ret; 3836 struct ftrace_hash *hash;
3734 int i;
3735 3837
3736 if (ftrace_probe_registered) { 3838 /*
3737 /* still need to update the function call sites */ 3839 * The mapper is simply a ftrace_hash, but since the entries
3738 if (ftrace_enabled) 3840 * in the hash are not ftrace_func_entry type, we define it
3739 ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, 3841 * as a separate structure.
3740 old_hash); 3842 */
3741 return; 3843 hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
3742 } 3844 return (struct ftrace_func_mapper *)hash;
3845}
3743 3846
3744 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { 3847/**
3745 struct hlist_head *hhd = &ftrace_func_hash[i]; 3848 * ftrace_func_mapper_find_ip - Find some data mapped to an ip
3746 if (hhd->first) 3849 * @mapper: The mapper that has the ip maps
3747 break; 3850 * @ip: the instruction pointer to find the data for
3748 } 3851 *
3749 /* Nothing registered? */ 3852 * Returns the data mapped to @ip if found otherwise NULL. The return
3750 if (i == FTRACE_FUNC_HASHSIZE) 3853 * is actually the address of the mapper data pointer. The address is
3751 return; 3854 * returned for use cases where the data is no bigger than a long, and
3855 * the user can use the data pointer as its data instead of having to
3856 * allocate more memory for the reference.
3857 */
3858void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper,
3859 unsigned long ip)
3860{
3861 struct ftrace_func_entry *entry;
3862 struct ftrace_func_map *map;
3752 3863
3753 ret = ftrace_startup(&trace_probe_ops, 0); 3864 entry = ftrace_lookup_ip(&mapper->hash, ip);
3865 if (!entry)
3866 return NULL;
3754 3867
3755 ftrace_probe_registered = 1; 3868 map = (struct ftrace_func_map *)entry;
3869 return &map->data;
3756} 3870}
3757 3871
3758static bool __disable_ftrace_function_probe(void) 3872/**
3873 * ftrace_func_mapper_add_ip - Map some data to an ip
3874 * @mapper: The mapper that has the ip maps
3875 * @ip: The instruction pointer address to map @data to
3876 * @data: The data to map to @ip
3877 *
3878 * Returns 0 on succes otherwise an error.
3879 */
3880int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
3881 unsigned long ip, void *data)
3759{ 3882{
3760 int i; 3883 struct ftrace_func_entry *entry;
3884 struct ftrace_func_map *map;
3761 3885
3762 if (!ftrace_probe_registered) 3886 entry = ftrace_lookup_ip(&mapper->hash, ip);
3763 return false; 3887 if (entry)
3888 return -EBUSY;
3764 3889
3765 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { 3890 map = kmalloc(sizeof(*map), GFP_KERNEL);
3766 struct hlist_head *hhd = &ftrace_func_hash[i]; 3891 if (!map)
3767 if (hhd->first) 3892 return -ENOMEM;
3768 return false;
3769 }
3770 3893
3771 /* no more funcs left */ 3894 map->entry.ip = ip;
3772 ftrace_shutdown(&trace_probe_ops, 0); 3895 map->data = data;
3773 3896
3774 ftrace_probe_registered = 0; 3897 __add_hash_entry(&mapper->hash, &map->entry);
3775 return true;
3776}
3777 3898
3899 return 0;
3900}
3778 3901
3779static void ftrace_free_entry(struct ftrace_func_probe *entry) 3902/**
3903 * ftrace_func_mapper_remove_ip - Remove an ip from the mapping
3904 * @mapper: The mapper that has the ip maps
3905 * @ip: The instruction pointer address to remove the data from
3906 *
3907 * Returns the data if it is found, otherwise NULL.
3908 * Note, if the data pointer is used as the data itself, (see
3909 * ftrace_func_mapper_find_ip(), then the return value may be meaningless,
3910 * if the data pointer was set to zero.
3911 */
3912void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper,
3913 unsigned long ip)
3780{ 3914{
3781 if (entry->ops->free) 3915 struct ftrace_func_entry *entry;
3782 entry->ops->free(entry->ops, entry->ip, &entry->data); 3916 struct ftrace_func_map *map;
3917 void *data;
3918
3919 entry = ftrace_lookup_ip(&mapper->hash, ip);
3920 if (!entry)
3921 return NULL;
3922
3923 map = (struct ftrace_func_map *)entry;
3924 data = map->data;
3925
3926 remove_hash_entry(&mapper->hash, entry);
3783 kfree(entry); 3927 kfree(entry);
3928
3929 return data;
3930}
3931
3932/**
3933 * free_ftrace_func_mapper - free a mapping of ips and data
3934 * @mapper: The mapper that has the ip maps
3935 * @free_func: A function to be called on each data item.
3936 *
3937 * This is used to free the function mapper. The @free_func is optional
3938 * and can be used if the data needs to be freed as well.
3939 */
3940void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
3941 ftrace_mapper_func free_func)
3942{
3943 struct ftrace_func_entry *entry;
3944 struct ftrace_func_map *map;
3945 struct hlist_head *hhd;
3946 int size = 1 << mapper->hash.size_bits;
3947 int i;
3948
3949 if (free_func && mapper->hash.count) {
3950 for (i = 0; i < size; i++) {
3951 hhd = &mapper->hash.buckets[i];
3952 hlist_for_each_entry(entry, hhd, hlist) {
3953 map = (struct ftrace_func_map *)entry;
3954 free_func(map);
3955 }
3956 }
3957 }
3958 free_ftrace_hash(&mapper->hash);
3959}
3960
3961static void release_probe(struct ftrace_func_probe *probe)
3962{
3963 struct ftrace_probe_ops *probe_ops;
3964
3965 mutex_lock(&ftrace_lock);
3966
3967 WARN_ON(probe->ref <= 0);
3968
3969 /* Subtract the ref that was used to protect this instance */
3970 probe->ref--;
3971
3972 if (!probe->ref) {
3973 probe_ops = probe->probe_ops;
3974 /*
3975 * Sending zero as ip tells probe_ops to free
3976 * the probe->data itself
3977 */
3978 if (probe_ops->free)
3979 probe_ops->free(probe_ops, probe->tr, 0, probe->data);
3980 list_del(&probe->list);
3981 kfree(probe);
3982 }
3983 mutex_unlock(&ftrace_lock);
3984}
3985
3986static void acquire_probe_locked(struct ftrace_func_probe *probe)
3987{
3988 /*
3989 * Add one ref to keep it from being freed when releasing the
3990 * ftrace_lock mutex.
3991 */
3992 probe->ref++;
3784} 3993}
3785 3994
3786int 3995int
3787register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 3996register_ftrace_function_probe(char *glob, struct trace_array *tr,
3788 void *data) 3997 struct ftrace_probe_ops *probe_ops,
3998 void *data)
3789{ 3999{
3790 struct ftrace_ops_hash old_hash_ops; 4000 struct ftrace_func_entry *entry;
3791 struct ftrace_func_probe *entry; 4001 struct ftrace_func_probe *probe;
3792 struct ftrace_glob func_g; 4002 struct ftrace_hash **orig_hash;
3793 struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; 4003 struct ftrace_hash *old_hash;
3794 struct ftrace_hash *old_hash = *orig_hash;
3795 struct ftrace_hash *hash; 4004 struct ftrace_hash *hash;
3796 struct ftrace_page *pg;
3797 struct dyn_ftrace *rec;
3798 int not;
3799 unsigned long key;
3800 int count = 0; 4005 int count = 0;
4006 int size;
3801 int ret; 4007 int ret;
4008 int i;
3802 4009
3803 func_g.type = filter_parse_regex(glob, strlen(glob), 4010 if (WARN_ON(!tr))
3804 &func_g.search, &not);
3805 func_g.len = strlen(func_g.search);
3806
3807 /* we do not support '!' for function probes */
3808 if (WARN_ON(not))
3809 return -EINVAL; 4011 return -EINVAL;
3810 4012
3811 mutex_lock(&trace_probe_ops.func_hash->regex_lock); 4013 /* We do not support '!' for function probes */
4014 if (WARN_ON(glob[0] == '!'))
4015 return -EINVAL;
3812 4016
3813 old_hash_ops.filter_hash = old_hash;
3814 /* Probes only have filters */
3815 old_hash_ops.notrace_hash = NULL;
3816 4017
3817 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); 4018 mutex_lock(&ftrace_lock);
3818 if (!hash) { 4019 /* Check if the probe_ops is already registered */
3819 count = -ENOMEM; 4020 list_for_each_entry(probe, &tr->func_probes, list) {
3820 goto out; 4021 if (probe->probe_ops == probe_ops)
4022 break;
3821 } 4023 }
3822 4024 if (&probe->list == &tr->func_probes) {
3823 if (unlikely(ftrace_disabled)) { 4025 probe = kzalloc(sizeof(*probe), GFP_KERNEL);
3824 count = -ENODEV; 4026 if (!probe) {
3825 goto out; 4027 mutex_unlock(&ftrace_lock);
4028 return -ENOMEM;
4029 }
4030 probe->probe_ops = probe_ops;
4031 probe->ops.func = function_trace_probe_call;
4032 probe->tr = tr;
4033 ftrace_ops_init(&probe->ops);
4034 list_add(&probe->list, &tr->func_probes);
3826 } 4035 }
3827 4036
3828 mutex_lock(&ftrace_lock); 4037 acquire_probe_locked(probe);
3829 4038
3830 do_for_each_ftrace_rec(pg, rec) { 4039 mutex_unlock(&ftrace_lock);
3831 4040
3832 if (rec->flags & FTRACE_FL_DISABLED) 4041 mutex_lock(&probe->ops.func_hash->regex_lock);
3833 continue;
3834 4042
3835 if (!ftrace_match_record(rec, &func_g, NULL, 0)) 4043 orig_hash = &probe->ops.func_hash->filter_hash;
3836 continue; 4044 old_hash = *orig_hash;
4045 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
3837 4046
3838 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 4047 ret = ftrace_match_records(hash, glob, strlen(glob));
3839 if (!entry) {
3840 /* If we did not process any, then return error */
3841 if (!count)
3842 count = -ENOMEM;
3843 goto out_unlock;
3844 }
3845 4048
3846 count++; 4049 /* Nothing found? */
4050 if (!ret)
4051 ret = -EINVAL;
3847 4052
3848 entry->data = data; 4053 if (ret < 0)
4054 goto out;
3849 4055
3850 /* 4056 size = 1 << hash->size_bits;
3851 * The caller might want to do something special 4057 for (i = 0; i < size; i++) {
3852 * for each function we find. We call the callback 4058 hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
3853 * to give the caller an opportunity to do so. 4059 if (ftrace_lookup_ip(old_hash, entry->ip))
3854 */
3855 if (ops->init) {
3856 if (ops->init(ops, rec->ip, &entry->data) < 0) {
3857 /* caller does not like this func */
3858 kfree(entry);
3859 continue; 4060 continue;
4061 /*
4062 * The caller might want to do something special
4063 * for each function we find. We call the callback
4064 * to give the caller an opportunity to do so.
4065 */
4066 if (probe_ops->init) {
4067 ret = probe_ops->init(probe_ops, tr,
4068 entry->ip, data,
4069 &probe->data);
4070 if (ret < 0) {
4071 if (probe_ops->free && count)
4072 probe_ops->free(probe_ops, tr,
4073 0, probe->data);
4074 probe->data = NULL;
4075 goto out;
4076 }
3860 } 4077 }
4078 count++;
3861 } 4079 }
4080 }
3862 4081
3863 ret = enter_record(hash, rec, 0); 4082 mutex_lock(&ftrace_lock);
3864 if (ret < 0) {
3865 kfree(entry);
3866 count = ret;
3867 goto out_unlock;
3868 }
3869
3870 entry->ops = ops;
3871 entry->ip = rec->ip;
3872
3873 key = hash_long(entry->ip, FTRACE_HASH_BITS);
3874 hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
3875 4083
3876 } while_for_each_ftrace_rec(); 4084 if (!count) {
4085 /* Nothing was added? */
4086 ret = -EINVAL;
4087 goto out_unlock;
4088 }
3877 4089
3878 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); 4090 ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash,
4091 hash, 1);
4092 if (ret < 0)
4093 goto err_unlock;
3879 4094
3880 __enable_ftrace_function_probe(&old_hash_ops); 4095 /* One ref for each new function traced */
4096 probe->ref += count;
3881 4097
3882 if (!ret) 4098 if (!(probe->ops.flags & FTRACE_OPS_FL_ENABLED))
3883 free_ftrace_hash_rcu(old_hash); 4099 ret = ftrace_startup(&probe->ops, 0);
3884 else
3885 count = ret;
3886 4100
3887 out_unlock: 4101 out_unlock:
3888 mutex_unlock(&ftrace_lock); 4102 mutex_unlock(&ftrace_lock);
4103
4104 if (!ret)
4105 ret = count;
3889 out: 4106 out:
3890 mutex_unlock(&trace_probe_ops.func_hash->regex_lock); 4107 mutex_unlock(&probe->ops.func_hash->regex_lock);
3891 free_ftrace_hash(hash); 4108 free_ftrace_hash(hash);
3892 4109
3893 return count; 4110 release_probe(probe);
3894}
3895 4111
3896enum { 4112 return ret;
3897 PROBE_TEST_FUNC = 1,
3898 PROBE_TEST_DATA = 2
3899};
3900 4113
3901static void 4114 err_unlock:
3902__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 4115 if (!probe_ops->free || !count)
3903 void *data, int flags) 4116 goto out_unlock;
4117
4118 /* Failed to do the move, need to call the free functions */
4119 for (i = 0; i < size; i++) {
4120 hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
4121 if (ftrace_lookup_ip(old_hash, entry->ip))
4122 continue;
4123 probe_ops->free(probe_ops, tr, entry->ip, probe->data);
4124 }
4125 }
4126 goto out_unlock;
4127}
4128
4129int
4130unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
4131 struct ftrace_probe_ops *probe_ops)
3904{ 4132{
3905 struct ftrace_ops_hash old_hash_ops; 4133 struct ftrace_ops_hash old_hash_ops;
3906 struct ftrace_func_entry *rec_entry; 4134 struct ftrace_func_entry *entry;
3907 struct ftrace_func_probe *entry; 4135 struct ftrace_func_probe *probe;
3908 struct ftrace_func_probe *p;
3909 struct ftrace_glob func_g; 4136 struct ftrace_glob func_g;
3910 struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; 4137 struct ftrace_hash **orig_hash;
3911 struct ftrace_hash *old_hash = *orig_hash; 4138 struct ftrace_hash *old_hash;
3912 struct list_head free_list; 4139 struct ftrace_hash *hash = NULL;
3913 struct ftrace_hash *hash;
3914 struct hlist_node *tmp; 4140 struct hlist_node *tmp;
4141 struct hlist_head hhd;
3915 char str[KSYM_SYMBOL_LEN]; 4142 char str[KSYM_SYMBOL_LEN];
3916 int i, ret; 4143 int count = 0;
3917 bool disabled; 4144 int i, ret = -ENODEV;
4145 int size;
3918 4146
3919 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) 4147 if (!glob || !strlen(glob) || !strcmp(glob, "*"))
3920 func_g.search = NULL; 4148 func_g.search = NULL;
3921 else if (glob) { 4149 else {
3922 int not; 4150 int not;
3923 4151
3924 func_g.type = filter_parse_regex(glob, strlen(glob), 4152 func_g.type = filter_parse_regex(glob, strlen(glob),
@@ -3928,95 +4156,112 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3928 4156
3929 /* we do not support '!' for function probes */ 4157 /* we do not support '!' for function probes */
3930 if (WARN_ON(not)) 4158 if (WARN_ON(not))
3931 return; 4159 return -EINVAL;
3932 } 4160 }
3933 4161
3934 mutex_lock(&trace_probe_ops.func_hash->regex_lock); 4162 mutex_lock(&ftrace_lock);
4163 /* Check if the probe_ops is already registered */
4164 list_for_each_entry(probe, &tr->func_probes, list) {
4165 if (probe->probe_ops == probe_ops)
4166 break;
4167 }
4168 if (&probe->list == &tr->func_probes)
4169 goto err_unlock_ftrace;
4170
4171 ret = -EINVAL;
4172 if (!(probe->ops.flags & FTRACE_OPS_FL_INITIALIZED))
4173 goto err_unlock_ftrace;
4174
4175 acquire_probe_locked(probe);
4176
4177 mutex_unlock(&ftrace_lock);
4178
4179 mutex_lock(&probe->ops.func_hash->regex_lock);
4180
4181 orig_hash = &probe->ops.func_hash->filter_hash;
4182 old_hash = *orig_hash;
4183
4184 if (ftrace_hash_empty(old_hash))
4185 goto out_unlock;
3935 4186
3936 old_hash_ops.filter_hash = old_hash; 4187 old_hash_ops.filter_hash = old_hash;
3937 /* Probes only have filters */ 4188 /* Probes only have filters */
3938 old_hash_ops.notrace_hash = NULL; 4189 old_hash_ops.notrace_hash = NULL;
3939 4190
3940 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); 4191 ret = -ENOMEM;
4192 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
3941 if (!hash) 4193 if (!hash)
3942 /* Hmm, should report this somehow */
3943 goto out_unlock; 4194 goto out_unlock;
3944 4195
3945 INIT_LIST_HEAD(&free_list); 4196 INIT_HLIST_HEAD(&hhd);
3946
3947 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
3948 struct hlist_head *hhd = &ftrace_func_hash[i];
3949 4197
3950 hlist_for_each_entry_safe(entry, tmp, hhd, node) { 4198 size = 1 << hash->size_bits;
3951 4199 for (i = 0; i < size; i++) {
3952 /* break up if statements for readability */ 4200 hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) {
3953 if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
3954 continue;
3955
3956 if ((flags & PROBE_TEST_DATA) && entry->data != data)
3957 continue;
3958 4201
3959 /* do this last, since it is the most expensive */
3960 if (func_g.search) { 4202 if (func_g.search) {
3961 kallsyms_lookup(entry->ip, NULL, NULL, 4203 kallsyms_lookup(entry->ip, NULL, NULL,
3962 NULL, str); 4204 NULL, str);
3963 if (!ftrace_match(str, &func_g)) 4205 if (!ftrace_match(str, &func_g))
3964 continue; 4206 continue;
3965 } 4207 }
3966 4208 count++;
3967 rec_entry = ftrace_lookup_ip(hash, entry->ip); 4209 remove_hash_entry(hash, entry);
3968 /* It is possible more than one entry had this ip */ 4210 hlist_add_head(&entry->hlist, &hhd);
3969 if (rec_entry)
3970 free_hash_entry(hash, rec_entry);
3971
3972 hlist_del_rcu(&entry->node);
3973 list_add(&entry->free_list, &free_list);
3974 } 4211 }
3975 } 4212 }
4213
4214 /* Nothing found? */
4215 if (!count) {
4216 ret = -EINVAL;
4217 goto out_unlock;
4218 }
4219
3976 mutex_lock(&ftrace_lock); 4220 mutex_lock(&ftrace_lock);
3977 disabled = __disable_ftrace_function_probe(); 4221
3978 /* 4222 WARN_ON(probe->ref < count);
3979 * Remove after the disable is called. Otherwise, if the last 4223
3980 * probe is removed, a null hash means *all enabled*. 4224 probe->ref -= count;
3981 */ 4225
3982 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); 4226 if (ftrace_hash_empty(hash))
4227 ftrace_shutdown(&probe->ops, 0);
4228
4229 ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash,
4230 hash, 1);
3983 4231
3984 /* still need to update the function call sites */ 4232 /* still need to update the function call sites */
3985 if (ftrace_enabled && !disabled) 4233 if (ftrace_enabled && !ftrace_hash_empty(hash))
3986 ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, 4234 ftrace_run_modify_code(&probe->ops, FTRACE_UPDATE_CALLS,
3987 &old_hash_ops); 4235 &old_hash_ops);
3988 synchronize_sched(); 4236 synchronize_sched();
3989 if (!ret)
3990 free_ftrace_hash_rcu(old_hash);
3991 4237
3992 list_for_each_entry_safe(entry, p, &free_list, free_list) { 4238 hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) {
3993 list_del(&entry->free_list); 4239 hlist_del(&entry->hlist);
3994 ftrace_free_entry(entry); 4240 if (probe_ops->free)
4241 probe_ops->free(probe_ops, tr, entry->ip, probe->data);
4242 kfree(entry);
3995 } 4243 }
3996 mutex_unlock(&ftrace_lock); 4244 mutex_unlock(&ftrace_lock);
3997 4245
3998 out_unlock: 4246 out_unlock:
3999 mutex_unlock(&trace_probe_ops.func_hash->regex_lock); 4247 mutex_unlock(&probe->ops.func_hash->regex_lock);
4000 free_ftrace_hash(hash); 4248 free_ftrace_hash(hash);
4001}
4002 4249
4003void 4250 release_probe(probe);
4004unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
4005 void *data)
4006{
4007 __unregister_ftrace_function_probe(glob, ops, data,
4008 PROBE_TEST_FUNC | PROBE_TEST_DATA);
4009}
4010 4251
4011void 4252 return ret;
4012unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) 4253
4013{ 4254 err_unlock_ftrace:
4014 __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC); 4255 mutex_unlock(&ftrace_lock);
4256 return ret;
4015} 4257}
4016 4258
4017void unregister_ftrace_function_probe_all(char *glob) 4259void clear_ftrace_function_probes(struct trace_array *tr)
4018{ 4260{
4019 __unregister_ftrace_function_probe(glob, NULL, NULL, 0); 4261 struct ftrace_func_probe *probe, *n;
4262
4263 list_for_each_entry_safe(probe, n, &tr->func_probes, list)
4264 unregister_ftrace_function_probe_func(NULL, tr, probe->probe_ops);
4020} 4265}
4021 4266
4022static LIST_HEAD(ftrace_commands); 4267static LIST_HEAD(ftrace_commands);
@@ -4068,9 +4313,11 @@ __init int unregister_ftrace_command(struct ftrace_func_command *cmd)
4068 return ret; 4313 return ret;
4069} 4314}
4070 4315
4071static int ftrace_process_regex(struct ftrace_hash *hash, 4316static int ftrace_process_regex(struct ftrace_iterator *iter,
4072 char *buff, int len, int enable) 4317 char *buff, int len, int enable)
4073{ 4318{
4319 struct ftrace_hash *hash = iter->hash;
4320 struct trace_array *tr = iter->ops->private;
4074 char *func, *command, *next = buff; 4321 char *func, *command, *next = buff;
4075 struct ftrace_func_command *p; 4322 struct ftrace_func_command *p;
4076 int ret = -EINVAL; 4323 int ret = -EINVAL;
@@ -4090,10 +4337,13 @@ static int ftrace_process_regex(struct ftrace_hash *hash,
4090 4337
4091 command = strsep(&next, ":"); 4338 command = strsep(&next, ":");
4092 4339
4340 if (WARN_ON_ONCE(!tr))
4341 return -EINVAL;
4342
4093 mutex_lock(&ftrace_cmd_mutex); 4343 mutex_lock(&ftrace_cmd_mutex);
4094 list_for_each_entry(p, &ftrace_commands, list) { 4344 list_for_each_entry(p, &ftrace_commands, list) {
4095 if (strcmp(p->name, command) == 0) { 4345 if (strcmp(p->name, command) == 0) {
4096 ret = p->func(hash, func, command, next, enable); 4346 ret = p->func(tr, hash, func, command, next, enable);
4097 goto out_unlock; 4347 goto out_unlock;
4098 } 4348 }
4099 } 4349 }
@@ -4130,7 +4380,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
4130 4380
4131 if (read >= 0 && trace_parser_loaded(parser) && 4381 if (read >= 0 && trace_parser_loaded(parser) &&
4132 !trace_parser_cont(parser)) { 4382 !trace_parser_cont(parser)) {
4133 ret = ftrace_process_regex(iter->hash, parser->buffer, 4383 ret = ftrace_process_regex(iter, parser->buffer,
4134 parser->idx, enable); 4384 parser->idx, enable);
4135 trace_parser_clear(parser); 4385 trace_parser_clear(parser);
4136 if (ret < 0) 4386 if (ret < 0)
@@ -4175,44 +4425,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
4175 return add_hash_entry(hash, ip); 4425 return add_hash_entry(hash, ip);
4176} 4426}
4177 4427
4178static void ftrace_ops_update_code(struct ftrace_ops *ops,
4179 struct ftrace_ops_hash *old_hash)
4180{
4181 struct ftrace_ops *op;
4182
4183 if (!ftrace_enabled)
4184 return;
4185
4186 if (ops->flags & FTRACE_OPS_FL_ENABLED) {
4187 ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
4188 return;
4189 }
4190
4191 /*
4192 * If this is the shared global_ops filter, then we need to
4193 * check if there is another ops that shares it, is enabled.
4194 * If so, we still need to run the modify code.
4195 */
4196 if (ops->func_hash != &global_ops.local_hash)
4197 return;
4198
4199 do_for_each_ftrace_op(op, ftrace_ops_list) {
4200 if (op->func_hash == &global_ops.local_hash &&
4201 op->flags & FTRACE_OPS_FL_ENABLED) {
4202 ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash);
4203 /* Only need to do this once */
4204 return;
4205 }
4206 } while_for_each_ftrace_op(op);
4207}
4208
4209static int 4428static int
4210ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, 4429ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
4211 unsigned long ip, int remove, int reset, int enable) 4430 unsigned long ip, int remove, int reset, int enable)
4212{ 4431{
4213 struct ftrace_hash **orig_hash; 4432 struct ftrace_hash **orig_hash;
4214 struct ftrace_ops_hash old_hash_ops;
4215 struct ftrace_hash *old_hash;
4216 struct ftrace_hash *hash; 4433 struct ftrace_hash *hash;
4217 int ret; 4434 int ret;
4218 4435
@@ -4247,14 +4464,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
4247 } 4464 }
4248 4465
4249 mutex_lock(&ftrace_lock); 4466 mutex_lock(&ftrace_lock);
4250 old_hash = *orig_hash; 4467 ret = ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable);
4251 old_hash_ops.filter_hash = ops->func_hash->filter_hash;
4252 old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
4253 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
4254 if (!ret) {
4255 ftrace_ops_update_code(ops, &old_hash_ops);
4256 free_ftrace_hash_rcu(old_hash);
4257 }
4258 mutex_unlock(&ftrace_lock); 4468 mutex_unlock(&ftrace_lock);
4259 4469
4260 out_regex_unlock: 4470 out_regex_unlock:
@@ -4493,10 +4703,8 @@ static void __init set_ftrace_early_filters(void)
4493int ftrace_regex_release(struct inode *inode, struct file *file) 4703int ftrace_regex_release(struct inode *inode, struct file *file)
4494{ 4704{
4495 struct seq_file *m = (struct seq_file *)file->private_data; 4705 struct seq_file *m = (struct seq_file *)file->private_data;
4496 struct ftrace_ops_hash old_hash_ops;
4497 struct ftrace_iterator *iter; 4706 struct ftrace_iterator *iter;
4498 struct ftrace_hash **orig_hash; 4707 struct ftrace_hash **orig_hash;
4499 struct ftrace_hash *old_hash;
4500 struct trace_parser *parser; 4708 struct trace_parser *parser;
4501 int filter_hash; 4709 int filter_hash;
4502 int ret; 4710 int ret;
@@ -4526,16 +4734,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
4526 orig_hash = &iter->ops->func_hash->notrace_hash; 4734 orig_hash = &iter->ops->func_hash->notrace_hash;
4527 4735
4528 mutex_lock(&ftrace_lock); 4736 mutex_lock(&ftrace_lock);
4529 old_hash = *orig_hash; 4737 ret = ftrace_hash_move_and_update_ops(iter->ops, orig_hash,
4530 old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash; 4738 iter->hash, filter_hash);
4531 old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash;
4532 ret = ftrace_hash_move(iter->ops, filter_hash,
4533 orig_hash, iter->hash);
4534 if (!ret) {
4535 ftrace_ops_update_code(iter->ops, &old_hash_ops);
4536 free_ftrace_hash_rcu(old_hash);
4537 }
4538 mutex_unlock(&ftrace_lock); 4739 mutex_unlock(&ftrace_lock);
4740 } else {
4741 /* For read only, the hash is the ops hash */
4742 iter->hash = NULL;
4539 } 4743 }
4540 4744
4541 mutex_unlock(&iter->ops->func_hash->regex_lock); 4745 mutex_unlock(&iter->ops->func_hash->regex_lock);
@@ -5274,6 +5478,50 @@ void ftrace_module_init(struct module *mod)
5274} 5478}
5275#endif /* CONFIG_MODULES */ 5479#endif /* CONFIG_MODULES */
5276 5480
5481void __init ftrace_free_init_mem(void)
5482{
5483 unsigned long start = (unsigned long)(&__init_begin);
5484 unsigned long end = (unsigned long)(&__init_end);
5485 struct ftrace_page **last_pg = &ftrace_pages_start;
5486 struct ftrace_page *pg;
5487 struct dyn_ftrace *rec;
5488 struct dyn_ftrace key;
5489 int order;
5490
5491 key.ip = start;
5492 key.flags = end; /* overload flags, as it is unsigned long */
5493
5494 mutex_lock(&ftrace_lock);
5495
5496 for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) {
5497 if (end < pg->records[0].ip ||
5498 start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
5499 continue;
5500 again:
5501 rec = bsearch(&key, pg->records, pg->index,
5502 sizeof(struct dyn_ftrace),
5503 ftrace_cmp_recs);
5504 if (!rec)
5505 continue;
5506 pg->index--;
5507 if (!pg->index) {
5508 *last_pg = pg->next;
5509 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
5510 free_pages((unsigned long)pg->records, order);
5511 kfree(pg);
5512 pg = container_of(last_pg, struct ftrace_page, next);
5513 if (!(*last_pg))
5514 ftrace_pages = pg;
5515 continue;
5516 }
5517 memmove(rec, rec + 1,
5518 (pg->index - (rec - pg->records)) * sizeof(*rec));
5519 /* More than one function may be in this block */
5520 goto again;
5521 }
5522 mutex_unlock(&ftrace_lock);
5523}
5524
5277void __init ftrace_init(void) 5525void __init ftrace_init(void)
5278{ 5526{
5279 extern unsigned long __start_mcount_loc[]; 5527 extern unsigned long __start_mcount_loc[];
@@ -5316,25 +5564,13 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
5316 5564
5317static void ftrace_update_trampoline(struct ftrace_ops *ops) 5565static void ftrace_update_trampoline(struct ftrace_ops *ops)
5318{ 5566{
5319
5320/*
5321 * Currently there's no safe way to free a trampoline when the kernel
5322 * is configured with PREEMPT. That is because a task could be preempted
5323 * when it jumped to the trampoline, it may be preempted for a long time
5324 * depending on the system load, and currently there's no way to know
5325 * when it will be off the trampoline. If the trampoline is freed
5326 * too early, when the task runs again, it will be executing on freed
5327 * memory and crash.
5328 */
5329#ifdef CONFIG_PREEMPT
5330 /* Currently, only non dynamic ops can have a trampoline */
5331 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
5332 return;
5333#endif
5334
5335 arch_ftrace_update_trampoline(ops); 5567 arch_ftrace_update_trampoline(ops);
5336} 5568}
5337 5569
5570void ftrace_init_trace_array(struct trace_array *tr)
5571{
5572 INIT_LIST_HEAD(&tr->func_probes);
5573}
5338#else 5574#else
5339 5575
5340static struct ftrace_ops global_ops = { 5576static struct ftrace_ops global_ops = {
@@ -5389,6 +5625,7 @@ __init void ftrace_init_global_array_ops(struct trace_array *tr)
5389{ 5625{
5390 tr->ops = &global_ops; 5626 tr->ops = &global_ops;
5391 tr->ops->private = tr; 5627 tr->ops->private = tr;
5628 ftrace_init_trace_array(tr);
5392} 5629}
5393 5630
5394void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) 5631void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
@@ -5543,6 +5780,43 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
5543 trace_ignore_this_task(pid_list, next)); 5780 trace_ignore_this_task(pid_list, next));
5544} 5781}
5545 5782
5783static void
5784ftrace_pid_follow_sched_process_fork(void *data,
5785 struct task_struct *self,
5786 struct task_struct *task)
5787{
5788 struct trace_pid_list *pid_list;
5789 struct trace_array *tr = data;
5790
5791 pid_list = rcu_dereference_sched(tr->function_pids);
5792 trace_filter_add_remove_task(pid_list, self, task);
5793}
5794
5795static void
5796ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task)
5797{
5798 struct trace_pid_list *pid_list;
5799 struct trace_array *tr = data;
5800
5801 pid_list = rcu_dereference_sched(tr->function_pids);
5802 trace_filter_add_remove_task(pid_list, NULL, task);
5803}
5804
5805void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
5806{
5807 if (enable) {
5808 register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
5809 tr);
5810 register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
5811 tr);
5812 } else {
5813 unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
5814 tr);
5815 unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
5816 tr);
5817 }
5818}
5819
5546static void clear_ftrace_pids(struct trace_array *tr) 5820static void clear_ftrace_pids(struct trace_array *tr)
5547{ 5821{
5548 struct trace_pid_list *pid_list; 5822 struct trace_pid_list *pid_list;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ca47a4fa2986..4ae268e687fe 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -438,6 +438,7 @@ struct ring_buffer_per_cpu {
438 raw_spinlock_t reader_lock; /* serialize readers */ 438 raw_spinlock_t reader_lock; /* serialize readers */
439 arch_spinlock_t lock; 439 arch_spinlock_t lock;
440 struct lock_class_key lock_key; 440 struct lock_class_key lock_key;
441 struct buffer_data_page *free_page;
441 unsigned long nr_pages; 442 unsigned long nr_pages;
442 unsigned int current_context; 443 unsigned int current_context;
443 struct list_head *pages; 444 struct list_head *pages;
@@ -4389,9 +4390,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
4389 */ 4390 */
4390void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) 4391void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
4391{ 4392{
4392 struct buffer_data_page *bpage; 4393 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
4394 struct buffer_data_page *bpage = NULL;
4395 unsigned long flags;
4393 struct page *page; 4396 struct page *page;
4394 4397
4398 local_irq_save(flags);
4399 arch_spin_lock(&cpu_buffer->lock);
4400
4401 if (cpu_buffer->free_page) {
4402 bpage = cpu_buffer->free_page;
4403 cpu_buffer->free_page = NULL;
4404 }
4405
4406 arch_spin_unlock(&cpu_buffer->lock);
4407 local_irq_restore(flags);
4408
4409 if (bpage)
4410 goto out;
4411
4395 page = alloc_pages_node(cpu_to_node(cpu), 4412 page = alloc_pages_node(cpu_to_node(cpu),
4396 GFP_KERNEL | __GFP_NORETRY, 0); 4413 GFP_KERNEL | __GFP_NORETRY, 0);
4397 if (!page) 4414 if (!page)
@@ -4399,6 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
4399 4416
4400 bpage = page_address(page); 4417 bpage = page_address(page);
4401 4418
4419 out:
4402 rb_init_page(bpage); 4420 rb_init_page(bpage);
4403 4421
4404 return bpage; 4422 return bpage;
@@ -4408,13 +4426,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
4408/** 4426/**
4409 * ring_buffer_free_read_page - free an allocated read page 4427 * ring_buffer_free_read_page - free an allocated read page
4410 * @buffer: the buffer the page was allocate for 4428 * @buffer: the buffer the page was allocate for
4429 * @cpu: the cpu buffer the page came from
4411 * @data: the page to free 4430 * @data: the page to free
4412 * 4431 *
4413 * Free a page allocated from ring_buffer_alloc_read_page. 4432 * Free a page allocated from ring_buffer_alloc_read_page.
4414 */ 4433 */
4415void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) 4434void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
4416{ 4435{
4417 free_page((unsigned long)data); 4436 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
4437 struct buffer_data_page *bpage = data;
4438 unsigned long flags;
4439
4440 local_irq_save(flags);
4441 arch_spin_lock(&cpu_buffer->lock);
4442
4443 if (!cpu_buffer->free_page) {
4444 cpu_buffer->free_page = bpage;
4445 bpage = NULL;
4446 }
4447
4448 arch_spin_unlock(&cpu_buffer->lock);
4449 local_irq_restore(flags);
4450
4451 free_page((unsigned long)bpage);
4418} 4452}
4419EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); 4453EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
4420 4454
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index c190a4d5013c..9fbcaf567886 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -171,7 +171,7 @@ static enum event_status read_page(int cpu)
171 } 171 }
172 } 172 }
173 } 173 }
174 ring_buffer_free_read_page(buffer, bpage); 174 ring_buffer_free_read_page(buffer, cpu, bpage);
175 175
176 if (ret < 0) 176 if (ret < 0)
177 return EVENT_DROPPED; 177 return EVENT_DROPPED;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0ad75e9698f6..1122f151466f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -257,7 +257,7 @@ unsigned long long ns2usecs(u64 nsec)
257 257
258/* trace_flags that are default zero for instances */ 258/* trace_flags that are default zero for instances */
259#define ZEROED_TRACE_FLAGS \ 259#define ZEROED_TRACE_FLAGS \
260 TRACE_ITER_EVENT_FORK 260 (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK)
261 261
262/* 262/*
263 * The global_trace is the descriptor that holds the top-level tracing 263 * The global_trace is the descriptor that holds the top-level tracing
@@ -757,7 +757,7 @@ __trace_buffer_lock_reserve(struct ring_buffer *buffer,
757 return event; 757 return event;
758} 758}
759 759
760static void tracer_tracing_on(struct trace_array *tr) 760void tracer_tracing_on(struct trace_array *tr)
761{ 761{
762 if (tr->trace_buffer.buffer) 762 if (tr->trace_buffer.buffer)
763 ring_buffer_record_on(tr->trace_buffer.buffer); 763 ring_buffer_record_on(tr->trace_buffer.buffer);
@@ -894,23 +894,8 @@ int __trace_bputs(unsigned long ip, const char *str)
894EXPORT_SYMBOL_GPL(__trace_bputs); 894EXPORT_SYMBOL_GPL(__trace_bputs);
895 895
896#ifdef CONFIG_TRACER_SNAPSHOT 896#ifdef CONFIG_TRACER_SNAPSHOT
897/** 897static void tracing_snapshot_instance(struct trace_array *tr)
898 * trace_snapshot - take a snapshot of the current buffer.
899 *
900 * This causes a swap between the snapshot buffer and the current live
901 * tracing buffer. You can use this to take snapshots of the live
902 * trace when some condition is triggered, but continue to trace.
903 *
904 * Note, make sure to allocate the snapshot with either
905 * a tracing_snapshot_alloc(), or by doing it manually
906 * with: echo 1 > /sys/kernel/debug/tracing/snapshot
907 *
908 * If the snapshot buffer is not allocated, it will stop tracing.
909 * Basically making a permanent snapshot.
910 */
911void tracing_snapshot(void)
912{ 898{
913 struct trace_array *tr = &global_trace;
914 struct tracer *tracer = tr->current_trace; 899 struct tracer *tracer = tr->current_trace;
915 unsigned long flags; 900 unsigned long flags;
916 901
@@ -938,6 +923,27 @@ void tracing_snapshot(void)
938 update_max_tr(tr, current, smp_processor_id()); 923 update_max_tr(tr, current, smp_processor_id());
939 local_irq_restore(flags); 924 local_irq_restore(flags);
940} 925}
926
927/**
928 * trace_snapshot - take a snapshot of the current buffer.
929 *
930 * This causes a swap between the snapshot buffer and the current live
931 * tracing buffer. You can use this to take snapshots of the live
932 * trace when some condition is triggered, but continue to trace.
933 *
934 * Note, make sure to allocate the snapshot with either
935 * a tracing_snapshot_alloc(), or by doing it manually
936 * with: echo 1 > /sys/kernel/debug/tracing/snapshot
937 *
938 * If the snapshot buffer is not allocated, it will stop tracing.
939 * Basically making a permanent snapshot.
940 */
941void tracing_snapshot(void)
942{
943 struct trace_array *tr = &global_trace;
944
945 tracing_snapshot_instance(tr);
946}
941EXPORT_SYMBOL_GPL(tracing_snapshot); 947EXPORT_SYMBOL_GPL(tracing_snapshot);
942 948
943static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, 949static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
@@ -1039,7 +1045,7 @@ void tracing_snapshot_alloc(void)
1039EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); 1045EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
1040#endif /* CONFIG_TRACER_SNAPSHOT */ 1046#endif /* CONFIG_TRACER_SNAPSHOT */
1041 1047
1042static void tracer_tracing_off(struct trace_array *tr) 1048void tracer_tracing_off(struct trace_array *tr)
1043{ 1049{
1044 if (tr->trace_buffer.buffer) 1050 if (tr->trace_buffer.buffer)
1045 ring_buffer_record_off(tr->trace_buffer.buffer); 1051 ring_buffer_record_off(tr->trace_buffer.buffer);
@@ -1424,6 +1430,28 @@ static int wait_on_pipe(struct trace_iterator *iter, bool full)
1424} 1430}
1425 1431
1426#ifdef CONFIG_FTRACE_STARTUP_TEST 1432#ifdef CONFIG_FTRACE_STARTUP_TEST
1433static bool selftests_can_run;
1434
1435struct trace_selftests {
1436 struct list_head list;
1437 struct tracer *type;
1438};
1439
1440static LIST_HEAD(postponed_selftests);
1441
1442static int save_selftest(struct tracer *type)
1443{
1444 struct trace_selftests *selftest;
1445
1446 selftest = kmalloc(sizeof(*selftest), GFP_KERNEL);
1447 if (!selftest)
1448 return -ENOMEM;
1449
1450 selftest->type = type;
1451 list_add(&selftest->list, &postponed_selftests);
1452 return 0;
1453}
1454
1427static int run_tracer_selftest(struct tracer *type) 1455static int run_tracer_selftest(struct tracer *type)
1428{ 1456{
1429 struct trace_array *tr = &global_trace; 1457 struct trace_array *tr = &global_trace;
@@ -1434,6 +1462,14 @@ static int run_tracer_selftest(struct tracer *type)
1434 return 0; 1462 return 0;
1435 1463
1436 /* 1464 /*
1465 * If a tracer registers early in boot up (before scheduling is
1466 * initialized and such), then do not run its selftests yet.
1467 * Instead, run it a little later in the boot process.
1468 */
1469 if (!selftests_can_run)
1470 return save_selftest(type);
1471
1472 /*
1437 * Run a selftest on this tracer. 1473 * Run a selftest on this tracer.
1438 * Here we reset the trace buffer, and set the current 1474 * Here we reset the trace buffer, and set the current
1439 * tracer to be this tracer. The tracer can then run some 1475 * tracer to be this tracer. The tracer can then run some
@@ -1482,6 +1518,47 @@ static int run_tracer_selftest(struct tracer *type)
1482 printk(KERN_CONT "PASSED\n"); 1518 printk(KERN_CONT "PASSED\n");
1483 return 0; 1519 return 0;
1484} 1520}
1521
1522static __init int init_trace_selftests(void)
1523{
1524 struct trace_selftests *p, *n;
1525 struct tracer *t, **last;
1526 int ret;
1527
1528 selftests_can_run = true;
1529
1530 mutex_lock(&trace_types_lock);
1531
1532 if (list_empty(&postponed_selftests))
1533 goto out;
1534
1535 pr_info("Running postponed tracer tests:\n");
1536
1537 list_for_each_entry_safe(p, n, &postponed_selftests, list) {
1538 ret = run_tracer_selftest(p->type);
1539 /* If the test fails, then warn and remove from available_tracers */
1540 if (ret < 0) {
1541 WARN(1, "tracer: %s failed selftest, disabling\n",
1542 p->type->name);
1543 last = &trace_types;
1544 for (t = trace_types; t; t = t->next) {
1545 if (t == p->type) {
1546 *last = t->next;
1547 break;
1548 }
1549 last = &t->next;
1550 }
1551 }
1552 list_del(&p->list);
1553 kfree(p);
1554 }
1555
1556 out:
1557 mutex_unlock(&trace_types_lock);
1558
1559 return 0;
1560}
1561core_initcall(init_trace_selftests);
1485#else 1562#else
1486static inline int run_tracer_selftest(struct tracer *type) 1563static inline int run_tracer_selftest(struct tracer *type)
1487{ 1564{
@@ -1899,7 +1976,7 @@ static void __trace_find_cmdline(int pid, char comm[])
1899 1976
1900 map = savedcmd->map_pid_to_cmdline[pid]; 1977 map = savedcmd->map_pid_to_cmdline[pid];
1901 if (map != NO_CMDLINE_MAP) 1978 if (map != NO_CMDLINE_MAP)
1902 strcpy(comm, get_saved_cmdlines(map)); 1979 strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
1903 else 1980 else
1904 strcpy(comm, "<...>"); 1981 strcpy(comm, "<...>");
1905} 1982}
@@ -1927,6 +2004,18 @@ void tracing_record_cmdline(struct task_struct *tsk)
1927 __this_cpu_write(trace_cmdline_save, false); 2004 __this_cpu_write(trace_cmdline_save, false);
1928} 2005}
1929 2006
2007/*
2008 * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq
2009 * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function
2010 * simplifies those functions and keeps them in sync.
2011 */
2012enum print_line_t trace_handle_return(struct trace_seq *s)
2013{
2014 return trace_seq_has_overflowed(s) ?
2015 TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED;
2016}
2017EXPORT_SYMBOL_GPL(trace_handle_return);
2018
1930void 2019void
1931tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, 2020tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1932 int pc) 2021 int pc)
@@ -2479,7 +2568,36 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
2479void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 2568void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
2480 int pc) 2569 int pc)
2481{ 2570{
2482 __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL); 2571 struct ring_buffer *buffer = tr->trace_buffer.buffer;
2572
2573 if (rcu_is_watching()) {
2574 __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
2575 return;
2576 }
2577
2578 /*
2579 * When an NMI triggers, RCU is enabled via rcu_nmi_enter(),
2580 * but if the above rcu_is_watching() failed, then the NMI
2581 * triggered someplace critical, and rcu_irq_enter() should
2582 * not be called from NMI.
2583 */
2584 if (unlikely(in_nmi()))
2585 return;
2586
2587 /*
2588 * It is possible that a function is being traced in a
2589 * location that RCU is not watching. A call to
2590 * rcu_irq_enter() will make sure that it is, but there's
2591 * a few internal rcu functions that could be traced
2592 * where that wont work either. In those cases, we just
2593 * do nothing.
2594 */
2595 if (unlikely(rcu_irq_enter_disabled()))
2596 return;
2597
2598 rcu_irq_enter_irqson();
2599 __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
2600 rcu_irq_exit_irqson();
2483} 2601}
2484 2602
2485/** 2603/**
@@ -3222,13 +3340,14 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
3222 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) 3340 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
3223 return; 3341 return;
3224 3342
3225 if (iter->started && cpumask_test_cpu(iter->cpu, iter->started)) 3343 if (cpumask_available(iter->started) &&
3344 cpumask_test_cpu(iter->cpu, iter->started))
3226 return; 3345 return;
3227 3346
3228 if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) 3347 if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
3229 return; 3348 return;
3230 3349
3231 if (iter->started) 3350 if (cpumask_available(iter->started))
3232 cpumask_set_cpu(iter->cpu, iter->started); 3351 cpumask_set_cpu(iter->cpu, iter->started);
3233 3352
3234 /* Don't print started cpu buffer for the first entry of the trace */ 3353 /* Don't print started cpu buffer for the first entry of the trace */
@@ -4122,6 +4241,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
4122 if (mask == TRACE_ITER_EVENT_FORK) 4241 if (mask == TRACE_ITER_EVENT_FORK)
4123 trace_event_follow_fork(tr, enabled); 4242 trace_event_follow_fork(tr, enabled);
4124 4243
4244 if (mask == TRACE_ITER_FUNC_FORK)
4245 ftrace_pid_follow_fork(tr, enabled);
4246
4125 if (mask == TRACE_ITER_OVERWRITE) { 4247 if (mask == TRACE_ITER_OVERWRITE) {
4126 ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); 4248 ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
4127#ifdef CONFIG_TRACER_MAX_TRACE 4249#ifdef CONFIG_TRACER_MAX_TRACE
@@ -4355,6 +4477,7 @@ static const char readme_msg[] =
4355 "\t -:[<group>/]<event>\n" 4477 "\t -:[<group>/]<event>\n"
4356#ifdef CONFIG_KPROBE_EVENTS 4478#ifdef CONFIG_KPROBE_EVENTS
4357 "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" 4479 "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
4480 "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n"
4358#endif 4481#endif
4359#ifdef CONFIG_UPROBE_EVENTS 4482#ifdef CONFIG_UPROBE_EVENTS
4360 "\t place: <path>:<offset>\n" 4483 "\t place: <path>:<offset>\n"
@@ -5529,7 +5652,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
5529 .partial = partial_def, 5652 .partial = partial_def,
5530 .nr_pages = 0, /* This gets updated below. */ 5653 .nr_pages = 0, /* This gets updated below. */
5531 .nr_pages_max = PIPE_DEF_BUFFERS, 5654 .nr_pages_max = PIPE_DEF_BUFFERS,
5532 .flags = flags,
5533 .ops = &tracing_pipe_buf_ops, 5655 .ops = &tracing_pipe_buf_ops,
5534 .spd_release = tracing_spd_release_pipe, 5656 .spd_release = tracing_spd_release_pipe,
5535 }; 5657 };
@@ -5962,6 +6084,7 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
5962struct ftrace_buffer_info { 6084struct ftrace_buffer_info {
5963 struct trace_iterator iter; 6085 struct trace_iterator iter;
5964 void *spare; 6086 void *spare;
6087 unsigned int spare_cpu;
5965 unsigned int read; 6088 unsigned int read;
5966}; 6089};
5967 6090
@@ -6291,9 +6414,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
6291 return -EBUSY; 6414 return -EBUSY;
6292#endif 6415#endif
6293 6416
6294 if (!info->spare) 6417 if (!info->spare) {
6295 info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, 6418 info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
6296 iter->cpu_file); 6419 iter->cpu_file);
6420 info->spare_cpu = iter->cpu_file;
6421 }
6297 if (!info->spare) 6422 if (!info->spare)
6298 return -ENOMEM; 6423 return -ENOMEM;
6299 6424
@@ -6353,7 +6478,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
6353 __trace_array_put(iter->tr); 6478 __trace_array_put(iter->tr);
6354 6479
6355 if (info->spare) 6480 if (info->spare)
6356 ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); 6481 ring_buffer_free_read_page(iter->trace_buffer->buffer,
6482 info->spare_cpu, info->spare);
6357 kfree(info); 6483 kfree(info);
6358 6484
6359 mutex_unlock(&trace_types_lock); 6485 mutex_unlock(&trace_types_lock);
@@ -6364,6 +6490,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
6364struct buffer_ref { 6490struct buffer_ref {
6365 struct ring_buffer *buffer; 6491 struct ring_buffer *buffer;
6366 void *page; 6492 void *page;
6493 int cpu;
6367 int ref; 6494 int ref;
6368}; 6495};
6369 6496
@@ -6375,7 +6502,7 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
6375 if (--ref->ref) 6502 if (--ref->ref)
6376 return; 6503 return;
6377 6504
6378 ring_buffer_free_read_page(ref->buffer, ref->page); 6505 ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
6379 kfree(ref); 6506 kfree(ref);
6380 buf->private = 0; 6507 buf->private = 0;
6381} 6508}
@@ -6409,7 +6536,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
6409 if (--ref->ref) 6536 if (--ref->ref)
6410 return; 6537 return;
6411 6538
6412 ring_buffer_free_read_page(ref->buffer, ref->page); 6539 ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
6413 kfree(ref); 6540 kfree(ref);
6414 spd->partial[i].private = 0; 6541 spd->partial[i].private = 0;
6415} 6542}
@@ -6427,7 +6554,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
6427 .pages = pages_def, 6554 .pages = pages_def,
6428 .partial = partial_def, 6555 .partial = partial_def,
6429 .nr_pages_max = PIPE_DEF_BUFFERS, 6556 .nr_pages_max = PIPE_DEF_BUFFERS,
6430 .flags = flags,
6431 .ops = &buffer_pipe_buf_ops, 6557 .ops = &buffer_pipe_buf_ops,
6432 .spd_release = buffer_spd_release, 6558 .spd_release = buffer_spd_release,
6433 }; 6559 };
@@ -6474,11 +6600,13 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
6474 kfree(ref); 6600 kfree(ref);
6475 break; 6601 break;
6476 } 6602 }
6603 ref->cpu = iter->cpu_file;
6477 6604
6478 r = ring_buffer_read_page(ref->buffer, &ref->page, 6605 r = ring_buffer_read_page(ref->buffer, &ref->page,
6479 len, iter->cpu_file, 1); 6606 len, iter->cpu_file, 1);
6480 if (r < 0) { 6607 if (r < 0) {
6481 ring_buffer_free_read_page(ref->buffer, ref->page); 6608 ring_buffer_free_read_page(ref->buffer, ref->cpu,
6609 ref->page);
6482 kfree(ref); 6610 kfree(ref);
6483 break; 6611 break;
6484 } 6612 }
@@ -6649,43 +6777,89 @@ static const struct file_operations tracing_dyn_info_fops = {
6649 6777
6650#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) 6778#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
6651static void 6779static void
6652ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) 6780ftrace_snapshot(unsigned long ip, unsigned long parent_ip,
6781 struct trace_array *tr, struct ftrace_probe_ops *ops,
6782 void *data)
6653{ 6783{
6654 tracing_snapshot(); 6784 tracing_snapshot_instance(tr);
6655} 6785}
6656 6786
6657static void 6787static void
6658ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) 6788ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip,
6789 struct trace_array *tr, struct ftrace_probe_ops *ops,
6790 void *data)
6659{ 6791{
6660 unsigned long *count = (long *)data; 6792 struct ftrace_func_mapper *mapper = data;
6793 long *count = NULL;
6661 6794
6662 if (!*count) 6795 if (mapper)
6663 return; 6796 count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
6797
6798 if (count) {
6799
6800 if (*count <= 0)
6801 return;
6664 6802
6665 if (*count != -1)
6666 (*count)--; 6803 (*count)--;
6804 }
6667 6805
6668 tracing_snapshot(); 6806 tracing_snapshot_instance(tr);
6669} 6807}
6670 6808
6671static int 6809static int
6672ftrace_snapshot_print(struct seq_file *m, unsigned long ip, 6810ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
6673 struct ftrace_probe_ops *ops, void *data) 6811 struct ftrace_probe_ops *ops, void *data)
6674{ 6812{
6675 long count = (long)data; 6813 struct ftrace_func_mapper *mapper = data;
6814 long *count = NULL;
6676 6815
6677 seq_printf(m, "%ps:", (void *)ip); 6816 seq_printf(m, "%ps:", (void *)ip);
6678 6817
6679 seq_puts(m, "snapshot"); 6818 seq_puts(m, "snapshot");
6680 6819
6681 if (count == -1) 6820 if (mapper)
6682 seq_puts(m, ":unlimited\n"); 6821 count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
6822
6823 if (count)
6824 seq_printf(m, ":count=%ld\n", *count);
6683 else 6825 else
6684 seq_printf(m, ":count=%ld\n", count); 6826 seq_puts(m, ":unlimited\n");
6685 6827
6686 return 0; 6828 return 0;
6687} 6829}
6688 6830
6831static int
6832ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
6833 unsigned long ip, void *init_data, void **data)
6834{
6835 struct ftrace_func_mapper *mapper = *data;
6836
6837 if (!mapper) {
6838 mapper = allocate_ftrace_func_mapper();
6839 if (!mapper)
6840 return -ENOMEM;
6841 *data = mapper;
6842 }
6843
6844 return ftrace_func_mapper_add_ip(mapper, ip, init_data);
6845}
6846
6847static void
6848ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
6849 unsigned long ip, void *data)
6850{
6851 struct ftrace_func_mapper *mapper = data;
6852
6853 if (!ip) {
6854 if (!mapper)
6855 return;
6856 free_ftrace_func_mapper(mapper, NULL);
6857 return;
6858 }
6859
6860 ftrace_func_mapper_remove_ip(mapper, ip);
6861}
6862
6689static struct ftrace_probe_ops snapshot_probe_ops = { 6863static struct ftrace_probe_ops snapshot_probe_ops = {
6690 .func = ftrace_snapshot, 6864 .func = ftrace_snapshot,
6691 .print = ftrace_snapshot_print, 6865 .print = ftrace_snapshot_print,
@@ -6694,10 +6868,12 @@ static struct ftrace_probe_ops snapshot_probe_ops = {
6694static struct ftrace_probe_ops snapshot_count_probe_ops = { 6868static struct ftrace_probe_ops snapshot_count_probe_ops = {
6695 .func = ftrace_count_snapshot, 6869 .func = ftrace_count_snapshot,
6696 .print = ftrace_snapshot_print, 6870 .print = ftrace_snapshot_print,
6871 .init = ftrace_snapshot_init,
6872 .free = ftrace_snapshot_free,
6697}; 6873};
6698 6874
6699static int 6875static int
6700ftrace_trace_snapshot_callback(struct ftrace_hash *hash, 6876ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
6701 char *glob, char *cmd, char *param, int enable) 6877 char *glob, char *cmd, char *param, int enable)
6702{ 6878{
6703 struct ftrace_probe_ops *ops; 6879 struct ftrace_probe_ops *ops;
@@ -6711,10 +6887,8 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
6711 6887
6712 ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; 6888 ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
6713 6889
6714 if (glob[0] == '!') { 6890 if (glob[0] == '!')
6715 unregister_ftrace_function_probe_func(glob+1, ops); 6891 return unregister_ftrace_function_probe_func(glob+1, tr, ops);
6716 return 0;
6717 }
6718 6892
6719 if (!param) 6893 if (!param)
6720 goto out_reg; 6894 goto out_reg;
@@ -6733,11 +6907,11 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
6733 return ret; 6907 return ret;
6734 6908
6735 out_reg: 6909 out_reg:
6736 ret = alloc_snapshot(&global_trace); 6910 ret = alloc_snapshot(tr);
6737 if (ret < 0) 6911 if (ret < 0)
6738 goto out; 6912 goto out;
6739 6913
6740 ret = register_ftrace_function_probe(glob, ops, count); 6914 ret = register_ftrace_function_probe(glob, tr, ops, count);
6741 6915
6742 out: 6916 out:
6743 return ret < 0 ? ret : 0; 6917 return ret < 0 ? ret : 0;
@@ -7348,6 +7522,8 @@ static int instance_mkdir(const char *name)
7348 goto out_free_tr; 7522 goto out_free_tr;
7349 } 7523 }
7350 7524
7525 ftrace_init_trace_array(tr);
7526
7351 init_tracer_tracefs(tr, tr->dir); 7527 init_tracer_tracefs(tr, tr->dir);
7352 init_trace_flags_index(tr); 7528 init_trace_flags_index(tr);
7353 __update_tracer_options(tr); 7529 __update_tracer_options(tr);
@@ -7403,6 +7579,7 @@ static int instance_rmdir(const char *name)
7403 } 7579 }
7404 7580
7405 tracing_set_nop(tr); 7581 tracing_set_nop(tr);
7582 clear_ftrace_function_probes(tr);
7406 event_trace_del_tracer(tr); 7583 event_trace_del_tracer(tr);
7407 ftrace_clear_pids(tr); 7584 ftrace_clear_pids(tr);
7408 ftrace_destroy_function_files(tr); 7585 ftrace_destroy_function_files(tr);
@@ -7968,6 +8145,9 @@ __init static int tracer_alloc_buffers(void)
7968 8145
7969 register_tracer(&nop_trace); 8146 register_tracer(&nop_trace);
7970 8147
8148 /* Function tracing may start here (via kernel command line) */
8149 init_function_trace();
8150
7971 /* All seems OK, enable tracing */ 8151 /* All seems OK, enable tracing */
7972 tracing_disabled = 0; 8152 tracing_disabled = 0;
7973 8153
@@ -8002,7 +8182,7 @@ out:
8002 return ret; 8182 return ret;
8003} 8183}
8004 8184
8005void __init trace_init(void) 8185void __init early_trace_init(void)
8006{ 8186{
8007 if (tracepoint_printk) { 8187 if (tracepoint_printk) {
8008 tracepoint_print_iter = 8188 tracepoint_print_iter =
@@ -8013,6 +8193,10 @@ void __init trace_init(void)
8013 static_key_enable(&tracepoint_printk_key.key); 8193 static_key_enable(&tracepoint_printk_key.key);
8014 } 8194 }
8015 tracer_alloc_buffers(); 8195 tracer_alloc_buffers();
8196}
8197
8198void __init trace_init(void)
8199{
8016 trace_event_init(); 8200 trace_event_init();
8017} 8201}
8018 8202
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d19d52d600d6..39fd77330aab 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -262,6 +262,9 @@ struct trace_array {
262#ifdef CONFIG_FUNCTION_TRACER 262#ifdef CONFIG_FUNCTION_TRACER
263 struct ftrace_ops *ops; 263 struct ftrace_ops *ops;
264 struct trace_pid_list __rcu *function_pids; 264 struct trace_pid_list __rcu *function_pids;
265#ifdef CONFIG_DYNAMIC_FTRACE
266 struct list_head func_probes;
267#endif
265 /* function tracing enabled */ 268 /* function tracing enabled */
266 int function_enabled; 269 int function_enabled;
267#endif 270#endif
@@ -579,6 +582,8 @@ void tracing_reset_all_online_cpus(void);
579int tracing_open_generic(struct inode *inode, struct file *filp); 582int tracing_open_generic(struct inode *inode, struct file *filp);
580bool tracing_is_disabled(void); 583bool tracing_is_disabled(void);
581int tracer_tracing_is_on(struct trace_array *tr); 584int tracer_tracing_is_on(struct trace_array *tr);
585void tracer_tracing_on(struct trace_array *tr);
586void tracer_tracing_off(struct trace_array *tr);
582struct dentry *trace_create_file(const char *name, 587struct dentry *trace_create_file(const char *name,
583 umode_t mode, 588 umode_t mode,
584 struct dentry *parent, 589 struct dentry *parent,
@@ -696,6 +701,9 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
696 701
697#ifdef CONFIG_DYNAMIC_FTRACE 702#ifdef CONFIG_DYNAMIC_FTRACE
698extern unsigned long ftrace_update_tot_cnt; 703extern unsigned long ftrace_update_tot_cnt;
704void ftrace_init_trace_array(struct trace_array *tr);
705#else
706static inline void ftrace_init_trace_array(struct trace_array *tr) { }
699#endif 707#endif
700#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func 708#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
701extern int DYN_FTRACE_TEST_NAME(void); 709extern int DYN_FTRACE_TEST_NAME(void);
@@ -880,6 +888,14 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
880extern struct list_head ftrace_pids; 888extern struct list_head ftrace_pids;
881 889
882#ifdef CONFIG_FUNCTION_TRACER 890#ifdef CONFIG_FUNCTION_TRACER
891struct ftrace_func_command {
892 struct list_head list;
893 char *name;
894 int (*func)(struct trace_array *tr,
895 struct ftrace_hash *hash,
896 char *func, char *cmd,
897 char *params, int enable);
898};
883extern bool ftrace_filter_param __initdata; 899extern bool ftrace_filter_param __initdata;
884static inline int ftrace_trace_task(struct trace_array *tr) 900static inline int ftrace_trace_task(struct trace_array *tr)
885{ 901{
@@ -897,6 +913,8 @@ void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
897void ftrace_init_tracefs_toplevel(struct trace_array *tr, 913void ftrace_init_tracefs_toplevel(struct trace_array *tr,
898 struct dentry *d_tracer); 914 struct dentry *d_tracer);
899void ftrace_clear_pids(struct trace_array *tr); 915void ftrace_clear_pids(struct trace_array *tr);
916int init_function_trace(void);
917void ftrace_pid_follow_fork(struct trace_array *tr, bool enable);
900#else 918#else
901static inline int ftrace_trace_task(struct trace_array *tr) 919static inline int ftrace_trace_task(struct trace_array *tr)
902{ 920{
@@ -916,15 +934,75 @@ static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
916static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } 934static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
917static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } 935static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
918static inline void ftrace_clear_pids(struct trace_array *tr) { } 936static inline void ftrace_clear_pids(struct trace_array *tr) { }
937static inline int init_function_trace(void) { return 0; }
938static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { }
919/* ftace_func_t type is not defined, use macro instead of static inline */ 939/* ftace_func_t type is not defined, use macro instead of static inline */
920#define ftrace_init_array_ops(tr, func) do { } while (0) 940#define ftrace_init_array_ops(tr, func) do { } while (0)
921#endif /* CONFIG_FUNCTION_TRACER */ 941#endif /* CONFIG_FUNCTION_TRACER */
922 942
923#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) 943#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
944
945struct ftrace_probe_ops {
946 void (*func)(unsigned long ip,
947 unsigned long parent_ip,
948 struct trace_array *tr,
949 struct ftrace_probe_ops *ops,
950 void *data);
951 int (*init)(struct ftrace_probe_ops *ops,
952 struct trace_array *tr,
953 unsigned long ip, void *init_data,
954 void **data);
955 void (*free)(struct ftrace_probe_ops *ops,
956 struct trace_array *tr,
957 unsigned long ip, void *data);
958 int (*print)(struct seq_file *m,
959 unsigned long ip,
960 struct ftrace_probe_ops *ops,
961 void *data);
962};
963
964struct ftrace_func_mapper;
965typedef int (*ftrace_mapper_func)(void *data);
966
967struct ftrace_func_mapper *allocate_ftrace_func_mapper(void);
968void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper,
969 unsigned long ip);
970int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
971 unsigned long ip, void *data);
972void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper,
973 unsigned long ip);
974void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
975 ftrace_mapper_func free_func);
976
977extern int
978register_ftrace_function_probe(char *glob, struct trace_array *tr,
979 struct ftrace_probe_ops *ops, void *data);
980extern int
981unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
982 struct ftrace_probe_ops *ops);
983extern void clear_ftrace_function_probes(struct trace_array *tr);
984
985int register_ftrace_command(struct ftrace_func_command *cmd);
986int unregister_ftrace_command(struct ftrace_func_command *cmd);
987
924void ftrace_create_filter_files(struct ftrace_ops *ops, 988void ftrace_create_filter_files(struct ftrace_ops *ops,
925 struct dentry *parent); 989 struct dentry *parent);
926void ftrace_destroy_filter_files(struct ftrace_ops *ops); 990void ftrace_destroy_filter_files(struct ftrace_ops *ops);
927#else 991#else
992struct ftrace_func_command;
993
994static inline __init int register_ftrace_command(struct ftrace_func_command *cmd)
995{
996 return -EINVAL;
997}
998static inline __init int unregister_ftrace_command(char *cmd_name)
999{
1000 return -EINVAL;
1001}
1002static inline void clear_ftrace_function_probes(struct trace_array *tr)
1003{
1004}
1005
928/* 1006/*
929 * The ops parameter passed in is usually undefined. 1007 * The ops parameter passed in is usually undefined.
930 * This must be a macro. 1008 * This must be a macro.
@@ -989,11 +1067,13 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
989 1067
990#ifdef CONFIG_FUNCTION_TRACER 1068#ifdef CONFIG_FUNCTION_TRACER
991# define FUNCTION_FLAGS \ 1069# define FUNCTION_FLAGS \
992 C(FUNCTION, "function-trace"), 1070 C(FUNCTION, "function-trace"), \
1071 C(FUNC_FORK, "function-fork"),
993# define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION 1072# define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION
994#else 1073#else
995# define FUNCTION_FLAGS 1074# define FUNCTION_FLAGS
996# define FUNCTION_DEFAULT_FLAGS 0UL 1075# define FUNCTION_DEFAULT_FLAGS 0UL
1076# define TRACE_ITER_FUNC_FORK 0UL
997#endif 1077#endif
998 1078
999#ifdef CONFIG_STACKTRACE 1079#ifdef CONFIG_STACKTRACE
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index e49fbe901cfc..16a8cf02eee9 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -153,10 +153,18 @@ static int benchmark_event_kthread(void *arg)
153 trace_do_benchmark(); 153 trace_do_benchmark();
154 154
155 /* 155 /*
156 * We don't go to sleep, but let others 156 * We don't go to sleep, but let others run as well.
157 * run as well. 157 * This is bascially a "yield()" to let any task that
158 * wants to run, schedule in, but if the CPU is idle,
159 * we'll keep burning cycles.
160 *
161 * Note the _rcu_qs() version of cond_resched() will
162 * notify synchronize_rcu_tasks() that this thread has
163 * passed a quiescent state for rcu_tasks. Otherwise
164 * this thread will never voluntarily schedule which would
165 * block synchronize_rcu_tasks() indefinitely.
158 */ 166 */
159 cond_resched(); 167 cond_resched_rcu_qs();
160 } 168 }
161 169
162 return 0; 170 return 0;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c203ac4df791..adcdbbeae010 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -348,14 +348,14 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
348 __field( u64, duration ) 348 __field( u64, duration )
349 __field( u64, outer_duration ) 349 __field( u64, outer_duration )
350 __field( u64, nmi_total_ts ) 350 __field( u64, nmi_total_ts )
351 __field_struct( struct timespec, timestamp ) 351 __field_struct( struct timespec64, timestamp )
352 __field_desc( long, timestamp, tv_sec ) 352 __field_desc( s64, timestamp, tv_sec )
353 __field_desc( long, timestamp, tv_nsec ) 353 __field_desc( long, timestamp, tv_nsec )
354 __field( unsigned int, nmi_count ) 354 __field( unsigned int, nmi_count )
355 __field( unsigned int, seqnum ) 355 __field( unsigned int, seqnum )
356 ), 356 ),
357 357
358 F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", 358 F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n",
359 __entry->seqnum, 359 __entry->seqnum,
360 __entry->tv_sec, 360 __entry->tv_sec,
361 __entry->tv_nsec, 361 __entry->tv_nsec,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 93116549a284..e7973e10398c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2460,15 +2460,8 @@ struct event_probe_data {
2460 bool enable; 2460 bool enable;
2461}; 2461};
2462 2462
2463static void 2463static void update_event_probe(struct event_probe_data *data)
2464event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
2465{ 2464{
2466 struct event_probe_data **pdata = (struct event_probe_data **)_data;
2467 struct event_probe_data *data = *pdata;
2468
2469 if (!data)
2470 return;
2471
2472 if (data->enable) 2465 if (data->enable)
2473 clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); 2466 clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags);
2474 else 2467 else
@@ -2476,77 +2469,141 @@ event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
2476} 2469}
2477 2470
2478static void 2471static void
2479event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) 2472event_enable_probe(unsigned long ip, unsigned long parent_ip,
2473 struct trace_array *tr, struct ftrace_probe_ops *ops,
2474 void *data)
2480{ 2475{
2481 struct event_probe_data **pdata = (struct event_probe_data **)_data; 2476 struct ftrace_func_mapper *mapper = data;
2482 struct event_probe_data *data = *pdata; 2477 struct event_probe_data *edata;
2478 void **pdata;
2483 2479
2484 if (!data) 2480 pdata = ftrace_func_mapper_find_ip(mapper, ip);
2481 if (!pdata || !*pdata)
2482 return;
2483
2484 edata = *pdata;
2485 update_event_probe(edata);
2486}
2487
2488static void
2489event_enable_count_probe(unsigned long ip, unsigned long parent_ip,
2490 struct trace_array *tr, struct ftrace_probe_ops *ops,
2491 void *data)
2492{
2493 struct ftrace_func_mapper *mapper = data;
2494 struct event_probe_data *edata;
2495 void **pdata;
2496
2497 pdata = ftrace_func_mapper_find_ip(mapper, ip);
2498 if (!pdata || !*pdata)
2485 return; 2499 return;
2486 2500
2487 if (!data->count) 2501 edata = *pdata;
2502
2503 if (!edata->count)
2488 return; 2504 return;
2489 2505
2490 /* Skip if the event is in a state we want to switch to */ 2506 /* Skip if the event is in a state we want to switch to */
2491 if (data->enable == !(data->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) 2507 if (edata->enable == !(edata->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
2492 return; 2508 return;
2493 2509
2494 if (data->count != -1) 2510 if (edata->count != -1)
2495 (data->count)--; 2511 (edata->count)--;
2496 2512
2497 event_enable_probe(ip, parent_ip, _data); 2513 update_event_probe(edata);
2498} 2514}
2499 2515
2500static int 2516static int
2501event_enable_print(struct seq_file *m, unsigned long ip, 2517event_enable_print(struct seq_file *m, unsigned long ip,
2502 struct ftrace_probe_ops *ops, void *_data) 2518 struct ftrace_probe_ops *ops, void *data)
2503{ 2519{
2504 struct event_probe_data *data = _data; 2520 struct ftrace_func_mapper *mapper = data;
2521 struct event_probe_data *edata;
2522 void **pdata;
2523
2524 pdata = ftrace_func_mapper_find_ip(mapper, ip);
2525
2526 if (WARN_ON_ONCE(!pdata || !*pdata))
2527 return 0;
2528
2529 edata = *pdata;
2505 2530
2506 seq_printf(m, "%ps:", (void *)ip); 2531 seq_printf(m, "%ps:", (void *)ip);
2507 2532
2508 seq_printf(m, "%s:%s:%s", 2533 seq_printf(m, "%s:%s:%s",
2509 data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, 2534 edata->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
2510 data->file->event_call->class->system, 2535 edata->file->event_call->class->system,
2511 trace_event_name(data->file->event_call)); 2536 trace_event_name(edata->file->event_call));
2512 2537
2513 if (data->count == -1) 2538 if (edata->count == -1)
2514 seq_puts(m, ":unlimited\n"); 2539 seq_puts(m, ":unlimited\n");
2515 else 2540 else
2516 seq_printf(m, ":count=%ld\n", data->count); 2541 seq_printf(m, ":count=%ld\n", edata->count);
2517 2542
2518 return 0; 2543 return 0;
2519} 2544}
2520 2545
2521static int 2546static int
2522event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, 2547event_enable_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
2523 void **_data) 2548 unsigned long ip, void *init_data, void **data)
2524{ 2549{
2525 struct event_probe_data **pdata = (struct event_probe_data **)_data; 2550 struct ftrace_func_mapper *mapper = *data;
2526 struct event_probe_data *data = *pdata; 2551 struct event_probe_data *edata = init_data;
2552 int ret;
2553
2554 if (!mapper) {
2555 mapper = allocate_ftrace_func_mapper();
2556 if (!mapper)
2557 return -ENODEV;
2558 *data = mapper;
2559 }
2560
2561 ret = ftrace_func_mapper_add_ip(mapper, ip, edata);
2562 if (ret < 0)
2563 return ret;
2564
2565 edata->ref++;
2527 2566
2528 data->ref++; 2567 return 0;
2568}
2569
2570static int free_probe_data(void *data)
2571{
2572 struct event_probe_data *edata = data;
2573
2574 edata->ref--;
2575 if (!edata->ref) {
2576 /* Remove the SOFT_MODE flag */
2577 __ftrace_event_enable_disable(edata->file, 0, 1);
2578 module_put(edata->file->event_call->mod);
2579 kfree(edata);
2580 }
2529 return 0; 2581 return 0;
2530} 2582}
2531 2583
2532static void 2584static void
2533event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, 2585event_enable_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
2534 void **_data) 2586 unsigned long ip, void *data)
2535{ 2587{
2536 struct event_probe_data **pdata = (struct event_probe_data **)_data; 2588 struct ftrace_func_mapper *mapper = data;
2537 struct event_probe_data *data = *pdata; 2589 struct event_probe_data *edata;
2538 2590
2539 if (WARN_ON_ONCE(data->ref <= 0)) 2591 if (!ip) {
2592 if (!mapper)
2593 return;
2594 free_ftrace_func_mapper(mapper, free_probe_data);
2540 return; 2595 return;
2541
2542 data->ref--;
2543 if (!data->ref) {
2544 /* Remove the SOFT_MODE flag */
2545 __ftrace_event_enable_disable(data->file, 0, 1);
2546 module_put(data->file->event_call->mod);
2547 kfree(data);
2548 } 2596 }
2549 *pdata = NULL; 2597
2598 edata = ftrace_func_mapper_remove_ip(mapper, ip);
2599
2600 if (WARN_ON_ONCE(!edata))
2601 return;
2602
2603 if (WARN_ON_ONCE(edata->ref <= 0))
2604 return;
2605
2606 free_probe_data(edata);
2550} 2607}
2551 2608
2552static struct ftrace_probe_ops event_enable_probe_ops = { 2609static struct ftrace_probe_ops event_enable_probe_ops = {
@@ -2578,10 +2635,9 @@ static struct ftrace_probe_ops event_disable_count_probe_ops = {
2578}; 2635};
2579 2636
2580static int 2637static int
2581event_enable_func(struct ftrace_hash *hash, 2638event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
2582 char *glob, char *cmd, char *param, int enabled) 2639 char *glob, char *cmd, char *param, int enabled)
2583{ 2640{
2584 struct trace_array *tr = top_trace_array();
2585 struct trace_event_file *file; 2641 struct trace_event_file *file;
2586 struct ftrace_probe_ops *ops; 2642 struct ftrace_probe_ops *ops;
2587 struct event_probe_data *data; 2643 struct event_probe_data *data;
@@ -2619,12 +2675,12 @@ event_enable_func(struct ftrace_hash *hash,
2619 ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; 2675 ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
2620 2676
2621 if (glob[0] == '!') { 2677 if (glob[0] == '!') {
2622 unregister_ftrace_function_probe_func(glob+1, ops); 2678 ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
2623 ret = 0;
2624 goto out; 2679 goto out;
2625 } 2680 }
2626 2681
2627 ret = -ENOMEM; 2682 ret = -ENOMEM;
2683
2628 data = kzalloc(sizeof(*data), GFP_KERNEL); 2684 data = kzalloc(sizeof(*data), GFP_KERNEL);
2629 if (!data) 2685 if (!data)
2630 goto out; 2686 goto out;
@@ -2661,7 +2717,8 @@ event_enable_func(struct ftrace_hash *hash,
2661 ret = __ftrace_event_enable_disable(file, 1, 1); 2717 ret = __ftrace_event_enable_disable(file, 1, 1);
2662 if (ret < 0) 2718 if (ret < 0)
2663 goto out_put; 2719 goto out_put;
2664 ret = register_ftrace_function_probe(glob, ops, data); 2720
2721 ret = register_ftrace_function_probe(glob, tr, ops, data);
2665 /* 2722 /*
2666 * The above returns on success the # of functions enabled, 2723 * The above returns on success the # of functions enabled,
2667 * but if it didn't find any functions it returns zero. 2724 * but if it didn't find any functions it returns zero.
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 0efa00d80623..a3bddbfd0874 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -267,10 +267,14 @@ static struct tracer function_trace __tracer_data =
267}; 267};
268 268
269#ifdef CONFIG_DYNAMIC_FTRACE 269#ifdef CONFIG_DYNAMIC_FTRACE
270static void update_traceon_count(void **data, bool on) 270static void update_traceon_count(struct ftrace_probe_ops *ops,
271 unsigned long ip,
272 struct trace_array *tr, bool on,
273 void *data)
271{ 274{
272 long *count = (long *)data; 275 struct ftrace_func_mapper *mapper = data;
273 long old_count = *count; 276 long *count;
277 long old_count;
274 278
275 /* 279 /*
276 * Tracing gets disabled (or enabled) once per count. 280 * Tracing gets disabled (or enabled) once per count.
@@ -301,23 +305,22 @@ static void update_traceon_count(void **data, bool on)
301 * setting the tracing_on file. But we currently don't care 305 * setting the tracing_on file. But we currently don't care
302 * about that. 306 * about that.
303 */ 307 */
304 if (!old_count) 308 count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
309 old_count = *count;
310
311 if (old_count <= 0)
305 return; 312 return;
306 313
307 /* Make sure we see count before checking tracing state */ 314 /* Make sure we see count before checking tracing state */
308 smp_rmb(); 315 smp_rmb();
309 316
310 if (on == !!tracing_is_on()) 317 if (on == !!tracer_tracing_is_on(tr))
311 return; 318 return;
312 319
313 if (on) 320 if (on)
314 tracing_on(); 321 tracer_tracing_on(tr);
315 else 322 else
316 tracing_off(); 323 tracer_tracing_off(tr);
317
318 /* unlimited? */
319 if (old_count == -1)
320 return;
321 324
322 /* Make sure tracing state is visible before updating count */ 325 /* Make sure tracing state is visible before updating count */
323 smp_wmb(); 326 smp_wmb();
@@ -326,33 +329,41 @@ static void update_traceon_count(void **data, bool on)
326} 329}
327 330
328static void 331static void
329ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) 332ftrace_traceon_count(unsigned long ip, unsigned long parent_ip,
333 struct trace_array *tr, struct ftrace_probe_ops *ops,
334 void *data)
330{ 335{
331 update_traceon_count(data, 1); 336 update_traceon_count(ops, ip, tr, 1, data);
332} 337}
333 338
334static void 339static void
335ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) 340ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip,
341 struct trace_array *tr, struct ftrace_probe_ops *ops,
342 void *data)
336{ 343{
337 update_traceon_count(data, 0); 344 update_traceon_count(ops, ip, tr, 0, data);
338} 345}
339 346
340static void 347static void
341ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) 348ftrace_traceon(unsigned long ip, unsigned long parent_ip,
349 struct trace_array *tr, struct ftrace_probe_ops *ops,
350 void *data)
342{ 351{
343 if (tracing_is_on()) 352 if (tracer_tracing_is_on(tr))
344 return; 353 return;
345 354
346 tracing_on(); 355 tracer_tracing_on(tr);
347} 356}
348 357
349static void 358static void
350ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) 359ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
360 struct trace_array *tr, struct ftrace_probe_ops *ops,
361 void *data)
351{ 362{
352 if (!tracing_is_on()) 363 if (!tracer_tracing_is_on(tr))
353 return; 364 return;
354 365
355 tracing_off(); 366 tracer_tracing_off(tr);
356} 367}
357 368
358/* 369/*
@@ -364,144 +375,218 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
364 */ 375 */
365#define STACK_SKIP 4 376#define STACK_SKIP 4
366 377
378static __always_inline void trace_stack(struct trace_array *tr)
379{
380 unsigned long flags;
381 int pc;
382
383 local_save_flags(flags);
384 pc = preempt_count();
385
386 __trace_stack(tr, flags, STACK_SKIP, pc);
387}
388
367static void 389static void
368ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) 390ftrace_stacktrace(unsigned long ip, unsigned long parent_ip,
391 struct trace_array *tr, struct ftrace_probe_ops *ops,
392 void *data)
369{ 393{
370 trace_dump_stack(STACK_SKIP); 394 trace_stack(tr);
371} 395}
372 396
373static void 397static void
374ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) 398ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip,
399 struct trace_array *tr, struct ftrace_probe_ops *ops,
400 void *data)
375{ 401{
376 long *count = (long *)data; 402 struct ftrace_func_mapper *mapper = data;
403 long *count;
377 long old_count; 404 long old_count;
378 long new_count; 405 long new_count;
379 406
407 if (!tracing_is_on())
408 return;
409
410 /* unlimited? */
411 if (!mapper) {
412 trace_stack(tr);
413 return;
414 }
415
416 count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
417
380 /* 418 /*
381 * Stack traces should only execute the number of times the 419 * Stack traces should only execute the number of times the
382 * user specified in the counter. 420 * user specified in the counter.
383 */ 421 */
384 do { 422 do {
385
386 if (!tracing_is_on())
387 return;
388
389 old_count = *count; 423 old_count = *count;
390 424
391 if (!old_count) 425 if (!old_count)
392 return; 426 return;
393 427
394 /* unlimited? */
395 if (old_count == -1) {
396 trace_dump_stack(STACK_SKIP);
397 return;
398 }
399
400 new_count = old_count - 1; 428 new_count = old_count - 1;
401 new_count = cmpxchg(count, old_count, new_count); 429 new_count = cmpxchg(count, old_count, new_count);
402 if (new_count == old_count) 430 if (new_count == old_count)
403 trace_dump_stack(STACK_SKIP); 431 trace_stack(tr);
432
433 if (!tracing_is_on())
434 return;
404 435
405 } while (new_count != old_count); 436 } while (new_count != old_count);
406} 437}
407 438
408static int update_count(void **data) 439static int update_count(struct ftrace_probe_ops *ops, unsigned long ip,
440 void *data)
409{ 441{
410 unsigned long *count = (long *)data; 442 struct ftrace_func_mapper *mapper = data;
443 long *count = NULL;
411 444
412 if (!*count) 445 if (mapper)
413 return 0; 446 count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
414 447
415 if (*count != -1) 448 if (count) {
449 if (*count <= 0)
450 return 0;
416 (*count)--; 451 (*count)--;
452 }
417 453
418 return 1; 454 return 1;
419} 455}
420 456
421static void 457static void
422ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) 458ftrace_dump_probe(unsigned long ip, unsigned long parent_ip,
459 struct trace_array *tr, struct ftrace_probe_ops *ops,
460 void *data)
423{ 461{
424 if (update_count(data)) 462 if (update_count(ops, ip, data))
425 ftrace_dump(DUMP_ALL); 463 ftrace_dump(DUMP_ALL);
426} 464}
427 465
428/* Only dump the current CPU buffer. */ 466/* Only dump the current CPU buffer. */
429static void 467static void
430ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) 468ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip,
469 struct trace_array *tr, struct ftrace_probe_ops *ops,
470 void *data)
431{ 471{
432 if (update_count(data)) 472 if (update_count(ops, ip, data))
433 ftrace_dump(DUMP_ORIG); 473 ftrace_dump(DUMP_ORIG);
434} 474}
435 475
436static int 476static int
437ftrace_probe_print(const char *name, struct seq_file *m, 477ftrace_probe_print(const char *name, struct seq_file *m,
438 unsigned long ip, void *data) 478 unsigned long ip, struct ftrace_probe_ops *ops,
479 void *data)
439{ 480{
440 long count = (long)data; 481 struct ftrace_func_mapper *mapper = data;
482 long *count = NULL;
441 483
442 seq_printf(m, "%ps:%s", (void *)ip, name); 484 seq_printf(m, "%ps:%s", (void *)ip, name);
443 485
444 if (count == -1) 486 if (mapper)
445 seq_puts(m, ":unlimited\n"); 487 count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
488
489 if (count)
490 seq_printf(m, ":count=%ld\n", *count);
446 else 491 else
447 seq_printf(m, ":count=%ld\n", count); 492 seq_puts(m, ":unlimited\n");
448 493
449 return 0; 494 return 0;
450} 495}
451 496
452static int 497static int
453ftrace_traceon_print(struct seq_file *m, unsigned long ip, 498ftrace_traceon_print(struct seq_file *m, unsigned long ip,
454 struct ftrace_probe_ops *ops, void *data) 499 struct ftrace_probe_ops *ops,
500 void *data)
455{ 501{
456 return ftrace_probe_print("traceon", m, ip, data); 502 return ftrace_probe_print("traceon", m, ip, ops, data);
457} 503}
458 504
459static int 505static int
460ftrace_traceoff_print(struct seq_file *m, unsigned long ip, 506ftrace_traceoff_print(struct seq_file *m, unsigned long ip,
461 struct ftrace_probe_ops *ops, void *data) 507 struct ftrace_probe_ops *ops, void *data)
462{ 508{
463 return ftrace_probe_print("traceoff", m, ip, data); 509 return ftrace_probe_print("traceoff", m, ip, ops, data);
464} 510}
465 511
466static int 512static int
467ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, 513ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
468 struct ftrace_probe_ops *ops, void *data) 514 struct ftrace_probe_ops *ops, void *data)
469{ 515{
470 return ftrace_probe_print("stacktrace", m, ip, data); 516 return ftrace_probe_print("stacktrace", m, ip, ops, data);
471} 517}
472 518
473static int 519static int
474ftrace_dump_print(struct seq_file *m, unsigned long ip, 520ftrace_dump_print(struct seq_file *m, unsigned long ip,
475 struct ftrace_probe_ops *ops, void *data) 521 struct ftrace_probe_ops *ops, void *data)
476{ 522{
477 return ftrace_probe_print("dump", m, ip, data); 523 return ftrace_probe_print("dump", m, ip, ops, data);
478} 524}
479 525
480static int 526static int
481ftrace_cpudump_print(struct seq_file *m, unsigned long ip, 527ftrace_cpudump_print(struct seq_file *m, unsigned long ip,
482 struct ftrace_probe_ops *ops, void *data) 528 struct ftrace_probe_ops *ops, void *data)
483{ 529{
484 return ftrace_probe_print("cpudump", m, ip, data); 530 return ftrace_probe_print("cpudump", m, ip, ops, data);
531}
532
533
534static int
535ftrace_count_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
536 unsigned long ip, void *init_data, void **data)
537{
538 struct ftrace_func_mapper *mapper = *data;
539
540 if (!mapper) {
541 mapper = allocate_ftrace_func_mapper();
542 if (!mapper)
543 return -ENOMEM;
544 *data = mapper;
545 }
546
547 return ftrace_func_mapper_add_ip(mapper, ip, init_data);
548}
549
550static void
551ftrace_count_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
552 unsigned long ip, void *data)
553{
554 struct ftrace_func_mapper *mapper = data;
555
556 if (!ip) {
557 free_ftrace_func_mapper(mapper, NULL);
558 return;
559 }
560
561 ftrace_func_mapper_remove_ip(mapper, ip);
485} 562}
486 563
487static struct ftrace_probe_ops traceon_count_probe_ops = { 564static struct ftrace_probe_ops traceon_count_probe_ops = {
488 .func = ftrace_traceon_count, 565 .func = ftrace_traceon_count,
489 .print = ftrace_traceon_print, 566 .print = ftrace_traceon_print,
567 .init = ftrace_count_init,
568 .free = ftrace_count_free,
490}; 569};
491 570
492static struct ftrace_probe_ops traceoff_count_probe_ops = { 571static struct ftrace_probe_ops traceoff_count_probe_ops = {
493 .func = ftrace_traceoff_count, 572 .func = ftrace_traceoff_count,
494 .print = ftrace_traceoff_print, 573 .print = ftrace_traceoff_print,
574 .init = ftrace_count_init,
575 .free = ftrace_count_free,
495}; 576};
496 577
497static struct ftrace_probe_ops stacktrace_count_probe_ops = { 578static struct ftrace_probe_ops stacktrace_count_probe_ops = {
498 .func = ftrace_stacktrace_count, 579 .func = ftrace_stacktrace_count,
499 .print = ftrace_stacktrace_print, 580 .print = ftrace_stacktrace_print,
581 .init = ftrace_count_init,
582 .free = ftrace_count_free,
500}; 583};
501 584
502static struct ftrace_probe_ops dump_probe_ops = { 585static struct ftrace_probe_ops dump_probe_ops = {
503 .func = ftrace_dump_probe, 586 .func = ftrace_dump_probe,
504 .print = ftrace_dump_print, 587 .print = ftrace_dump_print,
588 .init = ftrace_count_init,
589 .free = ftrace_count_free,
505}; 590};
506 591
507static struct ftrace_probe_ops cpudump_probe_ops = { 592static struct ftrace_probe_ops cpudump_probe_ops = {
@@ -525,7 +610,8 @@ static struct ftrace_probe_ops stacktrace_probe_ops = {
525}; 610};
526 611
527static int 612static int
528ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, 613ftrace_trace_probe_callback(struct trace_array *tr,
614 struct ftrace_probe_ops *ops,
529 struct ftrace_hash *hash, char *glob, 615 struct ftrace_hash *hash, char *glob,
530 char *cmd, char *param, int enable) 616 char *cmd, char *param, int enable)
531{ 617{
@@ -537,10 +623,8 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
537 if (!enable) 623 if (!enable)
538 return -EINVAL; 624 return -EINVAL;
539 625
540 if (glob[0] == '!') { 626 if (glob[0] == '!')
541 unregister_ftrace_function_probe_func(glob+1, ops); 627 return unregister_ftrace_function_probe_func(glob+1, tr, ops);
542 return 0;
543 }
544 628
545 if (!param) 629 if (!param)
546 goto out_reg; 630 goto out_reg;
@@ -559,13 +643,13 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
559 return ret; 643 return ret;
560 644
561 out_reg: 645 out_reg:
562 ret = register_ftrace_function_probe(glob, ops, count); 646 ret = register_ftrace_function_probe(glob, tr, ops, count);
563 647
564 return ret < 0 ? ret : 0; 648 return ret < 0 ? ret : 0;
565} 649}
566 650
567static int 651static int
568ftrace_trace_onoff_callback(struct ftrace_hash *hash, 652ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash,
569 char *glob, char *cmd, char *param, int enable) 653 char *glob, char *cmd, char *param, int enable)
570{ 654{
571 struct ftrace_probe_ops *ops; 655 struct ftrace_probe_ops *ops;
@@ -576,24 +660,24 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
576 else 660 else
577 ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; 661 ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
578 662
579 return ftrace_trace_probe_callback(ops, hash, glob, cmd, 663 return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
580 param, enable); 664 param, enable);
581} 665}
582 666
583static int 667static int
584ftrace_stacktrace_callback(struct ftrace_hash *hash, 668ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash,
585 char *glob, char *cmd, char *param, int enable) 669 char *glob, char *cmd, char *param, int enable)
586{ 670{
587 struct ftrace_probe_ops *ops; 671 struct ftrace_probe_ops *ops;
588 672
589 ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; 673 ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
590 674
591 return ftrace_trace_probe_callback(ops, hash, glob, cmd, 675 return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
592 param, enable); 676 param, enable);
593} 677}
594 678
595static int 679static int
596ftrace_dump_callback(struct ftrace_hash *hash, 680ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash,
597 char *glob, char *cmd, char *param, int enable) 681 char *glob, char *cmd, char *param, int enable)
598{ 682{
599 struct ftrace_probe_ops *ops; 683 struct ftrace_probe_ops *ops;
@@ -601,12 +685,12 @@ ftrace_dump_callback(struct ftrace_hash *hash,
601 ops = &dump_probe_ops; 685 ops = &dump_probe_ops;
602 686
603 /* Only dump once. */ 687 /* Only dump once. */
604 return ftrace_trace_probe_callback(ops, hash, glob, cmd, 688 return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
605 "1", enable); 689 "1", enable);
606} 690}
607 691
608static int 692static int
609ftrace_cpudump_callback(struct ftrace_hash *hash, 693ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash,
610 char *glob, char *cmd, char *param, int enable) 694 char *glob, char *cmd, char *param, int enable)
611{ 695{
612 struct ftrace_probe_ops *ops; 696 struct ftrace_probe_ops *ops;
@@ -614,7 +698,7 @@ ftrace_cpudump_callback(struct ftrace_hash *hash,
614 ops = &cpudump_probe_ops; 698 ops = &cpudump_probe_ops;
615 699
616 /* Only dump once. */ 700 /* Only dump once. */
617 return ftrace_trace_probe_callback(ops, hash, glob, cmd, 701 return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
618 "1", enable); 702 "1", enable);
619} 703}
620 704
@@ -687,9 +771,8 @@ static inline int init_func_cmd_traceon(void)
687} 771}
688#endif /* CONFIG_DYNAMIC_FTRACE */ 772#endif /* CONFIG_DYNAMIC_FTRACE */
689 773
690static __init int init_function_trace(void) 774__init int init_function_trace(void)
691{ 775{
692 init_func_cmd_traceon(); 776 init_func_cmd_traceon();
693 return register_tracer(&function_trace); 777 return register_tracer(&function_trace);
694} 778}
695core_initcall(init_function_trace);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 21ea6ae77d93..d7c8e4ec3d9d 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -79,12 +79,12 @@ static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC;
79 79
80/* Individual latency samples are stored here when detected. */ 80/* Individual latency samples are stored here when detected. */
81struct hwlat_sample { 81struct hwlat_sample {
82 u64 seqnum; /* unique sequence */ 82 u64 seqnum; /* unique sequence */
83 u64 duration; /* delta */ 83 u64 duration; /* delta */
84 u64 outer_duration; /* delta (outer loop) */ 84 u64 outer_duration; /* delta (outer loop) */
85 u64 nmi_total_ts; /* Total time spent in NMIs */ 85 u64 nmi_total_ts; /* Total time spent in NMIs */
86 struct timespec timestamp; /* wall time */ 86 struct timespec64 timestamp; /* wall time */
87 int nmi_count; /* # NMIs during this sample */ 87 int nmi_count; /* # NMIs during this sample */
88}; 88};
89 89
90/* keep the global state somewhere. */ 90/* keep the global state somewhere. */
@@ -250,7 +250,7 @@ static int get_sample(void)
250 s.seqnum = hwlat_data.count; 250 s.seqnum = hwlat_data.count;
251 s.duration = sample; 251 s.duration = sample;
252 s.outer_duration = outer_sample; 252 s.outer_duration = outer_sample;
253 s.timestamp = CURRENT_TIME; 253 ktime_get_real_ts64(&s.timestamp);
254 s.nmi_total_ts = nmi_total_ts; 254 s.nmi_total_ts = nmi_total_ts;
255 s.nmi_count = nmi_count; 255 s.nmi_count = nmi_count;
256 trace_hwlat_sample(&s); 256 trace_hwlat_sample(&s);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5f688cc724f0..c129fca6ec99 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -25,6 +25,7 @@
25#include "trace_probe.h" 25#include "trace_probe.h"
26 26
27#define KPROBE_EVENT_SYSTEM "kprobes" 27#define KPROBE_EVENT_SYSTEM "kprobes"
28#define KRETPROBE_MAXACTIVE_MAX 4096
28 29
29/** 30/**
30 * Kprobe event core functions 31 * Kprobe event core functions
@@ -282,6 +283,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
282 void *addr, 283 void *addr,
283 const char *symbol, 284 const char *symbol,
284 unsigned long offs, 285 unsigned long offs,
286 int maxactive,
285 int nargs, bool is_return) 287 int nargs, bool is_return)
286{ 288{
287 struct trace_kprobe *tk; 289 struct trace_kprobe *tk;
@@ -309,6 +311,8 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
309 else 311 else
310 tk->rp.kp.pre_handler = kprobe_dispatcher; 312 tk->rp.kp.pre_handler = kprobe_dispatcher;
311 313
314 tk->rp.maxactive = maxactive;
315
312 if (!event || !is_good_name(event)) { 316 if (!event || !is_good_name(event)) {
313 ret = -EINVAL; 317 ret = -EINVAL;
314 goto error; 318 goto error;
@@ -598,8 +602,10 @@ static int create_trace_kprobe(int argc, char **argv)
598{ 602{
599 /* 603 /*
600 * Argument syntax: 604 * Argument syntax:
601 * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] 605 * - Add kprobe:
602 * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] 606 * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
607 * - Add kretprobe:
608 * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]
603 * Fetch args: 609 * Fetch args:
604 * $retval : fetch return value 610 * $retval : fetch return value
605 * $stack : fetch stack address 611 * $stack : fetch stack address
@@ -619,6 +625,7 @@ static int create_trace_kprobe(int argc, char **argv)
619 int i, ret = 0; 625 int i, ret = 0;
620 bool is_return = false, is_delete = false; 626 bool is_return = false, is_delete = false;
621 char *symbol = NULL, *event = NULL, *group = NULL; 627 char *symbol = NULL, *event = NULL, *group = NULL;
628 int maxactive = 0;
622 char *arg; 629 char *arg;
623 unsigned long offset = 0; 630 unsigned long offset = 0;
624 void *addr = NULL; 631 void *addr = NULL;
@@ -637,8 +644,28 @@ static int create_trace_kprobe(int argc, char **argv)
637 return -EINVAL; 644 return -EINVAL;
638 } 645 }
639 646
640 if (argv[0][1] == ':') { 647 event = strchr(&argv[0][1], ':');
641 event = &argv[0][2]; 648 if (event) {
649 event[0] = '\0';
650 event++;
651 }
652 if (is_return && isdigit(argv[0][1])) {
653 ret = kstrtouint(&argv[0][1], 0, &maxactive);
654 if (ret) {
655 pr_info("Failed to parse maxactive.\n");
656 return ret;
657 }
658 /* kretprobes instances are iterated over via a list. The
659 * maximum should stay reasonable.
660 */
661 if (maxactive > KRETPROBE_MAXACTIVE_MAX) {
662 pr_info("Maxactive is too big (%d > %d).\n",
663 maxactive, KRETPROBE_MAXACTIVE_MAX);
664 return -E2BIG;
665 }
666 }
667
668 if (event) {
642 if (strchr(event, '/')) { 669 if (strchr(event, '/')) {
643 group = event; 670 group = event;
644 event = strchr(group, '/') + 1; 671 event = strchr(group, '/') + 1;
@@ -681,10 +708,6 @@ static int create_trace_kprobe(int argc, char **argv)
681 return -EINVAL; 708 return -EINVAL;
682 } 709 }
683 if (isdigit(argv[1][0])) { 710 if (isdigit(argv[1][0])) {
684 if (is_return) {
685 pr_info("Return probe point must be a symbol.\n");
686 return -EINVAL;
687 }
688 /* an address specified */ 711 /* an address specified */
689 ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); 712 ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
690 if (ret) { 713 if (ret) {
@@ -700,8 +723,9 @@ static int create_trace_kprobe(int argc, char **argv)
700 pr_info("Failed to parse symbol.\n"); 723 pr_info("Failed to parse symbol.\n");
701 return ret; 724 return ret;
702 } 725 }
703 if (offset && is_return) { 726 if (offset && is_return &&
704 pr_info("Return probe must be used without offset.\n"); 727 !function_offset_within_entry(NULL, symbol, offset)) {
728 pr_info("Given offset is not valid for return probe.\n");
705 return -EINVAL; 729 return -EINVAL;
706 } 730 }
707 } 731 }
@@ -718,8 +742,8 @@ static int create_trace_kprobe(int argc, char **argv)
718 is_return ? 'r' : 'p', addr); 742 is_return ? 'r' : 'p', addr);
719 event = buf; 743 event = buf;
720 } 744 }
721 tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc, 745 tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
722 is_return); 746 argc, is_return);
723 if (IS_ERR(tk)) { 747 if (IS_ERR(tk)) {
724 pr_info("Failed to allocate trace_probe.(%d)\n", 748 pr_info("Failed to allocate trace_probe.(%d)\n",
725 (int)PTR_ERR(tk)); 749 (int)PTR_ERR(tk));
@@ -1511,6 +1535,11 @@ static __init int kprobe_trace_self_tests_init(void)
1511 1535
1512end: 1536end:
1513 release_all_trace_kprobes(); 1537 release_all_trace_kprobes();
1538 /*
1539 * Wait for the optimizer work to finish. Otherwise it might fiddle
1540 * with probes in already freed __init text.
1541 */
1542 wait_for_kprobe_optimizer();
1514 if (warn) 1543 if (warn)
1515 pr_cont("NG: Some tests are failed. Please check them.\n"); 1544 pr_cont("NG: Some tests are failed. Please check them.\n");
1516 else 1545 else
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02a4aeb22c47..08f9bab8089e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -4,7 +4,6 @@
4 * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
5 * 5 *
6 */ 6 */
7
8#include <linux/module.h> 7#include <linux/module.h>
9#include <linux/mutex.h> 8#include <linux/mutex.h>
10#include <linux/ftrace.h> 9#include <linux/ftrace.h>
@@ -1161,11 +1160,11 @@ trace_hwlat_print(struct trace_iterator *iter, int flags,
1161 1160
1162 trace_assign_type(field, entry); 1161 trace_assign_type(field, entry);
1163 1162
1164 trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld", 1163 trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld",
1165 field->seqnum, 1164 field->seqnum,
1166 field->duration, 1165 field->duration,
1167 field->outer_duration, 1166 field->outer_duration,
1168 field->timestamp.tv_sec, 1167 (long long)field->timestamp.tv_sec,
1169 field->timestamp.tv_nsec); 1168 field->timestamp.tv_nsec);
1170 1169
1171 if (field->nmi_count) { 1170 if (field->nmi_count) {
@@ -1195,10 +1194,10 @@ trace_hwlat_raw(struct trace_iterator *iter, int flags,
1195 1194
1196 trace_assign_type(field, iter->ent); 1195 trace_assign_type(field, iter->ent);
1197 1196
1198 trace_seq_printf(s, "%llu %lld %ld %09ld %u\n", 1197 trace_seq_printf(s, "%llu %lld %lld %09ld %u\n",
1199 field->duration, 1198 field->duration,
1200 field->outer_duration, 1199 field->outer_duration,
1201 field->timestamp.tv_sec, 1200 (long long)field->timestamp.tv_sec,
1202 field->timestamp.tv_nsec, 1201 field->timestamp.tv_nsec,
1203 field->seqnum); 1202 field->seqnum);
1204 1203
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 5fb1f2c87e6b..76aa04d4c925 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -35,7 +35,7 @@ unsigned long stack_trace_max_size;
35arch_spinlock_t stack_trace_max_lock = 35arch_spinlock_t stack_trace_max_lock =
36 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 36 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
37 37
38static DEFINE_PER_CPU(int, trace_active); 38DEFINE_PER_CPU(int, disable_stack_tracer);
39static DEFINE_MUTEX(stack_sysctl_mutex); 39static DEFINE_MUTEX(stack_sysctl_mutex);
40 40
41int stack_tracer_enabled; 41int stack_tracer_enabled;
@@ -96,6 +96,14 @@ check_stack(unsigned long ip, unsigned long *stack)
96 if (in_nmi()) 96 if (in_nmi())
97 return; 97 return;
98 98
99 /*
100 * There's a slight chance that we are tracing inside the
101 * RCU infrastructure, and rcu_irq_enter() will not work
102 * as expected.
103 */
104 if (unlikely(rcu_irq_enter_disabled()))
105 return;
106
99 local_irq_save(flags); 107 local_irq_save(flags);
100 arch_spin_lock(&stack_trace_max_lock); 108 arch_spin_lock(&stack_trace_max_lock);
101 109
@@ -207,13 +215,12 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
207 struct ftrace_ops *op, struct pt_regs *pt_regs) 215 struct ftrace_ops *op, struct pt_regs *pt_regs)
208{ 216{
209 unsigned long stack; 217 unsigned long stack;
210 int cpu;
211 218
212 preempt_disable_notrace(); 219 preempt_disable_notrace();
213 220
214 cpu = raw_smp_processor_id();
215 /* no atomic needed, we only modify this variable by this cpu */ 221 /* no atomic needed, we only modify this variable by this cpu */
216 if (per_cpu(trace_active, cpu)++ != 0) 222 __this_cpu_inc(disable_stack_tracer);
223 if (__this_cpu_read(disable_stack_tracer) != 1)
217 goto out; 224 goto out;
218 225
219 ip += MCOUNT_INSN_SIZE; 226 ip += MCOUNT_INSN_SIZE;
@@ -221,7 +228,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
221 check_stack(ip, &stack); 228 check_stack(ip, &stack);
222 229
223 out: 230 out:
224 per_cpu(trace_active, cpu)--; 231 __this_cpu_dec(disable_stack_tracer);
225 /* prevent recursion in schedule */ 232 /* prevent recursion in schedule */
226 preempt_enable_notrace(); 233 preempt_enable_notrace();
227} 234}
@@ -253,7 +260,6 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
253 long *ptr = filp->private_data; 260 long *ptr = filp->private_data;
254 unsigned long val, flags; 261 unsigned long val, flags;
255 int ret; 262 int ret;
256 int cpu;
257 263
258 ret = kstrtoul_from_user(ubuf, count, 10, &val); 264 ret = kstrtoul_from_user(ubuf, count, 10, &val);
259 if (ret) 265 if (ret)
@@ -264,16 +270,15 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
264 /* 270 /*
265 * In case we trace inside arch_spin_lock() or after (NMI), 271 * In case we trace inside arch_spin_lock() or after (NMI),
266 * we will cause circular lock, so we also need to increase 272 * we will cause circular lock, so we also need to increase
267 * the percpu trace_active here. 273 * the percpu disable_stack_tracer here.
268 */ 274 */
269 cpu = smp_processor_id(); 275 __this_cpu_inc(disable_stack_tracer);
270 per_cpu(trace_active, cpu)++;
271 276
272 arch_spin_lock(&stack_trace_max_lock); 277 arch_spin_lock(&stack_trace_max_lock);
273 *ptr = val; 278 *ptr = val;
274 arch_spin_unlock(&stack_trace_max_lock); 279 arch_spin_unlock(&stack_trace_max_lock);
275 280
276 per_cpu(trace_active, cpu)--; 281 __this_cpu_dec(disable_stack_tracer);
277 local_irq_restore(flags); 282 local_irq_restore(flags);
278 283
279 return count; 284 return count;
@@ -307,12 +312,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
307 312
308static void *t_start(struct seq_file *m, loff_t *pos) 313static void *t_start(struct seq_file *m, loff_t *pos)
309{ 314{
310 int cpu;
311
312 local_irq_disable(); 315 local_irq_disable();
313 316
314 cpu = smp_processor_id(); 317 __this_cpu_inc(disable_stack_tracer);
315 per_cpu(trace_active, cpu)++;
316 318
317 arch_spin_lock(&stack_trace_max_lock); 319 arch_spin_lock(&stack_trace_max_lock);
318 320
@@ -324,12 +326,9 @@ static void *t_start(struct seq_file *m, loff_t *pos)
324 326
325static void t_stop(struct seq_file *m, void *p) 327static void t_stop(struct seq_file *m, void *p)
326{ 328{
327 int cpu;
328
329 arch_spin_unlock(&stack_trace_max_lock); 329 arch_spin_unlock(&stack_trace_max_lock);
330 330
331 cpu = smp_processor_id(); 331 __this_cpu_dec(disable_stack_tracer);
332 per_cpu(trace_active, cpu)--;
333 332
334 local_irq_enable(); 333 local_irq_enable();
335} 334}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c0168b7da1ea..c74bf39ef764 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3209,9 +3209,8 @@ static int init_worker_pool(struct worker_pool *pool)
3209 INIT_LIST_HEAD(&pool->idle_list); 3209 INIT_LIST_HEAD(&pool->idle_list);
3210 hash_init(pool->busy_hash); 3210 hash_init(pool->busy_hash);
3211 3211
3212 init_timer_deferrable(&pool->idle_timer); 3212 setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout,
3213 pool->idle_timer.function = idle_worker_timeout; 3213 (unsigned long)pool);
3214 pool->idle_timer.data = (unsigned long)pool;
3215 3214
3216 setup_timer(&pool->mayday_timer, pool_mayday_timeout, 3215 setup_timer(&pool->mayday_timer, pool_mayday_timeout,
3217 (unsigned long)pool); 3216 (unsigned long)pool);
@@ -4735,6 +4734,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
4735 return wfc.ret; 4734 return wfc.ret;
4736} 4735}
4737EXPORT_SYMBOL_GPL(work_on_cpu); 4736EXPORT_SYMBOL_GPL(work_on_cpu);
4737
4738/**
4739 * work_on_cpu_safe - run a function in thread context on a particular cpu
4740 * @cpu: the cpu to run on
4741 * @fn: the function to run
4742 * @arg: the function argument
4743 *
4744 * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
4745 * any locks which would prevent @fn from completing.
4746 *
4747 * Return: The value @fn returns.
4748 */
4749long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
4750{
4751 long ret = -ENODEV;
4752
4753 get_online_cpus();
4754 if (cpu_online(cpu))
4755 ret = work_on_cpu(cpu, fn, arg);
4756 put_online_cpus();
4757 return ret;
4758}
4759EXPORT_SYMBOL_GPL(work_on_cpu_safe);
4738#endif /* CONFIG_SMP */ 4760#endif /* CONFIG_SMP */
4739 4761
4740#ifdef CONFIG_FREEZER 4762#ifdef CONFIG_FREEZER