aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJames Morris <james.l.morris@oracle.com>2017-07-24 20:44:18 -0400
committerJames Morris <james.l.morris@oracle.com>2017-07-24 20:44:18 -0400
commit53a2ebaaabc1eb8458796fec3bc1e0e80746b642 (patch)
tree9d1f9227b49392cdd2edcc01057517da4f4b09c2 /kernel
parent3cf29931453215536916d0c4da953fce1911ced3 (diff)
parent520eccdfe187591a51ea9ab4c1a024ae4d0f68d9 (diff)
sync to Linus v4.13-rc2 for subsystem developers to work against
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/async.c8
-rw-r--r--kernel/audit.c62
-rw-r--r--kernel/audit.h29
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/bpf/arraymap.c56
-rw-r--r--kernel/bpf/cgroup.c37
-rw-r--r--kernel/bpf/core.c49
-rw-r--r--kernel/bpf/hashtab.c21
-rw-r--r--kernel/bpf/inode.c16
-rw-r--r--kernel/bpf/lpm_trie.c1
-rw-r--r--kernel/bpf/map_in_map.c5
-rw-r--r--kernel/bpf/map_in_map.h1
-rw-r--r--kernel/bpf/stackmap.c1
-rw-r--r--kernel/bpf/syscall.c510
-rw-r--r--kernel/bpf/verifier.c370
-rw-r--r--kernel/cgroup/Makefile1
-rw-r--r--kernel/cgroup/cgroup-internal.h2
-rw-r--r--kernel/cgroup/cgroup-v1.c155
-rw-r--r--kernel/cgroup/cgroup.c160
-rw-r--r--kernel/cgroup/cpuset.c37
-rw-r--r--kernel/cgroup/debug.c357
-rw-r--r--kernel/compat.c904
-rw-r--r--kernel/configs/android-base.config11
-rw-r--r--kernel/configs/android-recommended.config5
-rw-r--r--kernel/cpu.c286
-rw-r--r--kernel/crash_core.c44
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/events/core.c195
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/exit.c340
-rw-r--r--kernel/extable.c7
-rw-r--r--kernel/fork.c53
-rw-r--r--kernel/futex.c46
-rw-r--r--kernel/groups.c35
-rw-r--r--kernel/irq/Kconfig18
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/affinity.c89
-rw-r--r--kernel/irq/autoprobe.c4
-rw-r--r--kernel/irq/chip.c199
-rw-r--r--kernel/irq/cpuhotplug.c150
-rw-r--r--kernel/irq/debugfs.c213
-rw-r--r--kernel/irq/devres.c86
-rw-r--r--kernel/irq/generic-chip.c7
-rw-r--r--kernel/irq/handle.c4
-rw-r--r--kernel/irq/internals.h237
-rw-r--r--kernel/irq/irqdesc.c39
-rw-r--r--kernel/irq/irqdomain.c374
-rw-r--r--kernel/irq/manage.c222
-rw-r--r--kernel/irq/migration.c30
-rw-r--r--kernel/irq/msi.c13
-rw-r--r--kernel/irq/pm.c2
-rw-r--r--kernel/irq/proc.c110
-rw-r--r--kernel/irq/timings.c369
-rw-r--r--kernel/jump_label.c20
-rw-r--r--kernel/kallsyms.c10
-rw-r--r--kernel/kcmp.c57
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kexec_core.c43
-rw-r--r--kernel/kexec_file.c29
-rw-r--r--kernel/kexec_internal.h2
-rw-r--r--kernel/kmod.c56
-rw-r--r--kernel/kprobes.c103
-rw-r--r--kernel/ksysfs.c4
-rw-r--r--kernel/livepatch/Kconfig1
-rw-r--r--kernel/livepatch/patch.c8
-rw-r--r--kernel/livepatch/transition.c36
-rw-r--r--kernel/locking/lockdep.c176
-rw-r--r--kernel/locking/mutex.c6
-rw-r--r--kernel/locking/qrwlock.c1
-rw-r--r--kernel/locking/qspinlock.c1
-rw-r--r--kernel/locking/qspinlock_paravirt.h3
-rw-r--r--kernel/locking/rtmutex-debug.c6
-rw-r--r--kernel/locking/rtmutex-debug.h2
-rw-r--r--kernel/locking/rtmutex.c62
-rw-r--r--kernel/locking/rtmutex.h2
-rw-r--r--kernel/locking/rwsem-spinlock.c4
-rw-r--r--kernel/memremap.c6
-rw-r--r--kernel/module.c102
-rw-r--r--kernel/padata.c43
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/power/hibernate.c2
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/snapshot.c15
-rw-r--r--kernel/power/suspend.c6
-rw-r--r--kernel/power/swap.c14
-rw-r--r--kernel/printk/internal.h6
-rw-r--r--kernel/printk/printk.c67
-rw-r--r--kernel/printk/printk_safe.c36
-rw-r--r--kernel/ptrace.c20
-rw-r--r--kernel/rcu/Kconfig242
-rw-r--r--kernel/rcu/Kconfig.debug82
-rw-r--r--kernel/rcu/Makefile2
-rw-r--r--kernel/rcu/rcu.h277
-rw-r--r--kernel/rcu/rcuperf.c129
-rw-r--r--kernel/rcu/rcutorture.c21
-rw-r--r--kernel/rcu/srcu.c662
-rw-r--r--kernel/rcu/srcutiny.c87
-rw-r--r--kernel/rcu/srcutree.c192
-rw-r--r--kernel/rcu/tiny.c54
-rw-r--r--kernel/rcu/tiny_plugin.h123
-rw-r--r--kernel/rcu/tree.c195
-rw-r--r--kernel/rcu/tree.h109
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_plugin.h573
-rw-r--r--kernel/rcu/tree_trace.c494
-rw-r--r--kernel/rcu/update.c77
-rw-r--r--kernel/sched/Makefile6
-rw-r--r--kernel/sched/clock.c128
-rw-r--r--kernel/sched/completion.c2
-rw-r--r--kernel/sched/core.c782
-rw-r--r--kernel/sched/cpufreq_schedutil.c15
-rw-r--r--kernel/sched/cputime.c164
-rw-r--r--kernel/sched/deadline.c908
-rw-r--r--kernel/sched/debug.c17
-rw-r--r--kernel/sched/fair.c485
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/loadavg.c51
-rw-r--r--kernel/sched/rt.c323
-rw-r--r--kernel/sched/sched.h113
-rw-r--r--kernel/sched/topology.c430
-rw-r--r--kernel/sched/wait.c441
-rw-r--r--kernel/sched/wait_bit.c286
-rw-r--r--kernel/signal.c183
-rw-r--r--kernel/smp.c16
-rw-r--r--kernel/stop_machine.c11
-rw-r--r--kernel/sys.c122
-rw-r--r--kernel/sysctl.c335
-rw-r--r--kernel/sysctl_binary.c6
-rw-r--r--kernel/time/Kconfig50
-rw-r--r--kernel/time/alarmtimer.c377
-rw-r--r--kernel/time/clocksource.c3
-rw-r--r--kernel/time/hrtimer.c112
-rw-r--r--kernel/time/itimer.c46
-rw-r--r--kernel/time/posix-clock.c117
-rw-r--r--kernel/time/posix-cpu-timers.c167
-rw-r--r--kernel/time/posix-stubs.c156
-rw-r--r--kernel/time/posix-timers.c770
-rw-r--r--kernel/time/posix-timers.h40
-rw-r--r--kernel/time/tick-broadcast.c4
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-sched.c74
-rw-r--r--kernel/time/tick-sched.h2
-rw-r--r--kernel/time/time.c164
-rw-r--r--kernel/time/timekeeping.c109
-rw-r--r--kernel/time/timer.c50
-rw-r--r--kernel/trace/Kconfig22
-rw-r--r--kernel/trace/blktrace.c4
-rw-r--r--kernel/trace/bpf_trace.c66
-rw-r--r--kernel/trace/ftrace.c416
-rw-r--r--kernel/trace/ring_buffer.c10
-rw-r--r--kernel/trace/trace.c473
-rw-r--r--kernel/trace/trace.h36
-rw-r--r--kernel/trace/trace_events.c66
-rw-r--r--kernel/trace/trace_functions.c12
-rw-r--r--kernel/trace/trace_kprobe.c25
-rw-r--r--kernel/trace/trace_output.c27
-rw-r--r--kernel/trace/trace_sched_switch.c72
-rw-r--r--kernel/trace/trace_stack.c12
-rw-r--r--kernel/watchdog.c289
-rw-r--r--kernel/watchdog_hld.c37
-rw-r--r--kernel/workqueue.c4
163 files changed, 11323 insertions, 7595 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 72aa080f91f0..4cb8e8b23c6e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -82,7 +82,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
82obj-$(CONFIG_KGDB) += debug/ 82obj-$(CONFIG_KGDB) += debug/
83obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 83obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
84obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o 84obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
85obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o 85obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o
86obj-$(CONFIG_SECCOMP) += seccomp.o 86obj-$(CONFIG_SECCOMP) += seccomp.o
87obj-$(CONFIG_RELAY) += relay.o 87obj-$(CONFIG_RELAY) += relay.o
88obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 88obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/async.c b/kernel/async.c
index d2edd6efec56..2cbd3dd5940d 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -114,14 +114,14 @@ static void async_run_entry_fn(struct work_struct *work)
114 ktime_t uninitialized_var(calltime), delta, rettime; 114 ktime_t uninitialized_var(calltime), delta, rettime;
115 115
116 /* 1) run (and print duration) */ 116 /* 1) run (and print duration) */
117 if (initcall_debug && system_state == SYSTEM_BOOTING) { 117 if (initcall_debug && system_state < SYSTEM_RUNNING) {
118 pr_debug("calling %lli_%pF @ %i\n", 118 pr_debug("calling %lli_%pF @ %i\n",
119 (long long)entry->cookie, 119 (long long)entry->cookie,
120 entry->func, task_pid_nr(current)); 120 entry->func, task_pid_nr(current));
121 calltime = ktime_get(); 121 calltime = ktime_get();
122 } 122 }
123 entry->func(entry->data, entry->cookie); 123 entry->func(entry->data, entry->cookie);
124 if (initcall_debug && system_state == SYSTEM_BOOTING) { 124 if (initcall_debug && system_state < SYSTEM_RUNNING) {
125 rettime = ktime_get(); 125 rettime = ktime_get();
126 delta = ktime_sub(rettime, calltime); 126 delta = ktime_sub(rettime, calltime);
127 pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", 127 pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
@@ -284,14 +284,14 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
284{ 284{
285 ktime_t uninitialized_var(starttime), delta, endtime; 285 ktime_t uninitialized_var(starttime), delta, endtime;
286 286
287 if (initcall_debug && system_state == SYSTEM_BOOTING) { 287 if (initcall_debug && system_state < SYSTEM_RUNNING) {
288 pr_debug("async_waiting @ %i\n", task_pid_nr(current)); 288 pr_debug("async_waiting @ %i\n", task_pid_nr(current));
289 starttime = ktime_get(); 289 starttime = ktime_get();
290 } 290 }
291 291
292 wait_event(async_done, lowest_in_progress(domain) >= cookie); 292 wait_event(async_done, lowest_in_progress(domain) >= cookie);
293 293
294 if (initcall_debug && system_state == SYSTEM_BOOTING) { 294 if (initcall_debug && system_state < SYSTEM_RUNNING) {
295 endtime = ktime_get(); 295 endtime = ktime_get();
296 delta = ktime_sub(endtime, starttime); 296 delta = ktime_sub(endtime, starttime);
297 297
diff --git a/kernel/audit.c b/kernel/audit.c
index 4b7d49868ce1..6dd556931739 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -575,12 +575,16 @@ static void kauditd_retry_skb(struct sk_buff *skb)
575 575
576/** 576/**
577 * auditd_reset - Disconnect the auditd connection 577 * auditd_reset - Disconnect the auditd connection
578 * @ac: auditd connection state
578 * 579 *
579 * Description: 580 * Description:
580 * Break the auditd/kauditd connection and move all the queued records into the 581 * Break the auditd/kauditd connection and move all the queued records into the
581 * hold queue in case auditd reconnects. 582 * hold queue in case auditd reconnects. It is important to note that the @ac
583 * pointer should never be dereferenced inside this function as it may be NULL
584 * or invalid, you can only compare the memory address! If @ac is NULL then
585 * the connection will always be reset.
582 */ 586 */
583static void auditd_reset(void) 587static void auditd_reset(const struct auditd_connection *ac)
584{ 588{
585 unsigned long flags; 589 unsigned long flags;
586 struct sk_buff *skb; 590 struct sk_buff *skb;
@@ -590,17 +594,21 @@ static void auditd_reset(void)
590 spin_lock_irqsave(&auditd_conn_lock, flags); 594 spin_lock_irqsave(&auditd_conn_lock, flags);
591 ac_old = rcu_dereference_protected(auditd_conn, 595 ac_old = rcu_dereference_protected(auditd_conn,
592 lockdep_is_held(&auditd_conn_lock)); 596 lockdep_is_held(&auditd_conn_lock));
597 if (ac && ac != ac_old) {
598 /* someone already registered a new auditd connection */
599 spin_unlock_irqrestore(&auditd_conn_lock, flags);
600 return;
601 }
593 rcu_assign_pointer(auditd_conn, NULL); 602 rcu_assign_pointer(auditd_conn, NULL);
594 spin_unlock_irqrestore(&auditd_conn_lock, flags); 603 spin_unlock_irqrestore(&auditd_conn_lock, flags);
595 604
596 if (ac_old) 605 if (ac_old)
597 call_rcu(&ac_old->rcu, auditd_conn_free); 606 call_rcu(&ac_old->rcu, auditd_conn_free);
598 607
599 /* flush all of the main and retry queues to the hold queue */ 608 /* flush the retry queue to the hold queue, but don't touch the main
609 * queue since we need to process that normally for multicast */
600 while ((skb = skb_dequeue(&audit_retry_queue))) 610 while ((skb = skb_dequeue(&audit_retry_queue)))
601 kauditd_hold_skb(skb); 611 kauditd_hold_skb(skb);
602 while ((skb = skb_dequeue(&audit_queue)))
603 kauditd_hold_skb(skb);
604} 612}
605 613
606/** 614/**
@@ -633,6 +641,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
633 ac = rcu_dereference(auditd_conn); 641 ac = rcu_dereference(auditd_conn);
634 if (!ac) { 642 if (!ac) {
635 rcu_read_unlock(); 643 rcu_read_unlock();
644 kfree_skb(skb);
636 rc = -ECONNREFUSED; 645 rc = -ECONNREFUSED;
637 goto err; 646 goto err;
638 } 647 }
@@ -649,8 +658,8 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
649 return rc; 658 return rc;
650 659
651err: 660err:
652 if (rc == -ECONNREFUSED) 661 if (ac && rc == -ECONNREFUSED)
653 auditd_reset(); 662 auditd_reset(ac);
654 return rc; 663 return rc;
655} 664}
656 665
@@ -795,9 +804,9 @@ static int kauditd_thread(void *dummy)
795 rc = kauditd_send_queue(sk, portid, 804 rc = kauditd_send_queue(sk, portid,
796 &audit_hold_queue, UNICAST_RETRIES, 805 &audit_hold_queue, UNICAST_RETRIES,
797 NULL, kauditd_rehold_skb); 806 NULL, kauditd_rehold_skb);
798 if (rc < 0) { 807 if (ac && rc < 0) {
799 sk = NULL; 808 sk = NULL;
800 auditd_reset(); 809 auditd_reset(ac);
801 goto main_queue; 810 goto main_queue;
802 } 811 }
803 812
@@ -805,9 +814,9 @@ static int kauditd_thread(void *dummy)
805 rc = kauditd_send_queue(sk, portid, 814 rc = kauditd_send_queue(sk, portid,
806 &audit_retry_queue, UNICAST_RETRIES, 815 &audit_retry_queue, UNICAST_RETRIES,
807 NULL, kauditd_hold_skb); 816 NULL, kauditd_hold_skb);
808 if (rc < 0) { 817 if (ac && rc < 0) {
809 sk = NULL; 818 sk = NULL;
810 auditd_reset(); 819 auditd_reset(ac);
811 goto main_queue; 820 goto main_queue;
812 } 821 }
813 822
@@ -815,12 +824,13 @@ main_queue:
815 /* process the main queue - do the multicast send and attempt 824 /* process the main queue - do the multicast send and attempt
816 * unicast, dump failed record sends to the retry queue; if 825 * unicast, dump failed record sends to the retry queue; if
817 * sk == NULL due to previous failures we will just do the 826 * sk == NULL due to previous failures we will just do the
818 * multicast send and move the record to the retry queue */ 827 * multicast send and move the record to the hold queue */
819 rc = kauditd_send_queue(sk, portid, &audit_queue, 1, 828 rc = kauditd_send_queue(sk, portid, &audit_queue, 1,
820 kauditd_send_multicast_skb, 829 kauditd_send_multicast_skb,
821 kauditd_retry_skb); 830 (sk ?
822 if (sk == NULL || rc < 0) 831 kauditd_retry_skb : kauditd_hold_skb));
823 auditd_reset(); 832 if (ac && rc < 0)
833 auditd_reset(ac);
824 sk = NULL; 834 sk = NULL;
825 835
826 /* drop our netns reference, no auditd sends past this line */ 836 /* drop our netns reference, no auditd sends past this line */
@@ -1230,7 +1240,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1230 auditd_pid, 1); 1240 auditd_pid, 1);
1231 1241
1232 /* unregister the auditd connection */ 1242 /* unregister the auditd connection */
1233 auditd_reset(); 1243 auditd_reset(NULL);
1234 } 1244 }
1235 } 1245 }
1236 if (s.mask & AUDIT_STATUS_RATE_LIMIT) { 1246 if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
@@ -1999,22 +2009,10 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
1999 2009
2000static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) 2010static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
2001{ 2011{
2002 kernel_cap_t *perm = &name->fcap.permitted; 2012 audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
2003 kernel_cap_t *inh = &name->fcap.inheritable; 2013 audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
2004 int log = 0; 2014 audit_log_format(ab, " cap_fe=%d cap_fver=%x",
2005 2015 name->fcap.fE, name->fcap_ver);
2006 if (!cap_isclear(*perm)) {
2007 audit_log_cap(ab, "cap_fp", perm);
2008 log = 1;
2009 }
2010 if (!cap_isclear(*inh)) {
2011 audit_log_cap(ab, "cap_fi", inh);
2012 log = 1;
2013 }
2014
2015 if (log)
2016 audit_log_format(ab, " cap_fe=%d cap_fver=%x",
2017 name->fcap.fE, name->fcap_ver);
2018} 2016}
2019 2017
2020static inline int audit_copy_fcaps(struct audit_names *name, 2018static inline int audit_copy_fcaps(struct audit_names *name,
diff --git a/kernel/audit.h b/kernel/audit.h
index ddfce2ea4891..b331d9b83f63 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -68,6 +68,7 @@ struct audit_cap_data {
68 unsigned int fE; /* effective bit of file cap */ 68 unsigned int fE; /* effective bit of file cap */
69 kernel_cap_t effective; /* effective set of process */ 69 kernel_cap_t effective; /* effective set of process */
70 }; 70 };
71 kernel_cap_t ambient;
71}; 72};
72 73
73/* When fs/namei.c:getname() is called, we store the pointer in name and bump 74/* When fs/namei.c:getname() is called, we store the pointer in name and bump
@@ -247,13 +248,13 @@ struct audit_netlink_list {
247 struct sk_buff_head q; 248 struct sk_buff_head q;
248}; 249};
249 250
250int audit_send_list(void *); 251int audit_send_list(void *_dest);
251 252
252extern int selinux_audit_rule_update(void); 253extern int selinux_audit_rule_update(void);
253 254
254extern struct mutex audit_filter_mutex; 255extern struct mutex audit_filter_mutex;
255extern int audit_del_rule(struct audit_entry *); 256extern int audit_del_rule(struct audit_entry *entry);
256extern void audit_free_rule_rcu(struct rcu_head *); 257extern void audit_free_rule_rcu(struct rcu_head *head);
257extern struct list_head audit_filter_list[]; 258extern struct list_head audit_filter_list[];
258 259
259extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); 260extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
@@ -301,17 +302,17 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
301#endif /* CONFIG_AUDIT_WATCH */ 302#endif /* CONFIG_AUDIT_WATCH */
302 303
303#ifdef CONFIG_AUDIT_TREE 304#ifdef CONFIG_AUDIT_TREE
304extern struct audit_chunk *audit_tree_lookup(const struct inode *); 305extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
305extern void audit_put_chunk(struct audit_chunk *); 306extern void audit_put_chunk(struct audit_chunk *chunk);
306extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *); 307extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree);
307extern int audit_make_tree(struct audit_krule *, char *, u32); 308extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
308extern int audit_add_tree_rule(struct audit_krule *); 309extern int audit_add_tree_rule(struct audit_krule *rule);
309extern int audit_remove_tree_rule(struct audit_krule *); 310extern int audit_remove_tree_rule(struct audit_krule *rule);
310extern void audit_trim_trees(void); 311extern void audit_trim_trees(void);
311extern int audit_tag_tree(char *old, char *new); 312extern int audit_tag_tree(char *old, char *new);
312extern const char *audit_tree_path(struct audit_tree *); 313extern const char *audit_tree_path(struct audit_tree *tree);
313extern void audit_put_tree(struct audit_tree *); 314extern void audit_put_tree(struct audit_tree *tree);
314extern void audit_kill_trees(struct list_head *); 315extern void audit_kill_trees(struct list_head *list);
315#else 316#else
316#define audit_remove_tree_rule(rule) BUG() 317#define audit_remove_tree_rule(rule) BUG()
317#define audit_add_tree_rule(rule) -EINVAL 318#define audit_add_tree_rule(rule) -EINVAL
@@ -323,7 +324,7 @@ extern void audit_kill_trees(struct list_head *);
323#define audit_kill_trees(list) BUG() 324#define audit_kill_trees(list) BUG()
324#endif 325#endif
325 326
326extern char *audit_unpack_string(void **, size_t *, size_t); 327extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
327 328
328extern pid_t audit_sig_pid; 329extern pid_t audit_sig_pid;
329extern kuid_t audit_sig_uid; 330extern kuid_t audit_sig_uid;
@@ -333,7 +334,7 @@ extern int audit_filter(int msgtype, unsigned int listtype);
333 334
334#ifdef CONFIG_AUDITSYSCALL 335#ifdef CONFIG_AUDITSYSCALL
335extern int audit_signal_info(int sig, struct task_struct *t); 336extern int audit_signal_info(int sig, struct task_struct *t);
336extern void audit_filter_inodes(struct task_struct *, struct audit_context *); 337extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx);
337extern struct list_head *audit_killed_trees(void); 338extern struct list_head *audit_killed_trees(void);
338#else 339#else
339#define audit_signal_info(s,t) AUDIT_DISABLED 340#define audit_signal_info(s,t) AUDIT_DISABLED
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bb724baa7ac9..3260ba2312a9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1261,6 +1261,7 @@ static void show_special(struct audit_context *context, int *call_panic)
1261 audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); 1261 audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
1262 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); 1262 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
1263 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); 1263 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
1264 audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient);
1264 break; 1265 break;
1265 case AUDIT_MMAP: 1266 case AUDIT_MMAP:
1266 audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, 1267 audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
@@ -1382,9 +1383,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1382 audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); 1383 audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
1383 audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); 1384 audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
1384 audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); 1385 audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
1385 audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted); 1386 audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient);
1386 audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable); 1387 audit_log_cap(ab, "pp", &axs->new_pcap.permitted);
1387 audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); 1388 audit_log_cap(ab, "pi", &axs->new_pcap.inheritable);
1389 audit_log_cap(ab, "pe", &axs->new_pcap.effective);
1390 audit_log_cap(ab, "pa", &axs->new_pcap.ambient);
1388 break; } 1391 break; }
1389 1392
1390 } 1393 }
@@ -2342,10 +2345,12 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2342 ax->old_pcap.permitted = old->cap_permitted; 2345 ax->old_pcap.permitted = old->cap_permitted;
2343 ax->old_pcap.inheritable = old->cap_inheritable; 2346 ax->old_pcap.inheritable = old->cap_inheritable;
2344 ax->old_pcap.effective = old->cap_effective; 2347 ax->old_pcap.effective = old->cap_effective;
2348 ax->old_pcap.ambient = old->cap_ambient;
2345 2349
2346 ax->new_pcap.permitted = new->cap_permitted; 2350 ax->new_pcap.permitted = new->cap_permitted;
2347 ax->new_pcap.inheritable = new->cap_inheritable; 2351 ax->new_pcap.inheritable = new->cap_inheritable;
2348 ax->new_pcap.effective = new->cap_effective; 2352 ax->new_pcap.effective = new->cap_effective;
2353 ax->new_pcap.ambient = new->cap_ambient;
2349 return 0; 2354 return 0;
2350} 2355}
2351 2356
@@ -2364,6 +2369,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old)
2364 context->capset.cap.effective = new->cap_effective; 2369 context->capset.cap.effective = new->cap_effective;
2365 context->capset.cap.inheritable = new->cap_effective; 2370 context->capset.cap.inheritable = new->cap_effective;
2366 context->capset.cap.permitted = new->cap_permitted; 2371 context->capset.cap.permitted = new->cap_permitted;
2372 context->capset.cap.ambient = new->cap_ambient;
2367 context->type = AUDIT_CAPSET; 2373 context->type = AUDIT_CAPSET;
2368} 2374}
2369 2375
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 5e00b2333c26..d771a3872500 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -86,6 +86,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
86 array->map.key_size = attr->key_size; 86 array->map.key_size = attr->key_size;
87 array->map.value_size = attr->value_size; 87 array->map.value_size = attr->value_size;
88 array->map.max_entries = attr->max_entries; 88 array->map.max_entries = attr->max_entries;
89 array->map.map_flags = attr->map_flags;
89 array->elem_size = elem_size; 90 array->elem_size = elem_size;
90 91
91 if (!percpu) 92 if (!percpu)
@@ -334,6 +335,26 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
334} 335}
335 336
336/* only called from syscall */ 337/* only called from syscall */
338int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
339{
340 void **elem, *ptr;
341 int ret = 0;
342
343 if (!map->ops->map_fd_sys_lookup_elem)
344 return -ENOTSUPP;
345
346 rcu_read_lock();
347 elem = array_map_lookup_elem(map, key);
348 if (elem && (ptr = READ_ONCE(*elem)))
349 *value = map->ops->map_fd_sys_lookup_elem(ptr);
350 else
351 ret = -ENOENT;
352 rcu_read_unlock();
353
354 return ret;
355}
356
357/* only called from syscall */
337int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, 358int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
338 void *key, void *value, u64 map_flags) 359 void *key, void *value, u64 map_flags)
339{ 360{
@@ -399,6 +420,11 @@ static void prog_fd_array_put_ptr(void *ptr)
399 bpf_prog_put(ptr); 420 bpf_prog_put(ptr);
400} 421}
401 422
423static u32 prog_fd_array_sys_lookup_elem(void *ptr)
424{
425 return ((struct bpf_prog *)ptr)->aux->id;
426}
427
402/* decrement refcnt of all bpf_progs that are stored in this map */ 428/* decrement refcnt of all bpf_progs that are stored in this map */
403void bpf_fd_array_map_clear(struct bpf_map *map) 429void bpf_fd_array_map_clear(struct bpf_map *map)
404{ 430{
@@ -417,6 +443,7 @@ const struct bpf_map_ops prog_array_map_ops = {
417 .map_delete_elem = fd_array_map_delete_elem, 443 .map_delete_elem = fd_array_map_delete_elem,
418 .map_fd_get_ptr = prog_fd_array_get_ptr, 444 .map_fd_get_ptr = prog_fd_array_get_ptr,
419 .map_fd_put_ptr = prog_fd_array_put_ptr, 445 .map_fd_put_ptr = prog_fd_array_put_ptr,
446 .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
420}; 447};
421 448
422static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, 449static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -451,38 +478,24 @@ static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
451static void *perf_event_fd_array_get_ptr(struct bpf_map *map, 478static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
452 struct file *map_file, int fd) 479 struct file *map_file, int fd)
453{ 480{
454 const struct perf_event_attr *attr;
455 struct bpf_event_entry *ee; 481 struct bpf_event_entry *ee;
456 struct perf_event *event; 482 struct perf_event *event;
457 struct file *perf_file; 483 struct file *perf_file;
484 u64 value;
458 485
459 perf_file = perf_event_get(fd); 486 perf_file = perf_event_get(fd);
460 if (IS_ERR(perf_file)) 487 if (IS_ERR(perf_file))
461 return perf_file; 488 return perf_file;
462 489
490 ee = ERR_PTR(-EOPNOTSUPP);
463 event = perf_file->private_data; 491 event = perf_file->private_data;
464 ee = ERR_PTR(-EINVAL); 492 if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
465
466 attr = perf_event_attrs(event);
467 if (IS_ERR(attr) || attr->inherit)
468 goto err_out; 493 goto err_out;
469 494
470 switch (attr->type) { 495 ee = bpf_event_entry_gen(perf_file, map_file);
471 case PERF_TYPE_SOFTWARE: 496 if (ee)
472 if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) 497 return ee;
473 goto err_out; 498 ee = ERR_PTR(-ENOMEM);
474 /* fall-through */
475 case PERF_TYPE_RAW:
476 case PERF_TYPE_HARDWARE:
477 ee = bpf_event_entry_gen(perf_file, map_file);
478 if (ee)
479 return ee;
480 ee = ERR_PTR(-ENOMEM);
481 /* fall-through */
482 default:
483 break;
484 }
485
486err_out: 499err_out:
487 fput(perf_file); 500 fput(perf_file);
488 return ee; 501 return ee;
@@ -598,4 +611,5 @@ const struct bpf_map_ops array_of_maps_map_ops = {
598 .map_delete_elem = fd_array_map_delete_elem, 611 .map_delete_elem = fd_array_map_delete_elem,
599 .map_fd_get_ptr = bpf_map_fd_get_ptr, 612 .map_fd_get_ptr = bpf_map_fd_get_ptr,
600 .map_fd_put_ptr = bpf_map_fd_put_ptr, 613 .map_fd_put_ptr = bpf_map_fd_put_ptr,
614 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
601}; 615};
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ea6033cba947..546113430049 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
236 return ret; 236 return ret;
237} 237}
238EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); 238EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
239
240/**
241 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
242 * @sk: socket to get cgroup from
243 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
244 * sk with connection information (IP addresses, etc.) May not contain
245 * cgroup info if it is a req sock.
246 * @type: The type of program to be exectuted
247 *
248 * socket passed is expected to be of type INET or INET6.
249 *
250 * The program type passed in via @type must be suitable for sock_ops
251 * filtering. No further check is performed to assert that.
252 *
253 * This function will return %-EPERM if any if an attached program was found
254 * and if it returned != 1 during execution. In all other cases, 0 is returned.
255 */
256int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
257 struct bpf_sock_ops_kern *sock_ops,
258 enum bpf_attach_type type)
259{
260 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
261 struct bpf_prog *prog;
262 int ret = 0;
263
264
265 rcu_read_lock();
266
267 prog = rcu_dereference(cgrp->bpf.effective[type]);
268 if (prog)
269 ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
270
271 rcu_read_unlock();
272
273 return ret;
274}
275EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index dedf367f59bb..ad5f55922a13 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -763,10 +763,10 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
763 * 763 *
764 * Decode and execute eBPF instructions. 764 * Decode and execute eBPF instructions.
765 */ 765 */
766static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) 766static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
767 u64 *stack)
767{ 768{
768 u64 stack[MAX_BPF_STACK / sizeof(u64)]; 769 u64 tmp;
769 u64 regs[MAX_BPF_REG], tmp;
770 static const void *jumptable[256] = { 770 static const void *jumptable[256] = {
771 [0 ... 255] = &&default_label, 771 [0 ... 255] = &&default_label,
772 /* Now overwrite non-defaults ... */ 772 /* Now overwrite non-defaults ... */
@@ -824,7 +824,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
824 [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, 824 [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
825 /* Call instruction */ 825 /* Call instruction */
826 [BPF_JMP | BPF_CALL] = &&JMP_CALL, 826 [BPF_JMP | BPF_CALL] = &&JMP_CALL,
827 [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL, 827 [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
828 /* Jumps */ 828 /* Jumps */
829 [BPF_JMP | BPF_JA] = &&JMP_JA, 829 [BPF_JMP | BPF_JA] = &&JMP_JA,
830 [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, 830 [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -874,9 +874,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
874#define CONT ({ insn++; goto select_insn; }) 874#define CONT ({ insn++; goto select_insn; })
875#define CONT_JMP ({ insn++; goto select_insn; }) 875#define CONT_JMP ({ insn++; goto select_insn; })
876 876
877 FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
878 ARG1 = (u64) (unsigned long) ctx;
879
880select_insn: 877select_insn:
881 goto *jumptable[insn->code]; 878 goto *jumptable[insn->code];
882 879
@@ -1219,7 +1216,39 @@ load_byte:
1219 WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); 1216 WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
1220 return 0; 1217 return 0;
1221} 1218}
1222STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ 1219STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
1220
1221#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
1222#define DEFINE_BPF_PROG_RUN(stack_size) \
1223static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
1224{ \
1225 u64 stack[stack_size / sizeof(u64)]; \
1226 u64 regs[MAX_BPF_REG]; \
1227\
1228 FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
1229 ARG1 = (u64) (unsigned long) ctx; \
1230 return ___bpf_prog_run(regs, insn, stack); \
1231}
1232
1233#define EVAL1(FN, X) FN(X)
1234#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
1235#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
1236#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
1237#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
1238#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)
1239
1240EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
1241EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
1242EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
1243
1244#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
1245
1246static unsigned int (*interpreters[])(const void *ctx,
1247 const struct bpf_insn *insn) = {
1248EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
1249EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
1250EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
1251};
1223 1252
1224bool bpf_prog_array_compatible(struct bpf_array *array, 1253bool bpf_prog_array_compatible(struct bpf_array *array,
1225 const struct bpf_prog *fp) 1254 const struct bpf_prog *fp)
@@ -1268,7 +1297,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
1268 */ 1297 */
1269struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) 1298struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
1270{ 1299{
1271 fp->bpf_func = (void *) __bpf_prog_run; 1300 u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
1301
1302 fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
1272 1303
1273 /* eBPF JITs can rewrite the program in case constant 1304 /* eBPF JITs can rewrite the program in case constant
1274 * blinding is active. However, in case of error during 1305 * blinding is active. However, in case of error during
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 004334ea13ba..4fb463172aa8 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1244,6 +1244,26 @@ static void fd_htab_map_free(struct bpf_map *map)
1244} 1244}
1245 1245
1246/* only called from syscall */ 1246/* only called from syscall */
1247int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
1248{
1249 void **ptr;
1250 int ret = 0;
1251
1252 if (!map->ops->map_fd_sys_lookup_elem)
1253 return -ENOTSUPP;
1254
1255 rcu_read_lock();
1256 ptr = htab_map_lookup_elem(map, key);
1257 if (ptr)
1258 *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
1259 else
1260 ret = -ENOENT;
1261 rcu_read_unlock();
1262
1263 return ret;
1264}
1265
1266/* only called from syscall */
1247int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, 1267int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
1248 void *key, void *value, u64 map_flags) 1268 void *key, void *value, u64 map_flags)
1249{ 1269{
@@ -1305,4 +1325,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
1305 .map_delete_elem = htab_map_delete_elem, 1325 .map_delete_elem = htab_map_delete_elem,
1306 .map_fd_get_ptr = bpf_map_fd_get_ptr, 1326 .map_fd_get_ptr = bpf_map_fd_get_ptr,
1307 .map_fd_put_ptr = bpf_map_fd_put_ptr, 1327 .map_fd_put_ptr = bpf_map_fd_put_ptr,
1328 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
1308}; 1329};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 9bbd33497d3d..e833ed914358 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -377,10 +377,22 @@ static void bpf_evict_inode(struct inode *inode)
377 bpf_any_put(inode->i_private, type); 377 bpf_any_put(inode->i_private, type);
378} 378}
379 379
380/*
381 * Display the mount options in /proc/mounts.
382 */
383static int bpf_show_options(struct seq_file *m, struct dentry *root)
384{
385 umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
386
387 if (mode != S_IRWXUGO)
388 seq_printf(m, ",mode=%o", mode);
389 return 0;
390}
391
380static const struct super_operations bpf_super_ops = { 392static const struct super_operations bpf_super_ops = {
381 .statfs = simple_statfs, 393 .statfs = simple_statfs,
382 .drop_inode = generic_delete_inode, 394 .drop_inode = generic_delete_inode,
383 .show_options = generic_show_options, 395 .show_options = bpf_show_options,
384 .evict_inode = bpf_evict_inode, 396 .evict_inode = bpf_evict_inode,
385}; 397};
386 398
@@ -434,8 +446,6 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
434 struct inode *inode; 446 struct inode *inode;
435 int ret; 447 int ret;
436 448
437 save_mount_options(sb, data);
438
439 ret = bpf_parse_options(data, &opts); 449 ret = bpf_parse_options(data, &opts);
440 if (ret) 450 if (ret)
441 return ret; 451 return ret;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 39cfafd895b8..b09185f0f17d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -432,6 +432,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
432 trie->map.key_size = attr->key_size; 432 trie->map.key_size = attr->key_size;
433 trie->map.value_size = attr->value_size; 433 trie->map.value_size = attr->value_size;
434 trie->map.max_entries = attr->max_entries; 434 trie->map.max_entries = attr->max_entries;
435 trie->map.map_flags = attr->map_flags;
435 trie->data_size = attr->key_size - 436 trie->data_size = attr->key_size -
436 offsetof(struct bpf_lpm_trie_key, data); 437 offsetof(struct bpf_lpm_trie_key, data);
437 trie->max_prefixlen = trie->data_size * 8; 438 trie->max_prefixlen = trie->data_size * 8;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 59bcdf821ae4..1da574612bea 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -95,3 +95,8 @@ void bpf_map_fd_put_ptr(void *ptr)
95 */ 95 */
96 bpf_map_put(ptr); 96 bpf_map_put(ptr);
97} 97}
98
99u32 bpf_map_fd_sys_lookup_elem(void *ptr)
100{
101 return ((struct bpf_map *)ptr)->id;
102}
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 177fadb689dc..6183db9ec08c 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -19,5 +19,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
19void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, 19void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
20 int ufd); 20 int ufd);
21void bpf_map_fd_put_ptr(void *ptr); 21void bpf_map_fd_put_ptr(void *ptr);
22u32 bpf_map_fd_sys_lookup_elem(void *ptr);
22 23
23#endif 24#endif
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 4dfd6f2ec2f9..31147d730abf 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -88,6 +88,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
88 smap->map.key_size = attr->key_size; 88 smap->map.key_size = attr->key_size;
89 smap->map.value_size = value_size; 89 smap->map.value_size = value_size;
90 smap->map.max_entries = attr->max_entries; 90 smap->map.max_entries = attr->max_entries;
91 smap->map.map_flags = attr->map_flags;
91 smap->n_buckets = n_buckets; 92 smap->n_buckets = n_buckets;
92 smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 93 smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
93 94
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 265a0d854e33..045646da97cc 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -22,8 +22,20 @@
22#include <linux/filter.h> 22#include <linux/filter.h>
23#include <linux/version.h> 23#include <linux/version.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/idr.h>
26
27#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
28 (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
29 (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
30 (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
31#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
32#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
25 33
26DEFINE_PER_CPU(int, bpf_prog_active); 34DEFINE_PER_CPU(int, bpf_prog_active);
35static DEFINE_IDR(prog_idr);
36static DEFINE_SPINLOCK(prog_idr_lock);
37static DEFINE_IDR(map_idr);
38static DEFINE_SPINLOCK(map_idr_lock);
27 39
28int sysctl_unprivileged_bpf_disabled __read_mostly; 40int sysctl_unprivileged_bpf_disabled __read_mostly;
29 41
@@ -114,6 +126,37 @@ static void bpf_map_uncharge_memlock(struct bpf_map *map)
114 free_uid(user); 126 free_uid(user);
115} 127}
116 128
129static int bpf_map_alloc_id(struct bpf_map *map)
130{
131 int id;
132
133 spin_lock_bh(&map_idr_lock);
134 id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
135 if (id > 0)
136 map->id = id;
137 spin_unlock_bh(&map_idr_lock);
138
139 if (WARN_ON_ONCE(!id))
140 return -ENOSPC;
141
142 return id > 0 ? 0 : id;
143}
144
145static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
146{
147 if (do_idr_lock)
148 spin_lock_bh(&map_idr_lock);
149 else
150 __acquire(&map_idr_lock);
151
152 idr_remove(&map_idr, map->id);
153
154 if (do_idr_lock)
155 spin_unlock_bh(&map_idr_lock);
156 else
157 __release(&map_idr_lock);
158}
159
117/* called from workqueue */ 160/* called from workqueue */
118static void bpf_map_free_deferred(struct work_struct *work) 161static void bpf_map_free_deferred(struct work_struct *work)
119{ 162{
@@ -135,14 +178,21 @@ static void bpf_map_put_uref(struct bpf_map *map)
135/* decrement map refcnt and schedule it for freeing via workqueue 178/* decrement map refcnt and schedule it for freeing via workqueue
136 * (unrelying map implementation ops->map_free() might sleep) 179 * (unrelying map implementation ops->map_free() might sleep)
137 */ 180 */
138void bpf_map_put(struct bpf_map *map) 181static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
139{ 182{
140 if (atomic_dec_and_test(&map->refcnt)) { 183 if (atomic_dec_and_test(&map->refcnt)) {
184 /* bpf_map_free_id() must be called first */
185 bpf_map_free_id(map, do_idr_lock);
141 INIT_WORK(&map->work, bpf_map_free_deferred); 186 INIT_WORK(&map->work, bpf_map_free_deferred);
142 schedule_work(&map->work); 187 schedule_work(&map->work);
143 } 188 }
144} 189}
145 190
191void bpf_map_put(struct bpf_map *map)
192{
193 __bpf_map_put(map, true);
194}
195
146void bpf_map_put_with_uref(struct bpf_map *map) 196void bpf_map_put_with_uref(struct bpf_map *map)
147{ 197{
148 bpf_map_put_uref(map); 198 bpf_map_put_uref(map);
@@ -166,10 +216,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
166 const struct bpf_map *map = filp->private_data; 216 const struct bpf_map *map = filp->private_data;
167 const struct bpf_array *array; 217 const struct bpf_array *array;
168 u32 owner_prog_type = 0; 218 u32 owner_prog_type = 0;
219 u32 owner_jited = 0;
169 220
170 if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { 221 if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
171 array = container_of(map, struct bpf_array, map); 222 array = container_of(map, struct bpf_array, map);
172 owner_prog_type = array->owner_prog_type; 223 owner_prog_type = array->owner_prog_type;
224 owner_jited = array->owner_jited;
173 } 225 }
174 226
175 seq_printf(m, 227 seq_printf(m,
@@ -186,9 +238,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
186 map->map_flags, 238 map->map_flags,
187 map->pages * 1ULL << PAGE_SHIFT); 239 map->pages * 1ULL << PAGE_SHIFT);
188 240
189 if (owner_prog_type) 241 if (owner_prog_type) {
190 seq_printf(m, "owner_prog_type:\t%u\n", 242 seq_printf(m, "owner_prog_type:\t%u\n",
191 owner_prog_type); 243 owner_prog_type);
244 seq_printf(m, "owner_jited:\t%u\n",
245 owner_jited);
246 }
192} 247}
193#endif 248#endif
194 249
@@ -236,11 +291,22 @@ static int map_create(union bpf_attr *attr)
236 if (err) 291 if (err)
237 goto free_map_nouncharge; 292 goto free_map_nouncharge;
238 293
239 err = bpf_map_new_fd(map); 294 err = bpf_map_alloc_id(map);
240 if (err < 0) 295 if (err)
241 /* failed to allocate fd */
242 goto free_map; 296 goto free_map;
243 297
298 err = bpf_map_new_fd(map);
299 if (err < 0) {
300 /* failed to allocate fd.
301 * bpf_map_put() is needed because the above
302 * bpf_map_alloc_id() has published the map
303 * to the userspace and the userspace may
304 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
305 */
306 bpf_map_put(map);
307 return err;
308 }
309
244 trace_bpf_map_create(map, err); 310 trace_bpf_map_create(map, err);
245 return err; 311 return err;
246 312
@@ -295,6 +361,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
295 return map; 361 return map;
296} 362}
297 363
364/* map_idr_lock should have been held */
365static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
366 bool uref)
367{
368 int refold;
369
370 refold = __atomic_add_unless(&map->refcnt, 1, 0);
371
372 if (refold >= BPF_MAX_REFCNT) {
373 __bpf_map_put(map, false);
374 return ERR_PTR(-EBUSY);
375 }
376
377 if (!refold)
378 return ERR_PTR(-ENOENT);
379
380 if (uref)
381 atomic_inc(&map->usercnt);
382
383 return map;
384}
385
298int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) 386int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
299{ 387{
300 return -ENOTSUPP; 388 return -ENOTSUPP;
@@ -322,19 +410,18 @@ static int map_lookup_elem(union bpf_attr *attr)
322 if (IS_ERR(map)) 410 if (IS_ERR(map))
323 return PTR_ERR(map); 411 return PTR_ERR(map);
324 412
325 err = -ENOMEM; 413 key = memdup_user(ukey, map->key_size);
326 key = kmalloc(map->key_size, GFP_USER); 414 if (IS_ERR(key)) {
327 if (!key) 415 err = PTR_ERR(key);
328 goto err_put; 416 goto err_put;
329 417 }
330 err = -EFAULT;
331 if (copy_from_user(key, ukey, map->key_size) != 0)
332 goto free_key;
333 418
334 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 419 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
335 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || 420 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
336 map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) 421 map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
337 value_size = round_up(map->value_size, 8) * num_possible_cpus(); 422 value_size = round_up(map->value_size, 8) * num_possible_cpus();
423 else if (IS_FD_MAP(map))
424 value_size = sizeof(u32);
338 else 425 else
339 value_size = map->value_size; 426 value_size = map->value_size;
340 427
@@ -350,9 +437,10 @@ static int map_lookup_elem(union bpf_attr *attr)
350 err = bpf_percpu_array_copy(map, key, value); 437 err = bpf_percpu_array_copy(map, key, value);
351 } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { 438 } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
352 err = bpf_stackmap_copy(map, key, value); 439 err = bpf_stackmap_copy(map, key, value);
353 } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || 440 } else if (IS_FD_ARRAY(map)) {
354 map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { 441 err = bpf_fd_array_map_lookup_elem(map, key, value);
355 err = -ENOTSUPP; 442 } else if (IS_FD_HASH(map)) {
443 err = bpf_fd_htab_map_lookup_elem(map, key, value);
356 } else { 444 } else {
357 rcu_read_lock(); 445 rcu_read_lock();
358 ptr = map->ops->map_lookup_elem(map, key); 446 ptr = map->ops->map_lookup_elem(map, key);
@@ -402,14 +490,11 @@ static int map_update_elem(union bpf_attr *attr)
402 if (IS_ERR(map)) 490 if (IS_ERR(map))
403 return PTR_ERR(map); 491 return PTR_ERR(map);
404 492
405 err = -ENOMEM; 493 key = memdup_user(ukey, map->key_size);
406 key = kmalloc(map->key_size, GFP_USER); 494 if (IS_ERR(key)) {
407 if (!key) 495 err = PTR_ERR(key);
408 goto err_put; 496 goto err_put;
409 497 }
410 err = -EFAULT;
411 if (copy_from_user(key, ukey, map->key_size) != 0)
412 goto free_key;
413 498
414 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 499 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
415 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || 500 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -488,14 +573,11 @@ static int map_delete_elem(union bpf_attr *attr)
488 if (IS_ERR(map)) 573 if (IS_ERR(map))
489 return PTR_ERR(map); 574 return PTR_ERR(map);
490 575
491 err = -ENOMEM; 576 key = memdup_user(ukey, map->key_size);
492 key = kmalloc(map->key_size, GFP_USER); 577 if (IS_ERR(key)) {
493 if (!key) 578 err = PTR_ERR(key);
494 goto err_put; 579 goto err_put;
495 580 }
496 err = -EFAULT;
497 if (copy_from_user(key, ukey, map->key_size) != 0)
498 goto free_key;
499 581
500 preempt_disable(); 582 preempt_disable();
501 __this_cpu_inc(bpf_prog_active); 583 __this_cpu_inc(bpf_prog_active);
@@ -507,7 +589,6 @@ static int map_delete_elem(union bpf_attr *attr)
507 589
508 if (!err) 590 if (!err)
509 trace_bpf_map_delete_elem(map, ufd, key); 591 trace_bpf_map_delete_elem(map, ufd, key);
510free_key:
511 kfree(key); 592 kfree(key);
512err_put: 593err_put:
513 fdput(f); 594 fdput(f);
@@ -536,14 +617,11 @@ static int map_get_next_key(union bpf_attr *attr)
536 return PTR_ERR(map); 617 return PTR_ERR(map);
537 618
538 if (ukey) { 619 if (ukey) {
539 err = -ENOMEM; 620 key = memdup_user(ukey, map->key_size);
540 key = kmalloc(map->key_size, GFP_USER); 621 if (IS_ERR(key)) {
541 if (!key) 622 err = PTR_ERR(key);
542 goto err_put; 623 goto err_put;
543 624 }
544 err = -EFAULT;
545 if (copy_from_user(key, ukey, map->key_size) != 0)
546 goto free_key;
547 } else { 625 } else {
548 key = NULL; 626 key = NULL;
549 } 627 }
@@ -650,6 +728,42 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
650 free_uid(user); 728 free_uid(user);
651} 729}
652 730
731static int bpf_prog_alloc_id(struct bpf_prog *prog)
732{
733 int id;
734
735 spin_lock_bh(&prog_idr_lock);
736 id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
737 if (id > 0)
738 prog->aux->id = id;
739 spin_unlock_bh(&prog_idr_lock);
740
741 /* id is in [1, INT_MAX) */
742 if (WARN_ON_ONCE(!id))
743 return -ENOSPC;
744
745 return id > 0 ? 0 : id;
746}
747
748static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
749{
750 /* cBPF to eBPF migrations are currently not in the idr store. */
751 if (!prog->aux->id)
752 return;
753
754 if (do_idr_lock)
755 spin_lock_bh(&prog_idr_lock);
756 else
757 __acquire(&prog_idr_lock);
758
759 idr_remove(&prog_idr, prog->aux->id);
760
761 if (do_idr_lock)
762 spin_unlock_bh(&prog_idr_lock);
763 else
764 __release(&prog_idr_lock);
765}
766
653static void __bpf_prog_put_rcu(struct rcu_head *rcu) 767static void __bpf_prog_put_rcu(struct rcu_head *rcu)
654{ 768{
655 struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); 769 struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
@@ -659,14 +773,21 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
659 bpf_prog_free(aux->prog); 773 bpf_prog_free(aux->prog);
660} 774}
661 775
662void bpf_prog_put(struct bpf_prog *prog) 776static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
663{ 777{
664 if (atomic_dec_and_test(&prog->aux->refcnt)) { 778 if (atomic_dec_and_test(&prog->aux->refcnt)) {
665 trace_bpf_prog_put_rcu(prog); 779 trace_bpf_prog_put_rcu(prog);
780 /* bpf_prog_free_id() must be called first */
781 bpf_prog_free_id(prog, do_idr_lock);
666 bpf_prog_kallsyms_del(prog); 782 bpf_prog_kallsyms_del(prog);
667 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); 783 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
668 } 784 }
669} 785}
786
787void bpf_prog_put(struct bpf_prog *prog)
788{
789 __bpf_prog_put(prog, true);
790}
670EXPORT_SYMBOL_GPL(bpf_prog_put); 791EXPORT_SYMBOL_GPL(bpf_prog_put);
671 792
672static int bpf_prog_release(struct inode *inode, struct file *filp) 793static int bpf_prog_release(struct inode *inode, struct file *filp)
@@ -748,6 +869,24 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
748} 869}
749EXPORT_SYMBOL_GPL(bpf_prog_inc); 870EXPORT_SYMBOL_GPL(bpf_prog_inc);
750 871
872/* prog_idr_lock should have been held */
873static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
874{
875 int refold;
876
877 refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0);
878
879 if (refold >= BPF_MAX_REFCNT) {
880 __bpf_prog_put(prog, false);
881 return ERR_PTR(-EBUSY);
882 }
883
884 if (!refold)
885 return ERR_PTR(-ENOENT);
886
887 return prog;
888}
889
751static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) 890static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
752{ 891{
753 struct fd f = fdget(ufd); 892 struct fd f = fdget(ufd);
@@ -815,7 +954,9 @@ static int bpf_prog_load(union bpf_attr *attr)
815 attr->kern_version != LINUX_VERSION_CODE) 954 attr->kern_version != LINUX_VERSION_CODE)
816 return -EINVAL; 955 return -EINVAL;
817 956
818 if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) 957 if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
958 type != BPF_PROG_TYPE_CGROUP_SKB &&
959 !capable(CAP_SYS_ADMIN))
819 return -EPERM; 960 return -EPERM;
820 961
821 /* plain bpf_prog allocation */ 962 /* plain bpf_prog allocation */
@@ -855,11 +996,22 @@ static int bpf_prog_load(union bpf_attr *attr)
855 if (err < 0) 996 if (err < 0)
856 goto free_used_maps; 997 goto free_used_maps;
857 998
858 err = bpf_prog_new_fd(prog); 999 err = bpf_prog_alloc_id(prog);
859 if (err < 0) 1000 if (err)
860 /* failed to allocate fd */
861 goto free_used_maps; 1001 goto free_used_maps;
862 1002
1003 err = bpf_prog_new_fd(prog);
1004 if (err < 0) {
1005 /* failed to allocate fd.
1006 * bpf_prog_put() is needed because the above
1007 * bpf_prog_alloc_id() has published the prog
1008 * to the userspace and the userspace may
1009 * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
1010 */
1011 bpf_prog_put(prog);
1012 return err;
1013 }
1014
863 bpf_prog_kallsyms_add(prog); 1015 bpf_prog_kallsyms_add(prog);
864 trace_bpf_prog_load(prog, err); 1016 trace_bpf_prog_load(prog, err);
865 return err; 1017 return err;
@@ -919,6 +1071,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
919 case BPF_CGROUP_INET_SOCK_CREATE: 1071 case BPF_CGROUP_INET_SOCK_CREATE:
920 ptype = BPF_PROG_TYPE_CGROUP_SOCK; 1072 ptype = BPF_PROG_TYPE_CGROUP_SOCK;
921 break; 1073 break;
1074 case BPF_CGROUP_SOCK_OPS:
1075 ptype = BPF_PROG_TYPE_SOCK_OPS;
1076 break;
922 default: 1077 default:
923 return -EINVAL; 1078 return -EINVAL;
924 } 1079 }
@@ -959,6 +1114,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
959 case BPF_CGROUP_INET_INGRESS: 1114 case BPF_CGROUP_INET_INGRESS:
960 case BPF_CGROUP_INET_EGRESS: 1115 case BPF_CGROUP_INET_EGRESS:
961 case BPF_CGROUP_INET_SOCK_CREATE: 1116 case BPF_CGROUP_INET_SOCK_CREATE:
1117 case BPF_CGROUP_SOCK_OPS:
962 cgrp = cgroup_get_from_fd(attr->target_fd); 1118 cgrp = cgroup_get_from_fd(attr->target_fd);
963 if (IS_ERR(cgrp)) 1119 if (IS_ERR(cgrp))
964 return PTR_ERR(cgrp); 1120 return PTR_ERR(cgrp);
@@ -973,6 +1129,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
973 1129
974 return ret; 1130 return ret;
975} 1131}
1132
976#endif /* CONFIG_CGROUP_BPF */ 1133#endif /* CONFIG_CGROUP_BPF */
977 1134
978#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration 1135#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -997,6 +1154,237 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
997 return ret; 1154 return ret;
998} 1155}
999 1156
1157#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
1158
1159static int bpf_obj_get_next_id(const union bpf_attr *attr,
1160 union bpf_attr __user *uattr,
1161 struct idr *idr,
1162 spinlock_t *lock)
1163{
1164 u32 next_id = attr->start_id;
1165 int err = 0;
1166
1167 if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
1168 return -EINVAL;
1169
1170 if (!capable(CAP_SYS_ADMIN))
1171 return -EPERM;
1172
1173 next_id++;
1174 spin_lock_bh(lock);
1175 if (!idr_get_next(idr, &next_id))
1176 err = -ENOENT;
1177 spin_unlock_bh(lock);
1178
1179 if (!err)
1180 err = put_user(next_id, &uattr->next_id);
1181
1182 return err;
1183}
1184
1185#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
1186
1187static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
1188{
1189 struct bpf_prog *prog;
1190 u32 id = attr->prog_id;
1191 int fd;
1192
1193 if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
1194 return -EINVAL;
1195
1196 if (!capable(CAP_SYS_ADMIN))
1197 return -EPERM;
1198
1199 spin_lock_bh(&prog_idr_lock);
1200 prog = idr_find(&prog_idr, id);
1201 if (prog)
1202 prog = bpf_prog_inc_not_zero(prog);
1203 else
1204 prog = ERR_PTR(-ENOENT);
1205 spin_unlock_bh(&prog_idr_lock);
1206
1207 if (IS_ERR(prog))
1208 return PTR_ERR(prog);
1209
1210 fd = bpf_prog_new_fd(prog);
1211 if (fd < 0)
1212 bpf_prog_put(prog);
1213
1214 return fd;
1215}
1216
1217#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
1218
1219static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
1220{
1221 struct bpf_map *map;
1222 u32 id = attr->map_id;
1223 int fd;
1224
1225 if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
1226 return -EINVAL;
1227
1228 if (!capable(CAP_SYS_ADMIN))
1229 return -EPERM;
1230
1231 spin_lock_bh(&map_idr_lock);
1232 map = idr_find(&map_idr, id);
1233 if (map)
1234 map = bpf_map_inc_not_zero(map, true);
1235 else
1236 map = ERR_PTR(-ENOENT);
1237 spin_unlock_bh(&map_idr_lock);
1238
1239 if (IS_ERR(map))
1240 return PTR_ERR(map);
1241
1242 fd = bpf_map_new_fd(map);
1243 if (fd < 0)
1244 bpf_map_put(map);
1245
1246 return fd;
1247}
1248
1249static int check_uarg_tail_zero(void __user *uaddr,
1250 size_t expected_size,
1251 size_t actual_size)
1252{
1253 unsigned char __user *addr;
1254 unsigned char __user *end;
1255 unsigned char val;
1256 int err;
1257
1258 if (actual_size <= expected_size)
1259 return 0;
1260
1261 addr = uaddr + expected_size;
1262 end = uaddr + actual_size;
1263
1264 for (; addr < end; addr++) {
1265 err = get_user(val, addr);
1266 if (err)
1267 return err;
1268 if (val)
1269 return -E2BIG;
1270 }
1271
1272 return 0;
1273}
1274
1275static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
1276 const union bpf_attr *attr,
1277 union bpf_attr __user *uattr)
1278{
1279 struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
1280 struct bpf_prog_info info = {};
1281 u32 info_len = attr->info.info_len;
1282 char __user *uinsns;
1283 u32 ulen;
1284 int err;
1285
1286 err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
1287 if (err)
1288 return err;
1289 info_len = min_t(u32, sizeof(info), info_len);
1290
1291 if (copy_from_user(&info, uinfo, info_len))
1292 return err;
1293
1294 info.type = prog->type;
1295 info.id = prog->aux->id;
1296
1297 memcpy(info.tag, prog->tag, sizeof(prog->tag));
1298
1299 if (!capable(CAP_SYS_ADMIN)) {
1300 info.jited_prog_len = 0;
1301 info.xlated_prog_len = 0;
1302 goto done;
1303 }
1304
1305 ulen = info.jited_prog_len;
1306 info.jited_prog_len = prog->jited_len;
1307 if (info.jited_prog_len && ulen) {
1308 uinsns = u64_to_user_ptr(info.jited_prog_insns);
1309 ulen = min_t(u32, info.jited_prog_len, ulen);
1310 if (copy_to_user(uinsns, prog->bpf_func, ulen))
1311 return -EFAULT;
1312 }
1313
1314 ulen = info.xlated_prog_len;
1315 info.xlated_prog_len = bpf_prog_size(prog->len);
1316 if (info.xlated_prog_len && ulen) {
1317 uinsns = u64_to_user_ptr(info.xlated_prog_insns);
1318 ulen = min_t(u32, info.xlated_prog_len, ulen);
1319 if (copy_to_user(uinsns, prog->insnsi, ulen))
1320 return -EFAULT;
1321 }
1322
1323done:
1324 if (copy_to_user(uinfo, &info, info_len) ||
1325 put_user(info_len, &uattr->info.info_len))
1326 return -EFAULT;
1327
1328 return 0;
1329}
1330
1331static int bpf_map_get_info_by_fd(struct bpf_map *map,
1332 const union bpf_attr *attr,
1333 union bpf_attr __user *uattr)
1334{
1335 struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
1336 struct bpf_map_info info = {};
1337 u32 info_len = attr->info.info_len;
1338 int err;
1339
1340 err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
1341 if (err)
1342 return err;
1343 info_len = min_t(u32, sizeof(info), info_len);
1344
1345 info.type = map->map_type;
1346 info.id = map->id;
1347 info.key_size = map->key_size;
1348 info.value_size = map->value_size;
1349 info.max_entries = map->max_entries;
1350 info.map_flags = map->map_flags;
1351
1352 if (copy_to_user(uinfo, &info, info_len) ||
1353 put_user(info_len, &uattr->info.info_len))
1354 return -EFAULT;
1355
1356 return 0;
1357}
1358
1359#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
1360
1361static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
1362 union bpf_attr __user *uattr)
1363{
1364 int ufd = attr->info.bpf_fd;
1365 struct fd f;
1366 int err;
1367
1368 if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
1369 return -EINVAL;
1370
1371 f = fdget(ufd);
1372 if (!f.file)
1373 return -EBADFD;
1374
1375 if (f.file->f_op == &bpf_prog_fops)
1376 err = bpf_prog_get_info_by_fd(f.file->private_data, attr,
1377 uattr);
1378 else if (f.file->f_op == &bpf_map_fops)
1379 err = bpf_map_get_info_by_fd(f.file->private_data, attr,
1380 uattr);
1381 else
1382 err = -EINVAL;
1383
1384 fdput(f);
1385 return err;
1386}
1387
1000SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) 1388SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
1001{ 1389{
1002 union bpf_attr attr = {}; 1390 union bpf_attr attr = {};
@@ -1016,23 +1404,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
1016 * user-space does not rely on any kernel feature 1404 * user-space does not rely on any kernel feature
1017 * extensions we dont know about yet. 1405 * extensions we dont know about yet.
1018 */ 1406 */
1019 if (size > sizeof(attr)) { 1407 err = check_uarg_tail_zero(uattr, sizeof(attr), size);
1020 unsigned char __user *addr; 1408 if (err)
1021 unsigned char __user *end; 1409 return err;
1022 unsigned char val; 1410 size = min_t(u32, size, sizeof(attr));
1023
1024 addr = (void __user *)uattr + sizeof(attr);
1025 end = (void __user *)uattr + size;
1026
1027 for (; addr < end; addr++) {
1028 err = get_user(val, addr);
1029 if (err)
1030 return err;
1031 if (val)
1032 return -E2BIG;
1033 }
1034 size = sizeof(attr);
1035 }
1036 1411
1037 /* copy attributes from user space, may be less than sizeof(bpf_attr) */ 1412 /* copy attributes from user space, may be less than sizeof(bpf_attr) */
1038 if (copy_from_user(&attr, uattr, size) != 0) 1413 if (copy_from_user(&attr, uattr, size) != 0)
@@ -1074,6 +1449,23 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
1074 case BPF_PROG_TEST_RUN: 1449 case BPF_PROG_TEST_RUN:
1075 err = bpf_prog_test_run(&attr, uattr); 1450 err = bpf_prog_test_run(&attr, uattr);
1076 break; 1451 break;
1452 case BPF_PROG_GET_NEXT_ID:
1453 err = bpf_obj_get_next_id(&attr, uattr,
1454 &prog_idr, &prog_idr_lock);
1455 break;
1456 case BPF_MAP_GET_NEXT_ID:
1457 err = bpf_obj_get_next_id(&attr, uattr,
1458 &map_idr, &map_idr_lock);
1459 break;
1460 case BPF_PROG_GET_FD_BY_ID:
1461 err = bpf_prog_get_fd_by_id(&attr);
1462 break;
1463 case BPF_MAP_GET_FD_BY_ID:
1464 err = bpf_map_get_fd_by_id(&attr);
1465 break;
1466 case BPF_OBJ_GET_INFO_BY_FD:
1467 err = bpf_obj_get_info_by_fd(&attr, uattr);
1468 break;
1077 default: 1469 default:
1078 err = -EINVAL; 1470 err = -EINVAL;
1079 break; 1471 break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1eddb713b815..af9e84a4944e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -463,19 +463,22 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
463 BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 463 BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
464}; 464};
465 465
466static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
467{
468 BUG_ON(regno >= MAX_BPF_REG);
469
470 memset(&regs[regno], 0, sizeof(regs[regno]));
471 regs[regno].type = NOT_INIT;
472 regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
473 regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
474}
475
466static void init_reg_state(struct bpf_reg_state *regs) 476static void init_reg_state(struct bpf_reg_state *regs)
467{ 477{
468 int i; 478 int i;
469 479
470 for (i = 0; i < MAX_BPF_REG; i++) { 480 for (i = 0; i < MAX_BPF_REG; i++)
471 regs[i].type = NOT_INIT; 481 mark_reg_not_init(regs, i);
472 regs[i].imm = 0;
473 regs[i].min_value = BPF_REGISTER_MIN_RANGE;
474 regs[i].max_value = BPF_REGISTER_MAX_RANGE;
475 regs[i].min_align = 0;
476 regs[i].aux_off = 0;
477 regs[i].aux_off_align = 0;
478 }
479 482
480 /* frame pointer */ 483 /* frame pointer */
481 regs[BPF_REG_FP].type = FRAME_PTR; 484 regs[BPF_REG_FP].type = FRAME_PTR;
@@ -501,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
501{ 504{
502 regs[regno].min_value = BPF_REGISTER_MIN_RANGE; 505 regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
503 regs[regno].max_value = BPF_REGISTER_MAX_RANGE; 506 regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
507 regs[regno].value_from_signed = false;
504 regs[regno].min_align = 0; 508 regs[regno].min_align = 0;
505} 509}
506 510
@@ -543,20 +547,6 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
543 return 0; 547 return 0;
544} 548}
545 549
546static int bpf_size_to_bytes(int bpf_size)
547{
548 if (bpf_size == BPF_W)
549 return 4;
550 else if (bpf_size == BPF_H)
551 return 2;
552 else if (bpf_size == BPF_B)
553 return 1;
554 else if (bpf_size == BPF_DW)
555 return 8;
556 else
557 return -EINVAL;
558}
559
560static bool is_spillable_regtype(enum bpf_reg_type type) 550static bool is_spillable_regtype(enum bpf_reg_type type)
561{ 551{
562 switch (type) { 552 switch (type) {
@@ -755,15 +745,29 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
755} 745}
756 746
757/* check access to 'struct bpf_context' fields */ 747/* check access to 'struct bpf_context' fields */
758static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, 748static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
759 enum bpf_access_type t, enum bpf_reg_type *reg_type) 749 enum bpf_access_type t, enum bpf_reg_type *reg_type)
760{ 750{
751 struct bpf_insn_access_aux info = {
752 .reg_type = *reg_type,
753 };
754
761 /* for analyzer ctx accesses are already validated and converted */ 755 /* for analyzer ctx accesses are already validated and converted */
762 if (env->analyzer_ops) 756 if (env->analyzer_ops)
763 return 0; 757 return 0;
764 758
765 if (env->prog->aux->ops->is_valid_access && 759 if (env->prog->aux->ops->is_valid_access &&
766 env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { 760 env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
761 /* A non zero info.ctx_field_size indicates that this field is a
762 * candidate for later verifier transformation to load the whole
763 * field and then apply a mask when accessed with a narrower
764 * access than actual ctx access size. A zero info.ctx_field_size
765 * will only allow for whole field access and rejects any other
766 * type of narrower access.
767 */
768 env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
769 *reg_type = info.reg_type;
770
767 /* remember the offset of last byte accessed in ctx */ 771 /* remember the offset of last byte accessed in ctx */
768 if (env->prog->aux->max_ctx_offset < off + size) 772 if (env->prog->aux->max_ctx_offset < off + size)
769 env->prog->aux->max_ctx_offset = off + size; 773 env->prog->aux->max_ctx_offset = off + size;
@@ -774,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
774 return -EACCES; 778 return -EACCES;
775} 779}
776 780
777static bool is_pointer_value(struct bpf_verifier_env *env, int regno) 781static bool __is_pointer_value(bool allow_ptr_leaks,
782 const struct bpf_reg_state *reg)
778{ 783{
779 if (env->allow_ptr_leaks) 784 if (allow_ptr_leaks)
780 return false; 785 return false;
781 786
782 switch (env->cur_state.regs[regno].type) { 787 switch (reg->type) {
783 case UNKNOWN_VALUE: 788 case UNKNOWN_VALUE:
784 case CONST_IMM: 789 case CONST_IMM:
785 return false; 790 return false;
@@ -788,6 +793,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
788 } 793 }
789} 794}
790 795
796static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
797{
798 return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
799}
800
791static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, 801static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
792 int off, int size, bool strict) 802 int off, int size, bool strict)
793{ 803{
@@ -808,11 +818,15 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
808 reg_off += reg->aux_off; 818 reg_off += reg->aux_off;
809 } 819 }
810 820
811 /* skb->data is NET_IP_ALIGN-ed, but for strict alignment checking 821 /* For platforms that do not have a Kconfig enabling
812 * we force this to 2 which is universally what architectures use 822 * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of
813 * when they don't set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. 823 * NET_IP_ALIGN is universally set to '2'. And on platforms
824 * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get
825 * to this code only in strict mode where we want to emulate
826 * the NET_IP_ALIGN==2 checking. Therefore use an
827 * unconditional IP align value of '2'.
814 */ 828 */
815 ip_align = strict ? 2 : NET_IP_ALIGN; 829 ip_align = 2;
816 if ((ip_align + reg_off + off) % size != 0) { 830 if ((ip_align + reg_off + off) % size != 0) {
817 verbose("misaligned packet access off %d+%d+%d size %d\n", 831 verbose("misaligned packet access off %d+%d+%d size %d\n",
818 ip_align, reg_off, off, size); 832 ip_align, reg_off, off, size);
@@ -839,9 +853,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
839{ 853{
840 bool strict = env->strict_alignment; 854 bool strict = env->strict_alignment;
841 855
842 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
843 strict = true;
844
845 switch (reg->type) { 856 switch (reg->type) {
846 case PTR_TO_PACKET: 857 case PTR_TO_PACKET:
847 return check_pkt_ptr_alignment(reg, off, size, strict); 858 return check_pkt_ptr_alignment(reg, off, size, strict);
@@ -864,7 +875,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
864 * if t==write && value_regno==-1, some unknown value is stored into memory 875 * if t==write && value_regno==-1, some unknown value is stored into memory
865 * if t==read && value_regno==-1, don't care what we read from memory 876 * if t==read && value_regno==-1, don't care what we read from memory
866 */ 877 */
867static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, 878static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off,
868 int bpf_size, enum bpf_access_type t, 879 int bpf_size, enum bpf_access_type t,
869 int value_regno) 880 int value_regno)
870{ 881{
@@ -907,7 +918,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
907 verbose("R%d leaks addr into ctx\n", value_regno); 918 verbose("R%d leaks addr into ctx\n", value_regno);
908 return -EACCES; 919 return -EACCES;
909 } 920 }
910 err = check_ctx_access(env, off, size, t, &reg_type); 921 err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
911 if (!err && t == BPF_READ && value_regno >= 0) { 922 if (!err && t == BPF_READ && value_regno >= 0) {
912 mark_reg_unknown_value_and_range(state->regs, 923 mark_reg_unknown_value_and_range(state->regs,
913 value_regno); 924 value_regno);
@@ -922,6 +933,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
922 verbose("invalid stack off=%d size=%d\n", off, size); 933 verbose("invalid stack off=%d size=%d\n", off, size);
923 return -EACCES; 934 return -EACCES;
924 } 935 }
936
937 if (env->prog->aux->stack_depth < -off)
938 env->prog->aux->stack_depth = -off;
939
925 if (t == BPF_WRITE) { 940 if (t == BPF_WRITE) {
926 if (!env->allow_ptr_leaks && 941 if (!env->allow_ptr_leaks &&
927 state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && 942 state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
@@ -964,7 +979,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
964 return err; 979 return err;
965} 980}
966 981
967static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) 982static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
968{ 983{
969 struct bpf_reg_state *regs = env->cur_state.regs; 984 struct bpf_reg_state *regs = env->cur_state.regs;
970 int err; 985 int err;
@@ -985,14 +1000,19 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
985 if (err) 1000 if (err)
986 return err; 1001 return err;
987 1002
1003 if (is_pointer_value(env, insn->src_reg)) {
1004 verbose("R%d leaks addr into mem\n", insn->src_reg);
1005 return -EACCES;
1006 }
1007
988 /* check whether atomic_add can read the memory */ 1008 /* check whether atomic_add can read the memory */
989 err = check_mem_access(env, insn->dst_reg, insn->off, 1009 err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
990 BPF_SIZE(insn->code), BPF_READ, -1); 1010 BPF_SIZE(insn->code), BPF_READ, -1);
991 if (err) 1011 if (err)
992 return err; 1012 return err;
993 1013
994 /* check whether atomic_add can write into the same memory */ 1014 /* check whether atomic_add can write into the same memory */
995 return check_mem_access(env, insn->dst_reg, insn->off, 1015 return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
996 BPF_SIZE(insn->code), BPF_WRITE, -1); 1016 BPF_SIZE(insn->code), BPF_WRITE, -1);
997} 1017}
998 1018
@@ -1028,6 +1048,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
1028 return -EACCES; 1048 return -EACCES;
1029 } 1049 }
1030 1050
1051 if (env->prog->aux->stack_depth < -off)
1052 env->prog->aux->stack_depth = -off;
1053
1031 if (meta && meta->raw_mode) { 1054 if (meta && meta->raw_mode) {
1032 meta->access_size = access_size; 1055 meta->access_size = access_size;
1033 meta->regno = regno; 1056 meta->regno = regno;
@@ -1335,8 +1358,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
1335 if (reg->type != PTR_TO_PACKET && 1358 if (reg->type != PTR_TO_PACKET &&
1336 reg->type != PTR_TO_PACKET_END) 1359 reg->type != PTR_TO_PACKET_END)
1337 continue; 1360 continue;
1338 reg->type = UNKNOWN_VALUE; 1361 __mark_reg_unknown_value(state->spilled_regs,
1339 reg->imm = 0; 1362 i / BPF_REG_SIZE);
1340 } 1363 }
1341} 1364}
1342 1365
@@ -1345,7 +1368,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
1345 struct bpf_verifier_state *state = &env->cur_state; 1368 struct bpf_verifier_state *state = &env->cur_state;
1346 const struct bpf_func_proto *fn = NULL; 1369 const struct bpf_func_proto *fn = NULL;
1347 struct bpf_reg_state *regs = state->regs; 1370 struct bpf_reg_state *regs = state->regs;
1348 struct bpf_reg_state *reg;
1349 struct bpf_call_arg_meta meta; 1371 struct bpf_call_arg_meta meta;
1350 bool changes_data; 1372 bool changes_data;
1351 int i, err; 1373 int i, err;
@@ -1406,17 +1428,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
1406 * is inferred from register state. 1428 * is inferred from register state.
1407 */ 1429 */
1408 for (i = 0; i < meta.access_size; i++) { 1430 for (i = 0; i < meta.access_size; i++) {
1409 err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1); 1431 err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1);
1410 if (err) 1432 if (err)
1411 return err; 1433 return err;
1412 } 1434 }
1413 1435
1414 /* reset caller saved regs */ 1436 /* reset caller saved regs */
1415 for (i = 0; i < CALLER_SAVED_REGS; i++) { 1437 for (i = 0; i < CALLER_SAVED_REGS; i++)
1416 reg = regs + caller_saved[i]; 1438 mark_reg_not_init(regs, caller_saved[i]);
1417 reg->type = NOT_INIT;
1418 reg->imm = 0;
1419 }
1420 1439
1421 /* update return register */ 1440 /* update return register */
1422 if (fn->ret_type == RET_INTEGER) { 1441 if (fn->ret_type == RET_INTEGER) {
@@ -1645,6 +1664,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
1645 return 0; 1664 return 0;
1646} 1665}
1647 1666
1667static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
1668 struct bpf_insn *insn)
1669{
1670 struct bpf_reg_state *regs = env->cur_state.regs;
1671 struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
1672 struct bpf_reg_state *src_reg = &regs[insn->src_reg];
1673 u8 opcode = BPF_OP(insn->code);
1674 s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
1675
1676 /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
1677 if (src_reg->imm > 0 && dst_reg->imm) {
1678 switch (opcode) {
1679 case BPF_ADD:
1680 /* dreg += sreg
1681 * where both have zero upper bits. Adding them
1682 * can only result making one more bit non-zero
1683 * in the larger value.
1684 * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
1685 * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
1686 */
1687 dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
1688 dst_reg->imm--;
1689 break;
1690 case BPF_AND:
1691 /* dreg &= sreg
1692 * AND can not extend zero bits only shrink
1693 * Ex. 0x00..00ffffff
1694 * & 0x0f..ffffffff
1695 * ----------------
1696 * 0x00..00ffffff
1697 */
1698 dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
1699 break;
1700 case BPF_OR:
1701 /* dreg |= sreg
1702 * OR can only extend zero bits
1703 * Ex. 0x00..00ffffff
1704 * | 0x0f..ffffffff
1705 * ----------------
1706 * 0x0f..00ffffff
1707 */
1708 dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
1709 break;
1710 case BPF_SUB:
1711 case BPF_MUL:
1712 case BPF_RSH:
1713 case BPF_LSH:
1714 /* These may be flushed out later */
1715 default:
1716 mark_reg_unknown_value(regs, insn->dst_reg);
1717 }
1718 } else {
1719 mark_reg_unknown_value(regs, insn->dst_reg);
1720 }
1721
1722 dst_reg->type = UNKNOWN_VALUE;
1723 return 0;
1724}
1725
1648static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, 1726static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
1649 struct bpf_insn *insn) 1727 struct bpf_insn *insn)
1650{ 1728{
@@ -1654,6 +1732,9 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
1654 u8 opcode = BPF_OP(insn->code); 1732 u8 opcode = BPF_OP(insn->code);
1655 u64 dst_imm = dst_reg->imm; 1733 u64 dst_imm = dst_reg->imm;
1656 1734
1735 if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
1736 return evaluate_reg_imm_alu_unknown(env, insn);
1737
1657 /* dst_reg->type == CONST_IMM here. Simulate execution of insns 1738 /* dst_reg->type == CONST_IMM here. Simulate execution of insns
1658 * containing ALU ops. Don't care about overflow or negative 1739 * containing ALU ops. Don't care about overflow or negative
1659 * values, just add/sub/... them; registers are in u64. 1740 * values, just add/sub/... them; registers are in u64.
@@ -1758,10 +1839,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
1758 dst_align = dst_reg->min_align; 1839 dst_align = dst_reg->min_align;
1759 1840
1760 /* We don't know anything about what was done to this register, mark it 1841 /* We don't know anything about what was done to this register, mark it
1761 * as unknown. 1842 * as unknown. Also, if both derived bounds came from signed/unsigned
1843 * mixed compares and one side is unbounded, we cannot really do anything
1844 * with them as boundaries cannot be trusted. Thus, arithmetic of two
1845 * regs of such kind will get invalidated bounds on the dst side.
1762 */ 1846 */
1763 if (min_val == BPF_REGISTER_MIN_RANGE && 1847 if ((min_val == BPF_REGISTER_MIN_RANGE &&
1764 max_val == BPF_REGISTER_MAX_RANGE) { 1848 max_val == BPF_REGISTER_MAX_RANGE) ||
1849 (BPF_SRC(insn->code) == BPF_X &&
1850 ((min_val != BPF_REGISTER_MIN_RANGE &&
1851 max_val == BPF_REGISTER_MAX_RANGE) ||
1852 (min_val == BPF_REGISTER_MIN_RANGE &&
1853 max_val != BPF_REGISTER_MAX_RANGE) ||
1854 (dst_reg->min_value != BPF_REGISTER_MIN_RANGE &&
1855 dst_reg->max_value == BPF_REGISTER_MAX_RANGE) ||
1856 (dst_reg->min_value == BPF_REGISTER_MIN_RANGE &&
1857 dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) &&
1858 regs[insn->dst_reg].value_from_signed !=
1859 regs[insn->src_reg].value_from_signed)) {
1765 reset_reg_range_values(regs, insn->dst_reg); 1860 reset_reg_range_values(regs, insn->dst_reg);
1766 return; 1861 return;
1767 } 1862 }
@@ -1945,9 +2040,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
1945 */ 2040 */
1946 regs[insn->dst_reg].type = CONST_IMM; 2041 regs[insn->dst_reg].type = CONST_IMM;
1947 regs[insn->dst_reg].imm = insn->imm; 2042 regs[insn->dst_reg].imm = insn->imm;
2043 regs[insn->dst_reg].id = 0;
1948 regs[insn->dst_reg].max_value = insn->imm; 2044 regs[insn->dst_reg].max_value = insn->imm;
1949 regs[insn->dst_reg].min_value = insn->imm; 2045 regs[insn->dst_reg].min_value = insn->imm;
1950 regs[insn->dst_reg].min_align = calc_align(insn->imm); 2046 regs[insn->dst_reg].min_align = calc_align(insn->imm);
2047 regs[insn->dst_reg].value_from_signed = false;
1951 } 2048 }
1952 2049
1953 } else if (opcode > BPF_END) { 2050 } else if (opcode > BPF_END) {
@@ -2123,40 +2220,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
2123 struct bpf_reg_state *false_reg, u64 val, 2220 struct bpf_reg_state *false_reg, u64 val,
2124 u8 opcode) 2221 u8 opcode)
2125{ 2222{
2223 bool value_from_signed = true;
2224 bool is_range = true;
2225
2126 switch (opcode) { 2226 switch (opcode) {
2127 case BPF_JEQ: 2227 case BPF_JEQ:
2128 /* If this is false then we know nothing Jon Snow, but if it is 2228 /* If this is false then we know nothing Jon Snow, but if it is
2129 * true then we know for sure. 2229 * true then we know for sure.
2130 */ 2230 */
2131 true_reg->max_value = true_reg->min_value = val; 2231 true_reg->max_value = true_reg->min_value = val;
2232 is_range = false;
2132 break; 2233 break;
2133 case BPF_JNE: 2234 case BPF_JNE:
2134 /* If this is true we know nothing Jon Snow, but if it is false 2235 /* If this is true we know nothing Jon Snow, but if it is false
2135 * we know the value for sure; 2236 * we know the value for sure;
2136 */ 2237 */
2137 false_reg->max_value = false_reg->min_value = val; 2238 false_reg->max_value = false_reg->min_value = val;
2239 is_range = false;
2138 break; 2240 break;
2139 case BPF_JGT: 2241 case BPF_JGT:
2140 /* Unsigned comparison, the minimum value is 0. */ 2242 value_from_signed = false;
2141 false_reg->min_value = 0;
2142 /* fallthrough */ 2243 /* fallthrough */
2143 case BPF_JSGT: 2244 case BPF_JSGT:
2245 if (true_reg->value_from_signed != value_from_signed)
2246 reset_reg_range_values(true_reg, 0);
2247 if (false_reg->value_from_signed != value_from_signed)
2248 reset_reg_range_values(false_reg, 0);
2249 if (opcode == BPF_JGT) {
2250 /* Unsigned comparison, the minimum value is 0. */
2251 false_reg->min_value = 0;
2252 }
2144 /* If this is false then we know the maximum val is val, 2253 /* If this is false then we know the maximum val is val,
2145 * otherwise we know the min val is val+1. 2254 * otherwise we know the min val is val+1.
2146 */ 2255 */
2147 false_reg->max_value = val; 2256 false_reg->max_value = val;
2257 false_reg->value_from_signed = value_from_signed;
2148 true_reg->min_value = val + 1; 2258 true_reg->min_value = val + 1;
2259 true_reg->value_from_signed = value_from_signed;
2149 break; 2260 break;
2150 case BPF_JGE: 2261 case BPF_JGE:
2151 /* Unsigned comparison, the minimum value is 0. */ 2262 value_from_signed = false;
2152 false_reg->min_value = 0;
2153 /* fallthrough */ 2263 /* fallthrough */
2154 case BPF_JSGE: 2264 case BPF_JSGE:
2265 if (true_reg->value_from_signed != value_from_signed)
2266 reset_reg_range_values(true_reg, 0);
2267 if (false_reg->value_from_signed != value_from_signed)
2268 reset_reg_range_values(false_reg, 0);
2269 if (opcode == BPF_JGE) {
2270 /* Unsigned comparison, the minimum value is 0. */
2271 false_reg->min_value = 0;
2272 }
2155 /* If this is false then we know the maximum value is val - 1, 2273 /* If this is false then we know the maximum value is val - 1,
2156 * otherwise we know the mimimum value is val. 2274 * otherwise we know the mimimum value is val.
2157 */ 2275 */
2158 false_reg->max_value = val - 1; 2276 false_reg->max_value = val - 1;
2277 false_reg->value_from_signed = value_from_signed;
2159 true_reg->min_value = val; 2278 true_reg->min_value = val;
2279 true_reg->value_from_signed = value_from_signed;
2160 break; 2280 break;
2161 default: 2281 default:
2162 break; 2282 break;
@@ -2164,6 +2284,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
2164 2284
2165 check_reg_overflow(false_reg); 2285 check_reg_overflow(false_reg);
2166 check_reg_overflow(true_reg); 2286 check_reg_overflow(true_reg);
2287 if (is_range) {
2288 if (__is_pointer_value(false, false_reg))
2289 reset_reg_range_values(false_reg, 0);
2290 if (__is_pointer_value(false, true_reg))
2291 reset_reg_range_values(true_reg, 0);
2292 }
2167} 2293}
2168 2294
2169/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg 2295/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
@@ -2173,41 +2299,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
2173 struct bpf_reg_state *false_reg, u64 val, 2299 struct bpf_reg_state *false_reg, u64 val,
2174 u8 opcode) 2300 u8 opcode)
2175{ 2301{
2302 bool value_from_signed = true;
2303 bool is_range = true;
2304
2176 switch (opcode) { 2305 switch (opcode) {
2177 case BPF_JEQ: 2306 case BPF_JEQ:
2178 /* If this is false then we know nothing Jon Snow, but if it is 2307 /* If this is false then we know nothing Jon Snow, but if it is
2179 * true then we know for sure. 2308 * true then we know for sure.
2180 */ 2309 */
2181 true_reg->max_value = true_reg->min_value = val; 2310 true_reg->max_value = true_reg->min_value = val;
2311 is_range = false;
2182 break; 2312 break;
2183 case BPF_JNE: 2313 case BPF_JNE:
2184 /* If this is true we know nothing Jon Snow, but if it is false 2314 /* If this is true we know nothing Jon Snow, but if it is false
2185 * we know the value for sure; 2315 * we know the value for sure;
2186 */ 2316 */
2187 false_reg->max_value = false_reg->min_value = val; 2317 false_reg->max_value = false_reg->min_value = val;
2318 is_range = false;
2188 break; 2319 break;
2189 case BPF_JGT: 2320 case BPF_JGT:
2190 /* Unsigned comparison, the minimum value is 0. */ 2321 value_from_signed = false;
2191 true_reg->min_value = 0;
2192 /* fallthrough */ 2322 /* fallthrough */
2193 case BPF_JSGT: 2323 case BPF_JSGT:
2324 if (true_reg->value_from_signed != value_from_signed)
2325 reset_reg_range_values(true_reg, 0);
2326 if (false_reg->value_from_signed != value_from_signed)
2327 reset_reg_range_values(false_reg, 0);
2328 if (opcode == BPF_JGT) {
2329 /* Unsigned comparison, the minimum value is 0. */
2330 true_reg->min_value = 0;
2331 }
2194 /* 2332 /*
2195 * If this is false, then the val is <= the register, if it is 2333 * If this is false, then the val is <= the register, if it is
2196 * true the register <= to the val. 2334 * true the register <= to the val.
2197 */ 2335 */
2198 false_reg->min_value = val; 2336 false_reg->min_value = val;
2337 false_reg->value_from_signed = value_from_signed;
2199 true_reg->max_value = val - 1; 2338 true_reg->max_value = val - 1;
2339 true_reg->value_from_signed = value_from_signed;
2200 break; 2340 break;
2201 case BPF_JGE: 2341 case BPF_JGE:
2202 /* Unsigned comparison, the minimum value is 0. */ 2342 value_from_signed = false;
2203 true_reg->min_value = 0;
2204 /* fallthrough */ 2343 /* fallthrough */
2205 case BPF_JSGE: 2344 case BPF_JSGE:
2345 if (true_reg->value_from_signed != value_from_signed)
2346 reset_reg_range_values(true_reg, 0);
2347 if (false_reg->value_from_signed != value_from_signed)
2348 reset_reg_range_values(false_reg, 0);
2349 if (opcode == BPF_JGE) {
2350 /* Unsigned comparison, the minimum value is 0. */
2351 true_reg->min_value = 0;
2352 }
2206 /* If this is false then constant < register, if it is true then 2353 /* If this is false then constant < register, if it is true then
2207 * the register < constant. 2354 * the register < constant.
2208 */ 2355 */
2209 false_reg->min_value = val + 1; 2356 false_reg->min_value = val + 1;
2357 false_reg->value_from_signed = value_from_signed;
2210 true_reg->max_value = val; 2358 true_reg->max_value = val;
2359 true_reg->value_from_signed = value_from_signed;
2211 break; 2360 break;
2212 default: 2361 default:
2213 break; 2362 break;
@@ -2215,6 +2364,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
2215 2364
2216 check_reg_overflow(false_reg); 2365 check_reg_overflow(false_reg);
2217 check_reg_overflow(true_reg); 2366 check_reg_overflow(true_reg);
2367 if (is_range) {
2368 if (__is_pointer_value(false, false_reg))
2369 reset_reg_range_values(false_reg, 0);
2370 if (__is_pointer_value(false, true_reg))
2371 reset_reg_range_values(true_reg, 0);
2372 }
2218} 2373}
2219 2374
2220static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, 2375static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
@@ -2402,6 +2557,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
2402 2557
2403 regs[insn->dst_reg].type = CONST_IMM; 2558 regs[insn->dst_reg].type = CONST_IMM;
2404 regs[insn->dst_reg].imm = imm; 2559 regs[insn->dst_reg].imm = imm;
2560 regs[insn->dst_reg].id = 0;
2405 return 0; 2561 return 0;
2406 } 2562 }
2407 2563
@@ -2444,7 +2600,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
2444{ 2600{
2445 struct bpf_reg_state *regs = env->cur_state.regs; 2601 struct bpf_reg_state *regs = env->cur_state.regs;
2446 u8 mode = BPF_MODE(insn->code); 2602 u8 mode = BPF_MODE(insn->code);
2447 struct bpf_reg_state *reg;
2448 int i, err; 2603 int i, err;
2449 2604
2450 if (!may_access_skb(env->prog->type)) { 2605 if (!may_access_skb(env->prog->type)) {
@@ -2477,11 +2632,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
2477 } 2632 }
2478 2633
2479 /* reset caller saved regs to unreadable */ 2634 /* reset caller saved regs to unreadable */
2480 for (i = 0; i < CALLER_SAVED_REGS; i++) { 2635 for (i = 0; i < CALLER_SAVED_REGS; i++)
2481 reg = regs + caller_saved[i]; 2636 mark_reg_not_init(regs, caller_saved[i]);
2482 reg->type = NOT_INIT;
2483 reg->imm = 0;
2484 }
2485 2637
2486 /* mark destination R0 register as readable, since it contains 2638 /* mark destination R0 register as readable, since it contains
2487 * the value fetched from the packet 2639 * the value fetched from the packet
@@ -2692,7 +2844,8 @@ err_free:
2692/* the following conditions reduce the number of explored insns 2844/* the following conditions reduce the number of explored insns
2693 * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet 2845 * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
2694 */ 2846 */
2695static bool compare_ptrs_to_packet(struct bpf_reg_state *old, 2847static bool compare_ptrs_to_packet(struct bpf_verifier_env *env,
2848 struct bpf_reg_state *old,
2696 struct bpf_reg_state *cur) 2849 struct bpf_reg_state *cur)
2697{ 2850{
2698 if (old->id != cur->id) 2851 if (old->id != cur->id)
@@ -2735,7 +2888,7 @@ static bool compare_ptrs_to_packet(struct bpf_reg_state *old,
2735 * 'if (R4 > data_end)' and all further insn were already good with r=20, 2888 * 'if (R4 > data_end)' and all further insn were already good with r=20,
2736 * so they will be good with r=30 and we can prune the search. 2889 * so they will be good with r=30 and we can prune the search.
2737 */ 2890 */
2738 if (old->off <= cur->off && 2891 if (!env->strict_alignment && old->off <= cur->off &&
2739 old->off >= old->range && cur->off >= cur->range) 2892 old->off >= old->range && cur->off >= cur->range)
2740 return true; 2893 return true;
2741 2894
@@ -2806,7 +2959,7 @@ static bool states_equal(struct bpf_verifier_env *env,
2806 continue; 2959 continue;
2807 2960
2808 if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && 2961 if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
2809 compare_ptrs_to_packet(rold, rcur)) 2962 compare_ptrs_to_packet(env, rold, rcur))
2810 continue; 2963 continue;
2811 2964
2812 return false; 2965 return false;
@@ -2824,6 +2977,8 @@ static bool states_equal(struct bpf_verifier_env *env,
2824 return false; 2977 return false;
2825 if (i % BPF_REG_SIZE) 2978 if (i % BPF_REG_SIZE)
2826 continue; 2979 continue;
2980 if (old->stack_slot_type[i] != STACK_SPILL)
2981 continue;
2827 if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], 2982 if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
2828 &cur->spilled_regs[i / BPF_REG_SIZE], 2983 &cur->spilled_regs[i / BPF_REG_SIZE],
2829 sizeof(old->spilled_regs[0]))) 2984 sizeof(old->spilled_regs[0])))
@@ -2985,18 +3140,12 @@ static int do_check(struct bpf_verifier_env *env)
2985 /* check that memory (src_reg + off) is readable, 3140 /* check that memory (src_reg + off) is readable,
2986 * the state of dst_reg will be updated by this func 3141 * the state of dst_reg will be updated by this func
2987 */ 3142 */
2988 err = check_mem_access(env, insn->src_reg, insn->off, 3143 err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
2989 BPF_SIZE(insn->code), BPF_READ, 3144 BPF_SIZE(insn->code), BPF_READ,
2990 insn->dst_reg); 3145 insn->dst_reg);
2991 if (err) 3146 if (err)
2992 return err; 3147 return err;
2993 3148
2994 if (BPF_SIZE(insn->code) != BPF_W &&
2995 BPF_SIZE(insn->code) != BPF_DW) {
2996 insn_idx++;
2997 continue;
2998 }
2999
3000 prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; 3149 prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
3001 3150
3002 if (*prev_src_type == NOT_INIT) { 3151 if (*prev_src_type == NOT_INIT) {
@@ -3024,7 +3173,7 @@ static int do_check(struct bpf_verifier_env *env)
3024 enum bpf_reg_type *prev_dst_type, dst_reg_type; 3173 enum bpf_reg_type *prev_dst_type, dst_reg_type;
3025 3174
3026 if (BPF_MODE(insn->code) == BPF_XADD) { 3175 if (BPF_MODE(insn->code) == BPF_XADD) {
3027 err = check_xadd(env, insn); 3176 err = check_xadd(env, insn_idx, insn);
3028 if (err) 3177 if (err)
3029 return err; 3178 return err;
3030 insn_idx++; 3179 insn_idx++;
@@ -3043,7 +3192,7 @@ static int do_check(struct bpf_verifier_env *env)
3043 dst_reg_type = regs[insn->dst_reg].type; 3192 dst_reg_type = regs[insn->dst_reg].type;
3044 3193
3045 /* check that memory (dst_reg + off) is writeable */ 3194 /* check that memory (dst_reg + off) is writeable */
3046 err = check_mem_access(env, insn->dst_reg, insn->off, 3195 err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
3047 BPF_SIZE(insn->code), BPF_WRITE, 3196 BPF_SIZE(insn->code), BPF_WRITE,
3048 insn->src_reg); 3197 insn->src_reg);
3049 if (err) 3198 if (err)
@@ -3072,7 +3221,7 @@ static int do_check(struct bpf_verifier_env *env)
3072 return err; 3221 return err;
3073 3222
3074 /* check that memory (dst_reg + off) is writeable */ 3223 /* check that memory (dst_reg + off) is writeable */
3075 err = check_mem_access(env, insn->dst_reg, insn->off, 3224 err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
3076 BPF_SIZE(insn->code), BPF_WRITE, 3225 BPF_SIZE(insn->code), BPF_WRITE,
3077 -1); 3226 -1);
3078 if (err) 3227 if (err)
@@ -3170,7 +3319,8 @@ process_bpf_exit:
3170 insn_idx++; 3319 insn_idx++;
3171 } 3320 }
3172 3321
3173 verbose("processed %d insns\n", insn_processed); 3322 verbose("processed %d insns, stack depth %d\n",
3323 insn_processed, env->prog->aux->stack_depth);
3174 return 0; 3324 return 0;
3175} 3325}
3176 3326
@@ -3370,11 +3520,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
3370static int convert_ctx_accesses(struct bpf_verifier_env *env) 3520static int convert_ctx_accesses(struct bpf_verifier_env *env)
3371{ 3521{
3372 const struct bpf_verifier_ops *ops = env->prog->aux->ops; 3522 const struct bpf_verifier_ops *ops = env->prog->aux->ops;
3523 int i, cnt, size, ctx_field_size, delta = 0;
3373 const int insn_cnt = env->prog->len; 3524 const int insn_cnt = env->prog->len;
3374 struct bpf_insn insn_buf[16], *insn; 3525 struct bpf_insn insn_buf[16], *insn;
3375 struct bpf_prog *new_prog; 3526 struct bpf_prog *new_prog;
3376 enum bpf_access_type type; 3527 enum bpf_access_type type;
3377 int i, cnt, delta = 0; 3528 bool is_narrower_load;
3529 u32 target_size;
3378 3530
3379 if (ops->gen_prologue) { 3531 if (ops->gen_prologue) {
3380 cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, 3532 cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
@@ -3414,12 +3566,52 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
3414 if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) 3566 if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
3415 continue; 3567 continue;
3416 3568
3417 cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); 3569 ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
3418 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { 3570 size = BPF_LDST_BYTES(insn);
3571
3572 /* If the read access is a narrower load of the field,
3573 * convert to a 4/8-byte load, to minimum program type specific
3574 * convert_ctx_access changes. If conversion is successful,
3575 * we will apply proper mask to the result.
3576 */
3577 is_narrower_load = size < ctx_field_size;
3578 if (is_narrower_load) {
3579 u32 off = insn->off;
3580 u8 size_code;
3581
3582 if (type == BPF_WRITE) {
3583 verbose("bpf verifier narrow ctx access misconfigured\n");
3584 return -EINVAL;
3585 }
3586
3587 size_code = BPF_H;
3588 if (ctx_field_size == 4)
3589 size_code = BPF_W;
3590 else if (ctx_field_size == 8)
3591 size_code = BPF_DW;
3592
3593 insn->off = off & ~(ctx_field_size - 1);
3594 insn->code = BPF_LDX | BPF_MEM | size_code;
3595 }
3596
3597 target_size = 0;
3598 cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
3599 &target_size);
3600 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
3601 (ctx_field_size && !target_size)) {
3419 verbose("bpf verifier is misconfigured\n"); 3602 verbose("bpf verifier is misconfigured\n");
3420 return -EINVAL; 3603 return -EINVAL;
3421 } 3604 }
3422 3605
3606 if (is_narrower_load && size < target_size) {
3607 if (ctx_field_size <= 4)
3608 insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
3609 (1 << size * 8) - 1);
3610 else
3611 insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
3612 (1 << size * 8) - 1);
3613 }
3614
3423 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); 3615 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
3424 if (!new_prog) 3616 if (!new_prog)
3425 return -ENOMEM; 3617 return -ENOMEM;
@@ -3465,6 +3657,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
3465 * the program array. 3657 * the program array.
3466 */ 3658 */
3467 prog->cb_access = 1; 3659 prog->cb_access = 1;
3660 env->prog->aux->stack_depth = MAX_BPF_STACK;
3468 3661
3469 /* mark bpf_tail_call as different opcode to avoid 3662 /* mark bpf_tail_call as different opcode to avoid
3470 * conditional branch in the interpeter for every normal 3663 * conditional branch in the interpeter for every normal
@@ -3472,7 +3665,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
3472 * that doesn't support bpf_tail_call yet 3665 * that doesn't support bpf_tail_call yet
3473 */ 3666 */
3474 insn->imm = 0; 3667 insn->imm = 0;
3475 insn->code |= BPF_X; 3668 insn->code = BPF_JMP | BPF_TAIL_CALL;
3476 continue; 3669 continue;
3477 } 3670 }
3478 3671
@@ -3584,10 +3777,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
3584 } else { 3777 } else {
3585 log_level = 0; 3778 log_level = 0;
3586 } 3779 }
3587 if (attr->prog_flags & BPF_F_STRICT_ALIGNMENT) 3780
3781 env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
3782 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
3588 env->strict_alignment = true; 3783 env->strict_alignment = true;
3589 else
3590 env->strict_alignment = false;
3591 3784
3592 ret = replace_map_fd_with_map_ptr(env); 3785 ret = replace_map_fd_with_map_ptr(env);
3593 if (ret < 0) 3786 if (ret < 0)
@@ -3693,7 +3886,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
3693 mutex_lock(&bpf_verifier_lock); 3886 mutex_lock(&bpf_verifier_lock);
3694 3887
3695 log_level = 0; 3888 log_level = 0;
3889
3696 env->strict_alignment = false; 3890 env->strict_alignment = false;
3891 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
3892 env->strict_alignment = true;
3697 3893
3698 env->explored_states = kcalloc(env->prog->len, 3894 env->explored_states = kcalloc(env->prog->len,
3699 sizeof(struct bpf_verifier_state_list *), 3895 sizeof(struct bpf_verifier_state_list *),
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 387348a40c64..ce693ccb8c58 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
4obj-$(CONFIG_CGROUP_PIDS) += pids.o 4obj-$(CONFIG_CGROUP_PIDS) += pids.o
5obj-$(CONFIG_CGROUP_RDMA) += rdma.o 5obj-$(CONFIG_CGROUP_RDMA) += rdma.o
6obj-$(CONFIG_CPUSETS) += cpuset.o 6obj-$(CONFIG_CPUSETS) += cpuset.o
7obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 00f4d6bf048f..793565c05742 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -192,6 +192,8 @@ int cgroup_rmdir(struct kernfs_node *kn);
192int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, 192int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
193 struct kernfs_root *kf_root); 193 struct kernfs_root *kf_root);
194 194
195int cgroup_task_count(const struct cgroup *cgrp);
196
195/* 197/*
196 * namespace.c 198 * namespace.c
197 */ 199 */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 85d75152402d..7bf4b1533f34 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -334,19 +334,15 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
334/** 334/**
335 * cgroup_task_count - count the number of tasks in a cgroup. 335 * cgroup_task_count - count the number of tasks in a cgroup.
336 * @cgrp: the cgroup in question 336 * @cgrp: the cgroup in question
337 *
338 * Return the number of tasks in the cgroup. The returned number can be
339 * higher than the actual number of tasks due to css_set references from
340 * namespace roots and temporary usages.
341 */ 337 */
342static int cgroup_task_count(const struct cgroup *cgrp) 338int cgroup_task_count(const struct cgroup *cgrp)
343{ 339{
344 int count = 0; 340 int count = 0;
345 struct cgrp_cset_link *link; 341 struct cgrp_cset_link *link;
346 342
347 spin_lock_irq(&css_set_lock); 343 spin_lock_irq(&css_set_lock);
348 list_for_each_entry(link, &cgrp->cset_links, cset_link) 344 list_for_each_entry(link, &cgrp->cset_links, cset_link)
349 count += refcount_read(&link->cset->refcount); 345 count += link->cset->nr_tasks;
350 spin_unlock_irq(&css_set_lock); 346 spin_unlock_irq(&css_set_lock);
351 return count; 347 return count;
352} 348}
@@ -1263,150 +1259,3 @@ static int __init cgroup_no_v1(char *str)
1263 return 1; 1259 return 1;
1264} 1260}
1265__setup("cgroup_no_v1=", cgroup_no_v1); 1261__setup("cgroup_no_v1=", cgroup_no_v1);
1266
1267
1268#ifdef CONFIG_CGROUP_DEBUG
1269static struct cgroup_subsys_state *
1270debug_css_alloc(struct cgroup_subsys_state *parent_css)
1271{
1272 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
1273
1274 if (!css)
1275 return ERR_PTR(-ENOMEM);
1276
1277 return css;
1278}
1279
1280static void debug_css_free(struct cgroup_subsys_state *css)
1281{
1282 kfree(css);
1283}
1284
1285static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
1286 struct cftype *cft)
1287{
1288 return cgroup_task_count(css->cgroup);
1289}
1290
1291static u64 current_css_set_read(struct cgroup_subsys_state *css,
1292 struct cftype *cft)
1293{
1294 return (u64)(unsigned long)current->cgroups;
1295}
1296
1297static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
1298 struct cftype *cft)
1299{
1300 u64 count;
1301
1302 rcu_read_lock();
1303 count = refcount_read(&task_css_set(current)->refcount);
1304 rcu_read_unlock();
1305 return count;
1306}
1307
1308static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
1309{
1310 struct cgrp_cset_link *link;
1311 struct css_set *cset;
1312 char *name_buf;
1313
1314 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
1315 if (!name_buf)
1316 return -ENOMEM;
1317
1318 spin_lock_irq(&css_set_lock);
1319 rcu_read_lock();
1320 cset = rcu_dereference(current->cgroups);
1321 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1322 struct cgroup *c = link->cgrp;
1323
1324 cgroup_name(c, name_buf, NAME_MAX + 1);
1325 seq_printf(seq, "Root %d group %s\n",
1326 c->root->hierarchy_id, name_buf);
1327 }
1328 rcu_read_unlock();
1329 spin_unlock_irq(&css_set_lock);
1330 kfree(name_buf);
1331 return 0;
1332}
1333
1334#define MAX_TASKS_SHOWN_PER_CSS 25
1335static int cgroup_css_links_read(struct seq_file *seq, void *v)
1336{
1337 struct cgroup_subsys_state *css = seq_css(seq);
1338 struct cgrp_cset_link *link;
1339
1340 spin_lock_irq(&css_set_lock);
1341 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
1342 struct css_set *cset = link->cset;
1343 struct task_struct *task;
1344 int count = 0;
1345
1346 seq_printf(seq, "css_set %pK\n", cset);
1347
1348 list_for_each_entry(task, &cset->tasks, cg_list) {
1349 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
1350 goto overflow;
1351 seq_printf(seq, " task %d\n", task_pid_vnr(task));
1352 }
1353
1354 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
1355 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
1356 goto overflow;
1357 seq_printf(seq, " task %d\n", task_pid_vnr(task));
1358 }
1359 continue;
1360 overflow:
1361 seq_puts(seq, " ...\n");
1362 }
1363 spin_unlock_irq(&css_set_lock);
1364 return 0;
1365}
1366
1367static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
1368{
1369 return (!cgroup_is_populated(css->cgroup) &&
1370 !css_has_online_children(&css->cgroup->self));
1371}
1372
1373static struct cftype debug_files[] = {
1374 {
1375 .name = "taskcount",
1376 .read_u64 = debug_taskcount_read,
1377 },
1378
1379 {
1380 .name = "current_css_set",
1381 .read_u64 = current_css_set_read,
1382 },
1383
1384 {
1385 .name = "current_css_set_refcount",
1386 .read_u64 = current_css_set_refcount_read,
1387 },
1388
1389 {
1390 .name = "current_css_set_cg_links",
1391 .seq_show = current_css_set_cg_links_read,
1392 },
1393
1394 {
1395 .name = "cgroup_css_links",
1396 .seq_show = cgroup_css_links_read,
1397 },
1398
1399 {
1400 .name = "releasable",
1401 .read_u64 = releasable_read,
1402 },
1403
1404 { } /* terminate */
1405};
1406
1407struct cgroup_subsys debug_cgrp_subsys = {
1408 .css_alloc = debug_css_alloc,
1409 .css_free = debug_css_free,
1410 .legacy_cftypes = debug_files,
1411};
1412#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c3c9a0e1b3c9..620794a20a33 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -573,6 +573,11 @@ static int css_set_count = 1; /* 1 for init_css_set */
573/** 573/**
574 * css_set_populated - does a css_set contain any tasks? 574 * css_set_populated - does a css_set contain any tasks?
575 * @cset: target css_set 575 * @cset: target css_set
576 *
577 * css_set_populated() should be the same as !!cset->nr_tasks at steady
578 * state. However, css_set_populated() can be called while a task is being
579 * added to or removed from the linked list before the nr_tasks is
580 * properly updated. Hence, we can't just look at ->nr_tasks here.
576 */ 581 */
577static bool css_set_populated(struct css_set *cset) 582static bool css_set_populated(struct css_set *cset)
578{ 583{
@@ -1542,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1542 return len; 1547 return len;
1543} 1548}
1544 1549
1550static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
1551{
1552 char *token;
1553
1554 *root_flags = 0;
1555
1556 if (!data)
1557 return 0;
1558
1559 while ((token = strsep(&data, ",")) != NULL) {
1560 if (!strcmp(token, "nsdelegate")) {
1561 *root_flags |= CGRP_ROOT_NS_DELEGATE;
1562 continue;
1563 }
1564
1565 pr_err("cgroup2: unknown option \"%s\"\n", token);
1566 return -EINVAL;
1567 }
1568
1569 return 0;
1570}
1571
1572static void apply_cgroup_root_flags(unsigned int root_flags)
1573{
1574 if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1575 if (root_flags & CGRP_ROOT_NS_DELEGATE)
1576 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1577 else
1578 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1579 }
1580}
1581
1582static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1583{
1584 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1585 seq_puts(seq, ",nsdelegate");
1586 return 0;
1587}
1588
1545static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) 1589static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1546{ 1590{
1547 pr_err("remount is not allowed\n"); 1591 unsigned int root_flags;
1548 return -EINVAL; 1592 int ret;
1593
1594 ret = parse_cgroup_root_flags(data, &root_flags);
1595 if (ret)
1596 return ret;
1597
1598 apply_cgroup_root_flags(root_flags);
1599 return 0;
1549} 1600}
1550 1601
1551/* 1602/*
@@ -1598,6 +1649,7 @@ static void cgroup_enable_task_cg_lists(void)
1598 css_set_update_populated(cset, true); 1649 css_set_update_populated(cset, true);
1599 list_add_tail(&p->cg_list, &cset->tasks); 1650 list_add_tail(&p->cg_list, &cset->tasks);
1600 get_css_set(cset); 1651 get_css_set(cset);
1652 cset->nr_tasks++;
1601 } 1653 }
1602 spin_unlock(&p->sighand->siglock); 1654 spin_unlock(&p->sighand->siglock);
1603 } while_each_thread(g, p); 1655 } while_each_thread(g, p);
@@ -1784,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1784{ 1836{
1785 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; 1837 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
1786 struct dentry *dentry; 1838 struct dentry *dentry;
1839 int ret;
1787 1840
1788 get_cgroup_ns(ns); 1841 get_cgroup_ns(ns);
1789 1842
@@ -1801,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1801 cgroup_enable_task_cg_lists(); 1854 cgroup_enable_task_cg_lists();
1802 1855
1803 if (fs_type == &cgroup2_fs_type) { 1856 if (fs_type == &cgroup2_fs_type) {
1804 if (data) { 1857 unsigned int root_flags;
1805 pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); 1858
1859 ret = parse_cgroup_root_flags(data, &root_flags);
1860 if (ret) {
1806 put_cgroup_ns(ns); 1861 put_cgroup_ns(ns);
1807 return ERR_PTR(-EINVAL); 1862 return ERR_PTR(ret);
1808 } 1863 }
1864
1809 cgrp_dfl_visible = true; 1865 cgrp_dfl_visible = true;
1810 cgroup_get_live(&cgrp_dfl_root.cgrp); 1866 cgroup_get_live(&cgrp_dfl_root.cgrp);
1811 1867
1812 dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, 1868 dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
1813 CGROUP2_SUPER_MAGIC, ns); 1869 CGROUP2_SUPER_MAGIC, ns);
1870 if (!IS_ERR(dentry))
1871 apply_cgroup_root_flags(root_flags);
1814 } else { 1872 } else {
1815 dentry = cgroup1_mount(&cgroup_fs_type, flags, data, 1873 dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
1816 CGROUP_SUPER_MAGIC, ns); 1874 CGROUP_SUPER_MAGIC, ns);
@@ -2064,8 +2122,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2064 struct css_set *to_cset = cset->mg_dst_cset; 2122 struct css_set *to_cset = cset->mg_dst_cset;
2065 2123
2066 get_css_set(to_cset); 2124 get_css_set(to_cset);
2125 to_cset->nr_tasks++;
2067 css_set_move_task(task, from_cset, to_cset, true); 2126 css_set_move_task(task, from_cset, to_cset, true);
2068 put_css_set_locked(from_cset); 2127 put_css_set_locked(from_cset);
2128 from_cset->nr_tasks--;
2069 } 2129 }
2070 } 2130 }
2071 spin_unlock_irq(&css_set_lock); 2131 spin_unlock_irq(&css_set_lock);
@@ -2355,27 +2415,14 @@ static int cgroup_procs_write_permission(struct task_struct *task,
2355 struct cgroup *dst_cgrp, 2415 struct cgroup *dst_cgrp,
2356 struct kernfs_open_file *of) 2416 struct kernfs_open_file *of)
2357{ 2417{
2358 int ret = 0; 2418 struct super_block *sb = of->file->f_path.dentry->d_sb;
2359 2419 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2360 if (cgroup_on_dfl(dst_cgrp)) { 2420 struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
2361 struct super_block *sb = of->file->f_path.dentry->d_sb; 2421 struct cgroup *src_cgrp, *com_cgrp;
2362 struct cgroup *cgrp; 2422 struct inode *inode;
2363 struct inode *inode; 2423 int ret;
2364
2365 spin_lock_irq(&css_set_lock);
2366 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2367 spin_unlock_irq(&css_set_lock);
2368
2369 while (!cgroup_is_descendant(dst_cgrp, cgrp))
2370 cgrp = cgroup_parent(cgrp);
2371 2424
2372 ret = -ENOMEM; 2425 if (!cgroup_on_dfl(dst_cgrp)) {
2373 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
2374 if (inode) {
2375 ret = inode_permission(inode, MAY_WRITE);
2376 iput(inode);
2377 }
2378 } else {
2379 const struct cred *cred = current_cred(); 2426 const struct cred *cred = current_cred();
2380 const struct cred *tcred = get_task_cred(task); 2427 const struct cred *tcred = get_task_cred(task);
2381 2428
@@ -2383,14 +2430,47 @@ static int cgroup_procs_write_permission(struct task_struct *task,
2383 * even if we're attaching all tasks in the thread group, 2430 * even if we're attaching all tasks in the thread group,
2384 * we only need to check permissions on one of them. 2431 * we only need to check permissions on one of them.
2385 */ 2432 */
2386 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && 2433 if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
2387 !uid_eq(cred->euid, tcred->uid) && 2434 uid_eq(cred->euid, tcred->uid) ||
2388 !uid_eq(cred->euid, tcred->suid)) 2435 uid_eq(cred->euid, tcred->suid))
2436 ret = 0;
2437 else
2389 ret = -EACCES; 2438 ret = -EACCES;
2439
2390 put_cred(tcred); 2440 put_cred(tcred);
2441 return ret;
2391 } 2442 }
2392 2443
2393 return ret; 2444 /* find the source cgroup */
2445 spin_lock_irq(&css_set_lock);
2446 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2447 spin_unlock_irq(&css_set_lock);
2448
2449 /* and the common ancestor */
2450 com_cgrp = src_cgrp;
2451 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
2452 com_cgrp = cgroup_parent(com_cgrp);
2453
2454 /* %current should be authorized to migrate to the common ancestor */
2455 inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
2456 if (!inode)
2457 return -ENOMEM;
2458
2459 ret = inode_permission(inode, MAY_WRITE);
2460 iput(inode);
2461 if (ret)
2462 return ret;
2463
2464 /*
2465 * If namespaces are delegation boundaries, %current must be able
2466 * to see both source and destination cgroups from its namespace.
2467 */
2468 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
2469 (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
2470 !cgroup_is_descendant(dst_cgrp, root_cgrp)))
2471 return -ENOENT;
2472
2473 return 0;
2394} 2474}
2395 2475
2396/* 2476/*
@@ -2954,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of)
2954static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, 3034static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2955 size_t nbytes, loff_t off) 3035 size_t nbytes, loff_t off)
2956{ 3036{
3037 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2957 struct cgroup *cgrp = of->kn->parent->priv; 3038 struct cgroup *cgrp = of->kn->parent->priv;
2958 struct cftype *cft = of->kn->priv; 3039 struct cftype *cft = of->kn->priv;
2959 struct cgroup_subsys_state *css; 3040 struct cgroup_subsys_state *css;
2960 int ret; 3041 int ret;
2961 3042
3043 /*
3044 * If namespaces are delegation boundaries, disallow writes to
3045 * files in an non-init namespace root from inside the namespace
3046 * except for the files explicitly marked delegatable -
3047 * cgroup.procs and cgroup.subtree_control.
3048 */
3049 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3050 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3051 ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
3052 return -EPERM;
3053
2962 if (cft->write) 3054 if (cft->write)
2963 return cft->write(of, buf, nbytes, off); 3055 return cft->write(of, buf, nbytes, off);
2964 3056
@@ -3792,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
3792static struct cftype cgroup_base_files[] = { 3884static struct cftype cgroup_base_files[] = {
3793 { 3885 {
3794 .name = "cgroup.procs", 3886 .name = "cgroup.procs",
3887 .flags = CFTYPE_NS_DELEGATABLE,
3795 .file_offset = offsetof(struct cgroup, procs_file), 3888 .file_offset = offsetof(struct cgroup, procs_file),
3796 .release = cgroup_procs_release, 3889 .release = cgroup_procs_release,
3797 .seq_start = cgroup_procs_start, 3890 .seq_start = cgroup_procs_start,
@@ -3805,6 +3898,7 @@ static struct cftype cgroup_base_files[] = {
3805 }, 3898 },
3806 { 3899 {
3807 .name = "cgroup.subtree_control", 3900 .name = "cgroup.subtree_control",
3901 .flags = CFTYPE_NS_DELEGATABLE,
3808 .seq_show = cgroup_subtree_control_show, 3902 .seq_show = cgroup_subtree_control_show,
3809 .write = cgroup_subtree_control_write, 3903 .write = cgroup_subtree_control_write,
3810 }, 3904 },
@@ -4265,6 +4359,11 @@ static void kill_css(struct cgroup_subsys_state *css)
4265{ 4359{
4266 lockdep_assert_held(&cgroup_mutex); 4360 lockdep_assert_held(&cgroup_mutex);
4267 4361
4362 if (css->flags & CSS_DYING)
4363 return;
4364
4365 css->flags |= CSS_DYING;
4366
4268 /* 4367 /*
4269 * This must happen before css is disassociated with its cgroup. 4368 * This must happen before css is disassociated with its cgroup.
4270 * See seq_css() for details. 4369 * See seq_css() for details.
@@ -4388,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn)
4388} 4487}
4389 4488
4390static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { 4489static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4490 .show_options = cgroup_show_options,
4391 .remount_fs = cgroup_remount, 4491 .remount_fs = cgroup_remount,
4392 .mkdir = cgroup_mkdir, 4492 .mkdir = cgroup_mkdir,
4393 .rmdir = cgroup_rmdir, 4493 .rmdir = cgroup_rmdir,
@@ -4784,6 +4884,7 @@ void cgroup_post_fork(struct task_struct *child)
4784 cset = task_css_set(current); 4884 cset = task_css_set(current);
4785 if (list_empty(&child->cg_list)) { 4885 if (list_empty(&child->cg_list)) {
4786 get_css_set(cset); 4886 get_css_set(cset);
4887 cset->nr_tasks++;
4787 css_set_move_task(child, NULL, cset, false); 4888 css_set_move_task(child, NULL, cset, false);
4788 } 4889 }
4789 spin_unlock_irq(&css_set_lock); 4890 spin_unlock_irq(&css_set_lock);
@@ -4833,6 +4934,7 @@ void cgroup_exit(struct task_struct *tsk)
4833 if (!list_empty(&tsk->cg_list)) { 4934 if (!list_empty(&tsk->cg_list)) {
4834 spin_lock_irq(&css_set_lock); 4935 spin_lock_irq(&css_set_lock);
4835 css_set_move_task(tsk, cset, NULL, false); 4936 css_set_move_task(tsk, cset, NULL, false);
4937 cset->nr_tasks--;
4836 spin_unlock_irq(&css_set_lock); 4938 spin_unlock_irq(&css_set_lock);
4837 } else { 4939 } else {
4838 get_css_set(cset); 4940 get_css_set(cset);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index f6501f4f6040..ca8376e5008c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -176,9 +176,9 @@ typedef enum {
176} cpuset_flagbits_t; 176} cpuset_flagbits_t;
177 177
178/* convenient tests for these bits */ 178/* convenient tests for these bits */
179static inline bool is_cpuset_online(const struct cpuset *cs) 179static inline bool is_cpuset_online(struct cpuset *cs)
180{ 180{
181 return test_bit(CS_ONLINE, &cs->flags); 181 return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
182} 182}
183 183
184static inline int is_cpu_exclusive(const struct cpuset *cs) 184static inline int is_cpu_exclusive(const struct cpuset *cs)
@@ -1038,40 +1038,25 @@ static void cpuset_post_attach(void)
1038 * @tsk: the task to change 1038 * @tsk: the task to change
1039 * @newmems: new nodes that the task will be set 1039 * @newmems: new nodes that the task will be set
1040 * 1040 *
1041 * In order to avoid seeing no nodes if the old and new nodes are disjoint, 1041 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
1042 * we structure updates as setting all new allowed nodes, then clearing newly 1042 * and rebind an eventual tasks' mempolicy. If the task is allocating in
1043 * disallowed ones. 1043 * parallel, it might temporarily see an empty intersection, which results in
1044 * a seqlock check and retry before OOM or allocation failure.
1044 */ 1045 */
1045static void cpuset_change_task_nodemask(struct task_struct *tsk, 1046static void cpuset_change_task_nodemask(struct task_struct *tsk,
1046 nodemask_t *newmems) 1047 nodemask_t *newmems)
1047{ 1048{
1048 bool need_loop;
1049
1050 task_lock(tsk); 1049 task_lock(tsk);
1051 /*
1052 * Determine if a loop is necessary if another thread is doing
1053 * read_mems_allowed_begin(). If at least one node remains unchanged and
1054 * tsk does not have a mempolicy, then an empty nodemask will not be
1055 * possible when mems_allowed is larger than a word.
1056 */
1057 need_loop = task_has_mempolicy(tsk) ||
1058 !nodes_intersects(*newmems, tsk->mems_allowed);
1059 1050
1060 if (need_loop) { 1051 local_irq_disable();
1061 local_irq_disable(); 1052 write_seqcount_begin(&tsk->mems_allowed_seq);
1062 write_seqcount_begin(&tsk->mems_allowed_seq);
1063 }
1064 1053
1065 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 1054 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1066 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); 1055 mpol_rebind_task(tsk, newmems);
1067
1068 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1069 tsk->mems_allowed = *newmems; 1056 tsk->mems_allowed = *newmems;
1070 1057
1071 if (need_loop) { 1058 write_seqcount_end(&tsk->mems_allowed_seq);
1072 write_seqcount_end(&tsk->mems_allowed_seq); 1059 local_irq_enable();
1073 local_irq_enable();
1074 }
1075 1060
1076 task_unlock(tsk); 1061 task_unlock(tsk);
1077} 1062}
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
new file mode 100644
index 000000000000..dac46af22782
--- /dev/null
+++ b/kernel/cgroup/debug.c
@@ -0,0 +1,357 @@
1/*
2 * Debug controller
3 *
4 * WARNING: This controller is for cgroup core debugging only.
5 * Its interfaces are unstable and subject to changes at any time.
6 */
7#include <linux/ctype.h>
8#include <linux/mm.h>
9#include <linux/slab.h>
10
11#include "cgroup-internal.h"
12
13static struct cgroup_subsys_state *
14debug_css_alloc(struct cgroup_subsys_state *parent_css)
15{
16 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
17
18 if (!css)
19 return ERR_PTR(-ENOMEM);
20
21 return css;
22}
23
24static void debug_css_free(struct cgroup_subsys_state *css)
25{
26 kfree(css);
27}
28
29/*
30 * debug_taskcount_read - return the number of tasks in a cgroup.
31 * @cgrp: the cgroup in question
32 */
33static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
34 struct cftype *cft)
35{
36 return cgroup_task_count(css->cgroup);
37}
38
39static int current_css_set_read(struct seq_file *seq, void *v)
40{
41 struct kernfs_open_file *of = seq->private;
42 struct css_set *cset;
43 struct cgroup_subsys *ss;
44 struct cgroup_subsys_state *css;
45 int i, refcnt;
46
47 if (!cgroup_kn_lock_live(of->kn, false))
48 return -ENODEV;
49
50 spin_lock_irq(&css_set_lock);
51 rcu_read_lock();
52 cset = rcu_dereference(current->cgroups);
53 refcnt = refcount_read(&cset->refcount);
54 seq_printf(seq, "css_set %pK %d", cset, refcnt);
55 if (refcnt > cset->nr_tasks)
56 seq_printf(seq, " +%d", refcnt - cset->nr_tasks);
57 seq_puts(seq, "\n");
58
59 /*
60 * Print the css'es stored in the current css_set.
61 */
62 for_each_subsys(ss, i) {
63 css = cset->subsys[ss->id];
64 if (!css)
65 continue;
66 seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name,
67 (unsigned long)css, css->id);
68 }
69 rcu_read_unlock();
70 spin_unlock_irq(&css_set_lock);
71 cgroup_kn_unlock(of->kn);
72 return 0;
73}
74
75static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
76 struct cftype *cft)
77{
78 u64 count;
79
80 rcu_read_lock();
81 count = refcount_read(&task_css_set(current)->refcount);
82 rcu_read_unlock();
83 return count;
84}
85
86static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
87{
88 struct cgrp_cset_link *link;
89 struct css_set *cset;
90 char *name_buf;
91
92 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
93 if (!name_buf)
94 return -ENOMEM;
95
96 spin_lock_irq(&css_set_lock);
97 rcu_read_lock();
98 cset = rcu_dereference(current->cgroups);
99 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
100 struct cgroup *c = link->cgrp;
101
102 cgroup_name(c, name_buf, NAME_MAX + 1);
103 seq_printf(seq, "Root %d group %s\n",
104 c->root->hierarchy_id, name_buf);
105 }
106 rcu_read_unlock();
107 spin_unlock_irq(&css_set_lock);
108 kfree(name_buf);
109 return 0;
110}
111
112#define MAX_TASKS_SHOWN_PER_CSS 25
113static int cgroup_css_links_read(struct seq_file *seq, void *v)
114{
115 struct cgroup_subsys_state *css = seq_css(seq);
116 struct cgrp_cset_link *link;
117 int dead_cnt = 0, extra_refs = 0;
118
119 spin_lock_irq(&css_set_lock);
120 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
121 struct css_set *cset = link->cset;
122 struct task_struct *task;
123 int count = 0;
124 int refcnt = refcount_read(&cset->refcount);
125
126 seq_printf(seq, " %d", refcnt);
127 if (refcnt - cset->nr_tasks > 0) {
128 int extra = refcnt - cset->nr_tasks;
129
130 seq_printf(seq, " +%d", extra);
131 /*
132 * Take out the one additional reference in
133 * init_css_set.
134 */
135 if (cset == &init_css_set)
136 extra--;
137 extra_refs += extra;
138 }
139 seq_puts(seq, "\n");
140
141 list_for_each_entry(task, &cset->tasks, cg_list) {
142 if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
143 seq_printf(seq, " task %d\n",
144 task_pid_vnr(task));
145 }
146
147 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
148 if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
149 seq_printf(seq, " task %d\n",
150 task_pid_vnr(task));
151 }
152 /* show # of overflowed tasks */
153 if (count > MAX_TASKS_SHOWN_PER_CSS)
154 seq_printf(seq, " ... (%d)\n",
155 count - MAX_TASKS_SHOWN_PER_CSS);
156
157 if (cset->dead) {
158 seq_puts(seq, " [dead]\n");
159 dead_cnt++;
160 }
161
162 WARN_ON(count != cset->nr_tasks);
163 }
164 spin_unlock_irq(&css_set_lock);
165
166 if (!dead_cnt && !extra_refs)
167 return 0;
168
169 seq_puts(seq, "\n");
170 if (extra_refs)
171 seq_printf(seq, "extra references = %d\n", extra_refs);
172 if (dead_cnt)
173 seq_printf(seq, "dead css_sets = %d\n", dead_cnt);
174
175 return 0;
176}
177
178static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
179{
180 struct kernfs_open_file *of = seq->private;
181 struct cgroup *cgrp;
182 struct cgroup_subsys *ss;
183 struct cgroup_subsys_state *css;
184 char pbuf[16];
185 int i;
186
187 cgrp = cgroup_kn_lock_live(of->kn, false);
188 if (!cgrp)
189 return -ENODEV;
190
191 for_each_subsys(ss, i) {
192 css = rcu_dereference_check(cgrp->subsys[ss->id], true);
193 if (!css)
194 continue;
195
196 pbuf[0] = '\0';
197
198 /* Show the parent CSS if applicable*/
199 if (css->parent)
200 snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
201 css->parent->id);
202 seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name,
203 (unsigned long)css, css->id,
204 atomic_read(&css->online_cnt), pbuf);
205 }
206
207 cgroup_kn_unlock(of->kn);
208 return 0;
209}
210
211static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
212 u16 mask)
213{
214 struct cgroup_subsys *ss;
215 int ssid;
216 bool first = true;
217
218 seq_printf(seq, "%-17s: ", name);
219 for_each_subsys(ss, ssid) {
220 if (!(mask & (1 << ssid)))
221 continue;
222 if (!first)
223 seq_puts(seq, ", ");
224 seq_puts(seq, ss->name);
225 first = false;
226 }
227 seq_putc(seq, '\n');
228}
229
230static int cgroup_masks_read(struct seq_file *seq, void *v)
231{
232 struct kernfs_open_file *of = seq->private;
233 struct cgroup *cgrp;
234
235 cgrp = cgroup_kn_lock_live(of->kn, false);
236 if (!cgrp)
237 return -ENODEV;
238
239 cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control);
240 cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask);
241
242 cgroup_kn_unlock(of->kn);
243 return 0;
244}
245
246static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
247{
248 return (!cgroup_is_populated(css->cgroup) &&
249 !css_has_online_children(&css->cgroup->self));
250}
251
252static struct cftype debug_legacy_files[] = {
253 {
254 .name = "taskcount",
255 .read_u64 = debug_taskcount_read,
256 },
257
258 {
259 .name = "current_css_set",
260 .seq_show = current_css_set_read,
261 .flags = CFTYPE_ONLY_ON_ROOT,
262 },
263
264 {
265 .name = "current_css_set_refcount",
266 .read_u64 = current_css_set_refcount_read,
267 .flags = CFTYPE_ONLY_ON_ROOT,
268 },
269
270 {
271 .name = "current_css_set_cg_links",
272 .seq_show = current_css_set_cg_links_read,
273 .flags = CFTYPE_ONLY_ON_ROOT,
274 },
275
276 {
277 .name = "cgroup_css_links",
278 .seq_show = cgroup_css_links_read,
279 },
280
281 {
282 .name = "cgroup_subsys_states",
283 .seq_show = cgroup_subsys_states_read,
284 },
285
286 {
287 .name = "cgroup_masks",
288 .seq_show = cgroup_masks_read,
289 },
290
291 {
292 .name = "releasable",
293 .read_u64 = releasable_read,
294 },
295
296 { } /* terminate */
297};
298
299static struct cftype debug_files[] = {
300 {
301 .name = "taskcount",
302 .read_u64 = debug_taskcount_read,
303 },
304
305 {
306 .name = "current_css_set",
307 .seq_show = current_css_set_read,
308 .flags = CFTYPE_ONLY_ON_ROOT,
309 },
310
311 {
312 .name = "current_css_set_refcount",
313 .read_u64 = current_css_set_refcount_read,
314 .flags = CFTYPE_ONLY_ON_ROOT,
315 },
316
317 {
318 .name = "current_css_set_cg_links",
319 .seq_show = current_css_set_cg_links_read,
320 .flags = CFTYPE_ONLY_ON_ROOT,
321 },
322
323 {
324 .name = "css_links",
325 .seq_show = cgroup_css_links_read,
326 },
327
328 {
329 .name = "csses",
330 .seq_show = cgroup_subsys_states_read,
331 },
332
333 {
334 .name = "masks",
335 .seq_show = cgroup_masks_read,
336 },
337
338 { } /* terminate */
339};
340
341struct cgroup_subsys debug_cgrp_subsys = {
342 .css_alloc = debug_css_alloc,
343 .css_free = debug_css_free,
344 .legacy_cftypes = debug_legacy_files,
345};
346
347/*
348 * On v2, debug is an implicit controller enabled by "cgroup_debug" boot
349 * parameter.
350 */
351static int __init enable_cgroup_debug(char *str)
352{
353 debug_cgrp_subsys.dfl_cftypes = debug_files;
354 debug_cgrp_subsys.implicit_on_dfl = true;
355 return 1;
356}
357__setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/compat.c b/kernel/compat.c
index 933bcb31ae10..6f0a0e723a06 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -30,100 +30,66 @@
30 30
31#include <linux/uaccess.h> 31#include <linux/uaccess.h>
32 32
33static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) 33int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp)
34{ 34{
35 memset(txc, 0, sizeof(struct timex)); 35 struct compat_timex tx32;
36
37 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
38 __get_user(txc->modes, &utp->modes) ||
39 __get_user(txc->offset, &utp->offset) ||
40 __get_user(txc->freq, &utp->freq) ||
41 __get_user(txc->maxerror, &utp->maxerror) ||
42 __get_user(txc->esterror, &utp->esterror) ||
43 __get_user(txc->status, &utp->status) ||
44 __get_user(txc->constant, &utp->constant) ||
45 __get_user(txc->precision, &utp->precision) ||
46 __get_user(txc->tolerance, &utp->tolerance) ||
47 __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
48 __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
49 __get_user(txc->tick, &utp->tick) ||
50 __get_user(txc->ppsfreq, &utp->ppsfreq) ||
51 __get_user(txc->jitter, &utp->jitter) ||
52 __get_user(txc->shift, &utp->shift) ||
53 __get_user(txc->stabil, &utp->stabil) ||
54 __get_user(txc->jitcnt, &utp->jitcnt) ||
55 __get_user(txc->calcnt, &utp->calcnt) ||
56 __get_user(txc->errcnt, &utp->errcnt) ||
57 __get_user(txc->stbcnt, &utp->stbcnt))
58 return -EFAULT;
59 36
60 return 0; 37 if (copy_from_user(&tx32, utp, sizeof(struct compat_timex)))
61}
62
63static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
64{
65 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
66 __put_user(txc->modes, &utp->modes) ||
67 __put_user(txc->offset, &utp->offset) ||
68 __put_user(txc->freq, &utp->freq) ||
69 __put_user(txc->maxerror, &utp->maxerror) ||
70 __put_user(txc->esterror, &utp->esterror) ||
71 __put_user(txc->status, &utp->status) ||
72 __put_user(txc->constant, &utp->constant) ||
73 __put_user(txc->precision, &utp->precision) ||
74 __put_user(txc->tolerance, &utp->tolerance) ||
75 __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
76 __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
77 __put_user(txc->tick, &utp->tick) ||
78 __put_user(txc->ppsfreq, &utp->ppsfreq) ||
79 __put_user(txc->jitter, &utp->jitter) ||
80 __put_user(txc->shift, &utp->shift) ||
81 __put_user(txc->stabil, &utp->stabil) ||
82 __put_user(txc->jitcnt, &utp->jitcnt) ||
83 __put_user(txc->calcnt, &utp->calcnt) ||
84 __put_user(txc->errcnt, &utp->errcnt) ||
85 __put_user(txc->stbcnt, &utp->stbcnt) ||
86 __put_user(txc->tai, &utp->tai))
87 return -EFAULT; 38 return -EFAULT;
88 return 0;
89}
90 39
91COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, 40 txc->modes = tx32.modes;
92 struct timezone __user *, tz) 41 txc->offset = tx32.offset;
93{ 42 txc->freq = tx32.freq;
94 if (tv) { 43 txc->maxerror = tx32.maxerror;
95 struct timeval ktv; 44 txc->esterror = tx32.esterror;
96 do_gettimeofday(&ktv); 45 txc->status = tx32.status;
97 if (compat_put_timeval(&ktv, tv)) 46 txc->constant = tx32.constant;
98 return -EFAULT; 47 txc->precision = tx32.precision;
99 } 48 txc->tolerance = tx32.tolerance;
100 if (tz) { 49 txc->time.tv_sec = tx32.time.tv_sec;
101 if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) 50 txc->time.tv_usec = tx32.time.tv_usec;
102 return -EFAULT; 51 txc->tick = tx32.tick;
103 } 52 txc->ppsfreq = tx32.ppsfreq;
53 txc->jitter = tx32.jitter;
54 txc->shift = tx32.shift;
55 txc->stabil = tx32.stabil;
56 txc->jitcnt = tx32.jitcnt;
57 txc->calcnt = tx32.calcnt;
58 txc->errcnt = tx32.errcnt;
59 txc->stbcnt = tx32.stbcnt;
104 60
105 return 0; 61 return 0;
106} 62}
107 63
108COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, 64int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc)
109 struct timezone __user *, tz) 65{
110{ 66 struct compat_timex tx32;
111 struct timespec64 new_ts; 67
112 struct timeval user_tv; 68 memset(&tx32, 0, sizeof(struct compat_timex));
113 struct timezone new_tz; 69 tx32.modes = txc->modes;
114 70 tx32.offset = txc->offset;
115 if (tv) { 71 tx32.freq = txc->freq;
116 if (compat_get_timeval(&user_tv, tv)) 72 tx32.maxerror = txc->maxerror;
117 return -EFAULT; 73 tx32.esterror = txc->esterror;
118 new_ts.tv_sec = user_tv.tv_sec; 74 tx32.status = txc->status;
119 new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; 75 tx32.constant = txc->constant;
120 } 76 tx32.precision = txc->precision;
121 if (tz) { 77 tx32.tolerance = txc->tolerance;
122 if (copy_from_user(&new_tz, tz, sizeof(*tz))) 78 tx32.time.tv_sec = txc->time.tv_sec;
123 return -EFAULT; 79 tx32.time.tv_usec = txc->time.tv_usec;
124 } 80 tx32.tick = txc->tick;
125 81 tx32.ppsfreq = txc->ppsfreq;
126 return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); 82 tx32.jitter = txc->jitter;
83 tx32.shift = txc->shift;
84 tx32.stabil = txc->stabil;
85 tx32.jitcnt = txc->jitcnt;
86 tx32.calcnt = txc->calcnt;
87 tx32.errcnt = txc->errcnt;
88 tx32.stbcnt = txc->stbcnt;
89 tx32.tai = txc->tai;
90 if (copy_to_user(utp, &tx32, sizeof(struct compat_timex)))
91 return -EFAULT;
92 return 0;
127} 93}
128 94
129static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) 95static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
@@ -154,6 +120,50 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp
154 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; 120 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
155} 121}
156 122
123static int __compat_get_timespec64(struct timespec64 *ts64,
124 const struct compat_timespec __user *cts)
125{
126 struct compat_timespec ts;
127 int ret;
128
129 ret = copy_from_user(&ts, cts, sizeof(ts));
130 if (ret)
131 return -EFAULT;
132
133 ts64->tv_sec = ts.tv_sec;
134 ts64->tv_nsec = ts.tv_nsec;
135
136 return 0;
137}
138
139static int __compat_put_timespec64(const struct timespec64 *ts64,
140 struct compat_timespec __user *cts)
141{
142 struct compat_timespec ts = {
143 .tv_sec = ts64->tv_sec,
144 .tv_nsec = ts64->tv_nsec
145 };
146 return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
147}
148
149int compat_get_timespec64(struct timespec64 *ts, const void __user *uts)
150{
151 if (COMPAT_USE_64BIT_TIME)
152 return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
153 else
154 return __compat_get_timespec64(ts, uts);
155}
156EXPORT_SYMBOL_GPL(compat_get_timespec64);
157
158int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
159{
160 if (COMPAT_USE_64BIT_TIME)
161 return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
162 else
163 return __compat_put_timespec64(ts, uts);
164}
165EXPORT_SYMBOL_GPL(compat_put_timespec64);
166
157int compat_get_timeval(struct timeval *tv, const void __user *utv) 167int compat_get_timeval(struct timeval *tv, const void __user *utv)
158{ 168{
159 if (COMPAT_USE_64BIT_TIME) 169 if (COMPAT_USE_64BIT_TIME)
@@ -213,190 +223,30 @@ int compat_convert_timespec(struct timespec __user **kts,
213 return 0; 223 return 0;
214} 224}
215 225
216static long compat_nanosleep_restart(struct restart_block *restart) 226int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i)
217{
218 struct compat_timespec __user *rmtp;
219 struct timespec rmt;
220 mm_segment_t oldfs;
221 long ret;
222
223 restart->nanosleep.rmtp = (struct timespec __user *) &rmt;
224 oldfs = get_fs();
225 set_fs(KERNEL_DS);
226 ret = hrtimer_nanosleep_restart(restart);
227 set_fs(oldfs);
228
229 if (ret == -ERESTART_RESTARTBLOCK) {
230 rmtp = restart->nanosleep.compat_rmtp;
231
232 if (rmtp && compat_put_timespec(&rmt, rmtp))
233 return -EFAULT;
234 }
235
236 return ret;
237}
238
239COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
240 struct compat_timespec __user *, rmtp)
241{
242 struct timespec tu, rmt;
243 struct timespec64 tu64;
244 mm_segment_t oldfs;
245 long ret;
246
247 if (compat_get_timespec(&tu, rqtp))
248 return -EFAULT;
249
250 tu64 = timespec_to_timespec64(tu);
251 if (!timespec64_valid(&tu64))
252 return -EINVAL;
253
254 oldfs = get_fs();
255 set_fs(KERNEL_DS);
256 ret = hrtimer_nanosleep(&tu64,
257 rmtp ? (struct timespec __user *)&rmt : NULL,
258 HRTIMER_MODE_REL, CLOCK_MONOTONIC);
259 set_fs(oldfs);
260
261 /*
262 * hrtimer_nanosleep() can only return 0 or
263 * -ERESTART_RESTARTBLOCK here because:
264 *
265 * - we call it with HRTIMER_MODE_REL and therefor exclude the
266 * -ERESTARTNOHAND return path.
267 *
268 * - we supply the rmtp argument from the task stack (due to
269 * the necessary compat conversion. So the update cannot
270 * fail, which excludes the -EFAULT return path as well. If
271 * it fails nevertheless we have a bigger problem and wont
272 * reach this place anymore.
273 *
274 * - if the return value is 0, we do not have to update rmtp
275 * because there is no remaining time.
276 *
277 * We check for -ERESTART_RESTARTBLOCK nevertheless if the
278 * core implementation decides to return random nonsense.
279 */
280 if (ret == -ERESTART_RESTARTBLOCK) {
281 struct restart_block *restart = &current->restart_block;
282
283 restart->fn = compat_nanosleep_restart;
284 restart->nanosleep.compat_rmtp = rmtp;
285
286 if (rmtp && compat_put_timespec(&rmt, rmtp))
287 return -EFAULT;
288 }
289 return ret;
290}
291
292static inline long get_compat_itimerval(struct itimerval *o,
293 struct compat_itimerval __user *i)
294{ 227{
295 return (!access_ok(VERIFY_READ, i, sizeof(*i)) || 228 struct compat_itimerval v32;
296 (__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) |
297 __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) |
298 __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) |
299 __get_user(o->it_value.tv_usec, &i->it_value.tv_usec)));
300}
301
302static inline long put_compat_itimerval(struct compat_itimerval __user *o,
303 struct itimerval *i)
304{
305 return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) ||
306 (__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) |
307 __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) |
308 __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) |
309 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
310}
311
312asmlinkage long sys_ni_posix_timers(void);
313 229
314COMPAT_SYSCALL_DEFINE2(getitimer, int, which, 230 if (copy_from_user(&v32, i, sizeof(struct compat_itimerval)))
315 struct compat_itimerval __user *, it)
316{
317 struct itimerval kit;
318 int error;
319
320 if (!IS_ENABLED(CONFIG_POSIX_TIMERS))
321 return sys_ni_posix_timers();
322
323 error = do_getitimer(which, &kit);
324 if (!error && put_compat_itimerval(it, &kit))
325 error = -EFAULT;
326 return error;
327}
328
329COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
330 struct compat_itimerval __user *, in,
331 struct compat_itimerval __user *, out)
332{
333 struct itimerval kin, kout;
334 int error;
335
336 if (!IS_ENABLED(CONFIG_POSIX_TIMERS))
337 return sys_ni_posix_timers();
338
339 if (in) {
340 if (get_compat_itimerval(&kin, in))
341 return -EFAULT;
342 } else
343 memset(&kin, 0, sizeof(kin));
344
345 error = do_setitimer(which, &kin, out ? &kout : NULL);
346 if (error || !out)
347 return error;
348 if (put_compat_itimerval(out, &kout))
349 return -EFAULT; 231 return -EFAULT;
232 o->it_interval.tv_sec = v32.it_interval.tv_sec;
233 o->it_interval.tv_usec = v32.it_interval.tv_usec;
234 o->it_value.tv_sec = v32.it_value.tv_sec;
235 o->it_value.tv_usec = v32.it_value.tv_usec;
350 return 0; 236 return 0;
351} 237}
352 238
353static compat_clock_t clock_t_to_compat_clock_t(clock_t x) 239int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerval *i)
354{
355 return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
356}
357
358COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
359{ 240{
360 if (tbuf) { 241 struct compat_itimerval v32;
361 struct tms tms;
362 struct compat_tms tmp;
363
364 do_sys_times(&tms);
365 /* Convert our struct tms to the compat version. */
366 tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
367 tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
368 tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
369 tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
370 if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
371 return -EFAULT;
372 }
373 force_successful_syscall_return();
374 return compat_jiffies_to_clock_t(jiffies);
375}
376
377#ifdef __ARCH_WANT_SYS_SIGPENDING
378 242
379/* 243 v32.it_interval.tv_sec = i->it_interval.tv_sec;
380 * Assumption: old_sigset_t and compat_old_sigset_t are both 244 v32.it_interval.tv_usec = i->it_interval.tv_usec;
381 * types that can be passed to put_user()/get_user(). 245 v32.it_value.tv_sec = i->it_value.tv_sec;
382 */ 246 v32.it_value.tv_usec = i->it_value.tv_usec;
383 247 return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0;
384COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
385{
386 old_sigset_t s;
387 long ret;
388 mm_segment_t old_fs = get_fs();
389
390 set_fs(KERNEL_DS);
391 ret = sys_sigpending((old_sigset_t __user *) &s);
392 set_fs(old_fs);
393 if (ret == 0)
394 ret = put_user(s, set);
395 return ret;
396} 248}
397 249
398#endif
399
400#ifdef __ARCH_WANT_SYS_SIGPROCMASK 250#ifdef __ARCH_WANT_SYS_SIGPROCMASK
401 251
402/* 252/*
@@ -451,164 +301,33 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
451 301
452#endif 302#endif
453 303
454COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
455 struct compat_rlimit __user *, rlim)
456{
457 struct rlimit r;
458
459 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
460 __get_user(r.rlim_cur, &rlim->rlim_cur) ||
461 __get_user(r.rlim_max, &rlim->rlim_max))
462 return -EFAULT;
463
464 if (r.rlim_cur == COMPAT_RLIM_INFINITY)
465 r.rlim_cur = RLIM_INFINITY;
466 if (r.rlim_max == COMPAT_RLIM_INFINITY)
467 r.rlim_max = RLIM_INFINITY;
468 return do_prlimit(current, resource, &r, NULL);
469}
470
471#ifdef COMPAT_RLIM_OLD_INFINITY
472
473COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
474 struct compat_rlimit __user *, rlim)
475{
476 struct rlimit r;
477 int ret;
478 mm_segment_t old_fs = get_fs();
479
480 set_fs(KERNEL_DS);
481 ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r);
482 set_fs(old_fs);
483
484 if (!ret) {
485 if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY)
486 r.rlim_cur = COMPAT_RLIM_INFINITY;
487 if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY)
488 r.rlim_max = COMPAT_RLIM_INFINITY;
489
490 if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
491 __put_user(r.rlim_cur, &rlim->rlim_cur) ||
492 __put_user(r.rlim_max, &rlim->rlim_max))
493 return -EFAULT;
494 }
495 return ret;
496}
497
498#endif
499
500COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
501 struct compat_rlimit __user *, rlim)
502{
503 struct rlimit r;
504 int ret;
505
506 ret = do_prlimit(current, resource, NULL, &r);
507 if (!ret) {
508 if (r.rlim_cur > COMPAT_RLIM_INFINITY)
509 r.rlim_cur = COMPAT_RLIM_INFINITY;
510 if (r.rlim_max > COMPAT_RLIM_INFINITY)
511 r.rlim_max = COMPAT_RLIM_INFINITY;
512
513 if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
514 __put_user(r.rlim_cur, &rlim->rlim_cur) ||
515 __put_user(r.rlim_max, &rlim->rlim_max))
516 return -EFAULT;
517 }
518 return ret;
519}
520
521int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) 304int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
522{ 305{
523 if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || 306 struct compat_rusage r32;
524 __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) || 307 memset(&r32, 0, sizeof(r32));
525 __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) || 308 r32.ru_utime.tv_sec = r->ru_utime.tv_sec;
526 __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) || 309 r32.ru_utime.tv_usec = r->ru_utime.tv_usec;
527 __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) || 310 r32.ru_stime.tv_sec = r->ru_stime.tv_sec;
528 __put_user(r->ru_maxrss, &ru->ru_maxrss) || 311 r32.ru_stime.tv_usec = r->ru_stime.tv_usec;
529 __put_user(r->ru_ixrss, &ru->ru_ixrss) || 312 r32.ru_maxrss = r->ru_maxrss;
530 __put_user(r->ru_idrss, &ru->ru_idrss) || 313 r32.ru_ixrss = r->ru_ixrss;
531 __put_user(r->ru_isrss, &ru->ru_isrss) || 314 r32.ru_idrss = r->ru_idrss;
532 __put_user(r->ru_minflt, &ru->ru_minflt) || 315 r32.ru_isrss = r->ru_isrss;
533 __put_user(r->ru_majflt, &ru->ru_majflt) || 316 r32.ru_minflt = r->ru_minflt;
534 __put_user(r->ru_nswap, &ru->ru_nswap) || 317 r32.ru_majflt = r->ru_majflt;
535 __put_user(r->ru_inblock, &ru->ru_inblock) || 318 r32.ru_nswap = r->ru_nswap;
536 __put_user(r->ru_oublock, &ru->ru_oublock) || 319 r32.ru_inblock = r->ru_inblock;
537 __put_user(r->ru_msgsnd, &ru->ru_msgsnd) || 320 r32.ru_oublock = r->ru_oublock;
538 __put_user(r->ru_msgrcv, &ru->ru_msgrcv) || 321 r32.ru_msgsnd = r->ru_msgsnd;
539 __put_user(r->ru_nsignals, &ru->ru_nsignals) || 322 r32.ru_msgrcv = r->ru_msgrcv;
540 __put_user(r->ru_nvcsw, &ru->ru_nvcsw) || 323 r32.ru_nsignals = r->ru_nsignals;
541 __put_user(r->ru_nivcsw, &ru->ru_nivcsw)) 324 r32.ru_nvcsw = r->ru_nvcsw;
325 r32.ru_nivcsw = r->ru_nivcsw;
326 if (copy_to_user(ru, &r32, sizeof(r32)))
542 return -EFAULT; 327 return -EFAULT;
543 return 0; 328 return 0;
544} 329}
545 330
546COMPAT_SYSCALL_DEFINE4(wait4,
547 compat_pid_t, pid,
548 compat_uint_t __user *, stat_addr,
549 int, options,
550 struct compat_rusage __user *, ru)
551{
552 if (!ru) {
553 return sys_wait4(pid, stat_addr, options, NULL);
554 } else {
555 struct rusage r;
556 int ret;
557 unsigned int status;
558 mm_segment_t old_fs = get_fs();
559
560 set_fs (KERNEL_DS);
561 ret = sys_wait4(pid,
562 (stat_addr ?
563 (unsigned int __user *) &status : NULL),
564 options, (struct rusage __user *) &r);
565 set_fs (old_fs);
566
567 if (ret > 0) {
568 if (put_compat_rusage(&r, ru))
569 return -EFAULT;
570 if (stat_addr && put_user(status, stat_addr))
571 return -EFAULT;
572 }
573 return ret;
574 }
575}
576
577COMPAT_SYSCALL_DEFINE5(waitid,
578 int, which, compat_pid_t, pid,
579 struct compat_siginfo __user *, uinfo, int, options,
580 struct compat_rusage __user *, uru)
581{
582 siginfo_t info;
583 struct rusage ru;
584 long ret;
585 mm_segment_t old_fs = get_fs();
586
587 memset(&info, 0, sizeof(info));
588
589 set_fs(KERNEL_DS);
590 ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options,
591 uru ? (struct rusage __user *)&ru : NULL);
592 set_fs(old_fs);
593
594 if ((ret < 0) || (info.si_signo == 0))
595 return ret;
596
597 if (uru) {
598 /* sys_waitid() overwrites everything in ru */
599 if (COMPAT_USE_64BIT_TIME)
600 ret = copy_to_user(uru, &ru, sizeof(ru));
601 else
602 ret = put_compat_rusage(&ru, uru);
603 if (ret)
604 return -EFAULT;
605 }
606
607 BUG_ON(info.si_code & __SI_MASK);
608 info.si_code |= __SI_CHLD;
609 return copy_siginfo_to_user32(uinfo, &info);
610}
611
612static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, 331static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
613 unsigned len, struct cpumask *new_mask) 332 unsigned len, struct cpumask *new_mask)
614{ 333{
@@ -689,192 +408,26 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst,
689 return 0; 408 return 0;
690} 409}
691 410
692COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, 411int get_compat_itimerspec64(struct itimerspec64 *its,
693 struct compat_sigevent __user *, timer_event_spec, 412 const struct compat_itimerspec __user *uits)
694 timer_t __user *, created_timer_id)
695{
696 struct sigevent __user *event = NULL;
697
698 if (timer_event_spec) {
699 struct sigevent kevent;
700
701 event = compat_alloc_user_space(sizeof(*event));
702 if (get_compat_sigevent(&kevent, timer_event_spec) ||
703 copy_to_user(event, &kevent, sizeof(*event)))
704 return -EFAULT;
705 }
706
707 return sys_timer_create(which_clock, event, created_timer_id);
708}
709
710COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
711 struct compat_itimerspec __user *, new,
712 struct compat_itimerspec __user *, old)
713{
714 long err;
715 mm_segment_t oldfs;
716 struct itimerspec newts, oldts;
717
718 if (!new)
719 return -EINVAL;
720 if (get_compat_itimerspec(&newts, new))
721 return -EFAULT;
722 oldfs = get_fs();
723 set_fs(KERNEL_DS);
724 err = sys_timer_settime(timer_id, flags,
725 (struct itimerspec __user *) &newts,
726 (struct itimerspec __user *) &oldts);
727 set_fs(oldfs);
728 if (!err && old && put_compat_itimerspec(old, &oldts))
729 return -EFAULT;
730 return err;
731}
732
733COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
734 struct compat_itimerspec __user *, setting)
735{
736 long err;
737 mm_segment_t oldfs;
738 struct itimerspec ts;
739
740 oldfs = get_fs();
741 set_fs(KERNEL_DS);
742 err = sys_timer_gettime(timer_id,
743 (struct itimerspec __user *) &ts);
744 set_fs(oldfs);
745 if (!err && put_compat_itimerspec(setting, &ts))
746 return -EFAULT;
747 return err;
748}
749
750COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
751 struct compat_timespec __user *, tp)
752{
753 long err;
754 mm_segment_t oldfs;
755 struct timespec ts;
756
757 if (compat_get_timespec(&ts, tp))
758 return -EFAULT;
759 oldfs = get_fs();
760 set_fs(KERNEL_DS);
761 err = sys_clock_settime(which_clock,
762 (struct timespec __user *) &ts);
763 set_fs(oldfs);
764 return err;
765}
766
767COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
768 struct compat_timespec __user *, tp)
769{
770 long err;
771 mm_segment_t oldfs;
772 struct timespec ts;
773
774 oldfs = get_fs();
775 set_fs(KERNEL_DS);
776 err = sys_clock_gettime(which_clock,
777 (struct timespec __user *) &ts);
778 set_fs(oldfs);
779 if (!err && compat_put_timespec(&ts, tp))
780 return -EFAULT;
781 return err;
782}
783
784COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
785 struct compat_timex __user *, utp)
786{ 413{
787 struct timex txc;
788 mm_segment_t oldfs;
789 int err, ret;
790 414
791 err = compat_get_timex(&txc, utp); 415 if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) ||
792 if (err) 416 __compat_get_timespec64(&its->it_value, &uits->it_value))
793 return err;
794
795 oldfs = get_fs();
796 set_fs(KERNEL_DS);
797 ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
798 set_fs(oldfs);
799
800 err = compat_put_timex(utp, &txc);
801 if (err)
802 return err;
803
804 return ret;
805}
806
807COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
808 struct compat_timespec __user *, tp)
809{
810 long err;
811 mm_segment_t oldfs;
812 struct timespec ts;
813
814 oldfs = get_fs();
815 set_fs(KERNEL_DS);
816 err = sys_clock_getres(which_clock,
817 (struct timespec __user *) &ts);
818 set_fs(oldfs);
819 if (!err && tp && compat_put_timespec(&ts, tp))
820 return -EFAULT; 417 return -EFAULT;
821 return err; 418 return 0;
822}
823
824static long compat_clock_nanosleep_restart(struct restart_block *restart)
825{
826 long err;
827 mm_segment_t oldfs;
828 struct timespec tu;
829 struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp;
830
831 restart->nanosleep.rmtp = (struct timespec __user *) &tu;
832 oldfs = get_fs();
833 set_fs(KERNEL_DS);
834 err = clock_nanosleep_restart(restart);
835 set_fs(oldfs);
836
837 if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
838 compat_put_timespec(&tu, rmtp))
839 return -EFAULT;
840
841 if (err == -ERESTART_RESTARTBLOCK) {
842 restart->fn = compat_clock_nanosleep_restart;
843 restart->nanosleep.compat_rmtp = rmtp;
844 }
845 return err;
846} 419}
420EXPORT_SYMBOL_GPL(get_compat_itimerspec64);
847 421
848COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, 422int put_compat_itimerspec64(const struct itimerspec64 *its,
849 struct compat_timespec __user *, rqtp, 423 struct compat_itimerspec __user *uits)
850 struct compat_timespec __user *, rmtp)
851{ 424{
852 long err; 425 if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) ||
853 mm_segment_t oldfs; 426 __compat_put_timespec64(&its->it_value, &uits->it_value))
854 struct timespec in, out;
855 struct restart_block *restart;
856
857 if (compat_get_timespec(&in, rqtp))
858 return -EFAULT;
859
860 oldfs = get_fs();
861 set_fs(KERNEL_DS);
862 err = sys_clock_nanosleep(which_clock, flags,
863 (struct timespec __user *) &in,
864 (struct timespec __user *) &out);
865 set_fs(oldfs);
866
867 if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
868 compat_put_timespec(&out, rmtp))
869 return -EFAULT; 427 return -EFAULT;
870 428 return 0;
871 if (err == -ERESTART_RESTARTBLOCK) {
872 restart = &current->restart_block;
873 restart->fn = compat_clock_nanosleep_restart;
874 restart->nanosleep.compat_rmtp = rmtp;
875 }
876 return err;
877} 429}
430EXPORT_SYMBOL_GPL(put_compat_itimerspec64);
878 431
879/* 432/*
880 * We currently only need the following fields from the sigevent 433 * We currently only need the following fields from the sigevent
@@ -900,84 +453,59 @@ int get_compat_sigevent(struct sigevent *event,
900long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, 453long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
901 unsigned long bitmap_size) 454 unsigned long bitmap_size)
902{ 455{
903 int i, j;
904 unsigned long m;
905 compat_ulong_t um;
906 unsigned long nr_compat_longs; 456 unsigned long nr_compat_longs;
907 457
908 /* align bitmap up to nearest compat_long_t boundary */ 458 /* align bitmap up to nearest compat_long_t boundary */
909 bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); 459 bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
460 nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
910 461
911 if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) 462 if (!access_ok(VERIFY_READ, umask, bitmap_size / 8))
912 return -EFAULT; 463 return -EFAULT;
913 464
914 nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); 465 user_access_begin();
915 466 while (nr_compat_longs > 1) {
916 for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { 467 compat_ulong_t l1, l2;
917 m = 0; 468 unsafe_get_user(l1, umask++, Efault);
918 469 unsafe_get_user(l2, umask++, Efault);
919 for (j = 0; j < sizeof(m)/sizeof(um); j++) { 470 *mask++ = ((unsigned long)l2 << BITS_PER_COMPAT_LONG) | l1;
920 /* 471 nr_compat_longs -= 2;
921 * We dont want to read past the end of the userspace
922 * bitmap. We must however ensure the end of the
923 * kernel bitmap is zeroed.
924 */
925 if (nr_compat_longs) {
926 nr_compat_longs--;
927 if (__get_user(um, umask))
928 return -EFAULT;
929 } else {
930 um = 0;
931 }
932
933 umask++;
934 m |= (long)um << (j * BITS_PER_COMPAT_LONG);
935 }
936 *mask++ = m;
937 } 472 }
938 473 if (nr_compat_longs)
474 unsafe_get_user(*mask, umask++, Efault);
475 user_access_end();
939 return 0; 476 return 0;
477
478Efault:
479 user_access_end();
480 return -EFAULT;
940} 481}
941 482
942long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, 483long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
943 unsigned long bitmap_size) 484 unsigned long bitmap_size)
944{ 485{
945 int i, j;
946 unsigned long m;
947 compat_ulong_t um;
948 unsigned long nr_compat_longs; 486 unsigned long nr_compat_longs;
949 487
950 /* align bitmap up to nearest compat_long_t boundary */ 488 /* align bitmap up to nearest compat_long_t boundary */
951 bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); 489 bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
490 nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
952 491
953 if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) 492 if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8))
954 return -EFAULT; 493 return -EFAULT;
955 494
956 nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); 495 user_access_begin();
957 496 while (nr_compat_longs > 1) {
958 for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { 497 unsigned long m = *mask++;
959 m = *mask++; 498 unsafe_put_user((compat_ulong_t)m, umask++, Efault);
960 499 unsafe_put_user(m >> BITS_PER_COMPAT_LONG, umask++, Efault);
961 for (j = 0; j < sizeof(m)/sizeof(um); j++) { 500 nr_compat_longs -= 2;
962 um = m;
963
964 /*
965 * We dont want to write past the end of the userspace
966 * bitmap.
967 */
968 if (nr_compat_longs) {
969 nr_compat_longs--;
970 if (__put_user(um, umask))
971 return -EFAULT;
972 }
973
974 umask++;
975 m >>= 4*sizeof(um);
976 m >>= 4*sizeof(um);
977 }
978 } 501 }
979 502 if (nr_compat_longs)
503 unsafe_put_user((compat_ulong_t)*mask, umask++, Efault);
504 user_access_end();
980 return 0; 505 return 0;
506Efault:
507 user_access_end();
508 return -EFAULT;
981} 509}
982 510
983void 511void
@@ -1003,96 +531,6 @@ sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
1003 } 531 }
1004} 532}
1005 533
1006COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
1007 struct compat_siginfo __user *, uinfo,
1008 struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
1009{
1010 compat_sigset_t s32;
1011 sigset_t s;
1012 struct timespec t;
1013 siginfo_t info;
1014 long ret;
1015
1016 if (sigsetsize != sizeof(sigset_t))
1017 return -EINVAL;
1018
1019 if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
1020 return -EFAULT;
1021 sigset_from_compat(&s, &s32);
1022
1023 if (uts) {
1024 if (compat_get_timespec(&t, uts))
1025 return -EFAULT;
1026 }
1027
1028 ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
1029
1030 if (ret > 0 && uinfo) {
1031 if (copy_siginfo_to_user32(uinfo, &info))
1032 ret = -EFAULT;
1033 }
1034
1035 return ret;
1036}
1037
1038#ifdef __ARCH_WANT_COMPAT_SYS_TIME
1039
1040/* compat_time_t is a 32 bit "long" and needs to get converted. */
1041
1042COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
1043{
1044 compat_time_t i;
1045 struct timeval tv;
1046
1047 do_gettimeofday(&tv);
1048 i = tv.tv_sec;
1049
1050 if (tloc) {
1051 if (put_user(i,tloc))
1052 return -EFAULT;
1053 }
1054 force_successful_syscall_return();
1055 return i;
1056}
1057
1058COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
1059{
1060 struct timespec tv;
1061 int err;
1062
1063 if (get_user(tv.tv_sec, tptr))
1064 return -EFAULT;
1065
1066 tv.tv_nsec = 0;
1067
1068 err = security_settime(&tv, NULL);
1069 if (err)
1070 return err;
1071
1072 do_settimeofday(&tv);
1073 return 0;
1074}
1075
1076#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
1077
1078COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
1079{
1080 struct timex txc;
1081 int err, ret;
1082
1083 err = compat_get_timex(&txc, utp);
1084 if (err)
1085 return err;
1086
1087 ret = do_adjtimex(&txc);
1088
1089 err = compat_put_timex(utp, &txc);
1090 if (err)
1091 return err;
1092
1093 return ret;
1094}
1095
1096#ifdef CONFIG_NUMA 534#ifdef CONFIG_NUMA
1097COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, 535COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
1098 compat_uptr_t __user *, pages32, 536 compat_uptr_t __user *, pages32,
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 26a06e09a5bd..d70829033bb7 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -1,10 +1,13 @@
1# KEEP ALPHABETICALLY SORTED 1# KEEP ALPHABETICALLY SORTED
2# CONFIG_DEVKMEM is not set 2# CONFIG_DEVKMEM is not set
3# CONFIG_DEVMEM is not set 3# CONFIG_DEVMEM is not set
4# CONFIG_FHANDLE is not set
4# CONFIG_INET_LRO is not set 5# CONFIG_INET_LRO is not set
5# CONFIG_MODULES is not set 6# CONFIG_NFSD is not set
7# CONFIG_NFS_FS is not set
6# CONFIG_OABI_COMPAT is not set 8# CONFIG_OABI_COMPAT is not set
7# CONFIG_SYSVIPC is not set 9# CONFIG_SYSVIPC is not set
10# CONFIG_USELIB is not set
8CONFIG_ANDROID=y 11CONFIG_ANDROID=y
9CONFIG_ANDROID_BINDER_IPC=y 12CONFIG_ANDROID_BINDER_IPC=y
10CONFIG_ANDROID_LOW_MEMORY_KILLER=y 13CONFIG_ANDROID_LOW_MEMORY_KILLER=y
@@ -13,6 +16,7 @@ CONFIG_ASHMEM=y
13CONFIG_AUDIT=y 16CONFIG_AUDIT=y
14CONFIG_BLK_DEV_INITRD=y 17CONFIG_BLK_DEV_INITRD=y
15CONFIG_CGROUPS=y 18CONFIG_CGROUPS=y
19CONFIG_CGROUP_BPF=y
16CONFIG_CGROUP_CPUACCT=y 20CONFIG_CGROUP_CPUACCT=y
17CONFIG_CGROUP_DEBUG=y 21CONFIG_CGROUP_DEBUG=y
18CONFIG_CGROUP_FREEZER=y 22CONFIG_CGROUP_FREEZER=y
@@ -23,6 +27,8 @@ CONFIG_EMBEDDED=y
23CONFIG_FB=y 27CONFIG_FB=y
24CONFIG_HARDENED_USERCOPY=y 28CONFIG_HARDENED_USERCOPY=y
25CONFIG_HIGH_RES_TIMERS=y 29CONFIG_HIGH_RES_TIMERS=y
30CONFIG_IKCONFIG=y
31CONFIG_IKCONFIG_PROC=y
26CONFIG_INET6_AH=y 32CONFIG_INET6_AH=y
27CONFIG_INET6_ESP=y 33CONFIG_INET6_ESP=y
28CONFIG_INET6_IPCOMP=y 34CONFIG_INET6_IPCOMP=y
@@ -60,6 +66,9 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y
60CONFIG_IP_NF_TARGET_NETMAP=y 66CONFIG_IP_NF_TARGET_NETMAP=y
61CONFIG_IP_NF_TARGET_REDIRECT=y 67CONFIG_IP_NF_TARGET_REDIRECT=y
62CONFIG_IP_NF_TARGET_REJECT=y 68CONFIG_IP_NF_TARGET_REJECT=y
69CONFIG_MODULES=y
70CONFIG_MODULE_UNLOAD=y
71CONFIG_MODVERSIONS=y
63CONFIG_NET=y 72CONFIG_NET=y
64CONFIG_NETDEVICES=y 73CONFIG_NETDEVICES=y
65CONFIG_NETFILTER=y 74CONFIG_NETFILTER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index 28ee064b6744..946fb92418f7 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -6,13 +6,15 @@
6# CONFIG_NF_CONNTRACK_SIP is not set 6# CONFIG_NF_CONNTRACK_SIP is not set
7# CONFIG_PM_WAKELOCKS_GC is not set 7# CONFIG_PM_WAKELOCKS_GC is not set
8# CONFIG_VT is not set 8# CONFIG_VT is not set
9CONFIG_ARM64_SW_TTBR0_PAN=y
9CONFIG_BACKLIGHT_LCD_SUPPORT=y 10CONFIG_BACKLIGHT_LCD_SUPPORT=y
10CONFIG_BLK_DEV_DM=y 11CONFIG_BLK_DEV_DM=y
11CONFIG_BLK_DEV_LOOP=y 12CONFIG_BLK_DEV_LOOP=y
12CONFIG_BLK_DEV_RAM=y 13CONFIG_BLK_DEV_RAM=y
13CONFIG_BLK_DEV_RAM_SIZE=8192 14CONFIG_BLK_DEV_RAM_SIZE=8192
15CONFIG_CC_STACKPROTECTOR_STRONG=y
14CONFIG_COMPACTION=y 16CONFIG_COMPACTION=y
15CONFIG_STRICT_KERNEL_RWX=y 17CONFIG_CPU_SW_DOMAIN_PAN=y
16CONFIG_DM_CRYPT=y 18CONFIG_DM_CRYPT=y
17CONFIG_DM_UEVENT=y 19CONFIG_DM_UEVENT=y
18CONFIG_DM_VERITY=y 20CONFIG_DM_VERITY=y
@@ -105,6 +107,7 @@ CONFIG_SCHEDSTATS=y
105CONFIG_SMARTJOYPLUS_FF=y 107CONFIG_SMARTJOYPLUS_FF=y
106CONFIG_SND=y 108CONFIG_SND=y
107CONFIG_SOUND=y 109CONFIG_SOUND=y
110CONFIG_STRICT_KERNEL_RWX=y
108CONFIG_SUSPEND_TIME=y 111CONFIG_SUSPEND_TIME=y
109CONFIG_TABLET_USB_ACECAD=y 112CONFIG_TABLET_USB_ACECAD=y
110CONFIG_TABLET_USB_AIPTEK=y 113CONFIG_TABLET_USB_AIPTEK=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9ae6fbe5b5cf..eee033134262 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -27,6 +27,7 @@
27#include <linux/smpboot.h> 27#include <linux/smpboot.h>
28#include <linux/relay.h> 28#include <linux/relay.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/percpu-rwsem.h>
30 31
31#include <trace/events/power.h> 32#include <trace/events/power.h>
32#define CREATE_TRACE_POINTS 33#define CREATE_TRACE_POINTS
@@ -65,6 +66,12 @@ struct cpuhp_cpu_state {
65 66
66static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); 67static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
67 68
69#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
70static struct lock_class_key cpuhp_state_key;
71static struct lockdep_map cpuhp_state_lock_map =
72 STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key);
73#endif
74
68/** 75/**
69 * cpuhp_step - Hotplug state machine step 76 * cpuhp_step - Hotplug state machine step
70 * @name: Name of the step 77 * @name: Name of the step
@@ -196,121 +203,41 @@ void cpu_maps_update_done(void)
196 mutex_unlock(&cpu_add_remove_lock); 203 mutex_unlock(&cpu_add_remove_lock);
197} 204}
198 205
199/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. 206/*
207 * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
200 * Should always be manipulated under cpu_add_remove_lock 208 * Should always be manipulated under cpu_add_remove_lock
201 */ 209 */
202static int cpu_hotplug_disabled; 210static int cpu_hotplug_disabled;
203 211
204#ifdef CONFIG_HOTPLUG_CPU 212#ifdef CONFIG_HOTPLUG_CPU
205 213
206static struct { 214DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
207 struct task_struct *active_writer;
208 /* wait queue to wake up the active_writer */
209 wait_queue_head_t wq;
210 /* verifies that no writer will get active while readers are active */
211 struct mutex lock;
212 /*
213 * Also blocks the new readers during
214 * an ongoing cpu hotplug operation.
215 */
216 atomic_t refcount;
217
218#ifdef CONFIG_DEBUG_LOCK_ALLOC
219 struct lockdep_map dep_map;
220#endif
221} cpu_hotplug = {
222 .active_writer = NULL,
223 .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
224 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
225#ifdef CONFIG_DEBUG_LOCK_ALLOC
226 .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map),
227#endif
228};
229 215
230/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ 216void cpus_read_lock(void)
231#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
232#define cpuhp_lock_acquire_tryread() \
233 lock_map_acquire_tryread(&cpu_hotplug.dep_map)
234#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
235#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
236
237
238void get_online_cpus(void)
239{ 217{
240 might_sleep(); 218 percpu_down_read(&cpu_hotplug_lock);
241 if (cpu_hotplug.active_writer == current)
242 return;
243 cpuhp_lock_acquire_read();
244 mutex_lock(&cpu_hotplug.lock);
245 atomic_inc(&cpu_hotplug.refcount);
246 mutex_unlock(&cpu_hotplug.lock);
247} 219}
248EXPORT_SYMBOL_GPL(get_online_cpus); 220EXPORT_SYMBOL_GPL(cpus_read_lock);
249 221
250void put_online_cpus(void) 222void cpus_read_unlock(void)
251{ 223{
252 int refcount; 224 percpu_up_read(&cpu_hotplug_lock);
253
254 if (cpu_hotplug.active_writer == current)
255 return;
256
257 refcount = atomic_dec_return(&cpu_hotplug.refcount);
258 if (WARN_ON(refcount < 0)) /* try to fix things up */
259 atomic_inc(&cpu_hotplug.refcount);
260
261 if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
262 wake_up(&cpu_hotplug.wq);
263
264 cpuhp_lock_release();
265
266} 225}
267EXPORT_SYMBOL_GPL(put_online_cpus); 226EXPORT_SYMBOL_GPL(cpus_read_unlock);
268 227
269/* 228void cpus_write_lock(void)
270 * This ensures that the hotplug operation can begin only when the
271 * refcount goes to zero.
272 *
273 * Note that during a cpu-hotplug operation, the new readers, if any,
274 * will be blocked by the cpu_hotplug.lock
275 *
276 * Since cpu_hotplug_begin() is always called after invoking
277 * cpu_maps_update_begin(), we can be sure that only one writer is active.
278 *
279 * Note that theoretically, there is a possibility of a livelock:
280 * - Refcount goes to zero, last reader wakes up the sleeping
281 * writer.
282 * - Last reader unlocks the cpu_hotplug.lock.
283 * - A new reader arrives at this moment, bumps up the refcount.
284 * - The writer acquires the cpu_hotplug.lock finds the refcount
285 * non zero and goes to sleep again.
286 *
287 * However, this is very difficult to achieve in practice since
288 * get_online_cpus() not an api which is called all that often.
289 *
290 */
291void cpu_hotplug_begin(void)
292{ 229{
293 DEFINE_WAIT(wait); 230 percpu_down_write(&cpu_hotplug_lock);
294 231}
295 cpu_hotplug.active_writer = current;
296 cpuhp_lock_acquire();
297 232
298 for (;;) { 233void cpus_write_unlock(void)
299 mutex_lock(&cpu_hotplug.lock); 234{
300 prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE); 235 percpu_up_write(&cpu_hotplug_lock);
301 if (likely(!atomic_read(&cpu_hotplug.refcount)))
302 break;
303 mutex_unlock(&cpu_hotplug.lock);
304 schedule();
305 }
306 finish_wait(&cpu_hotplug.wq, &wait);
307} 236}
308 237
309void cpu_hotplug_done(void) 238void lockdep_assert_cpus_held(void)
310{ 239{
311 cpu_hotplug.active_writer = NULL; 240 percpu_rwsem_assert_held(&cpu_hotplug_lock);
312 mutex_unlock(&cpu_hotplug.lock);
313 cpuhp_lock_release();
314} 241}
315 242
316/* 243/*
@@ -344,13 +271,26 @@ void cpu_hotplug_enable(void)
344EXPORT_SYMBOL_GPL(cpu_hotplug_enable); 271EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
345#endif /* CONFIG_HOTPLUG_CPU */ 272#endif /* CONFIG_HOTPLUG_CPU */
346 273
347/* Notifier wrappers for transitioning to state machine */ 274static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st);
348 275
349static int bringup_wait_for_ap(unsigned int cpu) 276static int bringup_wait_for_ap(unsigned int cpu)
350{ 277{
351 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); 278 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
352 279
280 /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
353 wait_for_completion(&st->done); 281 wait_for_completion(&st->done);
282 if (WARN_ON_ONCE((!cpu_online(cpu))))
283 return -ECANCELED;
284
285 /* Unpark the stopper thread and the hotplug thread of the target cpu */
286 stop_machine_unpark(cpu);
287 kthread_unpark(st->thread);
288
289 /* Should we go further up ? */
290 if (st->target > CPUHP_AP_ONLINE_IDLE) {
291 __cpuhp_kick_ap_work(st);
292 wait_for_completion(&st->done);
293 }
354 return st->result; 294 return st->result;
355} 295}
356 296
@@ -371,9 +311,7 @@ static int bringup_cpu(unsigned int cpu)
371 irq_unlock_sparse(); 311 irq_unlock_sparse();
372 if (ret) 312 if (ret)
373 return ret; 313 return ret;
374 ret = bringup_wait_for_ap(cpu); 314 return bringup_wait_for_ap(cpu);
375 BUG_ON(!cpu_online(cpu));
376 return ret;
377} 315}
378 316
379/* 317/*
@@ -484,6 +422,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
484 422
485 st->should_run = false; 423 st->should_run = false;
486 424
425 lock_map_acquire(&cpuhp_state_lock_map);
487 /* Single callback invocation for [un]install ? */ 426 /* Single callback invocation for [un]install ? */
488 if (st->single) { 427 if (st->single) {
489 if (st->cb_state < CPUHP_AP_ONLINE) { 428 if (st->cb_state < CPUHP_AP_ONLINE) {
@@ -510,6 +449,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
510 else if (st->state > st->target) 449 else if (st->state > st->target)
511 ret = cpuhp_ap_offline(cpu, st); 450 ret = cpuhp_ap_offline(cpu, st);
512 } 451 }
452 lock_map_release(&cpuhp_state_lock_map);
513 st->result = ret; 453 st->result = ret;
514 complete(&st->done); 454 complete(&st->done);
515} 455}
@@ -524,6 +464,9 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
524 if (!cpu_online(cpu)) 464 if (!cpu_online(cpu))
525 return 0; 465 return 0;
526 466
467 lock_map_acquire(&cpuhp_state_lock_map);
468 lock_map_release(&cpuhp_state_lock_map);
469
527 /* 470 /*
528 * If we are up and running, use the hotplug thread. For early calls 471 * If we are up and running, use the hotplug thread. For early calls
529 * we invoke the thread function directly. 472 * we invoke the thread function directly.
@@ -567,6 +510,8 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
567 enum cpuhp_state state = st->state; 510 enum cpuhp_state state = st->state;
568 511
569 trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); 512 trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
513 lock_map_acquire(&cpuhp_state_lock_map);
514 lock_map_release(&cpuhp_state_lock_map);
570 __cpuhp_kick_ap_work(st); 515 __cpuhp_kick_ap_work(st);
571 wait_for_completion(&st->done); 516 wait_for_completion(&st->done);
572 trace_cpuhp_exit(cpu, st->state, state, st->result); 517 trace_cpuhp_exit(cpu, st->state, state, st->result);
@@ -630,30 +575,6 @@ void clear_tasks_mm_cpumask(int cpu)
630 rcu_read_unlock(); 575 rcu_read_unlock();
631} 576}
632 577
633static inline void check_for_tasks(int dead_cpu)
634{
635 struct task_struct *g, *p;
636
637 read_lock(&tasklist_lock);
638 for_each_process_thread(g, p) {
639 if (!p->on_rq)
640 continue;
641 /*
642 * We do the check with unlocked task_rq(p)->lock.
643 * Order the reading to do not warn about a task,
644 * which was running on this cpu in the past, and
645 * it's just been woken on another cpu.
646 */
647 rmb();
648 if (task_cpu(p) != dead_cpu)
649 continue;
650
651 pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
652 p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
653 }
654 read_unlock(&tasklist_lock);
655}
656
657/* Take this CPU down. */ 578/* Take this CPU down. */
658static int take_cpu_down(void *_param) 579static int take_cpu_down(void *_param)
659{ 580{
@@ -701,7 +622,7 @@ static int takedown_cpu(unsigned int cpu)
701 /* 622 /*
702 * So now all preempt/rcu users must observe !cpu_active(). 623 * So now all preempt/rcu users must observe !cpu_active().
703 */ 624 */
704 err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu)); 625 err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
705 if (err) { 626 if (err) {
706 /* CPU refused to die */ 627 /* CPU refused to die */
707 irq_unlock_sparse(); 628 irq_unlock_sparse();
@@ -773,7 +694,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
773 if (!cpu_present(cpu)) 694 if (!cpu_present(cpu))
774 return -EINVAL; 695 return -EINVAL;
775 696
776 cpu_hotplug_begin(); 697 cpus_write_lock();
777 698
778 cpuhp_tasks_frozen = tasks_frozen; 699 cpuhp_tasks_frozen = tasks_frozen;
779 700
@@ -811,7 +732,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
811 } 732 }
812 733
813out: 734out:
814 cpu_hotplug_done(); 735 cpus_write_unlock();
815 return ret; 736 return ret;
816} 737}
817 738
@@ -859,31 +780,20 @@ void notify_cpu_starting(unsigned int cpu)
859} 780}
860 781
861/* 782/*
862 * Called from the idle task. We need to set active here, so we can kick off 783 * Called from the idle task. Wake up the controlling task which brings the
863 * the stopper thread and unpark the smpboot threads. If the target state is 784 * stopper and the hotplug thread of the upcoming CPU up and then delegates
864 * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the 785 * the rest of the online bringup to the hotplug thread.
865 * cpu further.
866 */ 786 */
867void cpuhp_online_idle(enum cpuhp_state state) 787void cpuhp_online_idle(enum cpuhp_state state)
868{ 788{
869 struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); 789 struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
870 unsigned int cpu = smp_processor_id();
871 790
872 /* Happens for the boot cpu */ 791 /* Happens for the boot cpu */
873 if (state != CPUHP_AP_ONLINE_IDLE) 792 if (state != CPUHP_AP_ONLINE_IDLE)
874 return; 793 return;
875 794
876 st->state = CPUHP_AP_ONLINE_IDLE; 795 st->state = CPUHP_AP_ONLINE_IDLE;
877 796 complete(&st->done);
878 /* Unpark the stopper thread and the hotplug thread of this cpu */
879 stop_machine_unpark(cpu);
880 kthread_unpark(st->thread);
881
882 /* Should we go further up ? */
883 if (st->target > CPUHP_AP_ONLINE_IDLE)
884 __cpuhp_kick_ap_work(st);
885 else
886 complete(&st->done);
887} 797}
888 798
889/* Requires cpu_add_remove_lock to be held */ 799/* Requires cpu_add_remove_lock to be held */
@@ -893,7 +803,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
893 struct task_struct *idle; 803 struct task_struct *idle;
894 int ret = 0; 804 int ret = 0;
895 805
896 cpu_hotplug_begin(); 806 cpus_write_lock();
897 807
898 if (!cpu_present(cpu)) { 808 if (!cpu_present(cpu)) {
899 ret = -EINVAL; 809 ret = -EINVAL;
@@ -941,7 +851,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
941 target = min((int)target, CPUHP_BRINGUP_CPU); 851 target = min((int)target, CPUHP_BRINGUP_CPU);
942 ret = cpuhp_up_callbacks(cpu, st, target); 852 ret = cpuhp_up_callbacks(cpu, st, target);
943out: 853out:
944 cpu_hotplug_done(); 854 cpus_write_unlock();
945 return ret; 855 return ret;
946} 856}
947 857
@@ -1252,6 +1162,11 @@ static struct cpuhp_step cpuhp_ap_states[] = {
1252 .startup.single = smpboot_unpark_threads, 1162 .startup.single = smpboot_unpark_threads,
1253 .teardown.single = NULL, 1163 .teardown.single = NULL,
1254 }, 1164 },
1165 [CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
1166 .name = "irq/affinity:online",
1167 .startup.single = irq_affinity_online_cpu,
1168 .teardown.single = NULL,
1169 },
1255 [CPUHP_AP_PERF_ONLINE] = { 1170 [CPUHP_AP_PERF_ONLINE] = {
1256 .name = "perf:online", 1171 .name = "perf:online",
1257 .startup.single = perf_event_init_cpu, 1172 .startup.single = perf_event_init_cpu,
@@ -1413,18 +1328,20 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
1413 } 1328 }
1414} 1329}
1415 1330
1416int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, 1331int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
1417 bool invoke) 1332 struct hlist_node *node,
1333 bool invoke)
1418{ 1334{
1419 struct cpuhp_step *sp; 1335 struct cpuhp_step *sp;
1420 int cpu; 1336 int cpu;
1421 int ret; 1337 int ret;
1422 1338
1339 lockdep_assert_cpus_held();
1340
1423 sp = cpuhp_get_step(state); 1341 sp = cpuhp_get_step(state);
1424 if (sp->multi_instance == false) 1342 if (sp->multi_instance == false)
1425 return -EINVAL; 1343 return -EINVAL;
1426 1344
1427 get_online_cpus();
1428 mutex_lock(&cpuhp_state_mutex); 1345 mutex_lock(&cpuhp_state_mutex);
1429 1346
1430 if (!invoke || !sp->startup.multi) 1347 if (!invoke || !sp->startup.multi)
@@ -1453,13 +1370,23 @@ add_node:
1453 hlist_add_head(node, &sp->list); 1370 hlist_add_head(node, &sp->list);
1454unlock: 1371unlock:
1455 mutex_unlock(&cpuhp_state_mutex); 1372 mutex_unlock(&cpuhp_state_mutex);
1456 put_online_cpus(); 1373 return ret;
1374}
1375
1376int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
1377 bool invoke)
1378{
1379 int ret;
1380
1381 cpus_read_lock();
1382 ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke);
1383 cpus_read_unlock();
1457 return ret; 1384 return ret;
1458} 1385}
1459EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); 1386EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
1460 1387
1461/** 1388/**
1462 * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state 1389 * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
1463 * @state: The state to setup 1390 * @state: The state to setup
1464 * @invoke: If true, the startup function is invoked for cpus where 1391 * @invoke: If true, the startup function is invoked for cpus where
1465 * cpu state >= @state 1392 * cpu state >= @state
@@ -1468,25 +1395,27 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
1468 * @multi_instance: State is set up for multiple instances which get 1395 * @multi_instance: State is set up for multiple instances which get
1469 * added afterwards. 1396 * added afterwards.
1470 * 1397 *
1398 * The caller needs to hold cpus read locked while calling this function.
1471 * Returns: 1399 * Returns:
1472 * On success: 1400 * On success:
1473 * Positive state number if @state is CPUHP_AP_ONLINE_DYN 1401 * Positive state number if @state is CPUHP_AP_ONLINE_DYN
1474 * 0 for all other states 1402 * 0 for all other states
1475 * On failure: proper (negative) error code 1403 * On failure: proper (negative) error code
1476 */ 1404 */
1477int __cpuhp_setup_state(enum cpuhp_state state, 1405int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
1478 const char *name, bool invoke, 1406 const char *name, bool invoke,
1479 int (*startup)(unsigned int cpu), 1407 int (*startup)(unsigned int cpu),
1480 int (*teardown)(unsigned int cpu), 1408 int (*teardown)(unsigned int cpu),
1481 bool multi_instance) 1409 bool multi_instance)
1482{ 1410{
1483 int cpu, ret = 0; 1411 int cpu, ret = 0;
1484 bool dynstate; 1412 bool dynstate;
1485 1413
1414 lockdep_assert_cpus_held();
1415
1486 if (cpuhp_cb_check(state) || !name) 1416 if (cpuhp_cb_check(state) || !name)
1487 return -EINVAL; 1417 return -EINVAL;
1488 1418
1489 get_online_cpus();
1490 mutex_lock(&cpuhp_state_mutex); 1419 mutex_lock(&cpuhp_state_mutex);
1491 1420
1492 ret = cpuhp_store_callbacks(state, name, startup, teardown, 1421 ret = cpuhp_store_callbacks(state, name, startup, teardown,
@@ -1522,7 +1451,6 @@ int __cpuhp_setup_state(enum cpuhp_state state,
1522 } 1451 }
1523out: 1452out:
1524 mutex_unlock(&cpuhp_state_mutex); 1453 mutex_unlock(&cpuhp_state_mutex);
1525 put_online_cpus();
1526 /* 1454 /*
1527 * If the requested state is CPUHP_AP_ONLINE_DYN, return the 1455 * If the requested state is CPUHP_AP_ONLINE_DYN, return the
1528 * dynamically allocated state in case of success. 1456 * dynamically allocated state in case of success.
@@ -1531,6 +1459,22 @@ out:
1531 return state; 1459 return state;
1532 return ret; 1460 return ret;
1533} 1461}
1462EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked);
1463
1464int __cpuhp_setup_state(enum cpuhp_state state,
1465 const char *name, bool invoke,
1466 int (*startup)(unsigned int cpu),
1467 int (*teardown)(unsigned int cpu),
1468 bool multi_instance)
1469{
1470 int ret;
1471
1472 cpus_read_lock();
1473 ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup,
1474 teardown, multi_instance);
1475 cpus_read_unlock();
1476 return ret;
1477}
1534EXPORT_SYMBOL(__cpuhp_setup_state); 1478EXPORT_SYMBOL(__cpuhp_setup_state);
1535 1479
1536int __cpuhp_state_remove_instance(enum cpuhp_state state, 1480int __cpuhp_state_remove_instance(enum cpuhp_state state,
@@ -1544,7 +1488,7 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state,
1544 if (!sp->multi_instance) 1488 if (!sp->multi_instance)
1545 return -EINVAL; 1489 return -EINVAL;
1546 1490
1547 get_online_cpus(); 1491 cpus_read_lock();
1548 mutex_lock(&cpuhp_state_mutex); 1492 mutex_lock(&cpuhp_state_mutex);
1549 1493
1550 if (!invoke || !cpuhp_get_teardown_cb(state)) 1494 if (!invoke || !cpuhp_get_teardown_cb(state))
@@ -1565,29 +1509,30 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state,
1565remove: 1509remove:
1566 hlist_del(node); 1510 hlist_del(node);
1567 mutex_unlock(&cpuhp_state_mutex); 1511 mutex_unlock(&cpuhp_state_mutex);
1568 put_online_cpus(); 1512 cpus_read_unlock();
1569 1513
1570 return 0; 1514 return 0;
1571} 1515}
1572EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); 1516EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
1573 1517
1574/** 1518/**
1575 * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state 1519 * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state
1576 * @state: The state to remove 1520 * @state: The state to remove
1577 * @invoke: If true, the teardown function is invoked for cpus where 1521 * @invoke: If true, the teardown function is invoked for cpus where
1578 * cpu state >= @state 1522 * cpu state >= @state
1579 * 1523 *
1524 * The caller needs to hold cpus read locked while calling this function.
1580 * The teardown callback is currently not allowed to fail. Think 1525 * The teardown callback is currently not allowed to fail. Think
1581 * about module removal! 1526 * about module removal!
1582 */ 1527 */
1583void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) 1528void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke)
1584{ 1529{
1585 struct cpuhp_step *sp = cpuhp_get_step(state); 1530 struct cpuhp_step *sp = cpuhp_get_step(state);
1586 int cpu; 1531 int cpu;
1587 1532
1588 BUG_ON(cpuhp_cb_check(state)); 1533 BUG_ON(cpuhp_cb_check(state));
1589 1534
1590 get_online_cpus(); 1535 lockdep_assert_cpus_held();
1591 1536
1592 mutex_lock(&cpuhp_state_mutex); 1537 mutex_lock(&cpuhp_state_mutex);
1593 if (sp->multi_instance) { 1538 if (sp->multi_instance) {
@@ -1615,7 +1560,14 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
1615remove: 1560remove:
1616 cpuhp_store_callbacks(state, NULL, NULL, NULL, false); 1561 cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
1617 mutex_unlock(&cpuhp_state_mutex); 1562 mutex_unlock(&cpuhp_state_mutex);
1618 put_online_cpus(); 1563}
1564EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked);
1565
1566void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
1567{
1568 cpus_read_lock();
1569 __cpuhp_remove_state_cpuslocked(state, invoke);
1570 cpus_read_unlock();
1619} 1571}
1620EXPORT_SYMBOL(__cpuhp_remove_state); 1572EXPORT_SYMBOL(__cpuhp_remove_state);
1621 1573
@@ -1658,13 +1610,13 @@ static ssize_t write_cpuhp_target(struct device *dev,
1658 ret = !sp->name || sp->cant_stop ? -EINVAL : 0; 1610 ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
1659 mutex_unlock(&cpuhp_state_mutex); 1611 mutex_unlock(&cpuhp_state_mutex);
1660 if (ret) 1612 if (ret)
1661 return ret; 1613 goto out;
1662 1614
1663 if (st->state < target) 1615 if (st->state < target)
1664 ret = do_cpu_up(dev->id, target); 1616 ret = do_cpu_up(dev->id, target);
1665 else 1617 else
1666 ret = do_cpu_down(dev->id, target); 1618 ret = do_cpu_down(dev->id, target);
1667 1619out:
1668 unlock_device_hotplug(); 1620 unlock_device_hotplug();
1669 return ret ? ret : count; 1621 return ret ? ret : count;
1670} 1622}
@@ -1684,7 +1636,7 @@ static struct attribute *cpuhp_cpu_attrs[] = {
1684 NULL 1636 NULL
1685}; 1637};
1686 1638
1687static struct attribute_group cpuhp_cpu_attr_group = { 1639static const struct attribute_group cpuhp_cpu_attr_group = {
1688 .attrs = cpuhp_cpu_attrs, 1640 .attrs = cpuhp_cpu_attrs,
1689 .name = "hotplug", 1641 .name = "hotplug",
1690 NULL 1642 NULL
@@ -1716,7 +1668,7 @@ static struct attribute *cpuhp_cpu_root_attrs[] = {
1716 NULL 1668 NULL
1717}; 1669};
1718 1670
1719static struct attribute_group cpuhp_cpu_root_attr_group = { 1671static const struct attribute_group cpuhp_cpu_root_attr_group = {
1720 .attrs = cpuhp_cpu_root_attrs, 1672 .attrs = cpuhp_cpu_root_attrs,
1721 .name = "hotplug", 1673 .name = "hotplug",
1722 NULL 1674 NULL
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index fcbd568f1e95..6db80fc0810b 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -14,10 +14,12 @@
14#include <asm/sections.h> 14#include <asm/sections.h>
15 15
16/* vmcoreinfo stuff */ 16/* vmcoreinfo stuff */
17static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 17static unsigned char *vmcoreinfo_data;
18u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; 18static size_t vmcoreinfo_size;
19size_t vmcoreinfo_size; 19u32 *vmcoreinfo_note;
20size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); 20
21/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
22static unsigned char *vmcoreinfo_data_safecopy;
21 23
22/* 24/*
23 * parsing the "crashkernel" commandline 25 * parsing the "crashkernel" commandline
@@ -324,8 +326,23 @@ static void update_vmcoreinfo_note(void)
324 final_note(buf); 326 final_note(buf);
325} 327}
326 328
329void crash_update_vmcoreinfo_safecopy(void *ptr)
330{
331 if (ptr)
332 memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
333
334 vmcoreinfo_data_safecopy = ptr;
335}
336
327void crash_save_vmcoreinfo(void) 337void crash_save_vmcoreinfo(void)
328{ 338{
339 if (!vmcoreinfo_note)
340 return;
341
342 /* Use the safe copy to generate vmcoreinfo note if have */
343 if (vmcoreinfo_data_safecopy)
344 vmcoreinfo_data = vmcoreinfo_data_safecopy;
345
329 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); 346 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
330 update_vmcoreinfo_note(); 347 update_vmcoreinfo_note();
331} 348}
@@ -340,7 +357,7 @@ void vmcoreinfo_append_str(const char *fmt, ...)
340 r = vscnprintf(buf, sizeof(buf), fmt, args); 357 r = vscnprintf(buf, sizeof(buf), fmt, args);
341 va_end(args); 358 va_end(args);
342 359
343 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); 360 r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
344 361
345 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); 362 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
346 363
@@ -356,11 +373,26 @@ void __weak arch_crash_save_vmcoreinfo(void)
356 373
357phys_addr_t __weak paddr_vmcoreinfo_note(void) 374phys_addr_t __weak paddr_vmcoreinfo_note(void)
358{ 375{
359 return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); 376 return __pa(vmcoreinfo_note);
360} 377}
361 378
362static int __init crash_save_vmcoreinfo_init(void) 379static int __init crash_save_vmcoreinfo_init(void)
363{ 380{
381 vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
382 if (!vmcoreinfo_data) {
383 pr_warn("Memory allocation for vmcoreinfo_data failed\n");
384 return -ENOMEM;
385 }
386
387 vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
388 GFP_KERNEL | __GFP_ZERO);
389 if (!vmcoreinfo_note) {
390 free_page((unsigned long)vmcoreinfo_data);
391 vmcoreinfo_data = NULL;
392 pr_warn("Memory allocation for vmcoreinfo_note failed\n");
393 return -ENOMEM;
394 }
395
364 VMCOREINFO_OSRELEASE(init_uts_ns.name.release); 396 VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
365 VMCOREINFO_PAGESIZE(PAGE_SIZE); 397 VMCOREINFO_PAGESIZE(PAGE_SIZE);
366 398
diff --git a/kernel/cred.c b/kernel/cred.c
index 2bc66075740f..ecf03657e71c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
1/* Task credentials management - see Documentation/security/credentials.txt 1/* Task credentials management - see Documentation/security/credentials.rst
2 * 2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6e75a5c9412d..426c2ffba16d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -389,6 +389,7 @@ static atomic_t nr_switch_events __read_mostly;
389static LIST_HEAD(pmus); 389static LIST_HEAD(pmus);
390static DEFINE_MUTEX(pmus_lock); 390static DEFINE_MUTEX(pmus_lock);
391static struct srcu_struct pmus_srcu; 391static struct srcu_struct pmus_srcu;
392static cpumask_var_t perf_online_mask;
392 393
393/* 394/*
394 * perf event paranoia level: 395 * perf event paranoia level:
@@ -925,11 +926,6 @@ static inline int is_cgroup_event(struct perf_event *event)
925 return 0; 926 return 0;
926} 927}
927 928
928static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
929{
930 return 0;
931}
932
933static inline void update_cgrp_time_from_event(struct perf_event *event) 929static inline void update_cgrp_time_from_event(struct perf_event *event)
934{ 930{
935} 931}
@@ -1456,6 +1452,13 @@ static enum event_type_t get_event_type(struct perf_event *event)
1456 1452
1457 lockdep_assert_held(&ctx->lock); 1453 lockdep_assert_held(&ctx->lock);
1458 1454
1455 /*
1456 * It's 'group type', really, because if our group leader is
1457 * pinned, so are we.
1458 */
1459 if (event->group_leader != event)
1460 event = event->group_leader;
1461
1459 event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; 1462 event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1460 if (!ctx->task) 1463 if (!ctx->task)
1461 event_type |= EVENT_CPU; 1464 event_type |= EVENT_CPU;
@@ -3636,10 +3639,10 @@ static inline u64 perf_event_count(struct perf_event *event)
3636 * will not be local and we cannot read them atomically 3639 * will not be local and we cannot read them atomically
3637 * - must not have a pmu::count method 3640 * - must not have a pmu::count method
3638 */ 3641 */
3639u64 perf_event_read_local(struct perf_event *event) 3642int perf_event_read_local(struct perf_event *event, u64 *value)
3640{ 3643{
3641 unsigned long flags; 3644 unsigned long flags;
3642 u64 val; 3645 int ret = 0;
3643 3646
3644 /* 3647 /*
3645 * Disabling interrupts avoids all counter scheduling (context 3648 * Disabling interrupts avoids all counter scheduling (context
@@ -3647,25 +3650,37 @@ u64 perf_event_read_local(struct perf_event *event)
3647 */ 3650 */
3648 local_irq_save(flags); 3651 local_irq_save(flags);
3649 3652
3650 /* If this is a per-task event, it must be for current */
3651 WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
3652 event->hw.target != current);
3653
3654 /* If this is a per-CPU event, it must be for this CPU */
3655 WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
3656 event->cpu != smp_processor_id());
3657
3658 /* 3653 /*
3659 * It must not be an event with inherit set, we cannot read 3654 * It must not be an event with inherit set, we cannot read
3660 * all child counters from atomic context. 3655 * all child counters from atomic context.
3661 */ 3656 */
3662 WARN_ON_ONCE(event->attr.inherit); 3657 if (event->attr.inherit) {
3658 ret = -EOPNOTSUPP;
3659 goto out;
3660 }
3663 3661
3664 /* 3662 /*
3665 * It must not have a pmu::count method, those are not 3663 * It must not have a pmu::count method, those are not
3666 * NMI safe. 3664 * NMI safe.
3667 */ 3665 */
3668 WARN_ON_ONCE(event->pmu->count); 3666 if (event->pmu->count) {
3667 ret = -EOPNOTSUPP;
3668 goto out;
3669 }
3670
3671 /* If this is a per-task event, it must be for current */
3672 if ((event->attach_state & PERF_ATTACH_TASK) &&
3673 event->hw.target != current) {
3674 ret = -EINVAL;
3675 goto out;
3676 }
3677
3678 /* If this is a per-CPU event, it must be for this CPU */
3679 if (!(event->attach_state & PERF_ATTACH_TASK) &&
3680 event->cpu != smp_processor_id()) {
3681 ret = -EINVAL;
3682 goto out;
3683 }
3669 3684
3670 /* 3685 /*
3671 * If the event is currently on this CPU, its either a per-task event, 3686 * If the event is currently on this CPU, its either a per-task event,
@@ -3675,10 +3690,11 @@ u64 perf_event_read_local(struct perf_event *event)
3675 if (event->oncpu == smp_processor_id()) 3690 if (event->oncpu == smp_processor_id())
3676 event->pmu->read(event); 3691 event->pmu->read(event);
3677 3692
3678 val = local64_read(&event->count); 3693 *value = local64_read(&event->count);
3694out:
3679 local_irq_restore(flags); 3695 local_irq_restore(flags);
3680 3696
3681 return val; 3697 return ret;
3682} 3698}
3683 3699
3684static int perf_event_read(struct perf_event *event, bool group) 3700static int perf_event_read(struct perf_event *event, bool group)
@@ -3812,14 +3828,6 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
3812 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 3828 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3813 return ERR_PTR(-EACCES); 3829 return ERR_PTR(-EACCES);
3814 3830
3815 /*
3816 * We could be clever and allow to attach a event to an
3817 * offline CPU and activate it when the CPU comes up, but
3818 * that's for later.
3819 */
3820 if (!cpu_online(cpu))
3821 return ERR_PTR(-ENODEV);
3822
3823 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 3831 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3824 ctx = &cpuctx->ctx; 3832 ctx = &cpuctx->ctx;
3825 get_ctx(ctx); 3833 get_ctx(ctx);
@@ -4377,7 +4385,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value);
4377static int __perf_read_group_add(struct perf_event *leader, 4385static int __perf_read_group_add(struct perf_event *leader,
4378 u64 read_format, u64 *values) 4386 u64 read_format, u64 *values)
4379{ 4387{
4388 struct perf_event_context *ctx = leader->ctx;
4380 struct perf_event *sub; 4389 struct perf_event *sub;
4390 unsigned long flags;
4381 int n = 1; /* skip @nr */ 4391 int n = 1; /* skip @nr */
4382 int ret; 4392 int ret;
4383 4393
@@ -4407,12 +4417,15 @@ static int __perf_read_group_add(struct perf_event *leader,
4407 if (read_format & PERF_FORMAT_ID) 4417 if (read_format & PERF_FORMAT_ID)
4408 values[n++] = primary_event_id(leader); 4418 values[n++] = primary_event_id(leader);
4409 4419
4420 raw_spin_lock_irqsave(&ctx->lock, flags);
4421
4410 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 4422 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4411 values[n++] += perf_event_count(sub); 4423 values[n++] += perf_event_count(sub);
4412 if (read_format & PERF_FORMAT_ID) 4424 if (read_format & PERF_FORMAT_ID)
4413 values[n++] = primary_event_id(sub); 4425 values[n++] = primary_event_id(sub);
4414 } 4426 }
4415 4427
4428 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4416 return 0; 4429 return 0;
4417} 4430}
4418 4431
@@ -5729,9 +5742,6 @@ static void perf_output_read_one(struct perf_output_handle *handle,
5729 __output_copy(handle, values, n * sizeof(u64)); 5742 __output_copy(handle, values, n * sizeof(u64));
5730} 5743}
5731 5744
5732/*
5733 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
5734 */
5735static void perf_output_read_group(struct perf_output_handle *handle, 5745static void perf_output_read_group(struct perf_output_handle *handle,
5736 struct perf_event *event, 5746 struct perf_event *event,
5737 u64 enabled, u64 running) 5747 u64 enabled, u64 running)
@@ -5776,6 +5786,13 @@ static void perf_output_read_group(struct perf_output_handle *handle,
5776#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ 5786#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5777 PERF_FORMAT_TOTAL_TIME_RUNNING) 5787 PERF_FORMAT_TOTAL_TIME_RUNNING)
5778 5788
5789/*
5790 * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
5791 *
5792 * The problem is that its both hard and excessively expensive to iterate the
5793 * child list, not to mention that its impossible to IPI the children running
5794 * on another CPU, from interrupt/NMI context.
5795 */
5779static void perf_output_read(struct perf_output_handle *handle, 5796static void perf_output_read(struct perf_output_handle *handle,
5780 struct perf_event *event) 5797 struct perf_event *event)
5781{ 5798{
@@ -7703,7 +7720,8 @@ static int swevent_hlist_get_cpu(int cpu)
7703 int err = 0; 7720 int err = 0;
7704 7721
7705 mutex_lock(&swhash->hlist_mutex); 7722 mutex_lock(&swhash->hlist_mutex);
7706 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { 7723 if (!swevent_hlist_deref(swhash) &&
7724 cpumask_test_cpu(cpu, perf_online_mask)) {
7707 struct swevent_hlist *hlist; 7725 struct swevent_hlist *hlist;
7708 7726
7709 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 7727 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -7724,7 +7742,7 @@ static int swevent_hlist_get(void)
7724{ 7742{
7725 int err, cpu, failed_cpu; 7743 int err, cpu, failed_cpu;
7726 7744
7727 get_online_cpus(); 7745 mutex_lock(&pmus_lock);
7728 for_each_possible_cpu(cpu) { 7746 for_each_possible_cpu(cpu) {
7729 err = swevent_hlist_get_cpu(cpu); 7747 err = swevent_hlist_get_cpu(cpu);
7730 if (err) { 7748 if (err) {
@@ -7732,8 +7750,7 @@ static int swevent_hlist_get(void)
7732 goto fail; 7750 goto fail;
7733 } 7751 }
7734 } 7752 }
7735 put_online_cpus(); 7753 mutex_unlock(&pmus_lock);
7736
7737 return 0; 7754 return 0;
7738fail: 7755fail:
7739 for_each_possible_cpu(cpu) { 7756 for_each_possible_cpu(cpu) {
@@ -7741,8 +7758,7 @@ fail:
7741 break; 7758 break;
7742 swevent_hlist_put_cpu(cpu); 7759 swevent_hlist_put_cpu(cpu);
7743 } 7760 }
7744 7761 mutex_unlock(&pmus_lock);
7745 put_online_cpus();
7746 return err; 7762 return err;
7747} 7763}
7748 7764
@@ -8037,12 +8053,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8037 bool is_kprobe, is_tracepoint; 8053 bool is_kprobe, is_tracepoint;
8038 struct bpf_prog *prog; 8054 struct bpf_prog *prog;
8039 8055
8040 if (event->attr.type == PERF_TYPE_HARDWARE ||
8041 event->attr.type == PERF_TYPE_SOFTWARE)
8042 return perf_event_set_bpf_handler(event, prog_fd);
8043
8044 if (event->attr.type != PERF_TYPE_TRACEPOINT) 8056 if (event->attr.type != PERF_TYPE_TRACEPOINT)
8045 return -EINVAL; 8057 return perf_event_set_bpf_handler(event, prog_fd);
8046 8058
8047 if (event->tp_event->prog) 8059 if (event->tp_event->prog)
8048 return -EEXIST; 8060 return -EEXIST;
@@ -8920,7 +8932,7 @@ perf_event_mux_interval_ms_store(struct device *dev,
8920 pmu->hrtimer_interval_ms = timer; 8932 pmu->hrtimer_interval_ms = timer;
8921 8933
8922 /* update all cpuctx for this PMU */ 8934 /* update all cpuctx for this PMU */
8923 get_online_cpus(); 8935 cpus_read_lock();
8924 for_each_online_cpu(cpu) { 8936 for_each_online_cpu(cpu) {
8925 struct perf_cpu_context *cpuctx; 8937 struct perf_cpu_context *cpuctx;
8926 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 8938 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
@@ -8929,7 +8941,7 @@ perf_event_mux_interval_ms_store(struct device *dev,
8929 cpu_function_call(cpu, 8941 cpu_function_call(cpu,
8930 (remote_function_f)perf_mux_hrtimer_restart, cpuctx); 8942 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
8931 } 8943 }
8932 put_online_cpus(); 8944 cpus_read_unlock();
8933 mutex_unlock(&mux_interval_mutex); 8945 mutex_unlock(&mux_interval_mutex);
8934 8946
8935 return count; 8947 return count;
@@ -9059,6 +9071,7 @@ skip_type:
9059 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); 9071 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
9060 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); 9072 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
9061 cpuctx->ctx.pmu = pmu; 9073 cpuctx->ctx.pmu = pmu;
9074 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
9062 9075
9063 __perf_mux_hrtimer_init(cpuctx, cpu); 9076 __perf_mux_hrtimer_init(cpuctx, cpu);
9064 } 9077 }
@@ -9172,7 +9185,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
9172 9185
9173static struct pmu *perf_init_event(struct perf_event *event) 9186static struct pmu *perf_init_event(struct perf_event *event)
9174{ 9187{
9175 struct pmu *pmu = NULL; 9188 struct pmu *pmu;
9176 int idx; 9189 int idx;
9177 int ret; 9190 int ret;
9178 9191
@@ -9441,9 +9454,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
9441 local64_set(&hwc->period_left, hwc->sample_period); 9454 local64_set(&hwc->period_left, hwc->sample_period);
9442 9455
9443 /* 9456 /*
9444 * we currently do not support PERF_FORMAT_GROUP on inherited events 9457 * We currently do not support PERF_SAMPLE_READ on inherited events.
9458 * See perf_output_read().
9445 */ 9459 */
9446 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 9460 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
9447 goto err_ns; 9461 goto err_ns;
9448 9462
9449 if (!has_branch_stack(event)) 9463 if (!has_branch_stack(event))
@@ -9456,9 +9470,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
9456 } 9470 }
9457 9471
9458 pmu = perf_init_event(event); 9472 pmu = perf_init_event(event);
9459 if (!pmu) 9473 if (IS_ERR(pmu)) {
9460 goto err_ns;
9461 else if (IS_ERR(pmu)) {
9462 err = PTR_ERR(pmu); 9474 err = PTR_ERR(pmu);
9463 goto err_ns; 9475 goto err_ns;
9464 } 9476 }
@@ -9471,8 +9483,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
9471 event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, 9483 event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
9472 sizeof(unsigned long), 9484 sizeof(unsigned long),
9473 GFP_KERNEL); 9485 GFP_KERNEL);
9474 if (!event->addr_filters_offs) 9486 if (!event->addr_filters_offs) {
9487 err = -ENOMEM;
9475 goto err_per_task; 9488 goto err_per_task;
9489 }
9476 9490
9477 /* force hw sync on the address filters */ 9491 /* force hw sync on the address filters */
9478 event->addr_filters_gen = 1; 9492 event->addr_filters_gen = 1;
@@ -9882,12 +9896,10 @@ SYSCALL_DEFINE5(perf_event_open,
9882 goto err_task; 9896 goto err_task;
9883 } 9897 }
9884 9898
9885 get_online_cpus();
9886
9887 if (task) { 9899 if (task) {
9888 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); 9900 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
9889 if (err) 9901 if (err)
9890 goto err_cpus; 9902 goto err_task;
9891 9903
9892 /* 9904 /*
9893 * Reuse ptrace permission checks for now. 9905 * Reuse ptrace permission checks for now.
@@ -10073,6 +10085,23 @@ SYSCALL_DEFINE5(perf_event_open,
10073 goto err_locked; 10085 goto err_locked;
10074 } 10086 }
10075 10087
10088 if (!task) {
10089 /*
10090 * Check if the @cpu we're creating an event for is online.
10091 *
10092 * We use the perf_cpu_context::ctx::mutex to serialize against
10093 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
10094 */
10095 struct perf_cpu_context *cpuctx =
10096 container_of(ctx, struct perf_cpu_context, ctx);
10097
10098 if (!cpuctx->online) {
10099 err = -ENODEV;
10100 goto err_locked;
10101 }
10102 }
10103
10104
10076 /* 10105 /*
10077 * Must be under the same ctx::mutex as perf_install_in_context(), 10106 * Must be under the same ctx::mutex as perf_install_in_context(),
10078 * because we need to serialize with concurrent event creation. 10107 * because we need to serialize with concurrent event creation.
@@ -10162,8 +10191,6 @@ SYSCALL_DEFINE5(perf_event_open,
10162 put_task_struct(task); 10191 put_task_struct(task);
10163 } 10192 }
10164 10193
10165 put_online_cpus();
10166
10167 mutex_lock(&current->perf_event_mutex); 10194 mutex_lock(&current->perf_event_mutex);
10168 list_add_tail(&event->owner_entry, &current->perf_event_list); 10195 list_add_tail(&event->owner_entry, &current->perf_event_list);
10169 mutex_unlock(&current->perf_event_mutex); 10196 mutex_unlock(&current->perf_event_mutex);
@@ -10197,8 +10224,6 @@ err_alloc:
10197err_cred: 10224err_cred:
10198 if (task) 10225 if (task)
10199 mutex_unlock(&task->signal->cred_guard_mutex); 10226 mutex_unlock(&task->signal->cred_guard_mutex);
10200err_cpus:
10201 put_online_cpus();
10202err_task: 10227err_task:
10203 if (task) 10228 if (task)
10204 put_task_struct(task); 10229 put_task_struct(task);
@@ -10253,6 +10278,21 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
10253 goto err_unlock; 10278 goto err_unlock;
10254 } 10279 }
10255 10280
10281 if (!task) {
10282 /*
10283 * Check if the @cpu we're creating an event for is online.
10284 *
10285 * We use the perf_cpu_context::ctx::mutex to serialize against
10286 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
10287 */
10288 struct perf_cpu_context *cpuctx =
10289 container_of(ctx, struct perf_cpu_context, ctx);
10290 if (!cpuctx->online) {
10291 err = -ENODEV;
10292 goto err_unlock;
10293 }
10294 }
10295
10256 if (!exclusive_event_installable(event, ctx)) { 10296 if (!exclusive_event_installable(event, ctx)) {
10257 err = -EBUSY; 10297 err = -EBUSY;
10258 goto err_unlock; 10298 goto err_unlock;
@@ -10920,6 +10960,8 @@ static void __init perf_event_init_all_cpus(void)
10920 struct swevent_htable *swhash; 10960 struct swevent_htable *swhash;
10921 int cpu; 10961 int cpu;
10922 10962
10963 zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
10964
10923 for_each_possible_cpu(cpu) { 10965 for_each_possible_cpu(cpu) {
10924 swhash = &per_cpu(swevent_htable, cpu); 10966 swhash = &per_cpu(swevent_htable, cpu);
10925 mutex_init(&swhash->hlist_mutex); 10967 mutex_init(&swhash->hlist_mutex);
@@ -10935,7 +10977,7 @@ static void __init perf_event_init_all_cpus(void)
10935 } 10977 }
10936} 10978}
10937 10979
10938int perf_event_init_cpu(unsigned int cpu) 10980void perf_swevent_init_cpu(unsigned int cpu)
10939{ 10981{
10940 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 10982 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
10941 10983
@@ -10948,7 +10990,6 @@ int perf_event_init_cpu(unsigned int cpu)
10948 rcu_assign_pointer(swhash->swevent_hlist, hlist); 10990 rcu_assign_pointer(swhash->swevent_hlist, hlist);
10949 } 10991 }
10950 mutex_unlock(&swhash->hlist_mutex); 10992 mutex_unlock(&swhash->hlist_mutex);
10951 return 0;
10952} 10993}
10953 10994
10954#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE 10995#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
@@ -10966,19 +11007,22 @@ static void __perf_event_exit_context(void *__info)
10966 11007
10967static void perf_event_exit_cpu_context(int cpu) 11008static void perf_event_exit_cpu_context(int cpu)
10968{ 11009{
11010 struct perf_cpu_context *cpuctx;
10969 struct perf_event_context *ctx; 11011 struct perf_event_context *ctx;
10970 struct pmu *pmu; 11012 struct pmu *pmu;
10971 int idx;
10972 11013
10973 idx = srcu_read_lock(&pmus_srcu); 11014 mutex_lock(&pmus_lock);
10974 list_for_each_entry_rcu(pmu, &pmus, entry) { 11015 list_for_each_entry(pmu, &pmus, entry) {
10975 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; 11016 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11017 ctx = &cpuctx->ctx;
10976 11018
10977 mutex_lock(&ctx->mutex); 11019 mutex_lock(&ctx->mutex);
10978 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); 11020 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
11021 cpuctx->online = 0;
10979 mutex_unlock(&ctx->mutex); 11022 mutex_unlock(&ctx->mutex);
10980 } 11023 }
10981 srcu_read_unlock(&pmus_srcu, idx); 11024 cpumask_clear_cpu(cpu, perf_online_mask);
11025 mutex_unlock(&pmus_lock);
10982} 11026}
10983#else 11027#else
10984 11028
@@ -10986,6 +11030,29 @@ static void perf_event_exit_cpu_context(int cpu) { }
10986 11030
10987#endif 11031#endif
10988 11032
11033int perf_event_init_cpu(unsigned int cpu)
11034{
11035 struct perf_cpu_context *cpuctx;
11036 struct perf_event_context *ctx;
11037 struct pmu *pmu;
11038
11039 perf_swevent_init_cpu(cpu);
11040
11041 mutex_lock(&pmus_lock);
11042 cpumask_set_cpu(cpu, perf_online_mask);
11043 list_for_each_entry(pmu, &pmus, entry) {
11044 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11045 ctx = &cpuctx->ctx;
11046
11047 mutex_lock(&ctx->mutex);
11048 cpuctx->online = 1;
11049 mutex_unlock(&ctx->mutex);
11050 }
11051 mutex_unlock(&pmus_lock);
11052
11053 return 0;
11054}
11055
10989int perf_event_exit_cpu(unsigned int cpu) 11056int perf_event_exit_cpu(unsigned int cpu)
10990{ 11057{
10991 perf_event_exit_cpu_context(cpu); 11058 perf_event_exit_cpu_context(cpu);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 2831480c63a2..ee97196bb151 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -580,7 +580,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
580 int ret = -ENOMEM, max_order = 0; 580 int ret = -ENOMEM, max_order = 0;
581 581
582 if (!has_aux(event)) 582 if (!has_aux(event))
583 return -ENOTSUPP; 583 return -EOPNOTSUPP;
584 584
585 if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { 585 if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
586 /* 586 /*
diff --git a/kernel/exit.c b/kernel/exit.c
index 516acdb0e0ec..c5548faa9f37 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,7 +51,6 @@
51#include <linux/task_io_accounting_ops.h> 51#include <linux/task_io_accounting_ops.h>
52#include <linux/tracehook.h> 52#include <linux/tracehook.h>
53#include <linux/fs_struct.h> 53#include <linux/fs_struct.h>
54#include <linux/userfaultfd_k.h>
55#include <linux/init_task.h> 54#include <linux/init_task.h>
56#include <linux/perf_event.h> 55#include <linux/perf_event.h>
57#include <trace/events/sched.h> 56#include <trace/events/sched.h>
@@ -62,6 +61,7 @@
62#include <linux/kcov.h> 61#include <linux/kcov.h>
63#include <linux/random.h> 62#include <linux/random.h>
64#include <linux/rcuwait.h> 63#include <linux/rcuwait.h>
64#include <linux/compat.h>
65 65
66#include <linux/uaccess.h> 66#include <linux/uaccess.h>
67#include <asm/unistd.h> 67#include <asm/unistd.h>
@@ -318,19 +318,6 @@ void rcuwait_wake_up(struct rcuwait *w)
318 rcu_read_unlock(); 318 rcu_read_unlock();
319} 319}
320 320
321struct task_struct *try_get_task_struct(struct task_struct **ptask)
322{
323 struct task_struct *task;
324
325 rcu_read_lock();
326 task = task_rcu_dereference(ptask);
327 if (task)
328 get_task_struct(task);
329 rcu_read_unlock();
330
331 return task;
332}
333
334/* 321/*
335 * Determine if a process group is "orphaned", according to the POSIX 322 * Determine if a process group is "orphaned", according to the POSIX
336 * definition in 2.2.2.52. Orphaned process groups are not to be affected 323 * definition in 2.2.2.52. Orphaned process groups are not to be affected
@@ -995,16 +982,23 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
995 return 0; 982 return 0;
996} 983}
997 984
985struct waitid_info {
986 pid_t pid;
987 uid_t uid;
988 int status;
989 int cause;
990};
991
998struct wait_opts { 992struct wait_opts {
999 enum pid_type wo_type; 993 enum pid_type wo_type;
1000 int wo_flags; 994 int wo_flags;
1001 struct pid *wo_pid; 995 struct pid *wo_pid;
1002 996
1003 struct siginfo __user *wo_info; 997 struct waitid_info *wo_info;
1004 int __user *wo_stat; 998 int wo_stat;
1005 struct rusage __user *wo_rusage; 999 struct rusage *wo_rusage;
1006 1000
1007 wait_queue_t child_wait; 1001 wait_queue_entry_t child_wait;
1008 int notask_error; 1002 int notask_error;
1009}; 1003};
1010 1004
@@ -1049,34 +1043,6 @@ eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
1049 return 1; 1043 return 1;
1050} 1044}
1051 1045
1052static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1053 pid_t pid, uid_t uid, int why, int status)
1054{
1055 struct siginfo __user *infop;
1056 int retval = wo->wo_rusage
1057 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1058
1059 put_task_struct(p);
1060 infop = wo->wo_info;
1061 if (infop) {
1062 if (!retval)
1063 retval = put_user(SIGCHLD, &infop->si_signo);
1064 if (!retval)
1065 retval = put_user(0, &infop->si_errno);
1066 if (!retval)
1067 retval = put_user((short)why, &infop->si_code);
1068 if (!retval)
1069 retval = put_user(pid, &infop->si_pid);
1070 if (!retval)
1071 retval = put_user(uid, &infop->si_uid);
1072 if (!retval)
1073 retval = put_user(status, &infop->si_status);
1074 }
1075 if (!retval)
1076 retval = pid;
1077 return retval;
1078}
1079
1080/* 1046/*
1081 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold 1047 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
1082 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1048 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
@@ -1085,30 +1051,23 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1085 */ 1051 */
1086static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) 1052static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1087{ 1053{
1088 int state, retval, status; 1054 int state, status;
1089 pid_t pid = task_pid_vnr(p); 1055 pid_t pid = task_pid_vnr(p);
1090 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1056 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
1091 struct siginfo __user *infop; 1057 struct waitid_info *infop;
1092 1058
1093 if (!likely(wo->wo_flags & WEXITED)) 1059 if (!likely(wo->wo_flags & WEXITED))
1094 return 0; 1060 return 0;
1095 1061
1096 if (unlikely(wo->wo_flags & WNOWAIT)) { 1062 if (unlikely(wo->wo_flags & WNOWAIT)) {
1097 int exit_code = p->exit_code; 1063 status = p->exit_code;
1098 int why;
1099
1100 get_task_struct(p); 1064 get_task_struct(p);
1101 read_unlock(&tasklist_lock); 1065 read_unlock(&tasklist_lock);
1102 sched_annotate_sleep(); 1066 sched_annotate_sleep();
1103 1067 if (wo->wo_rusage)
1104 if ((exit_code & 0x7f) == 0) { 1068 getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1105 why = CLD_EXITED; 1069 put_task_struct(p);
1106 status = exit_code >> 8; 1070 goto out_info;
1107 } else {
1108 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
1109 status = exit_code & 0x7f;
1110 }
1111 return wait_noreap_copyout(wo, p, pid, uid, why, status);
1112 } 1071 }
1113 /* 1072 /*
1114 * Move the task's state to DEAD/TRACE, only one thread can do this. 1073 * Move the task's state to DEAD/TRACE, only one thread can do this.
@@ -1181,38 +1140,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1181 spin_unlock_irq(&current->sighand->siglock); 1140 spin_unlock_irq(&current->sighand->siglock);
1182 } 1141 }
1183 1142
1184 retval = wo->wo_rusage 1143 if (wo->wo_rusage)
1185 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1144 getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1186 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1145 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1187 ? p->signal->group_exit_code : p->exit_code; 1146 ? p->signal->group_exit_code : p->exit_code;
1188 if (!retval && wo->wo_stat) 1147 wo->wo_stat = status;
1189 retval = put_user(status, wo->wo_stat);
1190
1191 infop = wo->wo_info;
1192 if (!retval && infop)
1193 retval = put_user(SIGCHLD, &infop->si_signo);
1194 if (!retval && infop)
1195 retval = put_user(0, &infop->si_errno);
1196 if (!retval && infop) {
1197 int why;
1198
1199 if ((status & 0x7f) == 0) {
1200 why = CLD_EXITED;
1201 status >>= 8;
1202 } else {
1203 why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1204 status &= 0x7f;
1205 }
1206 retval = put_user((short)why, &infop->si_code);
1207 if (!retval)
1208 retval = put_user(status, &infop->si_status);
1209 }
1210 if (!retval && infop)
1211 retval = put_user(pid, &infop->si_pid);
1212 if (!retval && infop)
1213 retval = put_user(uid, &infop->si_uid);
1214 if (!retval)
1215 retval = pid;
1216 1148
1217 if (state == EXIT_TRACE) { 1149 if (state == EXIT_TRACE) {
1218 write_lock_irq(&tasklist_lock); 1150 write_lock_irq(&tasklist_lock);
@@ -1229,7 +1161,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1229 if (state == EXIT_DEAD) 1161 if (state == EXIT_DEAD)
1230 release_task(p); 1162 release_task(p);
1231 1163
1232 return retval; 1164out_info:
1165 infop = wo->wo_info;
1166 if (infop) {
1167 if ((status & 0x7f) == 0) {
1168 infop->cause = CLD_EXITED;
1169 infop->status = status >> 8;
1170 } else {
1171 infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1172 infop->status = status & 0x7f;
1173 }
1174 infop->pid = pid;
1175 infop->uid = uid;
1176 }
1177
1178 return pid;
1233} 1179}
1234 1180
1235static int *task_stopped_code(struct task_struct *p, bool ptrace) 1181static int *task_stopped_code(struct task_struct *p, bool ptrace)
@@ -1265,8 +1211,8 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
1265static int wait_task_stopped(struct wait_opts *wo, 1211static int wait_task_stopped(struct wait_opts *wo,
1266 int ptrace, struct task_struct *p) 1212 int ptrace, struct task_struct *p)
1267{ 1213{
1268 struct siginfo __user *infop; 1214 struct waitid_info *infop;
1269 int retval, exit_code, *p_code, why; 1215 int exit_code, *p_code, why;
1270 uid_t uid = 0; /* unneeded, required by compiler */ 1216 uid_t uid = 0; /* unneeded, required by compiler */
1271 pid_t pid; 1217 pid_t pid;
1272 1218
@@ -1311,34 +1257,21 @@ unlock_sig:
1311 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1257 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1312 read_unlock(&tasklist_lock); 1258 read_unlock(&tasklist_lock);
1313 sched_annotate_sleep(); 1259 sched_annotate_sleep();
1260 if (wo->wo_rusage)
1261 getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1262 put_task_struct(p);
1314 1263
1315 if (unlikely(wo->wo_flags & WNOWAIT)) 1264 if (likely(!(wo->wo_flags & WNOWAIT)))
1316 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); 1265 wo->wo_stat = (exit_code << 8) | 0x7f;
1317
1318 retval = wo->wo_rusage
1319 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1320 if (!retval && wo->wo_stat)
1321 retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
1322 1266
1323 infop = wo->wo_info; 1267 infop = wo->wo_info;
1324 if (!retval && infop) 1268 if (infop) {
1325 retval = put_user(SIGCHLD, &infop->si_signo); 1269 infop->cause = why;
1326 if (!retval && infop) 1270 infop->status = exit_code;
1327 retval = put_user(0, &infop->si_errno); 1271 infop->pid = pid;
1328 if (!retval && infop) 1272 infop->uid = uid;
1329 retval = put_user((short)why, &infop->si_code); 1273 }
1330 if (!retval && infop) 1274 return pid;
1331 retval = put_user(exit_code, &infop->si_status);
1332 if (!retval && infop)
1333 retval = put_user(pid, &infop->si_pid);
1334 if (!retval && infop)
1335 retval = put_user(uid, &infop->si_uid);
1336 if (!retval)
1337 retval = pid;
1338 put_task_struct(p);
1339
1340 BUG_ON(!retval);
1341 return retval;
1342} 1275}
1343 1276
1344/* 1277/*
@@ -1349,7 +1282,7 @@ unlock_sig:
1349 */ 1282 */
1350static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) 1283static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1351{ 1284{
1352 int retval; 1285 struct waitid_info *infop;
1353 pid_t pid; 1286 pid_t pid;
1354 uid_t uid; 1287 uid_t uid;
1355 1288
@@ -1374,22 +1307,20 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1374 get_task_struct(p); 1307 get_task_struct(p);
1375 read_unlock(&tasklist_lock); 1308 read_unlock(&tasklist_lock);
1376 sched_annotate_sleep(); 1309 sched_annotate_sleep();
1310 if (wo->wo_rusage)
1311 getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1312 put_task_struct(p);
1377 1313
1378 if (!wo->wo_info) { 1314 infop = wo->wo_info;
1379 retval = wo->wo_rusage 1315 if (!infop) {
1380 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1316 wo->wo_stat = 0xffff;
1381 put_task_struct(p);
1382 if (!retval && wo->wo_stat)
1383 retval = put_user(0xffff, wo->wo_stat);
1384 if (!retval)
1385 retval = pid;
1386 } else { 1317 } else {
1387 retval = wait_noreap_copyout(wo, p, pid, uid, 1318 infop->cause = CLD_CONTINUED;
1388 CLD_CONTINUED, SIGCONT); 1319 infop->pid = pid;
1389 BUG_ON(retval == 0); 1320 infop->uid = uid;
1321 infop->status = SIGCONT;
1390 } 1322 }
1391 1323 return pid;
1392 return retval;
1393} 1324}
1394 1325
1395/* 1326/*
@@ -1541,7 +1472,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1541 return 0; 1472 return 0;
1542} 1473}
1543 1474
1544static int child_wait_callback(wait_queue_t *wait, unsigned mode, 1475static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
1545 int sync, void *key) 1476 int sync, void *key)
1546{ 1477{
1547 struct wait_opts *wo = container_of(wait, struct wait_opts, 1478 struct wait_opts *wo = container_of(wait, struct wait_opts,
@@ -1617,8 +1548,8 @@ end:
1617 return retval; 1548 return retval;
1618} 1549}
1619 1550
1620SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, 1551static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
1621 infop, int, options, struct rusage __user *, ru) 1552 int options, struct rusage *ru)
1622{ 1553{
1623 struct wait_opts wo; 1554 struct wait_opts wo;
1624 struct pid *pid = NULL; 1555 struct pid *pid = NULL;
@@ -1656,38 +1587,48 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1656 wo.wo_pid = pid; 1587 wo.wo_pid = pid;
1657 wo.wo_flags = options; 1588 wo.wo_flags = options;
1658 wo.wo_info = infop; 1589 wo.wo_info = infop;
1659 wo.wo_stat = NULL;
1660 wo.wo_rusage = ru; 1590 wo.wo_rusage = ru;
1661 ret = do_wait(&wo); 1591 ret = do_wait(&wo);
1662 1592
1663 if (ret > 0) {
1664 ret = 0;
1665 } else if (infop) {
1666 /*
1667 * For a WNOHANG return, clear out all the fields
1668 * we would set so the user can easily tell the
1669 * difference.
1670 */
1671 if (!ret)
1672 ret = put_user(0, &infop->si_signo);
1673 if (!ret)
1674 ret = put_user(0, &infop->si_errno);
1675 if (!ret)
1676 ret = put_user(0, &infop->si_code);
1677 if (!ret)
1678 ret = put_user(0, &infop->si_pid);
1679 if (!ret)
1680 ret = put_user(0, &infop->si_uid);
1681 if (!ret)
1682 ret = put_user(0, &infop->si_status);
1683 }
1684
1685 put_pid(pid); 1593 put_pid(pid);
1686 return ret; 1594 return ret;
1687} 1595}
1688 1596
1689SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, 1597SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1690 int, options, struct rusage __user *, ru) 1598 infop, int, options, struct rusage __user *, ru)
1599{
1600 struct rusage r;
1601 struct waitid_info info = {.status = 0};
1602 long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
1603 int signo = 0;
1604 if (err > 0) {
1605 signo = SIGCHLD;
1606 err = 0;
1607 }
1608
1609 if (!err) {
1610 if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1611 return -EFAULT;
1612 }
1613 if (!infop)
1614 return err;
1615
1616 user_access_begin();
1617 unsafe_put_user(signo, &infop->si_signo, Efault);
1618 unsafe_put_user(0, &infop->si_errno, Efault);
1619 unsafe_put_user((short)info.cause, &infop->si_code, Efault);
1620 unsafe_put_user(info.pid, &infop->si_pid, Efault);
1621 unsafe_put_user(info.uid, &infop->si_uid, Efault);
1622 unsafe_put_user(info.status, &infop->si_status, Efault);
1623 user_access_end();
1624 return err;
1625Efault:
1626 user_access_end();
1627 return -EFAULT;
1628}
1629
1630long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
1631 struct rusage *ru)
1691{ 1632{
1692 struct wait_opts wo; 1633 struct wait_opts wo;
1693 struct pid *pid = NULL; 1634 struct pid *pid = NULL;
@@ -1698,6 +1639,10 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1698 __WNOTHREAD|__WCLONE|__WALL)) 1639 __WNOTHREAD|__WCLONE|__WALL))
1699 return -EINVAL; 1640 return -EINVAL;
1700 1641
1642 /* -INT_MIN is not defined */
1643 if (upid == INT_MIN)
1644 return -ESRCH;
1645
1701 if (upid == -1) 1646 if (upid == -1)
1702 type = PIDTYPE_MAX; 1647 type = PIDTYPE_MAX;
1703 else if (upid < 0) { 1648 else if (upid < 0) {
@@ -1715,14 +1660,29 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1715 wo.wo_pid = pid; 1660 wo.wo_pid = pid;
1716 wo.wo_flags = options | WEXITED; 1661 wo.wo_flags = options | WEXITED;
1717 wo.wo_info = NULL; 1662 wo.wo_info = NULL;
1718 wo.wo_stat = stat_addr; 1663 wo.wo_stat = 0;
1719 wo.wo_rusage = ru; 1664 wo.wo_rusage = ru;
1720 ret = do_wait(&wo); 1665 ret = do_wait(&wo);
1721 put_pid(pid); 1666 put_pid(pid);
1667 if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
1668 ret = -EFAULT;
1722 1669
1723 return ret; 1670 return ret;
1724} 1671}
1725 1672
1673SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1674 int, options, struct rusage __user *, ru)
1675{
1676 struct rusage r;
1677 long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
1678
1679 if (err > 0) {
1680 if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1681 return -EFAULT;
1682 }
1683 return err;
1684}
1685
1726#ifdef __ARCH_WANT_SYS_WAITPID 1686#ifdef __ARCH_WANT_SYS_WAITPID
1727 1687
1728/* 1688/*
@@ -1735,3 +1695,61 @@ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1735} 1695}
1736 1696
1737#endif 1697#endif
1698
1699#ifdef CONFIG_COMPAT
1700COMPAT_SYSCALL_DEFINE4(wait4,
1701 compat_pid_t, pid,
1702 compat_uint_t __user *, stat_addr,
1703 int, options,
1704 struct compat_rusage __user *, ru)
1705{
1706 struct rusage r;
1707 long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
1708 if (err > 0) {
1709 if (ru && put_compat_rusage(&r, ru))
1710 return -EFAULT;
1711 }
1712 return err;
1713}
1714
1715COMPAT_SYSCALL_DEFINE5(waitid,
1716 int, which, compat_pid_t, pid,
1717 struct compat_siginfo __user *, infop, int, options,
1718 struct compat_rusage __user *, uru)
1719{
1720 struct rusage ru;
1721 struct waitid_info info = {.status = 0};
1722 long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
1723 int signo = 0;
1724 if (err > 0) {
1725 signo = SIGCHLD;
1726 err = 0;
1727 }
1728
1729 if (!err && uru) {
1730 /* kernel_waitid() overwrites everything in ru */
1731 if (COMPAT_USE_64BIT_TIME)
1732 err = copy_to_user(uru, &ru, sizeof(ru));
1733 else
1734 err = put_compat_rusage(&ru, uru);
1735 if (err)
1736 return -EFAULT;
1737 }
1738
1739 if (!infop)
1740 return err;
1741
1742 user_access_begin();
1743 unsafe_put_user(signo, &infop->si_signo, Efault);
1744 unsafe_put_user(0, &infop->si_errno, Efault);
1745 unsafe_put_user((short)info.cause, &infop->si_code, Efault);
1746 unsafe_put_user(info.pid, &infop->si_pid, Efault);
1747 unsafe_put_user(info.uid, &infop->si_uid, Efault);
1748 unsafe_put_user(info.status, &infop->si_status, Efault);
1749 user_access_end();
1750 return err;
1751Efault:
1752 user_access_end();
1753 return -EFAULT;
1754}
1755#endif
diff --git a/kernel/extable.c b/kernel/extable.c
index 2676d7f8baf6..38c2412401a1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -55,7 +55,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
55{ 55{
56 const struct exception_table_entry *e; 56 const struct exception_table_entry *e;
57 57
58 e = search_extable(__start___ex_table, __stop___ex_table-1, addr); 58 e = search_extable(__start___ex_table,
59 __stop___ex_table - __start___ex_table, addr);
59 if (!e) 60 if (!e)
60 e = search_module_extables(addr); 61 e = search_module_extables(addr);
61 return e; 62 return e;
@@ -69,13 +70,13 @@ static inline int init_kernel_text(unsigned long addr)
69 return 0; 70 return 0;
70} 71}
71 72
72int core_kernel_text(unsigned long addr) 73int notrace core_kernel_text(unsigned long addr)
73{ 74{
74 if (addr >= (unsigned long)_stext && 75 if (addr >= (unsigned long)_stext &&
75 addr < (unsigned long)_etext) 76 addr < (unsigned long)_etext)
76 return 1; 77 return 1;
77 78
78 if (system_state == SYSTEM_BOOTING && 79 if (system_state < SYSTEM_RUNNING &&
79 init_kernel_text(addr)) 80 init_kernel_text(addr))
80 return 1; 81 return 1;
81 return 0; 82 return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 3a13a940a6ea..5ff0ebcaafc3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -205,19 +205,17 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
205 void *stack; 205 void *stack;
206 int i; 206 int i;
207 207
208 local_irq_disable();
209 for (i = 0; i < NR_CACHED_STACKS; i++) { 208 for (i = 0; i < NR_CACHED_STACKS; i++) {
210 struct vm_struct *s = this_cpu_read(cached_stacks[i]); 209 struct vm_struct *s;
210
211 s = this_cpu_xchg(cached_stacks[i], NULL);
211 212
212 if (!s) 213 if (!s)
213 continue; 214 continue;
214 this_cpu_write(cached_stacks[i], NULL);
215 215
216 tsk->stack_vm_area = s; 216 tsk->stack_vm_area = s;
217 local_irq_enable();
218 return s->addr; 217 return s->addr;
219 } 218 }
220 local_irq_enable();
221 219
222 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, 220 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
223 VMALLOC_START, VMALLOC_END, 221 VMALLOC_START, VMALLOC_END,
@@ -245,19 +243,15 @@ static inline void free_thread_stack(struct task_struct *tsk)
245{ 243{
246#ifdef CONFIG_VMAP_STACK 244#ifdef CONFIG_VMAP_STACK
247 if (task_stack_vm_area(tsk)) { 245 if (task_stack_vm_area(tsk)) {
248 unsigned long flags;
249 int i; 246 int i;
250 247
251 local_irq_save(flags);
252 for (i = 0; i < NR_CACHED_STACKS; i++) { 248 for (i = 0; i < NR_CACHED_STACKS; i++) {
253 if (this_cpu_read(cached_stacks[i])) 249 if (this_cpu_cmpxchg(cached_stacks[i],
250 NULL, tsk->stack_vm_area) != NULL)
254 continue; 251 continue;
255 252
256 this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
257 local_irq_restore(flags);
258 return; 253 return;
259 } 254 }
260 local_irq_restore(flags);
261 255
262 vfree_atomic(tsk->stack); 256 vfree_atomic(tsk->stack);
263 return; 257 return;
@@ -326,8 +320,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
326 } 320 }
327 321
328 /* All stack pages belong to the same memcg. */ 322 /* All stack pages belong to the same memcg. */
329 memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, 323 mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
330 account * (THREAD_SIZE / 1024)); 324 account * (THREAD_SIZE / 1024));
331 } else { 325 } else {
332 /* 326 /*
333 * All stack pages are in the same zone and belong to the 327 * All stack pages are in the same zone and belong to the
@@ -338,8 +332,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
338 mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, 332 mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
339 THREAD_SIZE / 1024 * account); 333 THREAD_SIZE / 1024 * account);
340 334
341 memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, 335 mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB,
342 account * (THREAD_SIZE / 1024)); 336 account * (THREAD_SIZE / 1024));
343 } 337 }
344} 338}
345 339
@@ -560,7 +554,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
560 set_task_stack_end_magic(tsk); 554 set_task_stack_end_magic(tsk);
561 555
562#ifdef CONFIG_CC_STACKPROTECTOR 556#ifdef CONFIG_CC_STACKPROTECTOR
563 tsk->stack_canary = get_random_long(); 557 tsk->stack_canary = get_random_canary();
564#endif 558#endif
565 559
566 /* 560 /*
@@ -579,6 +573,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
579 573
580 kcov_task_init(tsk); 574 kcov_task_init(tsk);
581 575
576#ifdef CONFIG_FAULT_INJECTION
577 tsk->fail_nth = 0;
578#endif
579
582 return tsk; 580 return tsk;
583 581
584free_stack: 582free_stack:
@@ -1573,6 +1571,18 @@ static __latent_entropy struct task_struct *copy_process(
1573 if (!p) 1571 if (!p)
1574 goto fork_out; 1572 goto fork_out;
1575 1573
1574 /*
1575 * This _must_ happen before we call free_task(), i.e. before we jump
1576 * to any of the bad_fork_* labels. This is to avoid freeing
1577 * p->set_child_tid which is (ab)used as a kthread's data pointer for
1578 * kernel threads (PF_KTHREAD).
1579 */
1580 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1581 /*
1582 * Clear TID on mm_release()?
1583 */
1584 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
1585
1576 ftrace_graph_init_task(p); 1586 ftrace_graph_init_task(p);
1577 1587
1578 rt_mutex_init_task(p); 1588 rt_mutex_init_task(p);
@@ -1621,9 +1631,9 @@ static __latent_entropy struct task_struct *copy_process(
1621 prev_cputime_init(&p->prev_cputime); 1631 prev_cputime_init(&p->prev_cputime);
1622 1632
1623#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1633#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1624 seqcount_init(&p->vtime_seqcount); 1634 seqcount_init(&p->vtime.seqcount);
1625 p->vtime_snap = 0; 1635 p->vtime.starttime = 0;
1626 p->vtime_snap_whence = VTIME_INACTIVE; 1636 p->vtime.state = VTIME_INACTIVE;
1627#endif 1637#endif
1628 1638
1629#if defined(SPLIT_RSS_COUNTING) 1639#if defined(SPLIT_RSS_COUNTING)
@@ -1739,11 +1749,6 @@ static __latent_entropy struct task_struct *copy_process(
1739 } 1749 }
1740 } 1750 }
1741 1751
1742 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1743 /*
1744 * Clear TID on mm_release()?
1745 */
1746 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
1747#ifdef CONFIG_BLOCK 1752#ifdef CONFIG_BLOCK
1748 p->plug = NULL; 1753 p->plug = NULL;
1749#endif 1754#endif
diff --git a/kernel/futex.c b/kernel/futex.c
index 357348a6cf6b..16dbe4c93895 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -212,7 +212,7 @@ struct futex_pi_state {
212 atomic_t refcount; 212 atomic_t refcount;
213 213
214 union futex_key key; 214 union futex_key key;
215}; 215} __randomize_layout;
216 216
217/** 217/**
218 * struct futex_q - The hashed futex queue entry, one per waiting task 218 * struct futex_q - The hashed futex queue entry, one per waiting task
@@ -225,7 +225,7 @@ struct futex_pi_state {
225 * @requeue_pi_key: the requeue_pi target futex key 225 * @requeue_pi_key: the requeue_pi target futex key
226 * @bitset: bitset for the optional bitmasked wakeup 226 * @bitset: bitset for the optional bitmasked wakeup
227 * 227 *
228 * We use this hashed waitqueue, instead of a normal wait_queue_t, so 228 * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
229 * we can wake only the relevant ones (hashed queues may be shared). 229 * we can wake only the relevant ones (hashed queues may be shared).
230 * 230 *
231 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 231 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
@@ -246,7 +246,7 @@ struct futex_q {
246 struct rt_mutex_waiter *rt_waiter; 246 struct rt_mutex_waiter *rt_waiter;
247 union futex_key *requeue_pi_key; 247 union futex_key *requeue_pi_key;
248 u32 bitset; 248 u32 bitset;
249}; 249} __randomize_layout;
250 250
251static const struct futex_q futex_q_init = { 251static const struct futex_q futex_q_init = {
252 /* list gets initialized in queue_me()*/ 252 /* list gets initialized in queue_me()*/
@@ -488,7 +488,7 @@ static void drop_futex_key_refs(union futex_key *key)
488 * 488 *
489 * Return: a negative error code or 0 489 * Return: a negative error code or 0
490 * 490 *
491 * The key words are stored in *key on success. 491 * The key words are stored in @key on success.
492 * 492 *
493 * For shared mappings, it's (page->index, file_inode(vma->vm_file), 493 * For shared mappings, it's (page->index, file_inode(vma->vm_file),
494 * offset_within_page). For private mappings, it's (uaddr, current->mm). 494 * offset_within_page). For private mappings, it's (uaddr, current->mm).
@@ -1259,9 +1259,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
1259 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 1259 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
1260 * 1260 *
1261 * Return: 1261 * Return:
1262 * 0 - ready to wait; 1262 * - 0 - ready to wait;
1263 * 1 - acquired the lock; 1263 * - 1 - acquired the lock;
1264 * <0 - error 1264 * - <0 - error
1265 * 1265 *
1266 * The hb->lock and futex_key refs shall be held by the caller. 1266 * The hb->lock and futex_key refs shall be held by the caller.
1267 */ 1267 */
@@ -1717,9 +1717,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1717 * hb1 and hb2 must be held by the caller. 1717 * hb1 and hb2 must be held by the caller.
1718 * 1718 *
1719 * Return: 1719 * Return:
1720 * 0 - failed to acquire the lock atomically; 1720 * - 0 - failed to acquire the lock atomically;
1721 * >0 - acquired the lock, return value is vpid of the top_waiter 1721 * - >0 - acquired the lock, return value is vpid of the top_waiter
1722 * <0 - error 1722 * - <0 - error
1723 */ 1723 */
1724static int futex_proxy_trylock_atomic(u32 __user *pifutex, 1724static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1725 struct futex_hash_bucket *hb1, 1725 struct futex_hash_bucket *hb1,
@@ -1785,8 +1785,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1785 * uaddr2 atomically on behalf of the top waiter. 1785 * uaddr2 atomically on behalf of the top waiter.
1786 * 1786 *
1787 * Return: 1787 * Return:
1788 * >=0 - on success, the number of tasks requeued or woken; 1788 * - >=0 - on success, the number of tasks requeued or woken;
1789 * <0 - on error 1789 * - <0 - on error
1790 */ 1790 */
1791static int futex_requeue(u32 __user *uaddr1, unsigned int flags, 1791static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1792 u32 __user *uaddr2, int nr_wake, int nr_requeue, 1792 u32 __user *uaddr2, int nr_wake, int nr_requeue,
@@ -2142,8 +2142,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2142 * be paired with exactly one earlier call to queue_me(). 2142 * be paired with exactly one earlier call to queue_me().
2143 * 2143 *
2144 * Return: 2144 * Return:
2145 * 1 - if the futex_q was still queued (and we removed unqueued it); 2145 * - 1 - if the futex_q was still queued (and we removed unqueued it);
2146 * 0 - if the futex_q was already removed by the waking thread 2146 * - 0 - if the futex_q was already removed by the waking thread
2147 */ 2147 */
2148static int unqueue_me(struct futex_q *q) 2148static int unqueue_me(struct futex_q *q)
2149{ 2149{
@@ -2333,9 +2333,9 @@ static long futex_wait_restart(struct restart_block *restart);
2333 * acquire the lock. Must be called with the hb lock held. 2333 * acquire the lock. Must be called with the hb lock held.
2334 * 2334 *
2335 * Return: 2335 * Return:
2336 * 1 - success, lock taken; 2336 * - 1 - success, lock taken;
2337 * 0 - success, lock not taken; 2337 * - 0 - success, lock not taken;
2338 * <0 - on error (-EFAULT) 2338 * - <0 - on error (-EFAULT)
2339 */ 2339 */
2340static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) 2340static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2341{ 2341{
@@ -2422,8 +2422,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
2422 * with no q.key reference on failure. 2422 * with no q.key reference on failure.
2423 * 2423 *
2424 * Return: 2424 * Return:
2425 * 0 - uaddr contains val and hb has been locked; 2425 * - 0 - uaddr contains val and hb has been locked;
2426 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked 2426 * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
2427 */ 2427 */
2428static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 2428static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
2429 struct futex_q *q, struct futex_hash_bucket **hb) 2429 struct futex_q *q, struct futex_hash_bucket **hb)
@@ -2895,8 +2895,8 @@ pi_faulted:
2895 * called with the hb lock held. 2895 * called with the hb lock held.
2896 * 2896 *
2897 * Return: 2897 * Return:
2898 * 0 = no early wakeup detected; 2898 * - 0 = no early wakeup detected;
2899 * <0 = -ETIMEDOUT or -ERESTARTNOINTR 2899 * - <0 = -ETIMEDOUT or -ERESTARTNOINTR
2900 */ 2900 */
2901static inline 2901static inline
2902int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, 2902int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2968,8 +2968,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2968 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2968 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2969 * 2969 *
2970 * Return: 2970 * Return:
2971 * 0 - On success; 2971 * - 0 - On success;
2972 * <0 - On error 2972 * - <0 - On error
2973 */ 2973 */
2974static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, 2974static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2975 u32 val, ktime_t *abs_time, u32 bitset, 2975 u32 val, ktime_t *abs_time, u32 bitset,
diff --git a/kernel/groups.c b/kernel/groups.c
index d09727692a2a..434f6665f187 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -5,6 +5,7 @@
5#include <linux/export.h> 5#include <linux/export.h>
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/sort.h>
8#include <linux/syscalls.h> 9#include <linux/syscalls.h>
9#include <linux/user_namespace.h> 10#include <linux/user_namespace.h>
10#include <linux/vmalloc.h> 11#include <linux/vmalloc.h>
@@ -76,32 +77,18 @@ static int groups_from_user(struct group_info *group_info,
76 return 0; 77 return 0;
77} 78}
78 79
79/* a simple Shell sort */ 80static int gid_cmp(const void *_a, const void *_b)
81{
82 kgid_t a = *(kgid_t *)_a;
83 kgid_t b = *(kgid_t *)_b;
84
85 return gid_gt(a, b) - gid_lt(a, b);
86}
87
80static void groups_sort(struct group_info *group_info) 88static void groups_sort(struct group_info *group_info)
81{ 89{
82 int base, max, stride; 90 sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
83 int gidsetsize = group_info->ngroups; 91 gid_cmp, NULL);
84
85 for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
86 ; /* nothing */
87 stride /= 3;
88
89 while (stride) {
90 max = gidsetsize - stride;
91 for (base = 0; base < max; base++) {
92 int left = base;
93 int right = left + stride;
94 kgid_t tmp = group_info->gid[right];
95
96 while (left >= 0 && gid_gt(group_info->gid[left], tmp)) {
97 group_info->gid[right] = group_info->gid[left];
98 right = left;
99 left -= stride;
100 }
101 group_info->gid[right] = tmp;
102 }
103 stride /= 3;
104 }
105} 92}
106 93
107/* a simple bsearch */ 94/* a simple bsearch */
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 3bbfd6a9c475..27c4e774071c 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -21,6 +21,10 @@ config GENERIC_IRQ_SHOW
21config GENERIC_IRQ_SHOW_LEVEL 21config GENERIC_IRQ_SHOW_LEVEL
22 bool 22 bool
23 23
24# Supports effective affinity mask
25config GENERIC_IRQ_EFFECTIVE_AFF_MASK
26 bool
27
24# Facility to allocate a hardware interrupt. This is legacy support 28# Facility to allocate a hardware interrupt. This is legacy support
25# and should not be used in new code. Use irq domains instead. 29# and should not be used in new code. Use irq domains instead.
26config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ 30config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
@@ -81,6 +85,9 @@ config GENERIC_MSI_IRQ_DOMAIN
81config HANDLE_DOMAIN_IRQ 85config HANDLE_DOMAIN_IRQ
82 bool 86 bool
83 87
88config IRQ_TIMINGS
89 bool
90
84config IRQ_DOMAIN_DEBUG 91config IRQ_DOMAIN_DEBUG
85 bool "Expose hardware/virtual IRQ mapping via debugfs" 92 bool "Expose hardware/virtual IRQ mapping via debugfs"
86 depends on IRQ_DOMAIN && DEBUG_FS 93 depends on IRQ_DOMAIN && DEBUG_FS
@@ -108,4 +115,15 @@ config SPARSE_IRQ
108 115
109 If you don't know what to do here, say N. 116 If you don't know what to do here, say N.
110 117
118config GENERIC_IRQ_DEBUGFS
119 bool "Expose irq internals in debugfs"
120 depends on DEBUG_FS
121 default n
122 ---help---
123
124 Exposes internal state information through debugfs. Mostly for
125 developers and debugging of hard to diagnose interrupt problems.
126
127 If you don't know what to do here, say N.
128
111endmenu 129endmenu
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 1d3ee3169202..e4aef7351f2b 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,6 @@
1 1
2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o 2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
3obj-$(CONFIG_IRQ_TIMINGS) += timings.o
3obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o 4obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
4obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 5obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
5obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o 6obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
@@ -10,3 +11,4 @@ obj-$(CONFIG_PM_SLEEP) += pm.o
10obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o 11obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
11obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o 12obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
12obj-$(CONFIG_SMP) += affinity.o 13obj-$(CONFIG_SMP) += affinity.o
14obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e2d356dd7581..d69bd77252a7 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -1,4 +1,7 @@
1 1/*
2 * Copyright (C) 2016 Thomas Gleixner.
3 * Copyright (C) 2016-2017 Christoph Hellwig.
4 */
2#include <linux/interrupt.h> 5#include <linux/interrupt.h>
3#include <linux/kernel.h> 6#include <linux/kernel.h>
4#include <linux/slab.h> 7#include <linux/slab.h>
@@ -35,13 +38,54 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
35 } 38 }
36} 39}
37 40
38static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) 41static cpumask_var_t *alloc_node_to_present_cpumask(void)
42{
43 cpumask_var_t *masks;
44 int node;
45
46 masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
47 if (!masks)
48 return NULL;
49
50 for (node = 0; node < nr_node_ids; node++) {
51 if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
52 goto out_unwind;
53 }
54
55 return masks;
56
57out_unwind:
58 while (--node >= 0)
59 free_cpumask_var(masks[node]);
60 kfree(masks);
61 return NULL;
62}
63
64static void free_node_to_present_cpumask(cpumask_var_t *masks)
65{
66 int node;
67
68 for (node = 0; node < nr_node_ids; node++)
69 free_cpumask_var(masks[node]);
70 kfree(masks);
71}
72
73static void build_node_to_present_cpumask(cpumask_var_t *masks)
74{
75 int cpu;
76
77 for_each_present_cpu(cpu)
78 cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
79}
80
81static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask,
82 const struct cpumask *mask, nodemask_t *nodemsk)
39{ 83{
40 int n, nodes = 0; 84 int n, nodes = 0;
41 85
42 /* Calculate the number of nodes in the supplied affinity mask */ 86 /* Calculate the number of nodes in the supplied affinity mask */
43 for_each_online_node(n) { 87 for_each_node(n) {
44 if (cpumask_intersects(mask, cpumask_of_node(n))) { 88 if (cpumask_intersects(mask, node_to_present_cpumask[n])) {
45 node_set(n, *nodemsk); 89 node_set(n, *nodemsk);
46 nodes++; 90 nodes++;
47 } 91 }
@@ -64,7 +108,14 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
64 int last_affv = affv + affd->pre_vectors; 108 int last_affv = affv + affd->pre_vectors;
65 nodemask_t nodemsk = NODE_MASK_NONE; 109 nodemask_t nodemsk = NODE_MASK_NONE;
66 struct cpumask *masks; 110 struct cpumask *masks;
67 cpumask_var_t nmsk; 111 cpumask_var_t nmsk, *node_to_present_cpumask;
112
113 /*
114 * If there aren't any vectors left after applying the pre/post
115 * vectors don't bother with assigning affinity.
116 */
117 if (!affv)
118 return NULL;
68 119
69 if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) 120 if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
70 return NULL; 121 return NULL;
@@ -73,13 +124,19 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
73 if (!masks) 124 if (!masks)
74 goto out; 125 goto out;
75 126
127 node_to_present_cpumask = alloc_node_to_present_cpumask();
128 if (!node_to_present_cpumask)
129 goto out;
130
76 /* Fill out vectors at the beginning that don't need affinity */ 131 /* Fill out vectors at the beginning that don't need affinity */
77 for (curvec = 0; curvec < affd->pre_vectors; curvec++) 132 for (curvec = 0; curvec < affd->pre_vectors; curvec++)
78 cpumask_copy(masks + curvec, irq_default_affinity); 133 cpumask_copy(masks + curvec, irq_default_affinity);
79 134
80 /* Stabilize the cpumasks */ 135 /* Stabilize the cpumasks */
81 get_online_cpus(); 136 get_online_cpus();
82 nodes = get_nodes_in_cpumask(cpu_online_mask, &nodemsk); 137 build_node_to_present_cpumask(node_to_present_cpumask);
138 nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask,
139 &nodemsk);
83 140
84 /* 141 /*
85 * If the number of nodes in the mask is greater than or equal the 142 * If the number of nodes in the mask is greater than or equal the
@@ -87,7 +144,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
87 */ 144 */
88 if (affv <= nodes) { 145 if (affv <= nodes) {
89 for_each_node_mask(n, nodemsk) { 146 for_each_node_mask(n, nodemsk) {
90 cpumask_copy(masks + curvec, cpumask_of_node(n)); 147 cpumask_copy(masks + curvec,
148 node_to_present_cpumask[n]);
91 if (++curvec == last_affv) 149 if (++curvec == last_affv)
92 break; 150 break;
93 } 151 }
@@ -101,7 +159,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
101 vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; 159 vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
102 160
103 /* Get the cpus on this node which are in the mask */ 161 /* Get the cpus on this node which are in the mask */
104 cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n)); 162 cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]);
105 163
106 /* Calculate the number of cpus per vector */ 164 /* Calculate the number of cpus per vector */
107 ncpus = cpumask_weight(nmsk); 165 ncpus = cpumask_weight(nmsk);
@@ -133,6 +191,7 @@ done:
133 /* Fill out vectors at the end that don't need affinity */ 191 /* Fill out vectors at the end that don't need affinity */
134 for (; curvec < nvecs; curvec++) 192 for (; curvec < nvecs; curvec++)
135 cpumask_copy(masks + curvec, irq_default_affinity); 193 cpumask_copy(masks + curvec, irq_default_affinity);
194 free_node_to_present_cpumask(node_to_present_cpumask);
136out: 195out:
137 free_cpumask_var(nmsk); 196 free_cpumask_var(nmsk);
138 return masks; 197 return masks;
@@ -140,19 +199,21 @@ out:
140 199
141/** 200/**
142 * irq_calc_affinity_vectors - Calculate the optimal number of vectors 201 * irq_calc_affinity_vectors - Calculate the optimal number of vectors
202 * @minvec: The minimum number of vectors available
143 * @maxvec: The maximum number of vectors available 203 * @maxvec: The maximum number of vectors available
144 * @affd: Description of the affinity requirements 204 * @affd: Description of the affinity requirements
145 */ 205 */
146int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) 206int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
147{ 207{
148 int resv = affd->pre_vectors + affd->post_vectors; 208 int resv = affd->pre_vectors + affd->post_vectors;
149 int vecs = maxvec - resv; 209 int vecs = maxvec - resv;
150 int cpus; 210 int ret;
211
212 if (resv > minvec)
213 return 0;
151 214
152 /* Stabilize the cpumasks */
153 get_online_cpus(); 215 get_online_cpus();
154 cpus = cpumask_weight(cpu_online_mask); 216 ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv;
155 put_online_cpus(); 217 put_online_cpus();
156 218 return ret;
157 return min(cpus, vecs) + resv;
158} 219}
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 0119b9d467ae..d30a0dd5cc02 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -53,7 +53,7 @@ unsigned long probe_irq_on(void)
53 if (desc->irq_data.chip->irq_set_type) 53 if (desc->irq_data.chip->irq_set_type)
54 desc->irq_data.chip->irq_set_type(&desc->irq_data, 54 desc->irq_data.chip->irq_set_type(&desc->irq_data,
55 IRQ_TYPE_PROBE); 55 IRQ_TYPE_PROBE);
56 irq_startup(desc, false); 56 irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE);
57 } 57 }
58 raw_spin_unlock_irq(&desc->lock); 58 raw_spin_unlock_irq(&desc->lock);
59 } 59 }
@@ -70,7 +70,7 @@ unsigned long probe_irq_on(void)
70 raw_spin_lock_irq(&desc->lock); 70 raw_spin_lock_irq(&desc->lock);
71 if (!desc->action && irq_settings_can_probe(desc)) { 71 if (!desc->action && irq_settings_can_probe(desc)) {
72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; 72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
73 if (irq_startup(desc, false)) 73 if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE))
74 desc->istate |= IRQS_PENDING; 74 desc->istate |= IRQS_PENDING;
75 } 75 }
76 raw_spin_unlock_irq(&desc->lock); 76 raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c94da688ee9b..a3cc37c0c85e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -7,7 +7,7 @@
7 * This file contains the core interrupt handling code, for irq-chip 7 * This file contains the core interrupt handling code, for irq-chip
8 * based architectures. 8 * based architectures.
9 * 9 *
10 * Detailed information is available in Documentation/DocBook/genericirq 10 * Detailed information is available in Documentation/core-api/genericirq.rst
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
@@ -170,62 +170,167 @@ static void irq_state_clr_disabled(struct irq_desc *desc)
170 irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); 170 irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
171} 171}
172 172
173static void irq_state_set_disabled(struct irq_desc *desc) 173static void irq_state_clr_masked(struct irq_desc *desc)
174{ 174{
175 irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); 175 irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
176} 176}
177 177
178static void irq_state_clr_masked(struct irq_desc *desc) 178static void irq_state_clr_started(struct irq_desc *desc)
179{ 179{
180 irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); 180 irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED);
181} 181}
182 182
183static void irq_state_set_masked(struct irq_desc *desc) 183static void irq_state_set_started(struct irq_desc *desc)
184{ 184{
185 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); 185 irqd_set(&desc->irq_data, IRQD_IRQ_STARTED);
186} 186}
187 187
188int irq_startup(struct irq_desc *desc, bool resend) 188enum {
189 IRQ_STARTUP_NORMAL,
190 IRQ_STARTUP_MANAGED,
191 IRQ_STARTUP_ABORT,
192};
193
194#ifdef CONFIG_SMP
195static int
196__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
189{ 197{
190 int ret = 0; 198 struct irq_data *d = irq_desc_get_irq_data(desc);
191 199
192 irq_state_clr_disabled(desc); 200 if (!irqd_affinity_is_managed(d))
193 desc->depth = 0; 201 return IRQ_STARTUP_NORMAL;
202
203 irqd_clr_managed_shutdown(d);
204
205 if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) {
206 /*
207 * Catch code which fiddles with enable_irq() on a managed
208 * and potentially shutdown IRQ. Chained interrupt
209 * installment or irq auto probing should not happen on
210 * managed irqs either. Emit a warning, break the affinity
211 * and start it up as a normal interrupt.
212 */
213 if (WARN_ON_ONCE(force))
214 return IRQ_STARTUP_NORMAL;
215 /*
216 * The interrupt was requested, but there is no online CPU
217 * in it's affinity mask. Put it into managed shutdown
218 * state and let the cpu hotplug mechanism start it up once
219 * a CPU in the mask becomes available.
220 */
221 irqd_set_managed_shutdown(d);
222 return IRQ_STARTUP_ABORT;
223 }
224 return IRQ_STARTUP_MANAGED;
225}
226#else
227static __always_inline int
228__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
229{
230 return IRQ_STARTUP_NORMAL;
231}
232#endif
194 233
195 irq_domain_activate_irq(&desc->irq_data); 234static int __irq_startup(struct irq_desc *desc)
196 if (desc->irq_data.chip->irq_startup) { 235{
197 ret = desc->irq_data.chip->irq_startup(&desc->irq_data); 236 struct irq_data *d = irq_desc_get_irq_data(desc);
237 int ret = 0;
238
239 irq_domain_activate_irq(d);
240 if (d->chip->irq_startup) {
241 ret = d->chip->irq_startup(d);
242 irq_state_clr_disabled(desc);
198 irq_state_clr_masked(desc); 243 irq_state_clr_masked(desc);
199 } else { 244 } else {
200 irq_enable(desc); 245 irq_enable(desc);
201 } 246 }
247 irq_state_set_started(desc);
248 return ret;
249}
250
251int irq_startup(struct irq_desc *desc, bool resend, bool force)
252{
253 struct irq_data *d = irq_desc_get_irq_data(desc);
254 struct cpumask *aff = irq_data_get_affinity_mask(d);
255 int ret = 0;
256
257 desc->depth = 0;
258
259 if (irqd_is_started(d)) {
260 irq_enable(desc);
261 } else {
262 switch (__irq_startup_managed(desc, aff, force)) {
263 case IRQ_STARTUP_NORMAL:
264 ret = __irq_startup(desc);
265 irq_setup_affinity(desc);
266 break;
267 case IRQ_STARTUP_MANAGED:
268 ret = __irq_startup(desc);
269 irq_set_affinity_locked(d, aff, false);
270 break;
271 case IRQ_STARTUP_ABORT:
272 return 0;
273 }
274 }
202 if (resend) 275 if (resend)
203 check_irq_resend(desc); 276 check_irq_resend(desc);
277
204 return ret; 278 return ret;
205} 279}
206 280
281static void __irq_disable(struct irq_desc *desc, bool mask);
282
207void irq_shutdown(struct irq_desc *desc) 283void irq_shutdown(struct irq_desc *desc)
208{ 284{
209 irq_state_set_disabled(desc); 285 if (irqd_is_started(&desc->irq_data)) {
210 desc->depth = 1; 286 desc->depth = 1;
211 if (desc->irq_data.chip->irq_shutdown) 287 if (desc->irq_data.chip->irq_shutdown) {
212 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 288 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
213 else if (desc->irq_data.chip->irq_disable) 289 irq_state_set_disabled(desc);
214 desc->irq_data.chip->irq_disable(&desc->irq_data); 290 irq_state_set_masked(desc);
215 else 291 } else {
216 desc->irq_data.chip->irq_mask(&desc->irq_data); 292 __irq_disable(desc, true);
293 }
294 irq_state_clr_started(desc);
295 }
296 /*
297 * This must be called even if the interrupt was never started up,
298 * because the activation can happen before the interrupt is
299 * available for request/startup. It has it's own state tracking so
300 * it's safe to call it unconditionally.
301 */
217 irq_domain_deactivate_irq(&desc->irq_data); 302 irq_domain_deactivate_irq(&desc->irq_data);
218 irq_state_set_masked(desc);
219} 303}
220 304
221void irq_enable(struct irq_desc *desc) 305void irq_enable(struct irq_desc *desc)
222{ 306{
223 irq_state_clr_disabled(desc); 307 if (!irqd_irq_disabled(&desc->irq_data)) {
224 if (desc->irq_data.chip->irq_enable) 308 unmask_irq(desc);
225 desc->irq_data.chip->irq_enable(&desc->irq_data); 309 } else {
226 else 310 irq_state_clr_disabled(desc);
227 desc->irq_data.chip->irq_unmask(&desc->irq_data); 311 if (desc->irq_data.chip->irq_enable) {
228 irq_state_clr_masked(desc); 312 desc->irq_data.chip->irq_enable(&desc->irq_data);
313 irq_state_clr_masked(desc);
314 } else {
315 unmask_irq(desc);
316 }
317 }
318}
319
320static void __irq_disable(struct irq_desc *desc, bool mask)
321{
322 if (irqd_irq_disabled(&desc->irq_data)) {
323 if (mask)
324 mask_irq(desc);
325 } else {
326 irq_state_set_disabled(desc);
327 if (desc->irq_data.chip->irq_disable) {
328 desc->irq_data.chip->irq_disable(&desc->irq_data);
329 irq_state_set_masked(desc);
330 } else if (mask) {
331 mask_irq(desc);
332 }
333 }
229} 334}
230 335
231/** 336/**
@@ -250,13 +355,7 @@ void irq_enable(struct irq_desc *desc)
250 */ 355 */
251void irq_disable(struct irq_desc *desc) 356void irq_disable(struct irq_desc *desc)
252{ 357{
253 irq_state_set_disabled(desc); 358 __irq_disable(desc, irq_settings_disable_unlazy(desc));
254 if (desc->irq_data.chip->irq_disable) {
255 desc->irq_data.chip->irq_disable(&desc->irq_data);
256 irq_state_set_masked(desc);
257 } else if (irq_settings_disable_unlazy(desc)) {
258 mask_irq(desc);
259 }
260} 359}
261 360
262void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu) 361void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu)
@@ -279,18 +378,21 @@ void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu)
279 378
280static inline void mask_ack_irq(struct irq_desc *desc) 379static inline void mask_ack_irq(struct irq_desc *desc)
281{ 380{
282 if (desc->irq_data.chip->irq_mask_ack) 381 if (desc->irq_data.chip->irq_mask_ack) {
283 desc->irq_data.chip->irq_mask_ack(&desc->irq_data); 382 desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
284 else { 383 irq_state_set_masked(desc);
285 desc->irq_data.chip->irq_mask(&desc->irq_data); 384 } else {
385 mask_irq(desc);
286 if (desc->irq_data.chip->irq_ack) 386 if (desc->irq_data.chip->irq_ack)
287 desc->irq_data.chip->irq_ack(&desc->irq_data); 387 desc->irq_data.chip->irq_ack(&desc->irq_data);
288 } 388 }
289 irq_state_set_masked(desc);
290} 389}
291 390
292void mask_irq(struct irq_desc *desc) 391void mask_irq(struct irq_desc *desc)
293{ 392{
393 if (irqd_irq_masked(&desc->irq_data))
394 return;
395
294 if (desc->irq_data.chip->irq_mask) { 396 if (desc->irq_data.chip->irq_mask) {
295 desc->irq_data.chip->irq_mask(&desc->irq_data); 397 desc->irq_data.chip->irq_mask(&desc->irq_data);
296 irq_state_set_masked(desc); 398 irq_state_set_masked(desc);
@@ -299,6 +401,9 @@ void mask_irq(struct irq_desc *desc)
299 401
300void unmask_irq(struct irq_desc *desc) 402void unmask_irq(struct irq_desc *desc)
301{ 403{
404 if (!irqd_irq_masked(&desc->irq_data))
405 return;
406
302 if (desc->irq_data.chip->irq_unmask) { 407 if (desc->irq_data.chip->irq_unmask) {
303 desc->irq_data.chip->irq_unmask(&desc->irq_data); 408 desc->irq_data.chip->irq_unmask(&desc->irq_data);
304 irq_state_clr_masked(desc); 409 irq_state_clr_masked(desc);
@@ -312,10 +417,7 @@ void unmask_threaded_irq(struct irq_desc *desc)
312 if (chip->flags & IRQCHIP_EOI_THREADED) 417 if (chip->flags & IRQCHIP_EOI_THREADED)
313 chip->irq_eoi(&desc->irq_data); 418 chip->irq_eoi(&desc->irq_data);
314 419
315 if (chip->irq_unmask) { 420 unmask_irq(desc);
316 chip->irq_unmask(&desc->irq_data);
317 irq_state_clr_masked(desc);
318 }
319} 421}
320 422
321/* 423/*
@@ -851,7 +953,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
851 irq_settings_set_norequest(desc); 953 irq_settings_set_norequest(desc);
852 irq_settings_set_nothread(desc); 954 irq_settings_set_nothread(desc);
853 desc->action = &chained_action; 955 desc->action = &chained_action;
854 irq_startup(desc, true); 956 irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE);
855 } 957 }
856} 958}
857 959
@@ -903,6 +1005,13 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
903 1005
904 if (!desc) 1006 if (!desc)
905 return; 1007 return;
1008
1009 /*
1010 * Warn when a driver sets the no autoenable flag on an already
1011 * active interrupt.
1012 */
1013 WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN));
1014
906 irq_settings_clr_and_set(desc, clr, set); 1015 irq_settings_clr_and_set(desc, clr, set);
907 1016
908 irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | 1017 irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 011f8c4c63da..aee8f7ec40af 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -14,37 +14,99 @@
14 14
15#include "internals.h" 15#include "internals.h"
16 16
17/* For !GENERIC_IRQ_EFFECTIVE_AFF_MASK this looks at general affinity mask */
18static inline bool irq_needs_fixup(struct irq_data *d)
19{
20 const struct cpumask *m = irq_data_get_effective_affinity_mask(d);
21
22 return cpumask_test_cpu(smp_processor_id(), m);
23}
24
17static bool migrate_one_irq(struct irq_desc *desc) 25static bool migrate_one_irq(struct irq_desc *desc)
18{ 26{
19 struct irq_data *d = irq_desc_get_irq_data(desc); 27 struct irq_data *d = irq_desc_get_irq_data(desc);
20 const struct cpumask *affinity = d->common->affinity; 28 struct irq_chip *chip = irq_data_get_irq_chip(d);
21 struct irq_chip *c; 29 bool maskchip = !irq_can_move_pcntxt(d) && !irqd_irq_masked(d);
22 bool ret = false; 30 const struct cpumask *affinity;
31 bool brokeaff = false;
32 int err;
23 33
24 /* 34 /*
25 * If this is a per-CPU interrupt, or the affinity does not 35 * IRQ chip might be already torn down, but the irq descriptor is
26 * include this CPU, then we have nothing to do. 36 * still in the radix tree. Also if the chip has no affinity setter,
37 * nothing can be done here.
27 */ 38 */
28 if (irqd_is_per_cpu(d) || 39 if (!chip || !chip->irq_set_affinity) {
29 !cpumask_test_cpu(smp_processor_id(), affinity)) 40 pr_debug("IRQ %u: Unable to migrate away\n", d->irq);
30 return false; 41 return false;
42 }
43
44 /*
45 * No move required, if:
46 * - Interrupt is per cpu
47 * - Interrupt is not started
48 * - Affinity mask does not include this CPU.
49 *
50 * Note: Do not check desc->action as this might be a chained
51 * interrupt.
52 */
53 if (irqd_is_per_cpu(d) || !irqd_is_started(d) || !irq_needs_fixup(d)) {
54 /*
55 * If an irq move is pending, abort it if the dying CPU is
56 * the sole target.
57 */
58 irq_fixup_move_pending(desc, false);
59 return false;
60 }
61
62 /*
63 * Complete an eventually pending irq move cleanup. If this
64 * interrupt was moved in hard irq context, then the vectors need
65 * to be cleaned up. It can't wait until this interrupt actually
66 * happens and this CPU was involved.
67 */
68 irq_force_complete_move(desc);
69
70 /*
71 * If there is a setaffinity pending, then try to reuse the pending
72 * mask, so the last change of the affinity does not get lost. If
73 * there is no move pending or the pending mask does not contain
74 * any online CPU, use the current affinity mask.
75 */
76 if (irq_fixup_move_pending(desc, true))
77 affinity = irq_desc_get_pending_mask(desc);
78 else
79 affinity = irq_data_get_affinity_mask(d);
80
81 /* Mask the chip for interrupts which cannot move in process context */
82 if (maskchip && chip->irq_mask)
83 chip->irq_mask(d);
31 84
32 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { 85 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
86 /*
87 * If the interrupt is managed, then shut it down and leave
88 * the affinity untouched.
89 */
90 if (irqd_affinity_is_managed(d)) {
91 irqd_set_managed_shutdown(d);
92 irq_shutdown(desc);
93 return false;
94 }
33 affinity = cpu_online_mask; 95 affinity = cpu_online_mask;
34 ret = true; 96 brokeaff = true;
35 } 97 }
36 98
37 c = irq_data_get_irq_chip(d); 99 err = irq_do_set_affinity(d, affinity, true);
38 if (!c->irq_set_affinity) { 100 if (err) {
39 pr_debug("IRQ%u: unable to set affinity\n", d->irq); 101 pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
40 } else { 102 d->irq, err);
41 int r = irq_do_set_affinity(d, affinity, false); 103 brokeaff = false;
42 if (r)
43 pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
44 d->irq, r);
45 } 104 }
46 105
47 return ret; 106 if (maskchip && chip->irq_unmask)
107 chip->irq_unmask(d);
108
109 return brokeaff;
48} 110}
49 111
50/** 112/**
@@ -59,11 +121,8 @@ static bool migrate_one_irq(struct irq_desc *desc)
59 */ 121 */
60void irq_migrate_all_off_this_cpu(void) 122void irq_migrate_all_off_this_cpu(void)
61{ 123{
62 unsigned int irq;
63 struct irq_desc *desc; 124 struct irq_desc *desc;
64 unsigned long flags; 125 unsigned int irq;
65
66 local_irq_save(flags);
67 126
68 for_each_active_irq(irq) { 127 for_each_active_irq(irq) {
69 bool affinity_broken; 128 bool affinity_broken;
@@ -73,10 +132,53 @@ void irq_migrate_all_off_this_cpu(void)
73 affinity_broken = migrate_one_irq(desc); 132 affinity_broken = migrate_one_irq(desc);
74 raw_spin_unlock(&desc->lock); 133 raw_spin_unlock(&desc->lock);
75 134
76 if (affinity_broken) 135 if (affinity_broken) {
77 pr_warn_ratelimited("IRQ%u no longer affine to CPU%u\n", 136 pr_warn_ratelimited("IRQ %u: no longer affine to CPU%u\n",
78 irq, smp_processor_id()); 137 irq, smp_processor_id());
138 }
139 }
140}
141
142static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
143{
144 struct irq_data *data = irq_desc_get_irq_data(desc);
145 const struct cpumask *affinity = irq_data_get_affinity_mask(data);
146
147 if (!irqd_affinity_is_managed(data) || !desc->action ||
148 !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity))
149 return;
150
151 if (irqd_is_managed_and_shutdown(data)) {
152 irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
153 return;
154 }
155
156 /*
157 * If the interrupt can only be directed to a single target
158 * CPU then it is already assigned to a CPU in the affinity
159 * mask. No point in trying to move it around.
160 */
161 if (!irqd_is_single_target(data))
162 irq_set_affinity_locked(data, affinity, false);
163}
164
165/**
166 * irq_affinity_online_cpu - Restore affinity for managed interrupts
167 * @cpu: Upcoming CPU for which interrupts should be restored
168 */
169int irq_affinity_online_cpu(unsigned int cpu)
170{
171 struct irq_desc *desc;
172 unsigned int irq;
173
174 irq_lock_sparse();
175 for_each_active_irq(irq) {
176 desc = irq_to_desc(irq);
177 raw_spin_lock_irq(&desc->lock);
178 irq_restore_affinity_of_irq(desc, cpu);
179 raw_spin_unlock_irq(&desc->lock);
79 } 180 }
181 irq_unlock_sparse();
80 182
81 local_irq_restore(flags); 183 return 0;
82} 184}
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
new file mode 100644
index 000000000000..4d384edc0c64
--- /dev/null
+++ b/kernel/irq/debugfs.c
@@ -0,0 +1,213 @@
1/*
2 * Copyright 2017 Thomas Gleixner <tglx@linutronix.de>
3 *
4 * This file is licensed under the GPL V2.
5 */
6#include <linux/irqdomain.h>
7#include <linux/irq.h>
8
9#include "internals.h"
10
11static struct dentry *irq_dir;
12
13struct irq_bit_descr {
14 unsigned int mask;
15 char *name;
16};
17#define BIT_MASK_DESCR(m) { .mask = m, .name = #m }
18
19static void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
20 const struct irq_bit_descr *sd, int size)
21{
22 int i;
23
24 for (i = 0; i < size; i++, sd++) {
25 if (state & sd->mask)
26 seq_printf(m, "%*s%s\n", ind + 12, "", sd->name);
27 }
28}
29
30#ifdef CONFIG_SMP
31static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc)
32{
33 struct irq_data *data = irq_desc_get_irq_data(desc);
34 struct cpumask *msk;
35
36 msk = irq_data_get_affinity_mask(data);
37 seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk));
38#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
39 msk = irq_data_get_effective_affinity_mask(data);
40 seq_printf(m, "effectiv: %*pbl\n", cpumask_pr_args(msk));
41#endif
42#ifdef CONFIG_GENERIC_PENDING_IRQ
43 msk = desc->pending_mask;
44 seq_printf(m, "pending: %*pbl\n", cpumask_pr_args(msk));
45#endif
46}
47#else
48static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc) { }
49#endif
50
51static const struct irq_bit_descr irqchip_flags[] = {
52 BIT_MASK_DESCR(IRQCHIP_SET_TYPE_MASKED),
53 BIT_MASK_DESCR(IRQCHIP_EOI_IF_HANDLED),
54 BIT_MASK_DESCR(IRQCHIP_MASK_ON_SUSPEND),
55 BIT_MASK_DESCR(IRQCHIP_ONOFFLINE_ENABLED),
56 BIT_MASK_DESCR(IRQCHIP_SKIP_SET_WAKE),
57 BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE),
58 BIT_MASK_DESCR(IRQCHIP_EOI_THREADED),
59};
60
61static void
62irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind)
63{
64 struct irq_chip *chip = data->chip;
65
66 if (!chip) {
67 seq_printf(m, "chip: None\n");
68 return;
69 }
70 seq_printf(m, "%*schip: %s\n", ind, "", chip->name);
71 seq_printf(m, "%*sflags: 0x%lx\n", ind + 1, "", chip->flags);
72 irq_debug_show_bits(m, ind, chip->flags, irqchip_flags,
73 ARRAY_SIZE(irqchip_flags));
74}
75
76static void
77irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind)
78{
79 seq_printf(m, "%*sdomain: %s\n", ind, "",
80 data->domain ? data->domain->name : "");
81 seq_printf(m, "%*shwirq: 0x%lx\n", ind + 1, "", data->hwirq);
82 irq_debug_show_chip(m, data, ind + 1);
83#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
84 if (!data->parent_data)
85 return;
86 seq_printf(m, "%*sparent:\n", ind + 1, "");
87 irq_debug_show_data(m, data->parent_data, ind + 4);
88#endif
89}
90
91static const struct irq_bit_descr irqdata_states[] = {
92 BIT_MASK_DESCR(IRQ_TYPE_EDGE_RISING),
93 BIT_MASK_DESCR(IRQ_TYPE_EDGE_FALLING),
94 BIT_MASK_DESCR(IRQ_TYPE_LEVEL_HIGH),
95 BIT_MASK_DESCR(IRQ_TYPE_LEVEL_LOW),
96 BIT_MASK_DESCR(IRQD_LEVEL),
97
98 BIT_MASK_DESCR(IRQD_ACTIVATED),
99 BIT_MASK_DESCR(IRQD_IRQ_STARTED),
100 BIT_MASK_DESCR(IRQD_IRQ_DISABLED),
101 BIT_MASK_DESCR(IRQD_IRQ_MASKED),
102 BIT_MASK_DESCR(IRQD_IRQ_INPROGRESS),
103
104 BIT_MASK_DESCR(IRQD_PER_CPU),
105 BIT_MASK_DESCR(IRQD_NO_BALANCING),
106
107 BIT_MASK_DESCR(IRQD_SINGLE_TARGET),
108 BIT_MASK_DESCR(IRQD_MOVE_PCNTXT),
109 BIT_MASK_DESCR(IRQD_AFFINITY_SET),
110 BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
111 BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
112 BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
113
114 BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
115
116 BIT_MASK_DESCR(IRQD_WAKEUP_STATE),
117 BIT_MASK_DESCR(IRQD_WAKEUP_ARMED),
118};
119
120static const struct irq_bit_descr irqdesc_states[] = {
121 BIT_MASK_DESCR(_IRQ_NOPROBE),
122 BIT_MASK_DESCR(_IRQ_NOREQUEST),
123 BIT_MASK_DESCR(_IRQ_NOTHREAD),
124 BIT_MASK_DESCR(_IRQ_NOAUTOEN),
125 BIT_MASK_DESCR(_IRQ_NESTED_THREAD),
126 BIT_MASK_DESCR(_IRQ_PER_CPU_DEVID),
127 BIT_MASK_DESCR(_IRQ_IS_POLLED),
128 BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY),
129};
130
131static const struct irq_bit_descr irqdesc_istates[] = {
132 BIT_MASK_DESCR(IRQS_AUTODETECT),
133 BIT_MASK_DESCR(IRQS_SPURIOUS_DISABLED),
134 BIT_MASK_DESCR(IRQS_POLL_INPROGRESS),
135 BIT_MASK_DESCR(IRQS_ONESHOT),
136 BIT_MASK_DESCR(IRQS_REPLAY),
137 BIT_MASK_DESCR(IRQS_WAITING),
138 BIT_MASK_DESCR(IRQS_PENDING),
139 BIT_MASK_DESCR(IRQS_SUSPENDED),
140};
141
142
143static int irq_debug_show(struct seq_file *m, void *p)
144{
145 struct irq_desc *desc = m->private;
146 struct irq_data *data;
147
148 raw_spin_lock_irq(&desc->lock);
149 data = irq_desc_get_irq_data(desc);
150 seq_printf(m, "handler: %pf\n", desc->handle_irq);
151 seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors);
152 irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states,
153 ARRAY_SIZE(irqdesc_states));
154 seq_printf(m, "istate: 0x%08x\n", desc->istate);
155 irq_debug_show_bits(m, 0, desc->istate, irqdesc_istates,
156 ARRAY_SIZE(irqdesc_istates));
157 seq_printf(m, "ddepth: %u\n", desc->depth);
158 seq_printf(m, "wdepth: %u\n", desc->wake_depth);
159 seq_printf(m, "dstate: 0x%08x\n", irqd_get(data));
160 irq_debug_show_bits(m, 0, irqd_get(data), irqdata_states,
161 ARRAY_SIZE(irqdata_states));
162 seq_printf(m, "node: %d\n", irq_data_get_node(data));
163 irq_debug_show_masks(m, desc);
164 irq_debug_show_data(m, data, 0);
165 raw_spin_unlock_irq(&desc->lock);
166 return 0;
167}
168
169static int irq_debug_open(struct inode *inode, struct file *file)
170{
171 return single_open(file, irq_debug_show, inode->i_private);
172}
173
174static const struct file_operations dfs_irq_ops = {
175 .open = irq_debug_open,
176 .read = seq_read,
177 .llseek = seq_lseek,
178 .release = single_release,
179};
180
181void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
182{
183 char name [10];
184
185 if (!irq_dir || !desc || desc->debugfs_file)
186 return;
187
188 sprintf(name, "%d", irq);
189 desc->debugfs_file = debugfs_create_file(name, 0444, irq_dir, desc,
190 &dfs_irq_ops);
191}
192
193static int __init irq_debugfs_init(void)
194{
195 struct dentry *root_dir;
196 int irq;
197
198 root_dir = debugfs_create_dir("irq", NULL);
199 if (!root_dir)
200 return -ENOMEM;
201
202 irq_domain_debugfs_init(root_dir);
203
204 irq_dir = debugfs_create_dir("irqs", root_dir);
205
206 irq_lock_sparse();
207 for_each_active_irq(irq)
208 irq_add_debugfs_entry(irq, irq_to_desc(irq));
209 irq_unlock_sparse();
210
211 return 0;
212}
213__initcall(irq_debugfs_init);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 1613bfd48365..194c506d9d20 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -4,6 +4,8 @@
4#include <linux/gfp.h> 4#include <linux/gfp.h>
5#include <linux/irq.h> 5#include <linux/irq.h>
6 6
7#include "internals.h"
8
7/* 9/*
8 * Device resource management aware IRQ request/free implementation. 10 * Device resource management aware IRQ request/free implementation.
9 */ 11 */
@@ -198,3 +200,87 @@ int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
198 return base; 200 return base;
199} 201}
200EXPORT_SYMBOL_GPL(__devm_irq_alloc_descs); 202EXPORT_SYMBOL_GPL(__devm_irq_alloc_descs);
203
204#ifdef CONFIG_GENERIC_IRQ_CHIP
205/**
206 * devm_irq_alloc_generic_chip - Allocate and initialize a generic chip
207 * for a managed device
208 * @dev: Device to allocate the generic chip for
209 * @name: Name of the irq chip
210 * @num_ct: Number of irq_chip_type instances associated with this
211 * @irq_base: Interrupt base nr for this chip
212 * @reg_base: Register base address (virtual)
213 * @handler: Default flow handler associated with this chip
214 *
215 * Returns an initialized irq_chip_generic structure. The chip defaults
216 * to the primary (index 0) irq_chip_type and @handler
217 */
218struct irq_chip_generic *
219devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,
220 unsigned int irq_base, void __iomem *reg_base,
221 irq_flow_handler_t handler)
222{
223 struct irq_chip_generic *gc;
224 unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
225
226 gc = devm_kzalloc(dev, sz, GFP_KERNEL);
227 if (gc)
228 irq_init_generic_chip(gc, name, num_ct,
229 irq_base, reg_base, handler);
230
231 return gc;
232}
233EXPORT_SYMBOL_GPL(devm_irq_alloc_generic_chip);
234
235struct irq_generic_chip_devres {
236 struct irq_chip_generic *gc;
237 u32 msk;
238 unsigned int clr;
239 unsigned int set;
240};
241
242static void devm_irq_remove_generic_chip(struct device *dev, void *res)
243{
244 struct irq_generic_chip_devres *this = res;
245
246 irq_remove_generic_chip(this->gc, this->msk, this->clr, this->set);
247}
248
249/**
250 * devm_irq_setup_generic_chip - Setup a range of interrupts with a generic
251 * chip for a managed device
252 *
253 * @dev: Device to setup the generic chip for
254 * @gc: Generic irq chip holding all data
255 * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
256 * @flags: Flags for initialization
257 * @clr: IRQ_* bits to clear
258 * @set: IRQ_* bits to set
259 *
260 * Set up max. 32 interrupts starting from gc->irq_base. Note, this
261 * initializes all interrupts to the primary irq_chip_type and its
262 * associated handler.
263 */
264int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc,
265 u32 msk, enum irq_gc_flags flags,
266 unsigned int clr, unsigned int set)
267{
268 struct irq_generic_chip_devres *dr;
269
270 dr = devres_alloc(devm_irq_remove_generic_chip,
271 sizeof(*dr), GFP_KERNEL);
272 if (!dr)
273 return -ENOMEM;
274
275 irq_setup_generic_chip(gc, msk, flags, clr, set);
276
277 dr->gc = gc;
278 dr->msk = msk;
279 dr->clr = clr;
280 dr->set = set;
281 devres_add(dev, dr);
282
283 return 0;
284}
285EXPORT_SYMBOL_GPL(devm_irq_setup_generic_chip);
286#endif /* CONFIG_GENERIC_IRQ_CHIP */
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index ee32870079c9..f7086b78ad6e 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -201,10 +201,9 @@ static void irq_writel_be(u32 val, void __iomem *addr)
201 iowrite32be(val, addr); 201 iowrite32be(val, addr);
202} 202}
203 203
204static void 204void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
205irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, 205 int num_ct, unsigned int irq_base,
206 int num_ct, unsigned int irq_base, 206 void __iomem *reg_base, irq_flow_handler_t handler)
207 void __iomem *reg_base, irq_flow_handler_t handler)
208{ 207{
209 raw_spin_lock_init(&gc->lock); 208 raw_spin_lock_init(&gc->lock);
210 gc->num_ct = num_ct; 209 gc->num_ct = num_ct;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d3f24905852c..79f987b942b8 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -6,7 +6,7 @@
6 * 6 *
7 * This file contains the core interrupt handling code. 7 * This file contains the core interrupt handling code.
8 * 8 *
9 * Detailed information is available in Documentation/DocBook/genericirq 9 * Detailed information is available in Documentation/core-api/genericirq.rst
10 * 10 *
11 */ 11 */
12 12
@@ -138,6 +138,8 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
138 unsigned int irq = desc->irq_data.irq; 138 unsigned int irq = desc->irq_data.irq;
139 struct irqaction *action; 139 struct irqaction *action;
140 140
141 record_irq_time(desc);
142
141 for_each_action_of_desc(desc, action) { 143 for_each_action_of_desc(desc, action) {
142 irqreturn_t res; 144 irqreturn_t res;
143 145
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index bc226e783bd2..a2c48058354c 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -8,6 +8,7 @@
8#include <linux/irqdesc.h> 8#include <linux/irqdesc.h>
9#include <linux/kernel_stat.h> 9#include <linux/kernel_stat.h>
10#include <linux/pm_runtime.h> 10#include <linux/pm_runtime.h>
11#include <linux/sched/clock.h>
11 12
12#ifdef CONFIG_SPARSE_IRQ 13#ifdef CONFIG_SPARSE_IRQ
13# define IRQ_BITMAP_BITS (NR_IRQS + 8196) 14# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
@@ -57,6 +58,7 @@ enum {
57 IRQS_WAITING = 0x00000080, 58 IRQS_WAITING = 0x00000080,
58 IRQS_PENDING = 0x00000200, 59 IRQS_PENDING = 0x00000200,
59 IRQS_SUSPENDED = 0x00000800, 60 IRQS_SUSPENDED = 0x00000800,
61 IRQS_TIMINGS = 0x00001000,
60}; 62};
61 63
62#include "debug.h" 64#include "debug.h"
@@ -66,7 +68,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags);
66extern void __disable_irq(struct irq_desc *desc); 68extern void __disable_irq(struct irq_desc *desc);
67extern void __enable_irq(struct irq_desc *desc); 69extern void __enable_irq(struct irq_desc *desc);
68 70
69extern int irq_startup(struct irq_desc *desc, bool resend); 71#define IRQ_RESEND true
72#define IRQ_NORESEND false
73
74#define IRQ_START_FORCE true
75#define IRQ_START_COND false
76
77extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
78
70extern void irq_shutdown(struct irq_desc *desc); 79extern void irq_shutdown(struct irq_desc *desc);
71extern void irq_enable(struct irq_desc *desc); 80extern void irq_enable(struct irq_desc *desc);
72extern void irq_disable(struct irq_desc *desc); 81extern void irq_disable(struct irq_desc *desc);
@@ -109,13 +118,19 @@ static inline void unregister_handler_proc(unsigned int irq,
109 118
110extern bool irq_can_set_affinity_usr(unsigned int irq); 119extern bool irq_can_set_affinity_usr(unsigned int irq);
111 120
112extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); 121extern int irq_select_affinity_usr(unsigned int irq);
113 122
114extern void irq_set_thread_affinity(struct irq_desc *desc); 123extern void irq_set_thread_affinity(struct irq_desc *desc);
115 124
116extern int irq_do_set_affinity(struct irq_data *data, 125extern int irq_do_set_affinity(struct irq_data *data,
117 const struct cpumask *dest, bool force); 126 const struct cpumask *dest, bool force);
118 127
128#ifdef CONFIG_SMP
129extern int irq_setup_affinity(struct irq_desc *desc);
130#else
131static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; }
132#endif
133
119/* Inline functions for support of irq chips on slow busses */ 134/* Inline functions for support of irq chips on slow busses */
120static inline void chip_bus_lock(struct irq_desc *desc) 135static inline void chip_bus_lock(struct irq_desc *desc)
121{ 136{
@@ -169,6 +184,11 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
169 184
170#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) 185#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
171 186
187static inline unsigned int irqd_get(struct irq_data *d)
188{
189 return __irqd_to_state(d);
190}
191
172/* 192/*
173 * Manipulation functions for irq_data.state 193 * Manipulation functions for irq_data.state
174 */ 194 */
@@ -182,6 +202,16 @@ static inline void irqd_clr_move_pending(struct irq_data *d)
182 __irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING; 202 __irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING;
183} 203}
184 204
205static inline void irqd_set_managed_shutdown(struct irq_data *d)
206{
207 __irqd_to_state(d) |= IRQD_MANAGED_SHUTDOWN;
208}
209
210static inline void irqd_clr_managed_shutdown(struct irq_data *d)
211{
212 __irqd_to_state(d) &= ~IRQD_MANAGED_SHUTDOWN;
213}
214
185static inline void irqd_clear(struct irq_data *d, unsigned int mask) 215static inline void irqd_clear(struct irq_data *d, unsigned int mask)
186{ 216{
187 __irqd_to_state(d) &= ~mask; 217 __irqd_to_state(d) &= ~mask;
@@ -197,6 +227,16 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
197 return __irqd_to_state(d) & mask; 227 return __irqd_to_state(d) & mask;
198} 228}
199 229
230static inline void irq_state_set_disabled(struct irq_desc *desc)
231{
232 irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
233}
234
235static inline void irq_state_set_masked(struct irq_desc *desc)
236{
237 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
238}
239
200#undef __irqd_to_state 240#undef __irqd_to_state
201 241
202static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) 242static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
@@ -226,3 +266,196 @@ irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
226static inline void 266static inline void
227irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } 267irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
228#endif 268#endif
269
270#ifdef CONFIG_IRQ_TIMINGS
271
272#define IRQ_TIMINGS_SHIFT 5
273#define IRQ_TIMINGS_SIZE (1 << IRQ_TIMINGS_SHIFT)
274#define IRQ_TIMINGS_MASK (IRQ_TIMINGS_SIZE - 1)
275
276/**
277 * struct irq_timings - irq timings storing structure
278 * @values: a circular buffer of u64 encoded <timestamp,irq> values
279 * @count: the number of elements in the array
280 */
281struct irq_timings {
282 u64 values[IRQ_TIMINGS_SIZE];
283 int count;
284};
285
286DECLARE_PER_CPU(struct irq_timings, irq_timings);
287
288extern void irq_timings_free(int irq);
289extern int irq_timings_alloc(int irq);
290
291static inline void irq_remove_timings(struct irq_desc *desc)
292{
293 desc->istate &= ~IRQS_TIMINGS;
294
295 irq_timings_free(irq_desc_get_irq(desc));
296}
297
298static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act)
299{
300 int irq = irq_desc_get_irq(desc);
301 int ret;
302
303 /*
304 * We don't need the measurement because the idle code already
305 * knows the next expiry event.
306 */
307 if (act->flags & __IRQF_TIMER)
308 return;
309
310 /*
311 * In case the timing allocation fails, we just want to warn,
312 * not fail, so letting the system boot anyway.
313 */
314 ret = irq_timings_alloc(irq);
315 if (ret) {
316 pr_warn("Failed to allocate irq timing stats for irq%d (%d)",
317 irq, ret);
318 return;
319 }
320
321 desc->istate |= IRQS_TIMINGS;
322}
323
324extern void irq_timings_enable(void);
325extern void irq_timings_disable(void);
326
327DECLARE_STATIC_KEY_FALSE(irq_timing_enabled);
328
329/*
330 * The interrupt number and the timestamp are encoded into a single
331 * u64 variable to optimize the size.
332 * 48 bit time stamp and 16 bit IRQ number is way sufficient.
333 * Who cares an IRQ after 78 hours of idle time?
334 */
335static inline u64 irq_timing_encode(u64 timestamp, int irq)
336{
337 return (timestamp << 16) | irq;
338}
339
340static inline int irq_timing_decode(u64 value, u64 *timestamp)
341{
342 *timestamp = value >> 16;
343 return value & U16_MAX;
344}
345
346/*
347 * The function record_irq_time is only called in one place in the
348 * interrupts handler. We want this function always inline so the code
349 * inside is embedded in the function and the static key branching
350 * code can act at the higher level. Without the explicit
351 * __always_inline we can end up with a function call and a small
352 * overhead in the hotpath for nothing.
353 */
354static __always_inline void record_irq_time(struct irq_desc *desc)
355{
356 if (!static_branch_likely(&irq_timing_enabled))
357 return;
358
359 if (desc->istate & IRQS_TIMINGS) {
360 struct irq_timings *timings = this_cpu_ptr(&irq_timings);
361
362 timings->values[timings->count & IRQ_TIMINGS_MASK] =
363 irq_timing_encode(local_clock(),
364 irq_desc_get_irq(desc));
365
366 timings->count++;
367 }
368}
369#else
370static inline void irq_remove_timings(struct irq_desc *desc) {}
371static inline void irq_setup_timings(struct irq_desc *desc,
372 struct irqaction *act) {};
373static inline void record_irq_time(struct irq_desc *desc) {}
374#endif /* CONFIG_IRQ_TIMINGS */
375
376
377#ifdef CONFIG_GENERIC_IRQ_CHIP
378void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
379 int num_ct, unsigned int irq_base,
380 void __iomem *reg_base, irq_flow_handler_t handler);
381#else
382static inline void
383irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
384 int num_ct, unsigned int irq_base,
385 void __iomem *reg_base, irq_flow_handler_t handler) { }
386#endif /* CONFIG_GENERIC_IRQ_CHIP */
387
388#ifdef CONFIG_GENERIC_PENDING_IRQ
389static inline bool irq_can_move_pcntxt(struct irq_data *data)
390{
391 return irqd_can_move_in_process_context(data);
392}
393static inline bool irq_move_pending(struct irq_data *data)
394{
395 return irqd_is_setaffinity_pending(data);
396}
397static inline void
398irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
399{
400 cpumask_copy(desc->pending_mask, mask);
401}
402static inline void
403irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
404{
405 cpumask_copy(mask, desc->pending_mask);
406}
407static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
408{
409 return desc->pending_mask;
410}
411bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
412#else /* CONFIG_GENERIC_PENDING_IRQ */
413static inline bool irq_can_move_pcntxt(struct irq_data *data)
414{
415 return true;
416}
417static inline bool irq_move_pending(struct irq_data *data)
418{
419 return false;
420}
421static inline void
422irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
423{
424}
425static inline void
426irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
427{
428}
429static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
430{
431 return NULL;
432}
433static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
434{
435 return false;
436}
437#endif /* !CONFIG_GENERIC_PENDING_IRQ */
438
439#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
440#include <linux/debugfs.h>
441
442void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
443static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
444{
445 debugfs_remove(desc->debugfs_file);
446}
447# ifdef CONFIG_IRQ_DOMAIN
448void irq_domain_debugfs_init(struct dentry *root);
449# else
450static inline void irq_domain_debugfs_init(struct dentry *root)
451{
452}
453# endif
454#else /* CONFIG_GENERIC_IRQ_DEBUGFS */
455static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
456{
457}
458static inline void irq_remove_debugfs_entry(struct irq_desc *d)
459{
460}
461#endif /* CONFIG_GENERIC_IRQ_DEBUGFS */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 00bb0aeea1d0..73be2b3909bd 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -4,7 +4,7 @@
4 * 4 *
5 * This file contains the interrupt descriptor management code 5 * This file contains the interrupt descriptor management code
6 * 6 *
7 * Detailed information is available in Documentation/DocBook/genericirq 7 * Detailed information is available in Documentation/core-api/genericirq.rst
8 * 8 *
9 */ 9 */
10#include <linux/irq.h> 10#include <linux/irq.h>
@@ -54,14 +54,25 @@ static void __init init_irq_default_affinity(void)
54#endif 54#endif
55 55
56#ifdef CONFIG_SMP 56#ifdef CONFIG_SMP
57static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) 57static int alloc_masks(struct irq_desc *desc, int node)
58{ 58{
59 if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity, 59 if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity,
60 gfp, node)) 60 GFP_KERNEL, node))
61 return -ENOMEM; 61 return -ENOMEM;
62 62
63#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
64 if (!zalloc_cpumask_var_node(&desc->irq_common_data.effective_affinity,
65 GFP_KERNEL, node)) {
66 free_cpumask_var(desc->irq_common_data.affinity);
67 return -ENOMEM;
68 }
69#endif
70
63#ifdef CONFIG_GENERIC_PENDING_IRQ 71#ifdef CONFIG_GENERIC_PENDING_IRQ
64 if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { 72 if (!zalloc_cpumask_var_node(&desc->pending_mask, GFP_KERNEL, node)) {
73#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
74 free_cpumask_var(desc->irq_common_data.effective_affinity);
75#endif
65 free_cpumask_var(desc->irq_common_data.affinity); 76 free_cpumask_var(desc->irq_common_data.affinity);
66 return -ENOMEM; 77 return -ENOMEM;
67 } 78 }
@@ -86,7 +97,7 @@ static void desc_smp_init(struct irq_desc *desc, int node,
86 97
87#else 98#else
88static inline int 99static inline int
89alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } 100alloc_masks(struct irq_desc *desc, int node) { return 0; }
90static inline void 101static inline void
91desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } 102desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { }
92#endif 103#endif
@@ -105,6 +116,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
105 desc->irq_data.chip_data = NULL; 116 desc->irq_data.chip_data = NULL;
106 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); 117 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
107 irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); 118 irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
119 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
108 desc->handle_irq = handle_bad_irq; 120 desc->handle_irq = handle_bad_irq;
109 desc->depth = 1; 121 desc->depth = 1;
110 desc->irq_count = 0; 122 desc->irq_count = 0;
@@ -324,6 +336,9 @@ static void free_masks(struct irq_desc *desc)
324 free_cpumask_var(desc->pending_mask); 336 free_cpumask_var(desc->pending_mask);
325#endif 337#endif
326 free_cpumask_var(desc->irq_common_data.affinity); 338 free_cpumask_var(desc->irq_common_data.affinity);
339#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
340 free_cpumask_var(desc->irq_common_data.effective_affinity);
341#endif
327} 342}
328#else 343#else
329static inline void free_masks(struct irq_desc *desc) { } 344static inline void free_masks(struct irq_desc *desc) { }
@@ -344,9 +359,8 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
344 struct module *owner) 359 struct module *owner)
345{ 360{
346 struct irq_desc *desc; 361 struct irq_desc *desc;
347 gfp_t gfp = GFP_KERNEL;
348 362
349 desc = kzalloc_node(sizeof(*desc), gfp, node); 363 desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
350 if (!desc) 364 if (!desc)
351 return NULL; 365 return NULL;
352 /* allocate based on nr_cpu_ids */ 366 /* allocate based on nr_cpu_ids */
@@ -354,11 +368,12 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
354 if (!desc->kstat_irqs) 368 if (!desc->kstat_irqs)
355 goto err_desc; 369 goto err_desc;
356 370
357 if (alloc_masks(desc, gfp, node)) 371 if (alloc_masks(desc, node))
358 goto err_kstat; 372 goto err_kstat;
359 373
360 raw_spin_lock_init(&desc->lock); 374 raw_spin_lock_init(&desc->lock);
361 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 375 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
376 mutex_init(&desc->request_mutex);
362 init_rcu_head(&desc->rcu); 377 init_rcu_head(&desc->rcu);
363 378
364 desc_set_defaults(irq, desc, node, affinity, owner); 379 desc_set_defaults(irq, desc, node, affinity, owner);
@@ -394,6 +409,7 @@ static void free_desc(unsigned int irq)
394{ 409{
395 struct irq_desc *desc = irq_to_desc(irq); 410 struct irq_desc *desc = irq_to_desc(irq);
396 411
412 irq_remove_debugfs_entry(desc);
397 unregister_irq_proc(irq, desc); 413 unregister_irq_proc(irq, desc);
398 414
399 /* 415 /*
@@ -480,7 +496,8 @@ int __init early_irq_init(void)
480 496
481 /* Let arch update nr_irqs and return the nr of preallocated irqs */ 497 /* Let arch update nr_irqs and return the nr of preallocated irqs */
482 initcnt = arch_probe_nr_irqs(); 498 initcnt = arch_probe_nr_irqs();
483 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); 499 printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n",
500 NR_IRQS, nr_irqs, initcnt);
484 501
485 if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) 502 if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
486 nr_irqs = IRQ_BITMAP_BITS; 503 nr_irqs = IRQ_BITMAP_BITS;
@@ -516,14 +533,14 @@ int __init early_irq_init(void)
516 533
517 init_irq_default_affinity(); 534 init_irq_default_affinity();
518 535
519 printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); 536 printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS);
520 537
521 desc = irq_desc; 538 desc = irq_desc;
522 count = ARRAY_SIZE(irq_desc); 539 count = ARRAY_SIZE(irq_desc);
523 540
524 for (i = 0; i < count; i++) { 541 for (i = 0; i < count; i++) {
525 desc[i].kstat_irqs = alloc_percpu(unsigned int); 542 desc[i].kstat_irqs = alloc_percpu(unsigned int);
526 alloc_masks(&desc[i], GFP_KERNEL, node); 543 alloc_masks(&desc[i], node);
527 raw_spin_lock_init(&desc[i].lock); 544 raw_spin_lock_init(&desc[i].lock);
528 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 545 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
529 desc_set_defaults(i, &desc[i], node, NULL, NULL); 546 desc_set_defaults(i, &desc[i], node, NULL, NULL);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 31805f237396..f1f251479aa6 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,5 +1,6 @@
1#define pr_fmt(fmt) "irq: " fmt 1#define pr_fmt(fmt) "irq: " fmt
2 2
3#include <linux/acpi.h>
3#include <linux/debugfs.h> 4#include <linux/debugfs.h>
4#include <linux/hardirq.h> 5#include <linux/hardirq.h>
5#include <linux/interrupt.h> 6#include <linux/interrupt.h>
@@ -26,39 +27,69 @@ static struct irq_domain *irq_default_domain;
26static void irq_domain_check_hierarchy(struct irq_domain *domain); 27static void irq_domain_check_hierarchy(struct irq_domain *domain);
27 28
28struct irqchip_fwid { 29struct irqchip_fwid {
29 struct fwnode_handle fwnode; 30 struct fwnode_handle fwnode;
30 char *name; 31 unsigned int type;
32 char *name;
31 void *data; 33 void *data;
32}; 34};
33 35
36#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
37static void debugfs_add_domain_dir(struct irq_domain *d);
38static void debugfs_remove_domain_dir(struct irq_domain *d);
39#else
40static inline void debugfs_add_domain_dir(struct irq_domain *d) { }
41static inline void debugfs_remove_domain_dir(struct irq_domain *d) { }
42#endif
43
34/** 44/**
35 * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for 45 * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for
36 * identifying an irq domain 46 * identifying an irq domain
37 * @data: optional user-provided data 47 * @type: Type of irqchip_fwnode. See linux/irqdomain.h
48 * @name: Optional user provided domain name
49 * @id: Optional user provided id if name != NULL
50 * @data: Optional user-provided data
38 * 51 *
39 * Allocate a struct device_node, and return a poiner to the embedded 52 * Allocate a struct irqchip_fwid, and return a poiner to the embedded
40 * fwnode_handle (or NULL on failure). 53 * fwnode_handle (or NULL on failure).
54 *
55 * Note: The types IRQCHIP_FWNODE_NAMED and IRQCHIP_FWNODE_NAMED_ID are
56 * solely to transport name information to irqdomain creation code. The
57 * node is not stored. For other types the pointer is kept in the irq
58 * domain struct.
41 */ 59 */
42struct fwnode_handle *irq_domain_alloc_fwnode(void *data) 60struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
61 const char *name, void *data)
43{ 62{
44 struct irqchip_fwid *fwid; 63 struct irqchip_fwid *fwid;
45 char *name; 64 char *n;
46 65
47 fwid = kzalloc(sizeof(*fwid), GFP_KERNEL); 66 fwid = kzalloc(sizeof(*fwid), GFP_KERNEL);
48 name = kasprintf(GFP_KERNEL, "irqchip@%p", data);
49 67
50 if (!fwid || !name) { 68 switch (type) {
69 case IRQCHIP_FWNODE_NAMED:
70 n = kasprintf(GFP_KERNEL, "%s", name);
71 break;
72 case IRQCHIP_FWNODE_NAMED_ID:
73 n = kasprintf(GFP_KERNEL, "%s-%d", name, id);
74 break;
75 default:
76 n = kasprintf(GFP_KERNEL, "irqchip@%p", data);
77 break;
78 }
79
80 if (!fwid || !n) {
51 kfree(fwid); 81 kfree(fwid);
52 kfree(name); 82 kfree(n);
53 return NULL; 83 return NULL;
54 } 84 }
55 85
56 fwid->name = name; 86 fwid->type = type;
87 fwid->name = n;
57 fwid->data = data; 88 fwid->data = data;
58 fwid->fwnode.type = FWNODE_IRQCHIP; 89 fwid->fwnode.type = FWNODE_IRQCHIP;
59 return &fwid->fwnode; 90 return &fwid->fwnode;
60} 91}
61EXPORT_SYMBOL_GPL(irq_domain_alloc_fwnode); 92EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode);
62 93
63/** 94/**
64 * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle 95 * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle
@@ -97,26 +128,97 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
97 void *host_data) 128 void *host_data)
98{ 129{
99 struct device_node *of_node = to_of_node(fwnode); 130 struct device_node *of_node = to_of_node(fwnode);
131 struct irqchip_fwid *fwid;
100 struct irq_domain *domain; 132 struct irq_domain *domain;
101 133
134 static atomic_t unknown_domains;
135
102 domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), 136 domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
103 GFP_KERNEL, of_node_to_nid(of_node)); 137 GFP_KERNEL, of_node_to_nid(of_node));
104 if (WARN_ON(!domain)) 138 if (WARN_ON(!domain))
105 return NULL; 139 return NULL;
106 140
141 if (fwnode && is_fwnode_irqchip(fwnode)) {
142 fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
143
144 switch (fwid->type) {
145 case IRQCHIP_FWNODE_NAMED:
146 case IRQCHIP_FWNODE_NAMED_ID:
147 domain->name = kstrdup(fwid->name, GFP_KERNEL);
148 if (!domain->name) {
149 kfree(domain);
150 return NULL;
151 }
152 domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
153 break;
154 default:
155 domain->fwnode = fwnode;
156 domain->name = fwid->name;
157 break;
158 }
159#ifdef CONFIG_ACPI
160 } else if (is_acpi_device_node(fwnode)) {
161 struct acpi_buffer buf = {
162 .length = ACPI_ALLOCATE_BUFFER,
163 };
164 acpi_handle handle;
165
166 handle = acpi_device_handle(to_acpi_device_node(fwnode));
167 if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) {
168 domain->name = buf.pointer;
169 domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
170 }
171
172 domain->fwnode = fwnode;
173#endif
174 } else if (of_node) {
175 char *name;
176
177 /*
178 * DT paths contain '/', which debugfs is legitimately
179 * unhappy about. Replace them with ':', which does
180 * the trick and is not as offensive as '\'...
181 */
182 name = kstrdup(of_node_full_name(of_node), GFP_KERNEL);
183 if (!name) {
184 kfree(domain);
185 return NULL;
186 }
187
188 strreplace(name, '/', ':');
189
190 domain->name = name;
191 domain->fwnode = fwnode;
192 domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
193 }
194
195 if (!domain->name) {
196 if (fwnode) {
197 pr_err("Invalid fwnode type (%d) for irqdomain\n",
198 fwnode->type);
199 }
200 domain->name = kasprintf(GFP_KERNEL, "unknown-%d",
201 atomic_inc_return(&unknown_domains));
202 if (!domain->name) {
203 kfree(domain);
204 return NULL;
205 }
206 domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
207 }
208
107 of_node_get(of_node); 209 of_node_get(of_node);
108 210
109 /* Fill structure */ 211 /* Fill structure */
110 INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); 212 INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
111 domain->ops = ops; 213 domain->ops = ops;
112 domain->host_data = host_data; 214 domain->host_data = host_data;
113 domain->fwnode = fwnode;
114 domain->hwirq_max = hwirq_max; 215 domain->hwirq_max = hwirq_max;
115 domain->revmap_size = size; 216 domain->revmap_size = size;
116 domain->revmap_direct_max_irq = direct_max; 217 domain->revmap_direct_max_irq = direct_max;
117 irq_domain_check_hierarchy(domain); 218 irq_domain_check_hierarchy(domain);
118 219
119 mutex_lock(&irq_domain_mutex); 220 mutex_lock(&irq_domain_mutex);
221 debugfs_add_domain_dir(domain);
120 list_add(&domain->link, &irq_domain_list); 222 list_add(&domain->link, &irq_domain_list);
121 mutex_unlock(&irq_domain_mutex); 223 mutex_unlock(&irq_domain_mutex);
122 224
@@ -136,6 +238,7 @@ EXPORT_SYMBOL_GPL(__irq_domain_add);
136void irq_domain_remove(struct irq_domain *domain) 238void irq_domain_remove(struct irq_domain *domain)
137{ 239{
138 mutex_lock(&irq_domain_mutex); 240 mutex_lock(&irq_domain_mutex);
241 debugfs_remove_domain_dir(domain);
139 242
140 WARN_ON(!radix_tree_empty(&domain->revmap_tree)); 243 WARN_ON(!radix_tree_empty(&domain->revmap_tree));
141 244
@@ -152,10 +255,43 @@ void irq_domain_remove(struct irq_domain *domain)
152 pr_debug("Removed domain %s\n", domain->name); 255 pr_debug("Removed domain %s\n", domain->name);
153 256
154 of_node_put(irq_domain_get_of_node(domain)); 257 of_node_put(irq_domain_get_of_node(domain));
258 if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED)
259 kfree(domain->name);
155 kfree(domain); 260 kfree(domain);
156} 261}
157EXPORT_SYMBOL_GPL(irq_domain_remove); 262EXPORT_SYMBOL_GPL(irq_domain_remove);
158 263
264void irq_domain_update_bus_token(struct irq_domain *domain,
265 enum irq_domain_bus_token bus_token)
266{
267 char *name;
268
269 if (domain->bus_token == bus_token)
270 return;
271
272 mutex_lock(&irq_domain_mutex);
273
274 domain->bus_token = bus_token;
275
276 name = kasprintf(GFP_KERNEL, "%s-%d", domain->name, bus_token);
277 if (!name) {
278 mutex_unlock(&irq_domain_mutex);
279 return;
280 }
281
282 debugfs_remove_domain_dir(domain);
283
284 if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED)
285 kfree(domain->name);
286 else
287 domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
288
289 domain->name = name;
290 debugfs_add_domain_dir(domain);
291
292 mutex_unlock(&irq_domain_mutex);
293}
294
159/** 295/**
160 * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs 296 * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
161 * @of_node: pointer to interrupt controller's device tree node. 297 * @of_node: pointer to interrupt controller's device tree node.
@@ -344,6 +480,7 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
344 480
345 irq_data->domain = NULL; 481 irq_data->domain = NULL;
346 irq_data->hwirq = 0; 482 irq_data->hwirq = 0;
483 domain->mapcount--;
347 484
348 /* Clear reverse map for this hwirq */ 485 /* Clear reverse map for this hwirq */
349 if (hwirq < domain->revmap_size) { 486 if (hwirq < domain->revmap_size) {
@@ -395,6 +532,7 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
395 domain->name = irq_data->chip->name; 532 domain->name = irq_data->chip->name;
396 } 533 }
397 534
535 domain->mapcount++;
398 if (hwirq < domain->revmap_size) { 536 if (hwirq < domain->revmap_size) {
399 domain->linear_revmap[hwirq] = virq; 537 domain->linear_revmap[hwirq] = virq;
400 } else { 538 } else {
@@ -746,13 +884,54 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
746EXPORT_SYMBOL_GPL(irq_find_mapping); 884EXPORT_SYMBOL_GPL(irq_find_mapping);
747 885
748#ifdef CONFIG_IRQ_DOMAIN_DEBUG 886#ifdef CONFIG_IRQ_DOMAIN_DEBUG
887static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc)
888{
889 struct irq_domain *domain;
890 struct irq_data *data;
891
892 domain = desc->irq_data.domain;
893 data = &desc->irq_data;
894
895 while (domain) {
896 unsigned int irq = data->irq;
897 unsigned long hwirq = data->hwirq;
898 struct irq_chip *chip;
899 bool direct;
900
901 if (data == &desc->irq_data)
902 seq_printf(m, "%5d ", irq);
903 else
904 seq_printf(m, "%5d+ ", irq);
905 seq_printf(m, "0x%05lx ", hwirq);
906
907 chip = irq_data_get_irq_chip(data);
908 seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
909
910 seq_printf(m, data ? "0x%p " : " %p ",
911 irq_data_get_irq_chip_data(data));
912
913 seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
914 direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq);
915 seq_printf(m, "%6s%-8s ",
916 (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
917 direct ? "(DIRECT)" : "");
918 seq_printf(m, "%s\n", domain->name);
919#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
920 domain = domain->parent;
921 data = data->parent_data;
922#else
923 domain = NULL;
924#endif
925 }
926}
927
749static int virq_debug_show(struct seq_file *m, void *private) 928static int virq_debug_show(struct seq_file *m, void *private)
750{ 929{
751 unsigned long flags; 930 unsigned long flags;
752 struct irq_desc *desc; 931 struct irq_desc *desc;
753 struct irq_domain *domain; 932 struct irq_domain *domain;
754 struct radix_tree_iter iter; 933 struct radix_tree_iter iter;
755 void *data, **slot; 934 void **slot;
756 int i; 935 int i;
757 936
758 seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", 937 seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
@@ -760,15 +939,26 @@ static int virq_debug_show(struct seq_file *m, void *private)
760 mutex_lock(&irq_domain_mutex); 939 mutex_lock(&irq_domain_mutex);
761 list_for_each_entry(domain, &irq_domain_list, link) { 940 list_for_each_entry(domain, &irq_domain_list, link) {
762 struct device_node *of_node; 941 struct device_node *of_node;
942 const char *name;
943
763 int count = 0; 944 int count = 0;
945
764 of_node = irq_domain_get_of_node(domain); 946 of_node = irq_domain_get_of_node(domain);
947 if (of_node)
948 name = of_node_full_name(of_node);
949 else if (is_fwnode_irqchip(domain->fwnode))
950 name = container_of(domain->fwnode, struct irqchip_fwid,
951 fwnode)->name;
952 else
953 name = "";
954
765 radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) 955 radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
766 count++; 956 count++;
767 seq_printf(m, "%c%-16s %6u %10u %10u %s\n", 957 seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
768 domain == irq_default_domain ? '*' : ' ', domain->name, 958 domain == irq_default_domain ? '*' : ' ', domain->name,
769 domain->revmap_size + count, domain->revmap_size, 959 domain->revmap_size + count, domain->revmap_size,
770 domain->revmap_direct_max_irq, 960 domain->revmap_direct_max_irq,
771 of_node ? of_node_full_name(of_node) : ""); 961 name);
772 } 962 }
773 mutex_unlock(&irq_domain_mutex); 963 mutex_unlock(&irq_domain_mutex);
774 964
@@ -782,30 +972,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
782 continue; 972 continue;
783 973
784 raw_spin_lock_irqsave(&desc->lock, flags); 974 raw_spin_lock_irqsave(&desc->lock, flags);
785 domain = desc->irq_data.domain; 975 virq_debug_show_one(m, desc);
786
787 if (domain) {
788 struct irq_chip *chip;
789 int hwirq = desc->irq_data.hwirq;
790 bool direct;
791
792 seq_printf(m, "%5d ", i);
793 seq_printf(m, "0x%05x ", hwirq);
794
795 chip = irq_desc_get_chip(desc);
796 seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
797
798 data = irq_desc_get_chip_data(desc);
799 seq_printf(m, data ? "0x%p " : " %p ", data);
800
801 seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
802 direct = (i == hwirq) && (i < domain->revmap_direct_max_irq);
803 seq_printf(m, "%6s%-8s ",
804 (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
805 direct ? "(DIRECT)" : "");
806 seq_printf(m, "%s\n", desc->irq_data.domain->name);
807 }
808
809 raw_spin_unlock_irqrestore(&desc->lock, flags); 976 raw_spin_unlock_irqrestore(&desc->lock, flags);
810 } 977 }
811 978
@@ -973,6 +1140,7 @@ static void irq_domain_insert_irq(int virq)
973 struct irq_domain *domain = data->domain; 1140 struct irq_domain *domain = data->domain;
974 irq_hw_number_t hwirq = data->hwirq; 1141 irq_hw_number_t hwirq = data->hwirq;
975 1142
1143 domain->mapcount++;
976 if (hwirq < domain->revmap_size) { 1144 if (hwirq < domain->revmap_size) {
977 domain->linear_revmap[hwirq] = virq; 1145 domain->linear_revmap[hwirq] = virq;
978 } else { 1146 } else {
@@ -1002,6 +1170,7 @@ static void irq_domain_remove_irq(int virq)
1002 struct irq_domain *domain = data->domain; 1170 struct irq_domain *domain = data->domain;
1003 irq_hw_number_t hwirq = data->hwirq; 1171 irq_hw_number_t hwirq = data->hwirq;
1004 1172
1173 domain->mapcount--;
1005 if (hwirq < domain->revmap_size) { 1174 if (hwirq < domain->revmap_size) {
1006 domain->linear_revmap[hwirq] = 0; 1175 domain->linear_revmap[hwirq] = 0;
1007 } else { 1176 } else {
@@ -1189,43 +1358,18 @@ void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq,
1189 irq_domain_free_irqs_common(domain, virq, nr_irqs); 1358 irq_domain_free_irqs_common(domain, virq, nr_irqs);
1190} 1359}
1191 1360
1192static bool irq_domain_is_auto_recursive(struct irq_domain *domain) 1361static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain,
1193{
1194 return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE;
1195}
1196
1197static void irq_domain_free_irqs_recursive(struct irq_domain *domain,
1198 unsigned int irq_base, 1362 unsigned int irq_base,
1199 unsigned int nr_irqs) 1363 unsigned int nr_irqs)
1200{ 1364{
1201 domain->ops->free(domain, irq_base, nr_irqs); 1365 domain->ops->free(domain, irq_base, nr_irqs);
1202 if (irq_domain_is_auto_recursive(domain)) {
1203 BUG_ON(!domain->parent);
1204 irq_domain_free_irqs_recursive(domain->parent, irq_base,
1205 nr_irqs);
1206 }
1207} 1366}
1208 1367
1209int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, 1368int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
1210 unsigned int irq_base, 1369 unsigned int irq_base,
1211 unsigned int nr_irqs, void *arg) 1370 unsigned int nr_irqs, void *arg)
1212{ 1371{
1213 int ret = 0; 1372 return domain->ops->alloc(domain, irq_base, nr_irqs, arg);
1214 struct irq_domain *parent = domain->parent;
1215 bool recursive = irq_domain_is_auto_recursive(domain);
1216
1217 BUG_ON(recursive && !parent);
1218 if (recursive)
1219 ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
1220 nr_irqs, arg);
1221 if (ret < 0)
1222 return ret;
1223
1224 ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
1225 if (ret < 0 && recursive)
1226 irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
1227
1228 return ret;
1229} 1373}
1230 1374
1231/** 1375/**
@@ -1286,7 +1430,7 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
1286 } 1430 }
1287 1431
1288 mutex_lock(&irq_domain_mutex); 1432 mutex_lock(&irq_domain_mutex);
1289 ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg); 1433 ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg);
1290 if (ret < 0) { 1434 if (ret < 0) {
1291 mutex_unlock(&irq_domain_mutex); 1435 mutex_unlock(&irq_domain_mutex);
1292 goto out_free_irq_data; 1436 goto out_free_irq_data;
@@ -1321,7 +1465,7 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
1321 mutex_lock(&irq_domain_mutex); 1465 mutex_lock(&irq_domain_mutex);
1322 for (i = 0; i < nr_irqs; i++) 1466 for (i = 0; i < nr_irqs; i++)
1323 irq_domain_remove_irq(virq + i); 1467 irq_domain_remove_irq(virq + i);
1324 irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs); 1468 irq_domain_free_irqs_hierarchy(data->domain, virq, nr_irqs);
1325 mutex_unlock(&irq_domain_mutex); 1469 mutex_unlock(&irq_domain_mutex);
1326 1470
1327 irq_domain_free_irq_data(virq, nr_irqs); 1471 irq_domain_free_irq_data(virq, nr_irqs);
@@ -1341,15 +1485,11 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
1341 unsigned int irq_base, unsigned int nr_irqs, 1485 unsigned int irq_base, unsigned int nr_irqs,
1342 void *arg) 1486 void *arg)
1343{ 1487{
1344 /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */ 1488 if (!domain->parent)
1345 if (irq_domain_is_auto_recursive(domain)) 1489 return -ENOSYS;
1346 return 0;
1347 1490
1348 domain = domain->parent; 1491 return irq_domain_alloc_irqs_hierarchy(domain->parent, irq_base,
1349 if (domain) 1492 nr_irqs, arg);
1350 return irq_domain_alloc_irqs_recursive(domain, irq_base,
1351 nr_irqs, arg);
1352 return -ENOSYS;
1353} 1493}
1354EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); 1494EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent);
1355 1495
@@ -1364,10 +1504,10 @@ EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent);
1364void irq_domain_free_irqs_parent(struct irq_domain *domain, 1504void irq_domain_free_irqs_parent(struct irq_domain *domain,
1365 unsigned int irq_base, unsigned int nr_irqs) 1505 unsigned int irq_base, unsigned int nr_irqs)
1366{ 1506{
1367 /* irq_domain_free_irqs_recursive() will call parent's free */ 1507 if (!domain->parent)
1368 if (!irq_domain_is_auto_recursive(domain) && domain->parent) 1508 return;
1369 irq_domain_free_irqs_recursive(domain->parent, irq_base, 1509
1370 nr_irqs); 1510 irq_domain_free_irqs_hierarchy(domain->parent, irq_base, nr_irqs);
1371} 1511}
1372EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); 1512EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
1373 1513
@@ -1487,3 +1627,77 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
1487{ 1627{
1488} 1628}
1489#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ 1629#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
1630
1631#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
1632static struct dentry *domain_dir;
1633
1634static void
1635irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)
1636{
1637 seq_printf(m, "%*sname: %s\n", ind, "", d->name);
1638 seq_printf(m, "%*ssize: %u\n", ind + 1, "",
1639 d->revmap_size + d->revmap_direct_max_irq);
1640 seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount);
1641 seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags);
1642#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
1643 if (!d->parent)
1644 return;
1645 seq_printf(m, "%*sparent: %s\n", ind + 1, "", d->parent->name);
1646 irq_domain_debug_show_one(m, d->parent, ind + 4);
1647#endif
1648}
1649
1650static int irq_domain_debug_show(struct seq_file *m, void *p)
1651{
1652 struct irq_domain *d = m->private;
1653
1654 /* Default domain? Might be NULL */
1655 if (!d) {
1656 if (!irq_default_domain)
1657 return 0;
1658 d = irq_default_domain;
1659 }
1660 irq_domain_debug_show_one(m, d, 0);
1661 return 0;
1662}
1663
1664static int irq_domain_debug_open(struct inode *inode, struct file *file)
1665{
1666 return single_open(file, irq_domain_debug_show, inode->i_private);
1667}
1668
1669static const struct file_operations dfs_domain_ops = {
1670 .open = irq_domain_debug_open,
1671 .read = seq_read,
1672 .llseek = seq_lseek,
1673 .release = single_release,
1674};
1675
1676static void debugfs_add_domain_dir(struct irq_domain *d)
1677{
1678 if (!d->name || !domain_dir || d->debugfs_file)
1679 return;
1680 d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d,
1681 &dfs_domain_ops);
1682}
1683
1684static void debugfs_remove_domain_dir(struct irq_domain *d)
1685{
1686 debugfs_remove(d->debugfs_file);
1687}
1688
1689void __init irq_domain_debugfs_init(struct dentry *root)
1690{
1691 struct irq_domain *d;
1692
1693 domain_dir = debugfs_create_dir("domains", root);
1694 if (!domain_dir)
1695 return;
1696
1697 debugfs_create_file("default", 0444, domain_dir, NULL, &dfs_domain_ops);
1698 mutex_lock(&irq_domain_mutex);
1699 list_for_each_entry(d, &irq_domain_list, link)
1700 debugfs_add_domain_dir(d);
1701 mutex_unlock(&irq_domain_mutex);
1702}
1703#endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 070be980c37a..1d1a5b945ab4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -168,34 +168,6 @@ void irq_set_thread_affinity(struct irq_desc *desc)
168 set_bit(IRQTF_AFFINITY, &action->thread_flags); 168 set_bit(IRQTF_AFFINITY, &action->thread_flags);
169} 169}
170 170
171#ifdef CONFIG_GENERIC_PENDING_IRQ
172static inline bool irq_can_move_pcntxt(struct irq_data *data)
173{
174 return irqd_can_move_in_process_context(data);
175}
176static inline bool irq_move_pending(struct irq_data *data)
177{
178 return irqd_is_setaffinity_pending(data);
179}
180static inline void
181irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
182{
183 cpumask_copy(desc->pending_mask, mask);
184}
185static inline void
186irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
187{
188 cpumask_copy(mask, desc->pending_mask);
189}
190#else
191static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; }
192static inline bool irq_move_pending(struct irq_data *data) { return false; }
193static inline void
194irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
195static inline void
196irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
197#endif
198
199int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, 171int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
200 bool force) 172 bool force)
201{ 173{
@@ -345,15 +317,18 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
345/* 317/*
346 * Generic version of the affinity autoselector. 318 * Generic version of the affinity autoselector.
347 */ 319 */
348static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) 320int irq_setup_affinity(struct irq_desc *desc)
349{ 321{
350 struct cpumask *set = irq_default_affinity; 322 struct cpumask *set = irq_default_affinity;
351 int node = irq_desc_get_node(desc); 323 int ret, node = irq_desc_get_node(desc);
324 static DEFINE_RAW_SPINLOCK(mask_lock);
325 static struct cpumask mask;
352 326
353 /* Excludes PER_CPU and NO_BALANCE interrupts */ 327 /* Excludes PER_CPU and NO_BALANCE interrupts */
354 if (!__irq_can_set_affinity(desc)) 328 if (!__irq_can_set_affinity(desc))
355 return 0; 329 return 0;
356 330
331 raw_spin_lock(&mask_lock);
357 /* 332 /*
358 * Preserve the managed affinity setting and a userspace affinity 333 * Preserve the managed affinity setting and a userspace affinity
359 * setup, but make sure that one of the targets is online. 334 * setup, but make sure that one of the targets is online.
@@ -367,46 +342,40 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
367 irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); 342 irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
368 } 343 }
369 344
370 cpumask_and(mask, cpu_online_mask, set); 345 cpumask_and(&mask, cpu_online_mask, set);
371 if (node != NUMA_NO_NODE) { 346 if (node != NUMA_NO_NODE) {
372 const struct cpumask *nodemask = cpumask_of_node(node); 347 const struct cpumask *nodemask = cpumask_of_node(node);
373 348
374 /* make sure at least one of the cpus in nodemask is online */ 349 /* make sure at least one of the cpus in nodemask is online */
375 if (cpumask_intersects(mask, nodemask)) 350 if (cpumask_intersects(&mask, nodemask))
376 cpumask_and(mask, mask, nodemask); 351 cpumask_and(&mask, &mask, nodemask);
377 } 352 }
378 irq_do_set_affinity(&desc->irq_data, mask, false); 353 ret = irq_do_set_affinity(&desc->irq_data, &mask, false);
379 return 0; 354 raw_spin_unlock(&mask_lock);
355 return ret;
380} 356}
381#else 357#else
382/* Wrapper for ALPHA specific affinity selector magic */ 358/* Wrapper for ALPHA specific affinity selector magic */
383static inline int setup_affinity(struct irq_desc *d, struct cpumask *mask) 359int irq_setup_affinity(struct irq_desc *desc)
384{ 360{
385 return irq_select_affinity(irq_desc_get_irq(d)); 361 return irq_select_affinity(irq_desc_get_irq(desc));
386} 362}
387#endif 363#endif
388 364
389/* 365/*
390 * Called when affinity is set via /proc/irq 366 * Called when a bogus affinity is set via /proc/irq
391 */ 367 */
392int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) 368int irq_select_affinity_usr(unsigned int irq)
393{ 369{
394 struct irq_desc *desc = irq_to_desc(irq); 370 struct irq_desc *desc = irq_to_desc(irq);
395 unsigned long flags; 371 unsigned long flags;
396 int ret; 372 int ret;
397 373
398 raw_spin_lock_irqsave(&desc->lock, flags); 374 raw_spin_lock_irqsave(&desc->lock, flags);
399 ret = setup_affinity(desc, mask); 375 ret = irq_setup_affinity(desc);
400 raw_spin_unlock_irqrestore(&desc->lock, flags); 376 raw_spin_unlock_irqrestore(&desc->lock, flags);
401 return ret; 377 return ret;
402} 378}
403
404#else
405static inline int
406setup_affinity(struct irq_desc *desc, struct cpumask *mask)
407{
408 return 0;
409}
410#endif 379#endif
411 380
412/** 381/**
@@ -533,9 +502,15 @@ void __enable_irq(struct irq_desc *desc)
533 goto err_out; 502 goto err_out;
534 /* Prevent probing on this irq: */ 503 /* Prevent probing on this irq: */
535 irq_settings_set_noprobe(desc); 504 irq_settings_set_noprobe(desc);
536 irq_enable(desc); 505 /*
537 check_irq_resend(desc); 506 * Call irq_startup() not irq_enable() here because the
538 /* fall-through */ 507 * interrupt might be marked NOAUTOEN. So irq_startup()
508 * needs to be invoked when it gets enabled the first
509 * time. If it was already started up, then irq_startup()
510 * will invoke irq_enable() under the hood.
511 */
512 irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
513 break;
539 } 514 }
540 default: 515 default:
541 desc->depth--; 516 desc->depth--;
@@ -1115,6 +1090,16 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
1115/* 1090/*
1116 * Internal function to register an irqaction - typically used to 1091 * Internal function to register an irqaction - typically used to
1117 * allocate special interrupts that are part of the architecture. 1092 * allocate special interrupts that are part of the architecture.
1093 *
1094 * Locking rules:
1095 *
1096 * desc->request_mutex Provides serialization against a concurrent free_irq()
1097 * chip_bus_lock Provides serialization for slow bus operations
1098 * desc->lock Provides serialization against hard interrupts
1099 *
1100 * chip_bus_lock and desc->lock are sufficient for all other management and
1101 * interrupt related functions. desc->request_mutex solely serializes
1102 * request/free_irq().
1118 */ 1103 */
1119static int 1104static int
1120__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) 1105__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
@@ -1122,7 +1107,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1122 struct irqaction *old, **old_ptr; 1107 struct irqaction *old, **old_ptr;
1123 unsigned long flags, thread_mask = 0; 1108 unsigned long flags, thread_mask = 0;
1124 int ret, nested, shared = 0; 1109 int ret, nested, shared = 0;
1125 cpumask_var_t mask;
1126 1110
1127 if (!desc) 1111 if (!desc)
1128 return -EINVAL; 1112 return -EINVAL;
@@ -1181,11 +1165,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1181 } 1165 }
1182 } 1166 }
1183 1167
1184 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
1185 ret = -ENOMEM;
1186 goto out_thread;
1187 }
1188
1189 /* 1168 /*
1190 * Drivers are often written to work w/o knowledge about the 1169 * Drivers are often written to work w/o knowledge about the
1191 * underlying irq chip implementation, so a request for a 1170 * underlying irq chip implementation, so a request for a
@@ -1199,7 +1178,34 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1199 new->flags &= ~IRQF_ONESHOT; 1178 new->flags &= ~IRQF_ONESHOT;
1200 1179
1201 /* 1180 /*
1181 * Protects against a concurrent __free_irq() call which might wait
1182 * for synchronize_irq() to complete without holding the optional
1183 * chip bus lock and desc->lock.
1184 */
1185 mutex_lock(&desc->request_mutex);
1186
1187 /*
1188 * Acquire bus lock as the irq_request_resources() callback below
1189 * might rely on the serialization or the magic power management
1190 * functions which are abusing the irq_bus_lock() callback,
1191 */
1192 chip_bus_lock(desc);
1193
1194 /* First installed action requests resources. */
1195 if (!desc->action) {
1196 ret = irq_request_resources(desc);
1197 if (ret) {
1198 pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
1199 new->name, irq, desc->irq_data.chip->name);
1200 goto out_bus_unlock;
1201 }
1202 }
1203
1204 /*
1202 * The following block of code has to be executed atomically 1205 * The following block of code has to be executed atomically
1206 * protected against a concurrent interrupt and any of the other
1207 * management calls which are not serialized via
1208 * desc->request_mutex or the optional bus lock.
1203 */ 1209 */
1204 raw_spin_lock_irqsave(&desc->lock, flags); 1210 raw_spin_lock_irqsave(&desc->lock, flags);
1205 old_ptr = &desc->action; 1211 old_ptr = &desc->action;
@@ -1250,7 +1256,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1250 */ 1256 */
1251 if (thread_mask == ~0UL) { 1257 if (thread_mask == ~0UL) {
1252 ret = -EBUSY; 1258 ret = -EBUSY;
1253 goto out_mask; 1259 goto out_unlock;
1254 } 1260 }
1255 /* 1261 /*
1256 * The thread_mask for the action is or'ed to 1262 * The thread_mask for the action is or'ed to
@@ -1294,17 +1300,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1294 pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", 1300 pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
1295 irq); 1301 irq);
1296 ret = -EINVAL; 1302 ret = -EINVAL;
1297 goto out_mask; 1303 goto out_unlock;
1298 } 1304 }
1299 1305
1300 if (!shared) { 1306 if (!shared) {
1301 ret = irq_request_resources(desc);
1302 if (ret) {
1303 pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
1304 new->name, irq, desc->irq_data.chip->name);
1305 goto out_mask;
1306 }
1307
1308 init_waitqueue_head(&desc->wait_for_threads); 1307 init_waitqueue_head(&desc->wait_for_threads);
1309 1308
1310 /* Setup the type (level, edge polarity) if configured: */ 1309 /* Setup the type (level, edge polarity) if configured: */
@@ -1313,7 +1312,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1313 new->flags & IRQF_TRIGGER_MASK); 1312 new->flags & IRQF_TRIGGER_MASK);
1314 1313
1315 if (ret) 1314 if (ret)
1316 goto out_mask; 1315 goto out_unlock;
1317 } 1316 }
1318 1317
1319 desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ 1318 desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
@@ -1328,20 +1327,25 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1328 if (new->flags & IRQF_ONESHOT) 1327 if (new->flags & IRQF_ONESHOT)
1329 desc->istate |= IRQS_ONESHOT; 1328 desc->istate |= IRQS_ONESHOT;
1330 1329
1331 if (irq_settings_can_autoenable(desc))
1332 irq_startup(desc, true);
1333 else
1334 /* Undo nested disables: */
1335 desc->depth = 1;
1336
1337 /* Exclude IRQ from balancing if requested */ 1330 /* Exclude IRQ from balancing if requested */
1338 if (new->flags & IRQF_NOBALANCING) { 1331 if (new->flags & IRQF_NOBALANCING) {
1339 irq_settings_set_no_balancing(desc); 1332 irq_settings_set_no_balancing(desc);
1340 irqd_set(&desc->irq_data, IRQD_NO_BALANCING); 1333 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
1341 } 1334 }
1342 1335
1343 /* Set default affinity mask once everything is setup */ 1336 if (irq_settings_can_autoenable(desc)) {
1344 setup_affinity(desc, mask); 1337 irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
1338 } else {
1339 /*
1340 * Shared interrupts do not go well with disabling
1341 * auto enable. The sharing interrupt might request
1342 * it while it's still disabled and then wait for
1343 * interrupts forever.
1344 */
1345 WARN_ON_ONCE(new->flags & IRQF_SHARED);
1346 /* Undo nested disables: */
1347 desc->depth = 1;
1348 }
1345 1349
1346 } else if (new->flags & IRQF_TRIGGER_MASK) { 1350 } else if (new->flags & IRQF_TRIGGER_MASK) {
1347 unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; 1351 unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
@@ -1371,6 +1375,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1371 } 1375 }
1372 1376
1373 raw_spin_unlock_irqrestore(&desc->lock, flags); 1377 raw_spin_unlock_irqrestore(&desc->lock, flags);
1378 chip_bus_sync_unlock(desc);
1379 mutex_unlock(&desc->request_mutex);
1380
1381 irq_setup_timings(desc, new);
1374 1382
1375 /* 1383 /*
1376 * Strictly no need to wake it up, but hung_task complains 1384 * Strictly no need to wake it up, but hung_task complains
@@ -1382,10 +1390,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1382 wake_up_process(new->secondary->thread); 1390 wake_up_process(new->secondary->thread);
1383 1391
1384 register_irq_proc(irq, desc); 1392 register_irq_proc(irq, desc);
1393 irq_add_debugfs_entry(irq, desc);
1385 new->dir = NULL; 1394 new->dir = NULL;
1386 register_handler_proc(irq, new); 1395 register_handler_proc(irq, new);
1387 free_cpumask_var(mask);
1388
1389 return 0; 1396 return 0;
1390 1397
1391mismatch: 1398mismatch:
@@ -1398,9 +1405,14 @@ mismatch:
1398 } 1405 }
1399 ret = -EBUSY; 1406 ret = -EBUSY;
1400 1407
1401out_mask: 1408out_unlock:
1402 raw_spin_unlock_irqrestore(&desc->lock, flags); 1409 raw_spin_unlock_irqrestore(&desc->lock, flags);
1403 free_cpumask_var(mask); 1410
1411 if (!desc->action)
1412 irq_release_resources(desc);
1413out_bus_unlock:
1414 chip_bus_sync_unlock(desc);
1415 mutex_unlock(&desc->request_mutex);
1404 1416
1405out_thread: 1417out_thread:
1406 if (new->thread) { 1418 if (new->thread) {
@@ -1441,9 +1453,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
1441 if (retval < 0) 1453 if (retval < 0)
1442 return retval; 1454 return retval;
1443 1455
1444 chip_bus_lock(desc);
1445 retval = __setup_irq(irq, desc, act); 1456 retval = __setup_irq(irq, desc, act);
1446 chip_bus_sync_unlock(desc);
1447 1457
1448 if (retval) 1458 if (retval)
1449 irq_chip_pm_put(&desc->irq_data); 1459 irq_chip_pm_put(&desc->irq_data);
@@ -1467,6 +1477,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1467 if (!desc) 1477 if (!desc)
1468 return NULL; 1478 return NULL;
1469 1479
1480 mutex_lock(&desc->request_mutex);
1470 chip_bus_lock(desc); 1481 chip_bus_lock(desc);
1471 raw_spin_lock_irqsave(&desc->lock, flags); 1482 raw_spin_lock_irqsave(&desc->lock, flags);
1472 1483
@@ -1482,6 +1493,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1482 WARN(1, "Trying to free already-free IRQ %d\n", irq); 1493 WARN(1, "Trying to free already-free IRQ %d\n", irq);
1483 raw_spin_unlock_irqrestore(&desc->lock, flags); 1494 raw_spin_unlock_irqrestore(&desc->lock, flags);
1484 chip_bus_sync_unlock(desc); 1495 chip_bus_sync_unlock(desc);
1496 mutex_unlock(&desc->request_mutex);
1485 return NULL; 1497 return NULL;
1486 } 1498 }
1487 1499
@@ -1499,7 +1511,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1499 if (!desc->action) { 1511 if (!desc->action) {
1500 irq_settings_clr_disable_unlazy(desc); 1512 irq_settings_clr_disable_unlazy(desc);
1501 irq_shutdown(desc); 1513 irq_shutdown(desc);
1502 irq_release_resources(desc);
1503 } 1514 }
1504 1515
1505#ifdef CONFIG_SMP 1516#ifdef CONFIG_SMP
@@ -1509,6 +1520,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1509#endif 1520#endif
1510 1521
1511 raw_spin_unlock_irqrestore(&desc->lock, flags); 1522 raw_spin_unlock_irqrestore(&desc->lock, flags);
1523 /*
1524 * Drop bus_lock here so the changes which were done in the chip
1525 * callbacks above are synced out to the irq chips which hang
1526 * behind a slow bus (I2C, SPI) before calling synchronize_irq().
1527 *
1528 * Aside of that the bus_lock can also be taken from the threaded
1529 * handler in irq_finalize_oneshot() which results in a deadlock
1530 * because synchronize_irq() would wait forever for the thread to
1531 * complete, which is blocked on the bus lock.
1532 *
1533 * The still held desc->request_mutex() protects against a
1534 * concurrent request_irq() of this irq so the release of resources
1535 * and timing data is properly serialized.
1536 */
1512 chip_bus_sync_unlock(desc); 1537 chip_bus_sync_unlock(desc);
1513 1538
1514 unregister_handler_proc(irq, action); 1539 unregister_handler_proc(irq, action);
@@ -1541,6 +1566,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1541 } 1566 }
1542 } 1567 }
1543 1568
1569 /* Last action releases resources */
1570 if (!desc->action) {
1571 /*
1572 * Reaquire bus lock as irq_release_resources() might
1573 * require it to deallocate resources over the slow bus.
1574 */
1575 chip_bus_lock(desc);
1576 irq_release_resources(desc);
1577 chip_bus_sync_unlock(desc);
1578 irq_remove_timings(desc);
1579 }
1580
1581 mutex_unlock(&desc->request_mutex);
1582
1544 irq_chip_pm_put(&desc->irq_data); 1583 irq_chip_pm_put(&desc->irq_data);
1545 module_put(desc->owner); 1584 module_put(desc->owner);
1546 kfree(action->secondary); 1585 kfree(action->secondary);
@@ -1697,9 +1736,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1697 return retval; 1736 return retval;
1698 } 1737 }
1699 1738
1700 chip_bus_lock(desc);
1701 retval = __setup_irq(irq, desc, action); 1739 retval = __setup_irq(irq, desc, action);
1702 chip_bus_sync_unlock(desc);
1703 1740
1704 if (retval) { 1741 if (retval) {
1705 irq_chip_pm_put(&desc->irq_data); 1742 irq_chip_pm_put(&desc->irq_data);
@@ -1947,9 +1984,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
1947 if (retval < 0) 1984 if (retval < 0)
1948 return retval; 1985 return retval;
1949 1986
1950 chip_bus_lock(desc);
1951 retval = __setup_irq(irq, desc, act); 1987 retval = __setup_irq(irq, desc, act);
1952 chip_bus_sync_unlock(desc);
1953 1988
1954 if (retval) 1989 if (retval)
1955 irq_chip_pm_put(&desc->irq_data); 1990 irq_chip_pm_put(&desc->irq_data);
@@ -1958,9 +1993,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
1958} 1993}
1959 1994
1960/** 1995/**
1961 * request_percpu_irq - allocate a percpu interrupt line 1996 * __request_percpu_irq - allocate a percpu interrupt line
1962 * @irq: Interrupt line to allocate 1997 * @irq: Interrupt line to allocate
1963 * @handler: Function to be called when the IRQ occurs. 1998 * @handler: Function to be called when the IRQ occurs.
1999 * @flags: Interrupt type flags (IRQF_TIMER only)
1964 * @devname: An ascii name for the claiming device 2000 * @devname: An ascii name for the claiming device
1965 * @dev_id: A percpu cookie passed back to the handler function 2001 * @dev_id: A percpu cookie passed back to the handler function
1966 * 2002 *
@@ -1973,8 +2009,9 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
1973 * the handler gets called with the interrupted CPU's instance of 2009 * the handler gets called with the interrupted CPU's instance of
1974 * that variable. 2010 * that variable.
1975 */ 2011 */
1976int request_percpu_irq(unsigned int irq, irq_handler_t handler, 2012int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
1977 const char *devname, void __percpu *dev_id) 2013 unsigned long flags, const char *devname,
2014 void __percpu *dev_id)
1978{ 2015{
1979 struct irqaction *action; 2016 struct irqaction *action;
1980 struct irq_desc *desc; 2017 struct irq_desc *desc;
@@ -1988,12 +2025,15 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
1988 !irq_settings_is_per_cpu_devid(desc)) 2025 !irq_settings_is_per_cpu_devid(desc))
1989 return -EINVAL; 2026 return -EINVAL;
1990 2027
2028 if (flags && flags != IRQF_TIMER)
2029 return -EINVAL;
2030
1991 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); 2031 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
1992 if (!action) 2032 if (!action)
1993 return -ENOMEM; 2033 return -ENOMEM;
1994 2034
1995 action->handler = handler; 2035 action->handler = handler;
1996 action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND; 2036 action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND;
1997 action->name = devname; 2037 action->name = devname;
1998 action->percpu_dev_id = dev_id; 2038 action->percpu_dev_id = dev_id;
1999 2039
@@ -2003,9 +2043,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
2003 return retval; 2043 return retval;
2004 } 2044 }
2005 2045
2006 chip_bus_lock(desc);
2007 retval = __setup_irq(irq, desc, action); 2046 retval = __setup_irq(irq, desc, action);
2008 chip_bus_sync_unlock(desc);
2009 2047
2010 if (retval) { 2048 if (retval) {
2011 irq_chip_pm_put(&desc->irq_data); 2049 irq_chip_pm_put(&desc->irq_data);
@@ -2014,7 +2052,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
2014 2052
2015 return retval; 2053 return retval;
2016} 2054}
2017EXPORT_SYMBOL_GPL(request_percpu_irq); 2055EXPORT_SYMBOL_GPL(__request_percpu_irq);
2018 2056
2019/** 2057/**
2020 * irq_get_irqchip_state - returns the irqchip state of a interrupt. 2058 * irq_get_irqchip_state - returns the irqchip state of a interrupt.
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 37ddb7bda651..6ca054a3f91d 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,6 +4,36 @@
4 4
5#include "internals.h" 5#include "internals.h"
6 6
7/**
8 * irq_fixup_move_pending - Cleanup irq move pending from a dying CPU
9 * @desc: Interrupt descpriptor to clean up
10 * @force_clear: If set clear the move pending bit unconditionally.
11 * If not set, clear it only when the dying CPU is the
12 * last one in the pending mask.
13 *
14 * Returns true if the pending bit was set and the pending mask contains an
15 * online CPU other than the dying CPU.
16 */
17bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear)
18{
19 struct irq_data *data = irq_desc_get_irq_data(desc);
20
21 if (!irqd_is_setaffinity_pending(data))
22 return false;
23
24 /*
25 * The outgoing CPU might be the last online target in a pending
26 * interrupt move. If that's the case clear the pending move bit.
27 */
28 if (cpumask_any_and(desc->pending_mask, cpu_online_mask) >= nr_cpu_ids) {
29 irqd_clr_move_pending(data);
30 return false;
31 }
32 if (force_clear)
33 irqd_clr_move_pending(data);
34 return true;
35}
36
7void irq_move_masked_irq(struct irq_data *idata) 37void irq_move_masked_irq(struct irq_data *idata)
8{ 38{
9 struct irq_desc *desc = irq_data_to_desc(idata); 39 struct irq_desc *desc = irq_data_to_desc(idata);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index ddc2f5427f75..48eadf416c24 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -265,13 +265,20 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
265 struct msi_domain_info *info, 265 struct msi_domain_info *info,
266 struct irq_domain *parent) 266 struct irq_domain *parent)
267{ 267{
268 struct irq_domain *domain;
269
268 if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) 270 if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
269 msi_domain_update_dom_ops(info); 271 msi_domain_update_dom_ops(info);
270 if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) 272 if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
271 msi_domain_update_chip_ops(info); 273 msi_domain_update_chip_ops(info);
272 274
273 return irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, 275 domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
274 fwnode, &msi_domain_ops, info); 276 fwnode, &msi_domain_ops, info);
277
278 if (domain && !domain->name && info->chip)
279 domain->name = info->chip->name;
280
281 return domain;
275} 282}
276 283
277int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, 284int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
@@ -308,7 +315,7 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
308 315
309 ops->set_desc(arg, desc); 316 ops->set_desc(arg, desc);
310 /* Assumes the domain mutex is held! */ 317 /* Assumes the domain mutex is held! */
311 ret = irq_domain_alloc_irqs_recursive(domain, virq, 1, arg); 318 ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
312 if (ret) 319 if (ret)
313 break; 320 break;
314 321
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cea1de0161f1..6bd9b58429cc 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -149,6 +149,8 @@ static void resume_irq(struct irq_desc *desc)
149 149
150 /* Pretend that it got disabled ! */ 150 /* Pretend that it got disabled ! */
151 desc->depth++; 151 desc->depth++;
152 irq_state_set_disabled(desc);
153 irq_state_set_masked(desc);
152resume: 154resume:
153 desc->istate &= ~IRQS_SUSPENDED; 155 desc->istate &= ~IRQS_SUSPENDED;
154 __enable_irq(desc); 156 __enable_irq(desc);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index c53edad7b459..7f9642a1e267 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -37,19 +37,47 @@ static struct proc_dir_entry *root_irq_dir;
37 37
38#ifdef CONFIG_SMP 38#ifdef CONFIG_SMP
39 39
40static int show_irq_affinity(int type, struct seq_file *m, void *v) 40enum {
41 AFFINITY,
42 AFFINITY_LIST,
43 EFFECTIVE,
44 EFFECTIVE_LIST,
45};
46
47static int show_irq_affinity(int type, struct seq_file *m)
41{ 48{
42 struct irq_desc *desc = irq_to_desc((long)m->private); 49 struct irq_desc *desc = irq_to_desc((long)m->private);
43 const struct cpumask *mask = desc->irq_common_data.affinity; 50 const struct cpumask *mask;
44 51
52 switch (type) {
53 case AFFINITY:
54 case AFFINITY_LIST:
55 mask = desc->irq_common_data.affinity;
45#ifdef CONFIG_GENERIC_PENDING_IRQ 56#ifdef CONFIG_GENERIC_PENDING_IRQ
46 if (irqd_is_setaffinity_pending(&desc->irq_data)) 57 if (irqd_is_setaffinity_pending(&desc->irq_data))
47 mask = desc->pending_mask; 58 mask = desc->pending_mask;
48#endif 59#endif
49 if (type) 60 break;
61 case EFFECTIVE:
62 case EFFECTIVE_LIST:
63#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
64 mask = desc->irq_common_data.effective_affinity;
65 break;
66#else
67 return -EINVAL;
68#endif
69 };
70
71 switch (type) {
72 case AFFINITY_LIST:
73 case EFFECTIVE_LIST:
50 seq_printf(m, "%*pbl\n", cpumask_pr_args(mask)); 74 seq_printf(m, "%*pbl\n", cpumask_pr_args(mask));
51 else 75 break;
76 case AFFINITY:
77 case EFFECTIVE:
52 seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); 78 seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
79 break;
80 }
53 return 0; 81 return 0;
54} 82}
55 83
@@ -80,12 +108,12 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
80int no_irq_affinity; 108int no_irq_affinity;
81static int irq_affinity_proc_show(struct seq_file *m, void *v) 109static int irq_affinity_proc_show(struct seq_file *m, void *v)
82{ 110{
83 return show_irq_affinity(0, m, v); 111 return show_irq_affinity(AFFINITY, m);
84} 112}
85 113
86static int irq_affinity_list_proc_show(struct seq_file *m, void *v) 114static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
87{ 115{
88 return show_irq_affinity(1, m, v); 116 return show_irq_affinity(AFFINITY_LIST, m);
89} 117}
90 118
91 119
@@ -120,9 +148,11 @@ static ssize_t write_irq_affinity(int type, struct file *file,
120 * one online CPU still has to be targeted. 148 * one online CPU still has to be targeted.
121 */ 149 */
122 if (!cpumask_intersects(new_value, cpu_online_mask)) { 150 if (!cpumask_intersects(new_value, cpu_online_mask)) {
123 /* Special case for empty set - allow the architecture 151 /*
124 code to set default SMP affinity. */ 152 * Special case for empty set - allow the architecture code
125 err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count; 153 * to set default SMP affinity.
154 */
155 err = irq_select_affinity_usr(irq) ? -EINVAL : count;
126 } else { 156 } else {
127 irq_set_affinity(irq, new_value); 157 irq_set_affinity(irq, new_value);
128 err = count; 158 err = count;
@@ -183,6 +213,44 @@ static const struct file_operations irq_affinity_list_proc_fops = {
183 .write = irq_affinity_list_proc_write, 213 .write = irq_affinity_list_proc_write,
184}; 214};
185 215
216#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
217static int irq_effective_aff_proc_show(struct seq_file *m, void *v)
218{
219 return show_irq_affinity(EFFECTIVE, m);
220}
221
222static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v)
223{
224 return show_irq_affinity(EFFECTIVE_LIST, m);
225}
226
227static int irq_effective_aff_proc_open(struct inode *inode, struct file *file)
228{
229 return single_open(file, irq_effective_aff_proc_show, PDE_DATA(inode));
230}
231
232static int irq_effective_aff_list_proc_open(struct inode *inode,
233 struct file *file)
234{
235 return single_open(file, irq_effective_aff_list_proc_show,
236 PDE_DATA(inode));
237}
238
239static const struct file_operations irq_effective_aff_proc_fops = {
240 .open = irq_effective_aff_proc_open,
241 .read = seq_read,
242 .llseek = seq_lseek,
243 .release = single_release,
244};
245
246static const struct file_operations irq_effective_aff_list_proc_fops = {
247 .open = irq_effective_aff_list_proc_open,
248 .read = seq_read,
249 .llseek = seq_lseek,
250 .release = single_release,
251};
252#endif
253
186static int default_affinity_show(struct seq_file *m, void *v) 254static int default_affinity_show(struct seq_file *m, void *v)
187{ 255{
188 seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity)); 256 seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity));
@@ -324,6 +392,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
324void register_irq_proc(unsigned int irq, struct irq_desc *desc) 392void register_irq_proc(unsigned int irq, struct irq_desc *desc)
325{ 393{
326 static DEFINE_MUTEX(register_lock); 394 static DEFINE_MUTEX(register_lock);
395 void __maybe_unused *irqp = (void *)(unsigned long) irq;
327 char name [MAX_NAMELEN]; 396 char name [MAX_NAMELEN];
328 397
329 if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) 398 if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip))
@@ -349,20 +418,25 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
349#ifdef CONFIG_SMP 418#ifdef CONFIG_SMP
350 /* create /proc/irq/<irq>/smp_affinity */ 419 /* create /proc/irq/<irq>/smp_affinity */
351 proc_create_data("smp_affinity", 0644, desc->dir, 420 proc_create_data("smp_affinity", 0644, desc->dir,
352 &irq_affinity_proc_fops, (void *)(long)irq); 421 &irq_affinity_proc_fops, irqp);
353 422
354 /* create /proc/irq/<irq>/affinity_hint */ 423 /* create /proc/irq/<irq>/affinity_hint */
355 proc_create_data("affinity_hint", 0444, desc->dir, 424 proc_create_data("affinity_hint", 0444, desc->dir,
356 &irq_affinity_hint_proc_fops, (void *)(long)irq); 425 &irq_affinity_hint_proc_fops, irqp);
357 426
358 /* create /proc/irq/<irq>/smp_affinity_list */ 427 /* create /proc/irq/<irq>/smp_affinity_list */
359 proc_create_data("smp_affinity_list", 0644, desc->dir, 428 proc_create_data("smp_affinity_list", 0644, desc->dir,
360 &irq_affinity_list_proc_fops, (void *)(long)irq); 429 &irq_affinity_list_proc_fops, irqp);
361 430
362 proc_create_data("node", 0444, desc->dir, 431 proc_create_data("node", 0444, desc->dir,
363 &irq_node_proc_fops, (void *)(long)irq); 432 &irq_node_proc_fops, irqp);
433# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
434 proc_create_data("effective_affinity", 0444, desc->dir,
435 &irq_effective_aff_proc_fops, irqp);
436 proc_create_data("effective_affinity_list", 0444, desc->dir,
437 &irq_effective_aff_list_proc_fops, irqp);
438# endif
364#endif 439#endif
365
366 proc_create_data("spurious", 0444, desc->dir, 440 proc_create_data("spurious", 0444, desc->dir,
367 &irq_spurious_proc_fops, (void *)(long)irq); 441 &irq_spurious_proc_fops, (void *)(long)irq);
368 442
@@ -381,6 +455,10 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
381 remove_proc_entry("affinity_hint", desc->dir); 455 remove_proc_entry("affinity_hint", desc->dir);
382 remove_proc_entry("smp_affinity_list", desc->dir); 456 remove_proc_entry("smp_affinity_list", desc->dir);
383 remove_proc_entry("node", desc->dir); 457 remove_proc_entry("node", desc->dir);
458# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
459 remove_proc_entry("effective_affinity", desc->dir);
460 remove_proc_entry("effective_affinity_list", desc->dir);
461# endif
384#endif 462#endif
385 remove_proc_entry("spurious", desc->dir); 463 remove_proc_entry("spurious", desc->dir);
386 464
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
new file mode 100644
index 000000000000..c8c1d073fbf1
--- /dev/null
+++ b/kernel/irq/timings.c
@@ -0,0 +1,369 @@
1/*
2 * linux/kernel/irq/timings.c
3 *
4 * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 */
11#include <linux/kernel.h>
12#include <linux/percpu.h>
13#include <linux/slab.h>
14#include <linux/static_key.h>
15#include <linux/interrupt.h>
16#include <linux/idr.h>
17#include <linux/irq.h>
18#include <linux/math64.h>
19
20#include <trace/events/irq.h>
21
22#include "internals.h"
23
24DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
25
26DEFINE_PER_CPU(struct irq_timings, irq_timings);
27
28struct irqt_stat {
29 u64 next_evt;
30 u64 last_ts;
31 u64 variance;
32 u32 avg;
33 u32 nr_samples;
34 int anomalies;
35 int valid;
36};
37
38static DEFINE_IDR(irqt_stats);
39
40void irq_timings_enable(void)
41{
42 static_branch_enable(&irq_timing_enabled);
43}
44
45void irq_timings_disable(void)
46{
47 static_branch_disable(&irq_timing_enabled);
48}
49
50/**
51 * irqs_update - update the irq timing statistics with a new timestamp
52 *
53 * @irqs: an irqt_stat struct pointer
54 * @ts: the new timestamp
55 *
56 * The statistics are computed online, in other words, the code is
57 * designed to compute the statistics on a stream of values rather
58 * than doing multiple passes on the values to compute the average,
59 * then the variance. The integer division introduces a loss of
60 * precision but with an acceptable error margin regarding the results
61 * we would have with the double floating precision: we are dealing
62 * with nanosec, so big numbers, consequently the mantisse is
63 * negligeable, especially when converting the time in usec
64 * afterwards.
65 *
66 * The computation happens at idle time. When the CPU is not idle, the
67 * interrupts' timestamps are stored in the circular buffer, when the
68 * CPU goes idle and this routine is called, all the buffer's values
69 * are injected in the statistical model continuying to extend the
70 * statistics from the previous busy-idle cycle.
71 *
72 * The observations showed a device will trigger a burst of periodic
73 * interrupts followed by one or two peaks of longer time, for
74 * instance when a SD card device flushes its cache, then the periodic
75 * intervals occur again. A one second inactivity period resets the
76 * stats, that gives us the certitude the statistical values won't
77 * exceed 1x10^9, thus the computation won't overflow.
78 *
79 * Basically, the purpose of the algorithm is to watch the periodic
80 * interrupts and eliminate the peaks.
81 *
82 * An interrupt is considered periodically stable if the interval of
83 * its occurences follow the normal distribution, thus the values
84 * comply with:
85 *
86 * avg - 3 x stddev < value < avg + 3 x stddev
87 *
88 * Which can be simplified to:
89 *
90 * -3 x stddev < value - avg < 3 x stddev
91 *
92 * abs(value - avg) < 3 x stddev
93 *
94 * In order to save a costly square root computation, we use the
95 * variance. For the record, stddev = sqrt(variance). The equation
96 * above becomes:
97 *
98 * abs(value - avg) < 3 x sqrt(variance)
99 *
100 * And finally we square it:
101 *
102 * (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
103 *
104 * (value - avg) x (value - avg) < 9 x variance
105 *
106 * Statistically speaking, any values out of this interval is
107 * considered as an anomaly and is discarded. However, a normal
108 * distribution appears when the number of samples is 30 (it is the
109 * rule of thumb in statistics, cf. "30 samples" on Internet). When
110 * there are three consecutive anomalies, the statistics are resetted.
111 *
112 */
113static void irqs_update(struct irqt_stat *irqs, u64 ts)
114{
115 u64 old_ts = irqs->last_ts;
116 u64 variance = 0;
117 u64 interval;
118 s64 diff;
119
120 /*
121 * The timestamps are absolute time values, we need to compute
122 * the timing interval between two interrupts.
123 */
124 irqs->last_ts = ts;
125
126 /*
127 * The interval type is u64 in order to deal with the same
128 * type in our computation, that prevent mindfuck issues with
129 * overflow, sign and division.
130 */
131 interval = ts - old_ts;
132
133 /*
134 * The interrupt triggered more than one second apart, that
135 * ends the sequence as predictible for our purpose. In this
136 * case, assume we have the beginning of a sequence and the
137 * timestamp is the first value. As it is impossible to
138 * predict anything at this point, return.
139 *
140 * Note the first timestamp of the sequence will always fall
141 * in this test because the old_ts is zero. That is what we
142 * want as we need another timestamp to compute an interval.
143 */
144 if (interval >= NSEC_PER_SEC) {
145 memset(irqs, 0, sizeof(*irqs));
146 irqs->last_ts = ts;
147 return;
148 }
149
150 /*
151 * Pre-compute the delta with the average as the result is
152 * used several times in this function.
153 */
154 diff = interval - irqs->avg;
155
156 /*
157 * Increment the number of samples.
158 */
159 irqs->nr_samples++;
160
161 /*
162 * Online variance divided by the number of elements if there
163 * is more than one sample. Normally the formula is division
164 * by nr_samples - 1 but we assume the number of element will be
165 * more than 32 and dividing by 32 instead of 31 is enough
166 * precise.
167 */
168 if (likely(irqs->nr_samples > 1))
169 variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
170
171 /*
172 * The rule of thumb in statistics for the normal distribution
173 * is having at least 30 samples in order to have the model to
174 * apply. Values outside the interval are considered as an
175 * anomaly.
176 */
177 if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
178 /*
179 * After three consecutive anomalies, we reset the
180 * stats as it is no longer stable enough.
181 */
182 if (irqs->anomalies++ >= 3) {
183 memset(irqs, 0, sizeof(*irqs));
184 irqs->last_ts = ts;
185 return;
186 }
187 } else {
188 /*
189 * The anomalies must be consecutives, so at this
190 * point, we reset the anomalies counter.
191 */
192 irqs->anomalies = 0;
193 }
194
195 /*
196 * The interrupt is considered stable enough to try to predict
197 * the next event on it.
198 */
199 irqs->valid = 1;
200
201 /*
202 * Online average algorithm:
203 *
204 * new_average = average + ((value - average) / count)
205 *
206 * The variance computation depends on the new average
207 * to be computed here first.
208 *
209 */
210 irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
211
212 /*
213 * Online variance algorithm:
214 *
215 * new_variance = variance + (value - average) x (value - new_average)
216 *
217 * Warning: irqs->avg is updated with the line above, hence
218 * 'interval - irqs->avg' is no longer equal to 'diff'
219 */
220 irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
221
222 /*
223 * Update the next event
224 */
225 irqs->next_evt = ts + irqs->avg;
226}
227
228/**
229 * irq_timings_next_event - Return when the next event is supposed to arrive
230 *
231 * During the last busy cycle, the number of interrupts is incremented
232 * and stored in the irq_timings structure. This information is
233 * necessary to:
234 *
235 * - know if the index in the table wrapped up:
236 *
237 * If more than the array size interrupts happened during the
238 * last busy/idle cycle, the index wrapped up and we have to
239 * begin with the next element in the array which is the last one
240 * in the sequence, otherwise it is a the index 0.
241 *
242 * - have an indication of the interrupts activity on this CPU
243 * (eg. irq/sec)
244 *
245 * The values are 'consumed' after inserting in the statistical model,
246 * thus the count is reinitialized.
247 *
248 * The array of values **must** be browsed in the time direction, the
249 * timestamp must increase between an element and the next one.
250 *
251 * Returns a nanosec time based estimation of the earliest interrupt,
252 * U64_MAX otherwise.
253 */
254u64 irq_timings_next_event(u64 now)
255{
256 struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
257 struct irqt_stat *irqs;
258 struct irqt_stat __percpu *s;
259 u64 ts, next_evt = U64_MAX;
260 int i, irq = 0;
261
262 /*
263 * This function must be called with the local irq disabled in
264 * order to prevent the timings circular buffer to be updated
265 * while we are reading it.
266 */
267 WARN_ON_ONCE(!irqs_disabled());
268
269 /*
270 * Number of elements in the circular buffer: If it happens it
271 * was flushed before, then the number of elements could be
272 * smaller than IRQ_TIMINGS_SIZE, so the count is used,
273 * otherwise the array size is used as we wrapped. The index
274 * begins from zero when we did not wrap. That could be done
275 * in a nicer way with the proper circular array structure
276 * type but with the cost of extra computation in the
277 * interrupt handler hot path. We choose efficiency.
278 *
279 * Inject measured irq/timestamp to the statistical model
280 * while decrementing the counter because we consume the data
281 * from our circular buffer.
282 */
283 for (i = irqts->count & IRQ_TIMINGS_MASK,
284 irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
285 irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
286
287 irq = irq_timing_decode(irqts->values[i], &ts);
288
289 s = idr_find(&irqt_stats, irq);
290 if (s) {
291 irqs = this_cpu_ptr(s);
292 irqs_update(irqs, ts);
293 }
294 }
295
296 /*
297 * Look in the list of interrupts' statistics, the earliest
298 * next event.
299 */
300 idr_for_each_entry(&irqt_stats, s, i) {
301
302 irqs = this_cpu_ptr(s);
303
304 if (!irqs->valid)
305 continue;
306
307 if (irqs->next_evt <= now) {
308 irq = i;
309 next_evt = now;
310
311 /*
312 * This interrupt mustn't use in the future
313 * until new events occur and update the
314 * statistics.
315 */
316 irqs->valid = 0;
317 break;
318 }
319
320 if (irqs->next_evt < next_evt) {
321 irq = i;
322 next_evt = irqs->next_evt;
323 }
324 }
325
326 return next_evt;
327}
328
329void irq_timings_free(int irq)
330{
331 struct irqt_stat __percpu *s;
332
333 s = idr_find(&irqt_stats, irq);
334 if (s) {
335 free_percpu(s);
336 idr_remove(&irqt_stats, irq);
337 }
338}
339
340int irq_timings_alloc(int irq)
341{
342 struct irqt_stat __percpu *s;
343 int id;
344
345 /*
346 * Some platforms can have the same private interrupt per cpu,
347 * so this function may be be called several times with the
348 * same interrupt number. Just bail out in case the per cpu
349 * stat structure is already allocated.
350 */
351 s = idr_find(&irqt_stats, irq);
352 if (s)
353 return 0;
354
355 s = alloc_percpu(*s);
356 if (!s)
357 return -ENOMEM;
358
359 idr_preload(GFP_KERNEL);
360 id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT);
361 idr_preload_end();
362
363 if (id < 0) {
364 free_percpu(s);
365 return id;
366 }
367
368 return 0;
369}
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 6c9cb208ac48..d11c506a6ac3 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -15,6 +15,7 @@
15#include <linux/static_key.h> 15#include <linux/static_key.h>
16#include <linux/jump_label_ratelimit.h> 16#include <linux/jump_label_ratelimit.h>
17#include <linux/bug.h> 17#include <linux/bug.h>
18#include <linux/cpu.h>
18 19
19#ifdef HAVE_JUMP_LABEL 20#ifdef HAVE_JUMP_LABEL
20 21
@@ -124,6 +125,7 @@ void static_key_slow_inc(struct static_key *key)
124 return; 125 return;
125 } 126 }
126 127
128 cpus_read_lock();
127 jump_label_lock(); 129 jump_label_lock();
128 if (atomic_read(&key->enabled) == 0) { 130 if (atomic_read(&key->enabled) == 0) {
129 atomic_set(&key->enabled, -1); 131 atomic_set(&key->enabled, -1);
@@ -133,12 +135,14 @@ void static_key_slow_inc(struct static_key *key)
133 atomic_inc(&key->enabled); 135 atomic_inc(&key->enabled);
134 } 136 }
135 jump_label_unlock(); 137 jump_label_unlock();
138 cpus_read_unlock();
136} 139}
137EXPORT_SYMBOL_GPL(static_key_slow_inc); 140EXPORT_SYMBOL_GPL(static_key_slow_inc);
138 141
139static void __static_key_slow_dec(struct static_key *key, 142static void __static_key_slow_dec(struct static_key *key,
140 unsigned long rate_limit, struct delayed_work *work) 143 unsigned long rate_limit, struct delayed_work *work)
141{ 144{
145 cpus_read_lock();
142 /* 146 /*
143 * The negative count check is valid even when a negative 147 * The negative count check is valid even when a negative
144 * key->enabled is in use by static_key_slow_inc(); a 148 * key->enabled is in use by static_key_slow_inc(); a
@@ -149,6 +153,7 @@ static void __static_key_slow_dec(struct static_key *key,
149 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { 153 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
150 WARN(atomic_read(&key->enabled) < 0, 154 WARN(atomic_read(&key->enabled) < 0,
151 "jump label: negative count!\n"); 155 "jump label: negative count!\n");
156 cpus_read_unlock();
152 return; 157 return;
153 } 158 }
154 159
@@ -159,6 +164,7 @@ static void __static_key_slow_dec(struct static_key *key,
159 jump_label_update(key); 164 jump_label_update(key);
160 } 165 }
161 jump_label_unlock(); 166 jump_label_unlock();
167 cpus_read_unlock();
162} 168}
163 169
164static void jump_label_update_timeout(struct work_struct *work) 170static void jump_label_update_timeout(struct work_struct *work)
@@ -334,6 +340,7 @@ void __init jump_label_init(void)
334 if (static_key_initialized) 340 if (static_key_initialized)
335 return; 341 return;
336 342
343 cpus_read_lock();
337 jump_label_lock(); 344 jump_label_lock();
338 jump_label_sort_entries(iter_start, iter_stop); 345 jump_label_sort_entries(iter_start, iter_stop);
339 346
@@ -353,6 +360,7 @@ void __init jump_label_init(void)
353 } 360 }
354 static_key_initialized = true; 361 static_key_initialized = true;
355 jump_label_unlock(); 362 jump_label_unlock();
363 cpus_read_unlock();
356} 364}
357 365
358#ifdef CONFIG_MODULES 366#ifdef CONFIG_MODULES
@@ -590,28 +598,28 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
590 struct module *mod = data; 598 struct module *mod = data;
591 int ret = 0; 599 int ret = 0;
592 600
601 cpus_read_lock();
602 jump_label_lock();
603
593 switch (val) { 604 switch (val) {
594 case MODULE_STATE_COMING: 605 case MODULE_STATE_COMING:
595 jump_label_lock();
596 ret = jump_label_add_module(mod); 606 ret = jump_label_add_module(mod);
597 if (ret) { 607 if (ret) {
598 WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n"); 608 WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n");
599 jump_label_del_module(mod); 609 jump_label_del_module(mod);
600 } 610 }
601 jump_label_unlock();
602 break; 611 break;
603 case MODULE_STATE_GOING: 612 case MODULE_STATE_GOING:
604 jump_label_lock();
605 jump_label_del_module(mod); 613 jump_label_del_module(mod);
606 jump_label_unlock();
607 break; 614 break;
608 case MODULE_STATE_LIVE: 615 case MODULE_STATE_LIVE:
609 jump_label_lock();
610 jump_label_invalidate_module_init(mod); 616 jump_label_invalidate_module_init(mod);
611 jump_label_unlock();
612 break; 617 break;
613 } 618 }
614 619
620 jump_label_unlock();
621 cpus_read_unlock();
622
615 return notifier_from_errno(ret); 623 return notifier_from_errno(ret);
616} 624}
617 625
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6a3b249a2ae1..127e7cfafa55 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -28,12 +28,6 @@
28 28
29#include <asm/sections.h> 29#include <asm/sections.h>
30 30
31#ifdef CONFIG_KALLSYMS_ALL
32#define all_var 1
33#else
34#define all_var 0
35#endif
36
37/* 31/*
38 * These will be re-linked against their real values 32 * These will be re-linked against their real values
39 * during the second link stage. 33 * during the second link stage.
@@ -82,7 +76,7 @@ static inline int is_kernel(unsigned long addr)
82 76
83static int is_ksym_addr(unsigned long addr) 77static int is_ksym_addr(unsigned long addr)
84{ 78{
85 if (all_var) 79 if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
86 return is_kernel(addr); 80 return is_kernel(addr);
87 81
88 return is_kernel_text(addr) || is_kernel_inittext(addr); 82 return is_kernel_text(addr) || is_kernel_inittext(addr);
@@ -280,7 +274,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
280 if (!symbol_end) { 274 if (!symbol_end) {
281 if (is_kernel_inittext(addr)) 275 if (is_kernel_inittext(addr))
282 symbol_end = (unsigned long)_einittext; 276 symbol_end = (unsigned long)_einittext;
283 else if (all_var) 277 else if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
284 symbol_end = (unsigned long)_end; 278 symbol_end = (unsigned long)_end;
285 else 279 else
286 symbol_end = (unsigned long)_etext; 280 symbol_end = (unsigned long)_etext;
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 3a47fa998fe0..ea34ed8bb952 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -11,6 +11,10 @@
11#include <linux/bug.h> 11#include <linux/bug.h>
12#include <linux/err.h> 12#include <linux/err.h>
13#include <linux/kcmp.h> 13#include <linux/kcmp.h>
14#include <linux/capability.h>
15#include <linux/list.h>
16#include <linux/eventpoll.h>
17#include <linux/file.h>
14 18
15#include <asm/unistd.h> 19#include <asm/unistd.h>
16 20
@@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2)
94 return err; 98 return err;
95} 99}
96 100
101#ifdef CONFIG_EPOLL
102static int kcmp_epoll_target(struct task_struct *task1,
103 struct task_struct *task2,
104 unsigned long idx1,
105 struct kcmp_epoll_slot __user *uslot)
106{
107 struct file *filp, *filp_epoll, *filp_tgt;
108 struct kcmp_epoll_slot slot;
109 struct files_struct *files;
110
111 if (copy_from_user(&slot, uslot, sizeof(slot)))
112 return -EFAULT;
113
114 filp = get_file_raw_ptr(task1, idx1);
115 if (!filp)
116 return -EBADF;
117
118 files = get_files_struct(task2);
119 if (!files)
120 return -EBADF;
121
122 spin_lock(&files->file_lock);
123 filp_epoll = fcheck_files(files, slot.efd);
124 if (filp_epoll)
125 get_file(filp_epoll);
126 else
127 filp_tgt = ERR_PTR(-EBADF);
128 spin_unlock(&files->file_lock);
129 put_files_struct(files);
130
131 if (filp_epoll) {
132 filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
133 fput(filp_epoll);
134 } else
135
136 if (IS_ERR(filp_tgt))
137 return PTR_ERR(filp_tgt);
138
139 return kcmp_ptr(filp, filp_tgt, KCMP_FILE);
140}
141#else
142static int kcmp_epoll_target(struct task_struct *task1,
143 struct task_struct *task2,
144 unsigned long idx1,
145 struct kcmp_epoll_slot __user *uslot)
146{
147 return -EOPNOTSUPP;
148}
149#endif
150
97SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, 151SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
98 unsigned long, idx1, unsigned long, idx2) 152 unsigned long, idx1, unsigned long, idx2)
99{ 153{
@@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
165 ret = -EOPNOTSUPP; 219 ret = -EOPNOTSUPP;
166#endif 220#endif
167 break; 221 break;
222 case KCMP_EPOLL_TFD:
223 ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2);
224 break;
168 default: 225 default:
169 ret = -EINVAL; 226 ret = -EINVAL;
170 break; 227 break;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 980936a90ee6..e62ec4dc6620 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -144,6 +144,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
144 if (ret) 144 if (ret)
145 goto out; 145 goto out;
146 146
147 /*
148 * Some architecture(like S390) may touch the crash memory before
149 * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
150 */
151 ret = kimage_crash_copy_vmcoreinfo(image);
152 if (ret)
153 goto out;
154
147 for (i = 0; i < nr_segments; i++) { 155 for (i = 0; i < nr_segments; i++) {
148 ret = kimage_load_segment(image, &image->segment[i]); 156 ret = kimage_load_segment(image, &image->segment[i]);
149 if (ret) 157 if (ret)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index ae1a3ba24df5..1ae7c41c33c1 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -38,6 +38,7 @@
38#include <linux/syscore_ops.h> 38#include <linux/syscore_ops.h>
39#include <linux/compiler.h> 39#include <linux/compiler.h>
40#include <linux/hugetlb.h> 40#include <linux/hugetlb.h>
41#include <linux/frame.h>
41 42
42#include <asm/page.h> 43#include <asm/page.h>
43#include <asm/sections.h> 44#include <asm/sections.h>
@@ -481,6 +482,40 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
481 return pages; 482 return pages;
482} 483}
483 484
485int kimage_crash_copy_vmcoreinfo(struct kimage *image)
486{
487 struct page *vmcoreinfo_page;
488 void *safecopy;
489
490 if (image->type != KEXEC_TYPE_CRASH)
491 return 0;
492
493 /*
494 * For kdump, allocate one vmcoreinfo safe copy from the
495 * crash memory. as we have arch_kexec_protect_crashkres()
496 * after kexec syscall, we naturally protect it from write
497 * (even read) access under kernel direct mapping. But on
498 * the other hand, we still need to operate it when crash
499 * happens to generate vmcoreinfo note, hereby we rely on
500 * vmap for this purpose.
501 */
502 vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
503 if (!vmcoreinfo_page) {
504 pr_warn("Could not allocate vmcoreinfo buffer\n");
505 return -ENOMEM;
506 }
507 safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
508 if (!safecopy) {
509 pr_warn("Could not vmap vmcoreinfo buffer\n");
510 return -ENOMEM;
511 }
512
513 image->vmcoreinfo_data_copy = safecopy;
514 crash_update_vmcoreinfo_safecopy(safecopy);
515
516 return 0;
517}
518
484static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) 519static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
485{ 520{
486 if (*image->entry != 0) 521 if (*image->entry != 0)
@@ -568,6 +603,11 @@ void kimage_free(struct kimage *image)
568 if (!image) 603 if (!image)
569 return; 604 return;
570 605
606 if (image->vmcoreinfo_data_copy) {
607 crash_update_vmcoreinfo_safecopy(NULL);
608 vunmap(image->vmcoreinfo_data_copy);
609 }
610
571 kimage_free_extra_pages(image); 611 kimage_free_extra_pages(image);
572 for_each_kimage_entry(image, ptr, entry) { 612 for_each_kimage_entry(image, ptr, entry) {
573 if (entry & IND_INDIRECTION) { 613 if (entry & IND_INDIRECTION) {
@@ -874,7 +914,7 @@ int kexec_load_disabled;
874 * only when panic_cpu holds the current CPU number; this is the only CPU 914 * only when panic_cpu holds the current CPU number; this is the only CPU
875 * which processes crash_kexec routines. 915 * which processes crash_kexec routines.
876 */ 916 */
877void __crash_kexec(struct pt_regs *regs) 917void __noclone __crash_kexec(struct pt_regs *regs)
878{ 918{
879 /* Take the kexec_mutex here to prevent sys_kexec_load 919 /* Take the kexec_mutex here to prevent sys_kexec_load
880 * running on one cpu from replacing the crash kernel 920 * running on one cpu from replacing the crash kernel
@@ -896,6 +936,7 @@ void __crash_kexec(struct pt_regs *regs)
896 mutex_unlock(&kexec_mutex); 936 mutex_unlock(&kexec_mutex);
897 } 937 }
898} 938}
939STACK_FRAME_NON_STANDARD(__crash_kexec);
899 940
900void crash_kexec(struct pt_regs *regs) 941void crash_kexec(struct pt_regs *regs)
901{ 942{
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b118735fea9d..9f48f4412297 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -26,13 +26,6 @@
26#include <linux/vmalloc.h> 26#include <linux/vmalloc.h>
27#include "kexec_internal.h" 27#include "kexec_internal.h"
28 28
29/*
30 * Declare these symbols weak so that if architecture provides a purgatory,
31 * these will be overridden.
32 */
33char __weak kexec_purgatory[0];
34size_t __weak kexec_purgatory_size = 0;
35
36static int kexec_calculate_store_digests(struct kimage *image); 29static int kexec_calculate_store_digests(struct kimage *image);
37 30
38/* Architectures can provide this probe function */ 31/* Architectures can provide this probe function */
@@ -162,16 +155,10 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
162 } 155 }
163 156
164 if (cmdline_len) { 157 if (cmdline_len) {
165 image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); 158 image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
166 if (!image->cmdline_buf) { 159 if (IS_ERR(image->cmdline_buf)) {
167 ret = -ENOMEM; 160 ret = PTR_ERR(image->cmdline_buf);
168 goto out; 161 image->cmdline_buf = NULL;
169 }
170
171 ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
172 cmdline_len);
173 if (ret) {
174 ret = -EFAULT;
175 goto out; 162 goto out;
176 } 163 }
177 164
@@ -304,6 +291,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
304 if (ret) 291 if (ret)
305 goto out; 292 goto out;
306 293
294 /*
295 * Some architecture(like S390) may touch the crash memory before
296 * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
297 */
298 ret = kimage_crash_copy_vmcoreinfo(image);
299 if (ret)
300 goto out;
301
307 ret = kexec_calculate_store_digests(image); 302 ret = kexec_calculate_store_digests(image);
308 if (ret) 303 if (ret)
309 goto out; 304 goto out;
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 799a8a452187..50dfcb039a41 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -17,6 +17,8 @@ extern struct mutex kexec_mutex;
17#ifdef CONFIG_KEXEC_FILE 17#ifdef CONFIG_KEXEC_FILE
18#include <linux/purgatory.h> 18#include <linux/purgatory.h>
19void kimage_file_post_load_cleanup(struct kimage *image); 19void kimage_file_post_load_cleanup(struct kimage *image);
20extern char kexec_purgatory[];
21extern size_t kexec_purgatory_size;
20#else /* CONFIG_KEXEC_FILE */ 22#else /* CONFIG_KEXEC_FILE */
21static inline void kimage_file_post_load_cleanup(struct kimage *image) { } 23static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
22#endif /* CONFIG_KEXEC_FILE */ 24#endif /* CONFIG_KEXEC_FILE */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 563f97e2be36..6d016c5d97c8 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,8 +45,6 @@
45 45
46#include <trace/events/module.h> 46#include <trace/events/module.h>
47 47
48extern int max_threads;
49
50#define CAP_BSET (void *)1 48#define CAP_BSET (void *)1
51#define CAP_PI (void *)2 49#define CAP_PI (void *)2
52 50
@@ -56,6 +54,21 @@ static DEFINE_SPINLOCK(umh_sysctl_lock);
56static DECLARE_RWSEM(umhelper_sem); 54static DECLARE_RWSEM(umhelper_sem);
57 55
58#ifdef CONFIG_MODULES 56#ifdef CONFIG_MODULES
57/*
58 * Assuming:
59 *
60 * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
61 * (u64) THREAD_SIZE * 8UL);
62 *
63 * If you need less than 50 threads would mean we're dealing with systems
64 * smaller than 3200 pages. This assuems you are capable of having ~13M memory,
65 * and this would only be an be an upper limit, after which the OOM killer
66 * would take effect. Systems like these are very unlikely if modules are
67 * enabled.
68 */
69#define MAX_KMOD_CONCURRENT 50
70static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
71static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
59 72
60/* 73/*
61 modprobe_path is set via /proc/sys. 74 modprobe_path is set via /proc/sys.
@@ -127,11 +140,7 @@ int __request_module(bool wait, const char *fmt, ...)
127{ 140{
128 va_list args; 141 va_list args;
129 char module_name[MODULE_NAME_LEN]; 142 char module_name[MODULE_NAME_LEN];
130 unsigned int max_modprobes;
131 int ret; 143 int ret;
132 static atomic_t kmod_concurrent = ATOMIC_INIT(0);
133#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
134 static int kmod_loop_msg;
135 144
136 /* 145 /*
137 * We don't allow synchronous module loading from async. Module 146 * We don't allow synchronous module loading from async. Module
@@ -154,40 +163,25 @@ int __request_module(bool wait, const char *fmt, ...)
154 if (ret) 163 if (ret)
155 return ret; 164 return ret;
156 165
157 /* If modprobe needs a service that is in a module, we get a recursive 166 if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) {
158 * loop. Limit the number of running kmod threads to max_threads/2 or 167 pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
159 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method 168 atomic_read(&kmod_concurrent_max),
160 * would be to run the parents of this process, counting how many times 169 MAX_KMOD_CONCURRENT, module_name);
161 * kmod was invoked. That would mean accessing the internals of the 170 wait_event_interruptible(kmod_wq,
162 * process tables to get the command line, proc_pid_cmdline is static 171 atomic_dec_if_positive(&kmod_concurrent_max) >= 0);
163 * and it is not worth changing the proc code just to handle this case.
164 * KAO.
165 *
166 * "trace the ppid" is simple, but will fail if someone's
167 * parent exits. I think this is as good as it gets. --RR
168 */
169 max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT);
170 atomic_inc(&kmod_concurrent);
171 if (atomic_read(&kmod_concurrent) > max_modprobes) {
172 /* We may be blaming an innocent here, but unlikely */
173 if (kmod_loop_msg < 5) {
174 printk(KERN_ERR
175 "request_module: runaway loop modprobe %s\n",
176 module_name);
177 kmod_loop_msg++;
178 }
179 atomic_dec(&kmod_concurrent);
180 return -ENOMEM;
181 } 172 }
182 173
183 trace_module_request(module_name, wait, _RET_IP_); 174 trace_module_request(module_name, wait, _RET_IP_);
184 175
185 ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); 176 ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
186 177
187 atomic_dec(&kmod_concurrent); 178 atomic_inc(&kmod_concurrent_max);
179 wake_up(&kmod_wq);
180
188 return ret; 181 return ret;
189} 182}
190EXPORT_SYMBOL(__request_module); 183EXPORT_SYMBOL(__request_module);
184
191#endif /* CONFIG_MODULES */ 185#endif /* CONFIG_MODULES */
192 186
193static void call_usermodehelper_freeinfo(struct subprocess_info *info) 187static void call_usermodehelper_freeinfo(struct subprocess_info *info)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2d2d3a568e4e..a1606a4224e1 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -122,7 +122,7 @@ static void *alloc_insn_page(void)
122 return module_alloc(PAGE_SIZE); 122 return module_alloc(PAGE_SIZE);
123} 123}
124 124
125static void free_insn_page(void *page) 125void __weak free_insn_page(void *page)
126{ 126{
127 module_memfree(page); 127 module_memfree(page);
128} 128}
@@ -483,11 +483,6 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
483 */ 483 */
484static void do_optimize_kprobes(void) 484static void do_optimize_kprobes(void)
485{ 485{
486 /* Optimization never be done when disarmed */
487 if (kprobes_all_disarmed || !kprobes_allow_optimization ||
488 list_empty(&optimizing_list))
489 return;
490
491 /* 486 /*
492 * The optimization/unoptimization refers online_cpus via 487 * The optimization/unoptimization refers online_cpus via
493 * stop_machine() and cpu-hotplug modifies online_cpus. 488 * stop_machine() and cpu-hotplug modifies online_cpus.
@@ -495,14 +490,19 @@ static void do_optimize_kprobes(void)
495 * This combination can cause a deadlock (cpu-hotplug try to lock 490 * This combination can cause a deadlock (cpu-hotplug try to lock
496 * text_mutex but stop_machine can not be done because online_cpus 491 * text_mutex but stop_machine can not be done because online_cpus
497 * has been changed) 492 * has been changed)
498 * To avoid this deadlock, we need to call get_online_cpus() 493 * To avoid this deadlock, caller must have locked cpu hotplug
499 * for preventing cpu-hotplug outside of text_mutex locking. 494 * for preventing cpu-hotplug outside of text_mutex locking.
500 */ 495 */
501 get_online_cpus(); 496 lockdep_assert_cpus_held();
497
498 /* Optimization never be done when disarmed */
499 if (kprobes_all_disarmed || !kprobes_allow_optimization ||
500 list_empty(&optimizing_list))
501 return;
502
502 mutex_lock(&text_mutex); 503 mutex_lock(&text_mutex);
503 arch_optimize_kprobes(&optimizing_list); 504 arch_optimize_kprobes(&optimizing_list);
504 mutex_unlock(&text_mutex); 505 mutex_unlock(&text_mutex);
505 put_online_cpus();
506} 506}
507 507
508/* 508/*
@@ -513,12 +513,13 @@ static void do_unoptimize_kprobes(void)
513{ 513{
514 struct optimized_kprobe *op, *tmp; 514 struct optimized_kprobe *op, *tmp;
515 515
516 /* See comment in do_optimize_kprobes() */
517 lockdep_assert_cpus_held();
518
516 /* Unoptimization must be done anytime */ 519 /* Unoptimization must be done anytime */
517 if (list_empty(&unoptimizing_list)) 520 if (list_empty(&unoptimizing_list))
518 return; 521 return;
519 522
520 /* Ditto to do_optimize_kprobes */
521 get_online_cpus();
522 mutex_lock(&text_mutex); 523 mutex_lock(&text_mutex);
523 arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); 524 arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
524 /* Loop free_list for disarming */ 525 /* Loop free_list for disarming */
@@ -537,7 +538,6 @@ static void do_unoptimize_kprobes(void)
537 list_del_init(&op->list); 538 list_del_init(&op->list);
538 } 539 }
539 mutex_unlock(&text_mutex); 540 mutex_unlock(&text_mutex);
540 put_online_cpus();
541} 541}
542 542
543/* Reclaim all kprobes on the free_list */ 543/* Reclaim all kprobes on the free_list */
@@ -562,6 +562,7 @@ static void kick_kprobe_optimizer(void)
562static void kprobe_optimizer(struct work_struct *work) 562static void kprobe_optimizer(struct work_struct *work)
563{ 563{
564 mutex_lock(&kprobe_mutex); 564 mutex_lock(&kprobe_mutex);
565 cpus_read_lock();
565 /* Lock modules while optimizing kprobes */ 566 /* Lock modules while optimizing kprobes */
566 mutex_lock(&module_mutex); 567 mutex_lock(&module_mutex);
567 568
@@ -587,6 +588,7 @@ static void kprobe_optimizer(struct work_struct *work)
587 do_free_cleaned_kprobes(); 588 do_free_cleaned_kprobes();
588 589
589 mutex_unlock(&module_mutex); 590 mutex_unlock(&module_mutex);
591 cpus_read_unlock();
590 mutex_unlock(&kprobe_mutex); 592 mutex_unlock(&kprobe_mutex);
591 593
592 /* Step 5: Kick optimizer again if needed */ 594 /* Step 5: Kick optimizer again if needed */
@@ -650,9 +652,8 @@ static void optimize_kprobe(struct kprobe *p)
650/* Short cut to direct unoptimizing */ 652/* Short cut to direct unoptimizing */
651static void force_unoptimize_kprobe(struct optimized_kprobe *op) 653static void force_unoptimize_kprobe(struct optimized_kprobe *op)
652{ 654{
653 get_online_cpus(); 655 lockdep_assert_cpus_held();
654 arch_unoptimize_kprobe(op); 656 arch_unoptimize_kprobe(op);
655 put_online_cpus();
656 if (kprobe_disabled(&op->kp)) 657 if (kprobe_disabled(&op->kp))
657 arch_disarm_kprobe(&op->kp); 658 arch_disarm_kprobe(&op->kp);
658} 659}
@@ -791,6 +792,7 @@ static void try_to_optimize_kprobe(struct kprobe *p)
791 return; 792 return;
792 793
793 /* For preparing optimization, jump_label_text_reserved() is called */ 794 /* For preparing optimization, jump_label_text_reserved() is called */
795 cpus_read_lock();
794 jump_label_lock(); 796 jump_label_lock();
795 mutex_lock(&text_mutex); 797 mutex_lock(&text_mutex);
796 798
@@ -812,6 +814,7 @@ static void try_to_optimize_kprobe(struct kprobe *p)
812out: 814out:
813 mutex_unlock(&text_mutex); 815 mutex_unlock(&text_mutex);
814 jump_label_unlock(); 816 jump_label_unlock();
817 cpus_read_unlock();
815} 818}
816 819
817#ifdef CONFIG_SYSCTL 820#ifdef CONFIG_SYSCTL
@@ -826,6 +829,7 @@ static void optimize_all_kprobes(void)
826 if (kprobes_allow_optimization) 829 if (kprobes_allow_optimization)
827 goto out; 830 goto out;
828 831
832 cpus_read_lock();
829 kprobes_allow_optimization = true; 833 kprobes_allow_optimization = true;
830 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 834 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
831 head = &kprobe_table[i]; 835 head = &kprobe_table[i];
@@ -833,6 +837,7 @@ static void optimize_all_kprobes(void)
833 if (!kprobe_disabled(p)) 837 if (!kprobe_disabled(p))
834 optimize_kprobe(p); 838 optimize_kprobe(p);
835 } 839 }
840 cpus_read_unlock();
836 printk(KERN_INFO "Kprobes globally optimized\n"); 841 printk(KERN_INFO "Kprobes globally optimized\n");
837out: 842out:
838 mutex_unlock(&kprobe_mutex); 843 mutex_unlock(&kprobe_mutex);
@@ -851,6 +856,7 @@ static void unoptimize_all_kprobes(void)
851 return; 856 return;
852 } 857 }
853 858
859 cpus_read_lock();
854 kprobes_allow_optimization = false; 860 kprobes_allow_optimization = false;
855 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 861 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
856 head = &kprobe_table[i]; 862 head = &kprobe_table[i];
@@ -859,6 +865,7 @@ static void unoptimize_all_kprobes(void)
859 unoptimize_kprobe(p, false); 865 unoptimize_kprobe(p, false);
860 } 866 }
861 } 867 }
868 cpus_read_unlock();
862 mutex_unlock(&kprobe_mutex); 869 mutex_unlock(&kprobe_mutex);
863 870
864 /* Wait for unoptimizing completion */ 871 /* Wait for unoptimizing completion */
@@ -1010,14 +1017,11 @@ static void arm_kprobe(struct kprobe *kp)
1010 arm_kprobe_ftrace(kp); 1017 arm_kprobe_ftrace(kp);
1011 return; 1018 return;
1012 } 1019 }
1013 /* 1020 cpus_read_lock();
1014 * Here, since __arm_kprobe() doesn't use stop_machine(),
1015 * this doesn't cause deadlock on text_mutex. So, we don't
1016 * need get_online_cpus().
1017 */
1018 mutex_lock(&text_mutex); 1021 mutex_lock(&text_mutex);
1019 __arm_kprobe(kp); 1022 __arm_kprobe(kp);
1020 mutex_unlock(&text_mutex); 1023 mutex_unlock(&text_mutex);
1024 cpus_read_unlock();
1021} 1025}
1022 1026
1023/* Disarm a kprobe with text_mutex */ 1027/* Disarm a kprobe with text_mutex */
@@ -1027,10 +1031,12 @@ static void disarm_kprobe(struct kprobe *kp, bool reopt)
1027 disarm_kprobe_ftrace(kp); 1031 disarm_kprobe_ftrace(kp);
1028 return; 1032 return;
1029 } 1033 }
1030 /* Ditto */ 1034
1035 cpus_read_lock();
1031 mutex_lock(&text_mutex); 1036 mutex_lock(&text_mutex);
1032 __disarm_kprobe(kp, reopt); 1037 __disarm_kprobe(kp, reopt);
1033 mutex_unlock(&text_mutex); 1038 mutex_unlock(&text_mutex);
1039 cpus_read_unlock();
1034} 1040}
1035 1041
1036/* 1042/*
@@ -1298,13 +1304,10 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
1298 int ret = 0; 1304 int ret = 0;
1299 struct kprobe *ap = orig_p; 1305 struct kprobe *ap = orig_p;
1300 1306
1307 cpus_read_lock();
1308
1301 /* For preparing optimization, jump_label_text_reserved() is called */ 1309 /* For preparing optimization, jump_label_text_reserved() is called */
1302 jump_label_lock(); 1310 jump_label_lock();
1303 /*
1304 * Get online CPUs to avoid text_mutex deadlock.with stop machine,
1305 * which is invoked by unoptimize_kprobe() in add_new_kprobe()
1306 */
1307 get_online_cpus();
1308 mutex_lock(&text_mutex); 1311 mutex_lock(&text_mutex);
1309 1312
1310 if (!kprobe_aggrprobe(orig_p)) { 1313 if (!kprobe_aggrprobe(orig_p)) {
@@ -1352,8 +1355,8 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
1352 1355
1353out: 1356out:
1354 mutex_unlock(&text_mutex); 1357 mutex_unlock(&text_mutex);
1355 put_online_cpus();
1356 jump_label_unlock(); 1358 jump_label_unlock();
1359 cpus_read_unlock();
1357 1360
1358 if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { 1361 if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
1359 ap->flags &= ~KPROBE_FLAG_DISABLED; 1362 ap->flags &= ~KPROBE_FLAG_DISABLED;
@@ -1555,9 +1558,12 @@ int register_kprobe(struct kprobe *p)
1555 goto out; 1558 goto out;
1556 } 1559 }
1557 1560
1558 mutex_lock(&text_mutex); /* Avoiding text modification */ 1561 cpus_read_lock();
1562 /* Prevent text modification */
1563 mutex_lock(&text_mutex);
1559 ret = prepare_kprobe(p); 1564 ret = prepare_kprobe(p);
1560 mutex_unlock(&text_mutex); 1565 mutex_unlock(&text_mutex);
1566 cpus_read_unlock();
1561 if (ret) 1567 if (ret)
1562 goto out; 1568 goto out;
1563 1569
@@ -1570,7 +1576,6 @@ int register_kprobe(struct kprobe *p)
1570 1576
1571 /* Try to optimize kprobe */ 1577 /* Try to optimize kprobe */
1572 try_to_optimize_kprobe(p); 1578 try_to_optimize_kprobe(p);
1573
1574out: 1579out:
1575 mutex_unlock(&kprobe_mutex); 1580 mutex_unlock(&kprobe_mutex);
1576 1581
@@ -1766,24 +1771,13 @@ unsigned long __weak arch_deref_entry_point(void *entry)
1766 1771
1767int register_jprobes(struct jprobe **jps, int num) 1772int register_jprobes(struct jprobe **jps, int num)
1768{ 1773{
1769 struct jprobe *jp;
1770 int ret = 0, i; 1774 int ret = 0, i;
1771 1775
1772 if (num <= 0) 1776 if (num <= 0)
1773 return -EINVAL; 1777 return -EINVAL;
1778
1774 for (i = 0; i < num; i++) { 1779 for (i = 0; i < num; i++) {
1775 unsigned long addr, offset; 1780 ret = register_jprobe(jps[i]);
1776 jp = jps[i];
1777 addr = arch_deref_entry_point(jp->entry);
1778
1779 /* Verify probepoint is a function entry point */
1780 if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
1781 offset == 0) {
1782 jp->kp.pre_handler = setjmp_pre_handler;
1783 jp->kp.break_handler = longjmp_break_handler;
1784 ret = register_kprobe(&jp->kp);
1785 } else
1786 ret = -EINVAL;
1787 1781
1788 if (ret < 0) { 1782 if (ret < 0) {
1789 if (i > 0) 1783 if (i > 0)
@@ -1791,13 +1785,30 @@ int register_jprobes(struct jprobe **jps, int num)
1791 break; 1785 break;
1792 } 1786 }
1793 } 1787 }
1788
1794 return ret; 1789 return ret;
1795} 1790}
1796EXPORT_SYMBOL_GPL(register_jprobes); 1791EXPORT_SYMBOL_GPL(register_jprobes);
1797 1792
1798int register_jprobe(struct jprobe *jp) 1793int register_jprobe(struct jprobe *jp)
1799{ 1794{
1800 return register_jprobes(&jp, 1); 1795 unsigned long addr, offset;
1796 struct kprobe *kp = &jp->kp;
1797
1798 /*
1799 * Verify probepoint as well as the jprobe handler are
1800 * valid function entry points.
1801 */
1802 addr = arch_deref_entry_point(jp->entry);
1803
1804 if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0 &&
1805 kprobe_on_func_entry(kp->addr, kp->symbol_name, kp->offset)) {
1806 kp->pre_handler = setjmp_pre_handler;
1807 kp->break_handler = longjmp_break_handler;
1808 return register_kprobe(kp);
1809 }
1810
1811 return -EINVAL;
1801} 1812}
1802EXPORT_SYMBOL_GPL(register_jprobe); 1813EXPORT_SYMBOL_GPL(register_jprobe);
1803 1814
@@ -1883,12 +1894,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
1883} 1894}
1884NOKPROBE_SYMBOL(pre_handler_kretprobe); 1895NOKPROBE_SYMBOL(pre_handler_kretprobe);
1885 1896
1886bool __weak arch_function_offset_within_entry(unsigned long offset) 1897bool __weak arch_kprobe_on_func_entry(unsigned long offset)
1887{ 1898{
1888 return !offset; 1899 return !offset;
1889} 1900}
1890 1901
1891bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) 1902bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
1892{ 1903{
1893 kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset); 1904 kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
1894 1905
@@ -1896,7 +1907,7 @@ bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsign
1896 return false; 1907 return false;
1897 1908
1898 if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) || 1909 if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
1899 !arch_function_offset_within_entry(offset)) 1910 !arch_kprobe_on_func_entry(offset))
1900 return false; 1911 return false;
1901 1912
1902 return true; 1913 return true;
@@ -1909,7 +1920,7 @@ int register_kretprobe(struct kretprobe *rp)
1909 int i; 1920 int i;
1910 void *addr; 1921 void *addr;
1911 1922
1912 if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset)) 1923 if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
1913 return -EINVAL; 1924 return -EINVAL;
1914 1925
1915 if (kretprobe_blacklist_size) { 1926 if (kretprobe_blacklist_size) {
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 23cd70651238..46ba853656f6 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
134{ 134{
135 phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); 135 phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
136 return sprintf(buf, "%pa %x\n", &vmcore_base, 136 return sprintf(buf, "%pa %x\n", &vmcore_base,
137 (unsigned int)sizeof(vmcoreinfo_note)); 137 (unsigned int)VMCOREINFO_NOTE_SIZE);
138} 138}
139KERNEL_ATTR_RO(vmcoreinfo); 139KERNEL_ATTR_RO(vmcoreinfo);
140 140
@@ -234,7 +234,7 @@ static struct attribute * kernel_attrs[] = {
234 NULL 234 NULL
235}; 235};
236 236
237static struct attribute_group kernel_attr_group = { 237static const struct attribute_group kernel_attr_group = {
238 .attrs = kernel_attrs, 238 .attrs = kernel_attrs,
239}; 239};
240 240
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index 045022557936..ec4565122e65 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -10,6 +10,7 @@ config LIVEPATCH
10 depends on SYSFS 10 depends on SYSFS
11 depends on KALLSYMS_ALL 11 depends on KALLSYMS_ALL
12 depends on HAVE_LIVEPATCH 12 depends on HAVE_LIVEPATCH
13 depends on !TRIM_UNUSED_KSYMS
13 help 14 help
14 Say Y here if you want to support kernel live patching. 15 Say Y here if you want to support kernel live patching.
15 This option has no runtime impact until a kernel "patch" 16 This option has no runtime impact until a kernel "patch"
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index f8269036bf0b..52c4e907c14b 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -59,7 +59,11 @@ static void notrace klp_ftrace_handler(unsigned long ip,
59 59
60 ops = container_of(fops, struct klp_ops, fops); 60 ops = container_of(fops, struct klp_ops, fops);
61 61
62 rcu_read_lock(); 62 /*
63 * A variant of synchronize_sched() is used to allow patching functions
64 * where RCU is not watching, see klp_synchronize_transition().
65 */
66 preempt_disable_notrace();
63 67
64 func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, 68 func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
65 stack_node); 69 stack_node);
@@ -115,7 +119,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
115 119
116 klp_arch_set_pc(regs, (unsigned long)func->new_func); 120 klp_arch_set_pc(regs, (unsigned long)func->new_func);
117unlock: 121unlock:
118 rcu_read_unlock(); 122 preempt_enable_notrace();
119} 123}
120 124
121/* 125/*
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index adc0cc64aa4b..b004a1fb6032 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -49,6 +49,28 @@ static void klp_transition_work_fn(struct work_struct *work)
49static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn); 49static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn);
50 50
51/* 51/*
52 * This function is just a stub to implement a hard force
53 * of synchronize_sched(). This requires synchronizing
54 * tasks even in userspace and idle.
55 */
56static void klp_sync(struct work_struct *work)
57{
58}
59
60/*
61 * We allow to patch also functions where RCU is not watching,
62 * e.g. before user_exit(). We can not rely on the RCU infrastructure
63 * to do the synchronization. Instead hard force the sched synchronization.
64 *
65 * This approach allows to use RCU functions for manipulating func_stack
66 * safely.
67 */
68static void klp_synchronize_transition(void)
69{
70 schedule_on_each_cpu(klp_sync);
71}
72
73/*
52 * The transition to the target patch state is complete. Clean up the data 74 * The transition to the target patch state is complete. Clean up the data
53 * structures. 75 * structures.
54 */ 76 */
@@ -73,7 +95,7 @@ static void klp_complete_transition(void)
73 * func->transition gets cleared, the handler may choose a 95 * func->transition gets cleared, the handler may choose a
74 * removed function. 96 * removed function.
75 */ 97 */
76 synchronize_rcu(); 98 klp_synchronize_transition();
77 } 99 }
78 100
79 if (klp_transition_patch->immediate) 101 if (klp_transition_patch->immediate)
@@ -92,7 +114,7 @@ static void klp_complete_transition(void)
92 114
93 /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ 115 /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
94 if (klp_target_state == KLP_PATCHED) 116 if (klp_target_state == KLP_PATCHED)
95 synchronize_rcu(); 117 klp_synchronize_transition();
96 118
97 read_lock(&tasklist_lock); 119 read_lock(&tasklist_lock);
98 for_each_process_thread(g, task) { 120 for_each_process_thread(g, task) {
@@ -136,7 +158,11 @@ void klp_cancel_transition(void)
136 */ 158 */
137void klp_update_patch_state(struct task_struct *task) 159void klp_update_patch_state(struct task_struct *task)
138{ 160{
139 rcu_read_lock(); 161 /*
162 * A variant of synchronize_sched() is used to allow patching functions
163 * where RCU is not watching, see klp_synchronize_transition().
164 */
165 preempt_disable_notrace();
140 166
141 /* 167 /*
142 * This test_and_clear_tsk_thread_flag() call also serves as a read 168 * This test_and_clear_tsk_thread_flag() call also serves as a read
@@ -153,7 +179,7 @@ void klp_update_patch_state(struct task_struct *task)
153 if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING)) 179 if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING))
154 task->patch_state = READ_ONCE(klp_target_state); 180 task->patch_state = READ_ONCE(klp_target_state);
155 181
156 rcu_read_unlock(); 182 preempt_enable_notrace();
157} 183}
158 184
159/* 185/*
@@ -539,7 +565,7 @@ void klp_reverse_transition(void)
539 clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING); 565 clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
540 566
541 /* Let any remaining calls to klp_update_patch_state() complete */ 567 /* Let any remaining calls to klp_update_patch_state() complete */
542 synchronize_rcu(); 568 klp_synchronize_transition();
543 569
544 klp_start_transition(); 570 klp_start_transition();
545} 571}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c0e31bfee25c..7d2499bec5fe 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1157,18 +1157,18 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1157 if (debug_locks_silent) 1157 if (debug_locks_silent)
1158 return 0; 1158 return 0;
1159 1159
1160 printk("\n"); 1160 pr_warn("\n");
1161 pr_warn("======================================================\n"); 1161 pr_warn("======================================================\n");
1162 pr_warn("WARNING: possible circular locking dependency detected\n"); 1162 pr_warn("WARNING: possible circular locking dependency detected\n");
1163 print_kernel_ident(); 1163 print_kernel_ident();
1164 pr_warn("------------------------------------------------------\n"); 1164 pr_warn("------------------------------------------------------\n");
1165 printk("%s/%d is trying to acquire lock:\n", 1165 pr_warn("%s/%d is trying to acquire lock:\n",
1166 curr->comm, task_pid_nr(curr)); 1166 curr->comm, task_pid_nr(curr));
1167 print_lock(check_src); 1167 print_lock(check_src);
1168 printk("\nbut task is already holding lock:\n"); 1168 pr_warn("\nbut task is already holding lock:\n");
1169 print_lock(check_tgt); 1169 print_lock(check_tgt);
1170 printk("\nwhich lock already depends on the new lock.\n\n"); 1170 pr_warn("\nwhich lock already depends on the new lock.\n\n");
1171 printk("\nthe existing dependency chain (in reverse order) is:\n"); 1171 pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
1172 1172
1173 print_circular_bug_entry(entry, depth); 1173 print_circular_bug_entry(entry, depth);
1174 1174
@@ -1495,13 +1495,13 @@ print_bad_irq_dependency(struct task_struct *curr,
1495 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1495 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1496 return 0; 1496 return 0;
1497 1497
1498 printk("\n"); 1498 pr_warn("\n");
1499 pr_warn("=====================================================\n"); 1499 pr_warn("=====================================================\n");
1500 pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", 1500 pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
1501 irqclass, irqclass); 1501 irqclass, irqclass);
1502 print_kernel_ident(); 1502 print_kernel_ident();
1503 pr_warn("-----------------------------------------------------\n"); 1503 pr_warn("-----------------------------------------------------\n");
1504 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 1504 pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
1505 curr->comm, task_pid_nr(curr), 1505 curr->comm, task_pid_nr(curr),
1506 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, 1506 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
1507 curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, 1507 curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
@@ -1509,46 +1509,46 @@ print_bad_irq_dependency(struct task_struct *curr,
1509 curr->softirqs_enabled); 1509 curr->softirqs_enabled);
1510 print_lock(next); 1510 print_lock(next);
1511 1511
1512 printk("\nand this task is already holding:\n"); 1512 pr_warn("\nand this task is already holding:\n");
1513 print_lock(prev); 1513 print_lock(prev);
1514 printk("which would create a new lock dependency:\n"); 1514 pr_warn("which would create a new lock dependency:\n");
1515 print_lock_name(hlock_class(prev)); 1515 print_lock_name(hlock_class(prev));
1516 printk(KERN_CONT " ->"); 1516 pr_cont(" ->");
1517 print_lock_name(hlock_class(next)); 1517 print_lock_name(hlock_class(next));
1518 printk(KERN_CONT "\n"); 1518 pr_cont("\n");
1519 1519
1520 printk("\nbut this new dependency connects a %s-irq-safe lock:\n", 1520 pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n",
1521 irqclass); 1521 irqclass);
1522 print_lock_name(backwards_entry->class); 1522 print_lock_name(backwards_entry->class);
1523 printk("\n... which became %s-irq-safe at:\n", irqclass); 1523 pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
1524 1524
1525 print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); 1525 print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
1526 1526
1527 printk("\nto a %s-irq-unsafe lock:\n", irqclass); 1527 pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
1528 print_lock_name(forwards_entry->class); 1528 print_lock_name(forwards_entry->class);
1529 printk("\n... which became %s-irq-unsafe at:\n", irqclass); 1529 pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
1530 printk("..."); 1530 pr_warn("...");
1531 1531
1532 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); 1532 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
1533 1533
1534 printk("\nother info that might help us debug this:\n\n"); 1534 pr_warn("\nother info that might help us debug this:\n\n");
1535 print_irq_lock_scenario(backwards_entry, forwards_entry, 1535 print_irq_lock_scenario(backwards_entry, forwards_entry,
1536 hlock_class(prev), hlock_class(next)); 1536 hlock_class(prev), hlock_class(next));
1537 1537
1538 lockdep_print_held_locks(curr); 1538 lockdep_print_held_locks(curr);
1539 1539
1540 printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); 1540 pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
1541 if (!save_trace(&prev_root->trace)) 1541 if (!save_trace(&prev_root->trace))
1542 return 0; 1542 return 0;
1543 print_shortest_lock_dependencies(backwards_entry, prev_root); 1543 print_shortest_lock_dependencies(backwards_entry, prev_root);
1544 1544
1545 printk("\nthe dependencies between the lock to be acquired"); 1545 pr_warn("\nthe dependencies between the lock to be acquired");
1546 printk(" and %s-irq-unsafe lock:\n", irqclass); 1546 pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
1547 if (!save_trace(&next_root->trace)) 1547 if (!save_trace(&next_root->trace))
1548 return 0; 1548 return 0;
1549 print_shortest_lock_dependencies(forwards_entry, next_root); 1549 print_shortest_lock_dependencies(forwards_entry, next_root);
1550 1550
1551 printk("\nstack backtrace:\n"); 1551 pr_warn("\nstack backtrace:\n");
1552 dump_stack(); 1552 dump_stack();
1553 1553
1554 return 0; 1554 return 0;
@@ -1724,22 +1724,22 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1724 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1724 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1725 return 0; 1725 return 0;
1726 1726
1727 printk("\n"); 1727 pr_warn("\n");
1728 pr_warn("============================================\n"); 1728 pr_warn("============================================\n");
1729 pr_warn("WARNING: possible recursive locking detected\n"); 1729 pr_warn("WARNING: possible recursive locking detected\n");
1730 print_kernel_ident(); 1730 print_kernel_ident();
1731 pr_warn("--------------------------------------------\n"); 1731 pr_warn("--------------------------------------------\n");
1732 printk("%s/%d is trying to acquire lock:\n", 1732 pr_warn("%s/%d is trying to acquire lock:\n",
1733 curr->comm, task_pid_nr(curr)); 1733 curr->comm, task_pid_nr(curr));
1734 print_lock(next); 1734 print_lock(next);
1735 printk("\nbut task is already holding lock:\n"); 1735 pr_warn("\nbut task is already holding lock:\n");
1736 print_lock(prev); 1736 print_lock(prev);
1737 1737
1738 printk("\nother info that might help us debug this:\n"); 1738 pr_warn("\nother info that might help us debug this:\n");
1739 print_deadlock_scenario(next, prev); 1739 print_deadlock_scenario(next, prev);
1740 lockdep_print_held_locks(curr); 1740 lockdep_print_held_locks(curr);
1741 1741
1742 printk("\nstack backtrace:\n"); 1742 pr_warn("\nstack backtrace:\n");
1743 dump_stack(); 1743 dump_stack();
1744 1744
1745 return 0; 1745 return 0;
@@ -2074,21 +2074,21 @@ static void print_collision(struct task_struct *curr,
2074 struct held_lock *hlock_next, 2074 struct held_lock *hlock_next,
2075 struct lock_chain *chain) 2075 struct lock_chain *chain)
2076{ 2076{
2077 printk("\n"); 2077 pr_warn("\n");
2078 pr_warn("============================\n"); 2078 pr_warn("============================\n");
2079 pr_warn("WARNING: chain_key collision\n"); 2079 pr_warn("WARNING: chain_key collision\n");
2080 print_kernel_ident(); 2080 print_kernel_ident();
2081 pr_warn("----------------------------\n"); 2081 pr_warn("----------------------------\n");
2082 printk("%s/%d: ", current->comm, task_pid_nr(current)); 2082 pr_warn("%s/%d: ", current->comm, task_pid_nr(current));
2083 printk("Hash chain already cached but the contents don't match!\n"); 2083 pr_warn("Hash chain already cached but the contents don't match!\n");
2084 2084
2085 printk("Held locks:"); 2085 pr_warn("Held locks:");
2086 print_chain_keys_held_locks(curr, hlock_next); 2086 print_chain_keys_held_locks(curr, hlock_next);
2087 2087
2088 printk("Locks in cached chain:"); 2088 pr_warn("Locks in cached chain:");
2089 print_chain_keys_chain(chain); 2089 print_chain_keys_chain(chain);
2090 2090
2091 printk("\nstack backtrace:\n"); 2091 pr_warn("\nstack backtrace:\n");
2092 dump_stack(); 2092 dump_stack();
2093} 2093}
2094#endif 2094#endif
@@ -2373,16 +2373,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2373 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2373 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2374 return 0; 2374 return 0;
2375 2375
2376 printk("\n"); 2376 pr_warn("\n");
2377 pr_warn("================================\n"); 2377 pr_warn("================================\n");
2378 pr_warn("WARNING: inconsistent lock state\n"); 2378 pr_warn("WARNING: inconsistent lock state\n");
2379 print_kernel_ident(); 2379 print_kernel_ident();
2380 pr_warn("--------------------------------\n"); 2380 pr_warn("--------------------------------\n");
2381 2381
2382 printk("inconsistent {%s} -> {%s} usage.\n", 2382 pr_warn("inconsistent {%s} -> {%s} usage.\n",
2383 usage_str[prev_bit], usage_str[new_bit]); 2383 usage_str[prev_bit], usage_str[new_bit]);
2384 2384
2385 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", 2385 pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
2386 curr->comm, task_pid_nr(curr), 2386 curr->comm, task_pid_nr(curr),
2387 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, 2387 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
2388 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, 2388 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
@@ -2390,16 +2390,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2390 trace_softirqs_enabled(curr)); 2390 trace_softirqs_enabled(curr));
2391 print_lock(this); 2391 print_lock(this);
2392 2392
2393 printk("{%s} state was registered at:\n", usage_str[prev_bit]); 2393 pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
2394 print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); 2394 print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
2395 2395
2396 print_irqtrace_events(curr); 2396 print_irqtrace_events(curr);
2397 printk("\nother info that might help us debug this:\n"); 2397 pr_warn("\nother info that might help us debug this:\n");
2398 print_usage_bug_scenario(this); 2398 print_usage_bug_scenario(this);
2399 2399
2400 lockdep_print_held_locks(curr); 2400 lockdep_print_held_locks(curr);
2401 2401
2402 printk("\nstack backtrace:\n"); 2402 pr_warn("\nstack backtrace:\n");
2403 dump_stack(); 2403 dump_stack();
2404 2404
2405 return 0; 2405 return 0;
@@ -2438,28 +2438,28 @@ print_irq_inversion_bug(struct task_struct *curr,
2438 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2438 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2439 return 0; 2439 return 0;
2440 2440
2441 printk("\n"); 2441 pr_warn("\n");
2442 pr_warn("========================================================\n"); 2442 pr_warn("========================================================\n");
2443 pr_warn("WARNING: possible irq lock inversion dependency detected\n"); 2443 pr_warn("WARNING: possible irq lock inversion dependency detected\n");
2444 print_kernel_ident(); 2444 print_kernel_ident();
2445 pr_warn("--------------------------------------------------------\n"); 2445 pr_warn("--------------------------------------------------------\n");
2446 printk("%s/%d just changed the state of lock:\n", 2446 pr_warn("%s/%d just changed the state of lock:\n",
2447 curr->comm, task_pid_nr(curr)); 2447 curr->comm, task_pid_nr(curr));
2448 print_lock(this); 2448 print_lock(this);
2449 if (forwards) 2449 if (forwards)
2450 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); 2450 pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
2451 else 2451 else
2452 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); 2452 pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
2453 print_lock_name(other->class); 2453 print_lock_name(other->class);
2454 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 2454 pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n");
2455 2455
2456 printk("\nother info that might help us debug this:\n"); 2456 pr_warn("\nother info that might help us debug this:\n");
2457 2457
2458 /* Find a middle lock (if one exists) */ 2458 /* Find a middle lock (if one exists) */
2459 depth = get_lock_depth(other); 2459 depth = get_lock_depth(other);
2460 do { 2460 do {
2461 if (depth == 0 && (entry != root)) { 2461 if (depth == 0 && (entry != root)) {
2462 printk("lockdep:%s bad path found in chain graph\n", __func__); 2462 pr_warn("lockdep:%s bad path found in chain graph\n", __func__);
2463 break; 2463 break;
2464 } 2464 }
2465 middle = entry; 2465 middle = entry;
@@ -2475,12 +2475,12 @@ print_irq_inversion_bug(struct task_struct *curr,
2475 2475
2476 lockdep_print_held_locks(curr); 2476 lockdep_print_held_locks(curr);
2477 2477
2478 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); 2478 pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
2479 if (!save_trace(&root->trace)) 2479 if (!save_trace(&root->trace))
2480 return 0; 2480 return 0;
2481 print_shortest_lock_dependencies(other, root); 2481 print_shortest_lock_dependencies(other, root);
2482 2482
2483 printk("\nstack backtrace:\n"); 2483 pr_warn("\nstack backtrace:\n");
2484 dump_stack(); 2484 dump_stack();
2485 2485
2486 return 0; 2486 return 0;
@@ -3189,25 +3189,25 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
3189 if (debug_locks_silent) 3189 if (debug_locks_silent)
3190 return 0; 3190 return 0;
3191 3191
3192 printk("\n"); 3192 pr_warn("\n");
3193 pr_warn("==================================\n"); 3193 pr_warn("==================================\n");
3194 pr_warn("WARNING: Nested lock was not taken\n"); 3194 pr_warn("WARNING: Nested lock was not taken\n");
3195 print_kernel_ident(); 3195 print_kernel_ident();
3196 pr_warn("----------------------------------\n"); 3196 pr_warn("----------------------------------\n");
3197 3197
3198 printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); 3198 pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
3199 print_lock(hlock); 3199 print_lock(hlock);
3200 3200
3201 printk("\nbut this task is not holding:\n"); 3201 pr_warn("\nbut this task is not holding:\n");
3202 printk("%s\n", hlock->nest_lock->name); 3202 pr_warn("%s\n", hlock->nest_lock->name);
3203 3203
3204 printk("\nstack backtrace:\n"); 3204 pr_warn("\nstack backtrace:\n");
3205 dump_stack(); 3205 dump_stack();
3206 3206
3207 printk("\nother info that might help us debug this:\n"); 3207 pr_warn("\nother info that might help us debug this:\n");
3208 lockdep_print_held_locks(curr); 3208 lockdep_print_held_locks(curr);
3209 3209
3210 printk("\nstack backtrace:\n"); 3210 pr_warn("\nstack backtrace:\n");
3211 dump_stack(); 3211 dump_stack();
3212 3212
3213 return 0; 3213 return 0;
@@ -3402,21 +3402,21 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3402 if (debug_locks_silent) 3402 if (debug_locks_silent)
3403 return 0; 3403 return 0;
3404 3404
3405 printk("\n"); 3405 pr_warn("\n");
3406 pr_warn("=====================================\n"); 3406 pr_warn("=====================================\n");
3407 pr_warn("WARNING: bad unlock balance detected!\n"); 3407 pr_warn("WARNING: bad unlock balance detected!\n");
3408 print_kernel_ident(); 3408 print_kernel_ident();
3409 pr_warn("-------------------------------------\n"); 3409 pr_warn("-------------------------------------\n");
3410 printk("%s/%d is trying to release lock (", 3410 pr_warn("%s/%d is trying to release lock (",
3411 curr->comm, task_pid_nr(curr)); 3411 curr->comm, task_pid_nr(curr));
3412 print_lockdep_cache(lock); 3412 print_lockdep_cache(lock);
3413 printk(KERN_CONT ") at:\n"); 3413 pr_cont(") at:\n");
3414 print_ip_sym(ip); 3414 print_ip_sym(ip);
3415 printk("but there are no more locks to release!\n"); 3415 pr_warn("but there are no more locks to release!\n");
3416 printk("\nother info that might help us debug this:\n"); 3416 pr_warn("\nother info that might help us debug this:\n");
3417 lockdep_print_held_locks(curr); 3417 lockdep_print_held_locks(curr);
3418 3418
3419 printk("\nstack backtrace:\n"); 3419 pr_warn("\nstack backtrace:\n");
3420 dump_stack(); 3420 dump_stack();
3421 3421
3422 return 0; 3422 return 0;
@@ -3974,21 +3974,21 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
3974 if (debug_locks_silent) 3974 if (debug_locks_silent)
3975 return 0; 3975 return 0;
3976 3976
3977 printk("\n"); 3977 pr_warn("\n");
3978 pr_warn("=================================\n"); 3978 pr_warn("=================================\n");
3979 pr_warn("WARNING: bad contention detected!\n"); 3979 pr_warn("WARNING: bad contention detected!\n");
3980 print_kernel_ident(); 3980 print_kernel_ident();
3981 pr_warn("---------------------------------\n"); 3981 pr_warn("---------------------------------\n");
3982 printk("%s/%d is trying to contend lock (", 3982 pr_warn("%s/%d is trying to contend lock (",
3983 curr->comm, task_pid_nr(curr)); 3983 curr->comm, task_pid_nr(curr));
3984 print_lockdep_cache(lock); 3984 print_lockdep_cache(lock);
3985 printk(KERN_CONT ") at:\n"); 3985 pr_cont(") at:\n");
3986 print_ip_sym(ip); 3986 print_ip_sym(ip);
3987 printk("but there are no locks held!\n"); 3987 pr_warn("but there are no locks held!\n");
3988 printk("\nother info that might help us debug this:\n"); 3988 pr_warn("\nother info that might help us debug this:\n");
3989 lockdep_print_held_locks(curr); 3989 lockdep_print_held_locks(curr);
3990 3990
3991 printk("\nstack backtrace:\n"); 3991 pr_warn("\nstack backtrace:\n");
3992 dump_stack(); 3992 dump_stack();
3993 3993
3994 return 0; 3994 return 0;
@@ -4318,17 +4318,17 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
4318 if (debug_locks_silent) 4318 if (debug_locks_silent)
4319 return; 4319 return;
4320 4320
4321 printk("\n"); 4321 pr_warn("\n");
4322 pr_warn("=========================\n"); 4322 pr_warn("=========================\n");
4323 pr_warn("WARNING: held lock freed!\n"); 4323 pr_warn("WARNING: held lock freed!\n");
4324 print_kernel_ident(); 4324 print_kernel_ident();
4325 pr_warn("-------------------------\n"); 4325 pr_warn("-------------------------\n");
4326 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 4326 pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
4327 curr->comm, task_pid_nr(curr), mem_from, mem_to-1); 4327 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
4328 print_lock(hlock); 4328 print_lock(hlock);
4329 lockdep_print_held_locks(curr); 4329 lockdep_print_held_locks(curr);
4330 4330
4331 printk("\nstack backtrace:\n"); 4331 pr_warn("\nstack backtrace:\n");
4332 dump_stack(); 4332 dump_stack();
4333} 4333}
4334 4334
@@ -4376,14 +4376,14 @@ static void print_held_locks_bug(void)
4376 if (debug_locks_silent) 4376 if (debug_locks_silent)
4377 return; 4377 return;
4378 4378
4379 printk("\n"); 4379 pr_warn("\n");
4380 pr_warn("====================================\n"); 4380 pr_warn("====================================\n");
4381 pr_warn("WARNING: %s/%d still has locks held!\n", 4381 pr_warn("WARNING: %s/%d still has locks held!\n",
4382 current->comm, task_pid_nr(current)); 4382 current->comm, task_pid_nr(current));
4383 print_kernel_ident(); 4383 print_kernel_ident();
4384 pr_warn("------------------------------------\n"); 4384 pr_warn("------------------------------------\n");
4385 lockdep_print_held_locks(current); 4385 lockdep_print_held_locks(current);
4386 printk("\nstack backtrace:\n"); 4386 pr_warn("\nstack backtrace:\n");
4387 dump_stack(); 4387 dump_stack();
4388} 4388}
4389 4389
@@ -4402,10 +4402,10 @@ void debug_show_all_locks(void)
4402 int unlock = 1; 4402 int unlock = 1;
4403 4403
4404 if (unlikely(!debug_locks)) { 4404 if (unlikely(!debug_locks)) {
4405 printk("INFO: lockdep is turned off.\n"); 4405 pr_warn("INFO: lockdep is turned off.\n");
4406 return; 4406 return;
4407 } 4407 }
4408 printk("\nShowing all locks held in the system:\n"); 4408 pr_warn("\nShowing all locks held in the system:\n");
4409 4409
4410 /* 4410 /*
4411 * Here we try to get the tasklist_lock as hard as possible, 4411 * Here we try to get the tasklist_lock as hard as possible,
@@ -4416,18 +4416,18 @@ void debug_show_all_locks(void)
4416retry: 4416retry:
4417 if (!read_trylock(&tasklist_lock)) { 4417 if (!read_trylock(&tasklist_lock)) {
4418 if (count == 10) 4418 if (count == 10)
4419 printk("hm, tasklist_lock locked, retrying... "); 4419 pr_warn("hm, tasklist_lock locked, retrying... ");
4420 if (count) { 4420 if (count) {
4421 count--; 4421 count--;
4422 printk(" #%d", 10-count); 4422 pr_cont(" #%d", 10-count);
4423 mdelay(200); 4423 mdelay(200);
4424 goto retry; 4424 goto retry;
4425 } 4425 }
4426 printk(" ignoring it.\n"); 4426 pr_cont(" ignoring it.\n");
4427 unlock = 0; 4427 unlock = 0;
4428 } else { 4428 } else {
4429 if (count != 10) 4429 if (count != 10)
4430 printk(KERN_CONT " locked it.\n"); 4430 pr_cont(" locked it.\n");
4431 } 4431 }
4432 4432
4433 do_each_thread(g, p) { 4433 do_each_thread(g, p) {
@@ -4445,7 +4445,7 @@ retry:
4445 unlock = 1; 4445 unlock = 1;
4446 } while_each_thread(g, p); 4446 } while_each_thread(g, p);
4447 4447
4448 printk("\n"); 4448 pr_warn("\n");
4449 pr_warn("=============================================\n\n"); 4449 pr_warn("=============================================\n\n");
4450 4450
4451 if (unlock) 4451 if (unlock)
@@ -4475,12 +4475,12 @@ asmlinkage __visible void lockdep_sys_exit(void)
4475 if (unlikely(curr->lockdep_depth)) { 4475 if (unlikely(curr->lockdep_depth)) {
4476 if (!debug_locks_off()) 4476 if (!debug_locks_off())
4477 return; 4477 return;
4478 printk("\n"); 4478 pr_warn("\n");
4479 pr_warn("================================================\n"); 4479 pr_warn("================================================\n");
4480 pr_warn("WARNING: lock held when returning to user space!\n"); 4480 pr_warn("WARNING: lock held when returning to user space!\n");
4481 print_kernel_ident(); 4481 print_kernel_ident();
4482 pr_warn("------------------------------------------------\n"); 4482 pr_warn("------------------------------------------------\n");
4483 printk("%s/%d is leaving the kernel with locks still held!\n", 4483 pr_warn("%s/%d is leaving the kernel with locks still held!\n",
4484 curr->comm, curr->pid); 4484 curr->comm, curr->pid);
4485 lockdep_print_held_locks(curr); 4485 lockdep_print_held_locks(curr);
4486 } 4486 }
@@ -4490,19 +4490,15 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4490{ 4490{
4491 struct task_struct *curr = current; 4491 struct task_struct *curr = current;
4492 4492
4493#ifndef CONFIG_PROVE_RCU_REPEATEDLY
4494 if (!debug_locks_off())
4495 return;
4496#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
4497 /* Note: the following can be executed concurrently, so be careful. */ 4493 /* Note: the following can be executed concurrently, so be careful. */
4498 printk("\n"); 4494 pr_warn("\n");
4499 pr_warn("=============================\n"); 4495 pr_warn("=============================\n");
4500 pr_warn("WARNING: suspicious RCU usage\n"); 4496 pr_warn("WARNING: suspicious RCU usage\n");
4501 print_kernel_ident(); 4497 print_kernel_ident();
4502 pr_warn("-----------------------------\n"); 4498 pr_warn("-----------------------------\n");
4503 printk("%s:%d %s!\n", file, line, s); 4499 pr_warn("%s:%d %s!\n", file, line, s);
4504 printk("\nother info that might help us debug this:\n\n"); 4500 pr_warn("\nother info that might help us debug this:\n\n");
4505 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", 4501 pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
4506 !rcu_lockdep_current_cpu_online() 4502 !rcu_lockdep_current_cpu_online()
4507 ? "RCU used illegally from offline CPU!\n" 4503 ? "RCU used illegally from offline CPU!\n"
4508 : !rcu_is_watching() 4504 : !rcu_is_watching()
@@ -4529,10 +4525,10 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4529 * rcu_read_lock_bh() and so on from extended quiescent states. 4525 * rcu_read_lock_bh() and so on from extended quiescent states.
4530 */ 4526 */
4531 if (!rcu_is_watching()) 4527 if (!rcu_is_watching())
4532 printk("RCU used illegally from extended quiescent state!\n"); 4528 pr_warn("RCU used illegally from extended quiescent state!\n");
4533 4529
4534 lockdep_print_held_locks(curr); 4530 lockdep_print_held_locks(curr);
4535 printk("\nstack backtrace:\n"); 4531 pr_warn("\nstack backtrace:\n");
4536 dump_stack(); 4532 dump_stack();
4537} 4533}
4538EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); 4534EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 198527a62149..858a07590e39 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -227,9 +227,9 @@ static void __sched __mutex_lock_slowpath(struct mutex *lock);
227 * (or statically defined) before it can be locked. memset()-ing 227 * (or statically defined) before it can be locked. memset()-ing
228 * the mutex to 0 is not allowed. 228 * the mutex to 0 is not allowed.
229 * 229 *
230 * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging 230 * (The CONFIG_DEBUG_MUTEXES .config option turns on debugging
231 * checks that will enforce the restrictions and will also do 231 * checks that will enforce the restrictions and will also do
232 * deadlock debugging. ) 232 * deadlock debugging)
233 * 233 *
234 * This function is similar to (but not equivalent to) down(). 234 * This function is similar to (but not equivalent to) down().
235 */ 235 */
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index cc3ed0ccdfa2..2655f26ec882 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -20,6 +20,7 @@
20#include <linux/cpumask.h> 20#include <linux/cpumask.h>
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/spinlock.h>
23#include <asm/qrwlock.h> 24#include <asm/qrwlock.h>
24 25
25/* 26/*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b2caec7315af..fd24153e8a48 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -28,6 +28,7 @@
28#include <linux/percpu.h> 28#include <linux/percpu.h>
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <linux/mutex.h> 30#include <linux/mutex.h>
31#include <linux/prefetch.h>
31#include <asm/byteorder.h> 32#include <asm/byteorder.h>
32#include <asm/qspinlock.h> 33#include <asm/qspinlock.h>
33 34
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e6b2f7ad3e51..4ccfcaae5b89 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void)
193 */ 193 */
194 pv_lock_hash = alloc_large_system_hash("PV qspinlock", 194 pv_lock_hash = alloc_large_system_hash("PV qspinlock",
195 sizeof(struct pv_hash_entry), 195 sizeof(struct pv_hash_entry),
196 pv_hash_size, 0, HASH_EARLY, 196 pv_hash_size, 0,
197 HASH_EARLY | HASH_ZERO,
197 &pv_lock_hash_bits, NULL, 198 &pv_lock_hash_bits, NULL,
198 pv_hash_size, pv_hash_size); 199 pv_hash_size, pv_hash_size);
199} 200}
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 58e366ad36f4..ac35e648b0e5 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -166,12 +166,16 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
166 memset(waiter, 0x22, sizeof(*waiter)); 166 memset(waiter, 0x22, sizeof(*waiter));
167} 167}
168 168
169void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) 169void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key)
170{ 170{
171 /* 171 /*
172 * Make sure we are not reinitializing a held lock: 172 * Make sure we are not reinitializing a held lock:
173 */ 173 */
174 debug_check_no_locks_freed((void *)lock, sizeof(*lock)); 174 debug_check_no_locks_freed((void *)lock, sizeof(*lock));
175 lock->name = name; 175 lock->name = name;
176
177#ifdef CONFIG_DEBUG_LOCK_ALLOC
178 lockdep_init_map(&lock->dep_map, name, key, 0);
179#endif
176} 180}
177 181
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index b585af9a1b50..5078c6ddf4a5 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -11,7 +11,7 @@
11 11
12extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); 12extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
13extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); 13extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
14extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); 14extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key);
15extern void debug_rt_mutex_lock(struct rt_mutex *lock); 15extern void debug_rt_mutex_lock(struct rt_mutex *lock);
16extern void debug_rt_mutex_unlock(struct rt_mutex *lock); 16extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
17extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, 17extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b95509416909..649dc9d3951a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -963,7 +963,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
963 return -EDEADLK; 963 return -EDEADLK;
964 964
965 raw_spin_lock(&task->pi_lock); 965 raw_spin_lock(&task->pi_lock);
966 rt_mutex_adjust_prio(task);
967 waiter->task = task; 966 waiter->task = task;
968 waiter->lock = lock; 967 waiter->lock = lock;
969 waiter->prio = task->prio; 968 waiter->prio = task->prio;
@@ -1481,6 +1480,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
1481{ 1480{
1482 might_sleep(); 1481 might_sleep();
1483 1482
1483 mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
1484 rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); 1484 rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
1485} 1485}
1486EXPORT_SYMBOL_GPL(rt_mutex_lock); 1486EXPORT_SYMBOL_GPL(rt_mutex_lock);
@@ -1496,9 +1496,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
1496 */ 1496 */
1497int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) 1497int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
1498{ 1498{
1499 int ret;
1500
1499 might_sleep(); 1501 might_sleep();
1500 1502
1501 return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); 1503 mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
1504 ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
1505 if (ret)
1506 mutex_release(&lock->dep_map, 1, _RET_IP_);
1507
1508 return ret;
1502} 1509}
1503EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); 1510EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
1504 1511
@@ -1526,11 +1533,18 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
1526int 1533int
1527rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) 1534rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
1528{ 1535{
1536 int ret;
1537
1529 might_sleep(); 1538 might_sleep();
1530 1539
1531 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, 1540 mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
1541 ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
1532 RT_MUTEX_MIN_CHAINWALK, 1542 RT_MUTEX_MIN_CHAINWALK,
1533 rt_mutex_slowlock); 1543 rt_mutex_slowlock);
1544 if (ret)
1545 mutex_release(&lock->dep_map, 1, _RET_IP_);
1546
1547 return ret;
1534} 1548}
1535EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); 1549EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
1536 1550
@@ -1547,10 +1561,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
1547 */ 1561 */
1548int __sched rt_mutex_trylock(struct rt_mutex *lock) 1562int __sched rt_mutex_trylock(struct rt_mutex *lock)
1549{ 1563{
1564 int ret;
1565
1550 if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) 1566 if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
1551 return 0; 1567 return 0;
1552 1568
1553 return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); 1569 ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
1570 if (ret)
1571 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
1572
1573 return ret;
1554} 1574}
1555EXPORT_SYMBOL_GPL(rt_mutex_trylock); 1575EXPORT_SYMBOL_GPL(rt_mutex_trylock);
1556 1576
@@ -1561,6 +1581,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock);
1561 */ 1581 */
1562void __sched rt_mutex_unlock(struct rt_mutex *lock) 1582void __sched rt_mutex_unlock(struct rt_mutex *lock)
1563{ 1583{
1584 mutex_release(&lock->dep_map, 1, _RET_IP_);
1564 rt_mutex_fastunlock(lock, rt_mutex_slowunlock); 1585 rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
1565} 1586}
1566EXPORT_SYMBOL_GPL(rt_mutex_unlock); 1587EXPORT_SYMBOL_GPL(rt_mutex_unlock);
@@ -1620,7 +1641,6 @@ void rt_mutex_destroy(struct rt_mutex *lock)
1620 lock->magic = NULL; 1641 lock->magic = NULL;
1621#endif 1642#endif
1622} 1643}
1623
1624EXPORT_SYMBOL_GPL(rt_mutex_destroy); 1644EXPORT_SYMBOL_GPL(rt_mutex_destroy);
1625 1645
1626/** 1646/**
@@ -1632,14 +1652,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
1632 * 1652 *
1633 * Initializing of a locked rt lock is not allowed 1653 * Initializing of a locked rt lock is not allowed
1634 */ 1654 */
1635void __rt_mutex_init(struct rt_mutex *lock, const char *name) 1655void __rt_mutex_init(struct rt_mutex *lock, const char *name,
1656 struct lock_class_key *key)
1636{ 1657{
1637 lock->owner = NULL; 1658 lock->owner = NULL;
1638 raw_spin_lock_init(&lock->wait_lock); 1659 raw_spin_lock_init(&lock->wait_lock);
1639 lock->waiters = RB_ROOT; 1660 lock->waiters = RB_ROOT;
1640 lock->waiters_leftmost = NULL; 1661 lock->waiters_leftmost = NULL;
1641 1662
1642 debug_rt_mutex_init(lock, name); 1663 if (name && key)
1664 debug_rt_mutex_init(lock, name, key);
1643} 1665}
1644EXPORT_SYMBOL_GPL(__rt_mutex_init); 1666EXPORT_SYMBOL_GPL(__rt_mutex_init);
1645 1667
@@ -1660,7 +1682,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
1660void rt_mutex_init_proxy_locked(struct rt_mutex *lock, 1682void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
1661 struct task_struct *proxy_owner) 1683 struct task_struct *proxy_owner)
1662{ 1684{
1663 __rt_mutex_init(lock, NULL); 1685 __rt_mutex_init(lock, NULL, NULL);
1664 debug_rt_mutex_proxy_lock(lock, proxy_owner); 1686 debug_rt_mutex_proxy_lock(lock, proxy_owner);
1665 rt_mutex_set_owner(lock, proxy_owner); 1687 rt_mutex_set_owner(lock, proxy_owner);
1666} 1688}
@@ -1785,12 +1807,14 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
1785 int ret; 1807 int ret;
1786 1808
1787 raw_spin_lock_irq(&lock->wait_lock); 1809 raw_spin_lock_irq(&lock->wait_lock);
1788
1789 set_current_state(TASK_INTERRUPTIBLE);
1790
1791 /* sleep on the mutex */ 1810 /* sleep on the mutex */
1811 set_current_state(TASK_INTERRUPTIBLE);
1792 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); 1812 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
1793 1813 /*
1814 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
1815 * have to fix that up.
1816 */
1817 fixup_rt_mutex_waiters(lock);
1794 raw_spin_unlock_irq(&lock->wait_lock); 1818 raw_spin_unlock_irq(&lock->wait_lock);
1795 1819
1796 return ret; 1820 return ret;
@@ -1822,15 +1846,25 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
1822 1846
1823 raw_spin_lock_irq(&lock->wait_lock); 1847 raw_spin_lock_irq(&lock->wait_lock);
1824 /* 1848 /*
1849 * Do an unconditional try-lock, this deals with the lock stealing
1850 * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
1851 * sets a NULL owner.
1852 *
1853 * We're not interested in the return value, because the subsequent
1854 * test on rt_mutex_owner() will infer that. If the trylock succeeded,
1855 * we will own the lock and it will have removed the waiter. If we
1856 * failed the trylock, we're still not owner and we need to remove
1857 * ourselves.
1858 */
1859 try_to_take_rt_mutex(lock, current, waiter);
1860 /*
1825 * Unless we're the owner; we're still enqueued on the wait_list. 1861 * Unless we're the owner; we're still enqueued on the wait_list.
1826 * So check if we became owner, if not, take us off the wait_list. 1862 * So check if we became owner, if not, take us off the wait_list.
1827 */ 1863 */
1828 if (rt_mutex_owner(lock) != current) { 1864 if (rt_mutex_owner(lock) != current) {
1829 remove_waiter(lock, waiter); 1865 remove_waiter(lock, waiter);
1830 fixup_rt_mutex_waiters(lock);
1831 cleanup = true; 1866 cleanup = true;
1832 } 1867 }
1833
1834 /* 1868 /*
1835 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might 1869 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
1836 * have to fix that up. 1870 * have to fix that up.
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index 6607802efa8b..5c253caffe91 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -17,7 +17,7 @@
17#define debug_rt_mutex_proxy_lock(l,p) do { } while (0) 17#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
18#define debug_rt_mutex_proxy_unlock(l) do { } while (0) 18#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
19#define debug_rt_mutex_unlock(l) do { } while (0) 19#define debug_rt_mutex_unlock(l) do { } while (0)
20#define debug_rt_mutex_init(m, n) do { } while (0) 20#define debug_rt_mutex_init(m, n, k) do { } while (0)
21#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) 21#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
22#define debug_rt_mutex_print_deadlock(w) do { } while (0) 22#define debug_rt_mutex_print_deadlock(w) do { } while (0)
23#define debug_rt_mutex_reset_waiter(w) do { } while (0) 23#define debug_rt_mutex_reset_waiter(w) do { } while (0)
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index c65f7989f850..20819df98125 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -231,8 +231,8 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
231 231
232out_nolock: 232out_nolock:
233 list_del(&waiter.list); 233 list_del(&waiter.list);
234 if (!list_empty(&sem->wait_list)) 234 if (!list_empty(&sem->wait_list) && sem->count >= 0)
235 __rwsem_do_wake(sem, 1); 235 __rwsem_do_wake(sem, 0);
236 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 236 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
237 237
238 return -EINTR; 238 return -EINTR;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 23a6483c3666..124bed776532 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -358,7 +358,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
358 goto err_pfn_remap; 358 goto err_pfn_remap;
359 359
360 mem_hotplug_begin(); 360 mem_hotplug_begin();
361 error = arch_add_memory(nid, align_start, align_size, true); 361 error = arch_add_memory(nid, align_start, align_size, false);
362 if (!error)
363 move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
364 align_start >> PAGE_SHIFT,
365 align_size >> PAGE_SHIFT);
362 mem_hotplug_done(); 366 mem_hotplug_done();
363 if (error) 367 if (error)
364 goto err_add_memory; 368 goto err_add_memory;
diff --git a/kernel/module.c b/kernel/module.c
index 4a3665f8f837..40f983cbea81 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -49,9 +49,7 @@
49#include <linux/rculist.h> 49#include <linux/rculist.h>
50#include <linux/uaccess.h> 50#include <linux/uaccess.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#ifdef CONFIG_STRICT_MODULE_RWX 52#include <linux/set_memory.h>
53#include <asm/set_memory.h>
54#endif
55#include <asm/mmu_context.h> 53#include <asm/mmu_context.h>
56#include <linux/license.h> 54#include <linux/license.h>
57#include <asm/sections.h> 55#include <asm/sections.h>
@@ -302,6 +300,7 @@ int unregister_module_notifier(struct notifier_block *nb)
302EXPORT_SYMBOL(unregister_module_notifier); 300EXPORT_SYMBOL(unregister_module_notifier);
303 301
304struct load_info { 302struct load_info {
303 const char *name;
305 Elf_Ehdr *hdr; 304 Elf_Ehdr *hdr;
306 unsigned long len; 305 unsigned long len;
307 Elf_Shdr *sechdrs; 306 Elf_Shdr *sechdrs;
@@ -602,7 +601,7 @@ static struct module *find_module_all(const char *name, size_t len,
602 601
603 module_assert_mutex_or_preempt(); 602 module_assert_mutex_or_preempt();
604 603
605 list_for_each_entry(mod, &modules, list) { 604 list_for_each_entry_rcu(mod, &modules, list) {
606 if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) 605 if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
607 continue; 606 continue;
608 if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) 607 if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
@@ -1202,10 +1201,7 @@ static ssize_t store_uevent(struct module_attribute *mattr,
1202 struct module_kobject *mk, 1201 struct module_kobject *mk,
1203 const char *buffer, size_t count) 1202 const char *buffer, size_t count)
1204{ 1203{
1205 enum kobject_action action; 1204 kobject_synth_uevent(&mk->kobj, buffer, count);
1206
1207 if (kobject_action_type(buffer, count, &action) == 0)
1208 kobject_uevent(&mk->kobj, action);
1209 return count; 1205 return count;
1210} 1206}
1211 1207
@@ -1278,12 +1274,13 @@ static u32 resolve_rel_crc(const s32 *crc)
1278 return *(u32 *)((void *)crc + *crc); 1274 return *(u32 *)((void *)crc + *crc);
1279} 1275}
1280 1276
1281static int check_version(Elf_Shdr *sechdrs, 1277static int check_version(const struct load_info *info,
1282 unsigned int versindex,
1283 const char *symname, 1278 const char *symname,
1284 struct module *mod, 1279 struct module *mod,
1285 const s32 *crc) 1280 const s32 *crc)
1286{ 1281{
1282 Elf_Shdr *sechdrs = info->sechdrs;
1283 unsigned int versindex = info->index.vers;
1287 unsigned int i, num_versions; 1284 unsigned int i, num_versions;
1288 struct modversion_info *versions; 1285 struct modversion_info *versions;
1289 1286
@@ -1317,17 +1314,16 @@ static int check_version(Elf_Shdr *sechdrs,
1317 } 1314 }
1318 1315
1319 /* Broken toolchain. Warn once, then let it go.. */ 1316 /* Broken toolchain. Warn once, then let it go.. */
1320 pr_warn_once("%s: no symbol version for %s\n", mod->name, symname); 1317 pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
1321 return 1; 1318 return 1;
1322 1319
1323bad_version: 1320bad_version:
1324 pr_warn("%s: disagrees about version of symbol %s\n", 1321 pr_warn("%s: disagrees about version of symbol %s\n",
1325 mod->name, symname); 1322 info->name, symname);
1326 return 0; 1323 return 0;
1327} 1324}
1328 1325
1329static inline int check_modstruct_version(Elf_Shdr *sechdrs, 1326static inline int check_modstruct_version(const struct load_info *info,
1330 unsigned int versindex,
1331 struct module *mod) 1327 struct module *mod)
1332{ 1328{
1333 const s32 *crc; 1329 const s32 *crc;
@@ -1343,8 +1339,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1343 BUG(); 1339 BUG();
1344 } 1340 }
1345 preempt_enable(); 1341 preempt_enable();
1346 return check_version(sechdrs, versindex, 1342 return check_version(info, VMLINUX_SYMBOL_STR(module_layout),
1347 VMLINUX_SYMBOL_STR(module_layout), mod, crc); 1343 mod, crc);
1348} 1344}
1349 1345
1350/* First part is kernel version, which we ignore if module has crcs. */ 1346/* First part is kernel version, which we ignore if module has crcs. */
@@ -1358,8 +1354,7 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1358 return strcmp(amagic, bmagic) == 0; 1354 return strcmp(amagic, bmagic) == 0;
1359} 1355}
1360#else 1356#else
1361static inline int check_version(Elf_Shdr *sechdrs, 1357static inline int check_version(const struct load_info *info,
1362 unsigned int versindex,
1363 const char *symname, 1358 const char *symname,
1364 struct module *mod, 1359 struct module *mod,
1365 const s32 *crc) 1360 const s32 *crc)
@@ -1367,8 +1362,7 @@ static inline int check_version(Elf_Shdr *sechdrs,
1367 return 1; 1362 return 1;
1368} 1363}
1369 1364
1370static inline int check_modstruct_version(Elf_Shdr *sechdrs, 1365static inline int check_modstruct_version(const struct load_info *info,
1371 unsigned int versindex,
1372 struct module *mod) 1366 struct module *mod)
1373{ 1367{
1374 return 1; 1368 return 1;
@@ -1404,7 +1398,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
1404 if (!sym) 1398 if (!sym)
1405 goto unlock; 1399 goto unlock;
1406 1400
1407 if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) { 1401 if (!check_version(info, name, mod, crc)) {
1408 sym = ERR_PTR(-EINVAL); 1402 sym = ERR_PTR(-EINVAL);
1409 goto getname; 1403 goto getname;
1410 } 1404 }
@@ -1667,31 +1661,36 @@ static inline void remove_notes_attrs(struct module *mod)
1667} 1661}
1668#endif /* CONFIG_KALLSYMS */ 1662#endif /* CONFIG_KALLSYMS */
1669 1663
1670static void add_usage_links(struct module *mod) 1664static void del_usage_links(struct module *mod)
1671{ 1665{
1672#ifdef CONFIG_MODULE_UNLOAD 1666#ifdef CONFIG_MODULE_UNLOAD
1673 struct module_use *use; 1667 struct module_use *use;
1674 int nowarn;
1675 1668
1676 mutex_lock(&module_mutex); 1669 mutex_lock(&module_mutex);
1677 list_for_each_entry(use, &mod->target_list, target_list) { 1670 list_for_each_entry(use, &mod->target_list, target_list)
1678 nowarn = sysfs_create_link(use->target->holders_dir, 1671 sysfs_remove_link(use->target->holders_dir, mod->name);
1679 &mod->mkobj.kobj, mod->name);
1680 }
1681 mutex_unlock(&module_mutex); 1672 mutex_unlock(&module_mutex);
1682#endif 1673#endif
1683} 1674}
1684 1675
1685static void del_usage_links(struct module *mod) 1676static int add_usage_links(struct module *mod)
1686{ 1677{
1678 int ret = 0;
1687#ifdef CONFIG_MODULE_UNLOAD 1679#ifdef CONFIG_MODULE_UNLOAD
1688 struct module_use *use; 1680 struct module_use *use;
1689 1681
1690 mutex_lock(&module_mutex); 1682 mutex_lock(&module_mutex);
1691 list_for_each_entry(use, &mod->target_list, target_list) 1683 list_for_each_entry(use, &mod->target_list, target_list) {
1692 sysfs_remove_link(use->target->holders_dir, mod->name); 1684 ret = sysfs_create_link(use->target->holders_dir,
1685 &mod->mkobj.kobj, mod->name);
1686 if (ret)
1687 break;
1688 }
1693 mutex_unlock(&module_mutex); 1689 mutex_unlock(&module_mutex);
1690 if (ret)
1691 del_usage_links(mod);
1694#endif 1692#endif
1693 return ret;
1695} 1694}
1696 1695
1697static int module_add_modinfo_attrs(struct module *mod) 1696static int module_add_modinfo_attrs(struct module *mod)
@@ -1802,13 +1801,18 @@ static int mod_sysfs_setup(struct module *mod,
1802 if (err) 1801 if (err)
1803 goto out_unreg_param; 1802 goto out_unreg_param;
1804 1803
1805 add_usage_links(mod); 1804 err = add_usage_links(mod);
1805 if (err)
1806 goto out_unreg_modinfo_attrs;
1807
1806 add_sect_attrs(mod, info); 1808 add_sect_attrs(mod, info);
1807 add_notes_attrs(mod, info); 1809 add_notes_attrs(mod, info);
1808 1810
1809 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); 1811 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
1810 return 0; 1812 return 0;
1811 1813
1814out_unreg_modinfo_attrs:
1815 module_remove_modinfo_attrs(mod);
1812out_unreg_param: 1816out_unreg_param:
1813 module_param_sysfs_remove(mod); 1817 module_param_sysfs_remove(mod);
1814out_unreg_holders: 1818out_unreg_holders:
@@ -2915,9 +2919,15 @@ static int rewrite_section_headers(struct load_info *info, int flags)
2915 info->index.vers = 0; /* Pretend no __versions section! */ 2919 info->index.vers = 0; /* Pretend no __versions section! */
2916 else 2920 else
2917 info->index.vers = find_sec(info, "__versions"); 2921 info->index.vers = find_sec(info, "__versions");
2922 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
2923
2918 info->index.info = find_sec(info, ".modinfo"); 2924 info->index.info = find_sec(info, ".modinfo");
2925 if (!info->index.info)
2926 info->name = "(missing .modinfo section)";
2927 else
2928 info->name = get_modinfo(info, "name");
2919 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; 2929 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
2920 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; 2930
2921 return 0; 2931 return 0;
2922} 2932}
2923 2933
@@ -2957,21 +2967,29 @@ static struct module *setup_load_info(struct load_info *info, int flags)
2957 2967
2958 info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); 2968 info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
2959 if (!info->index.mod) { 2969 if (!info->index.mod) {
2960 pr_warn("No module found in object\n"); 2970 pr_warn("%s: No module found in object\n",
2971 info->name ?: "(missing .modinfo name field)");
2961 return ERR_PTR(-ENOEXEC); 2972 return ERR_PTR(-ENOEXEC);
2962 } 2973 }
2963 /* This is temporary: point mod into copy of data. */ 2974 /* This is temporary: point mod into copy of data. */
2964 mod = (void *)info->sechdrs[info->index.mod].sh_addr; 2975 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2965 2976
2977 /*
2978 * If we didn't load the .modinfo 'name' field, fall back to
2979 * on-disk struct mod 'name' field.
2980 */
2981 if (!info->name)
2982 info->name = mod->name;
2983
2966 if (info->index.sym == 0) { 2984 if (info->index.sym == 0) {
2967 pr_warn("%s: module has no symbols (stripped?)\n", mod->name); 2985 pr_warn("%s: module has no symbols (stripped?)\n", info->name);
2968 return ERR_PTR(-ENOEXEC); 2986 return ERR_PTR(-ENOEXEC);
2969 } 2987 }
2970 2988
2971 info->index.pcpu = find_pcpusec(info); 2989 info->index.pcpu = find_pcpusec(info);
2972 2990
2973 /* Check module struct version now, before we try to use module. */ 2991 /* Check module struct version now, before we try to use module. */
2974 if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) 2992 if (!check_modstruct_version(info, mod))
2975 return ERR_PTR(-ENOEXEC); 2993 return ERR_PTR(-ENOEXEC);
2976 2994
2977 return mod; 2995 return mod;
@@ -2992,7 +3010,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2992 return err; 3010 return err;
2993 } else if (!same_magic(modmagic, vermagic, info->index.vers)) { 3011 } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
2994 pr_err("%s: version magic '%s' should be '%s'\n", 3012 pr_err("%s: version magic '%s' should be '%s'\n",
2995 mod->name, modmagic, vermagic); 3013 info->name, modmagic, vermagic);
2996 return -ENOEXEC; 3014 return -ENOEXEC;
2997 } 3015 }
2998 3016
@@ -3077,9 +3095,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
3077 mod->trace_events = section_objs(info, "_ftrace_events", 3095 mod->trace_events = section_objs(info, "_ftrace_events",
3078 sizeof(*mod->trace_events), 3096 sizeof(*mod->trace_events),
3079 &mod->num_trace_events); 3097 &mod->num_trace_events);
3080 mod->trace_enums = section_objs(info, "_ftrace_enum_map", 3098 mod->trace_evals = section_objs(info, "_ftrace_eval_map",
3081 sizeof(*mod->trace_enums), 3099 sizeof(*mod->trace_evals),
3082 &mod->num_trace_enums); 3100 &mod->num_trace_evals);
3083#endif 3101#endif
3084#ifdef CONFIG_TRACING 3102#ifdef CONFIG_TRACING
3085 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", 3103 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
@@ -3242,7 +3260,7 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
3242 3260
3243/* module_blacklist is a comma-separated list of module names */ 3261/* module_blacklist is a comma-separated list of module names */
3244static char *module_blacklist; 3262static char *module_blacklist;
3245static bool blacklisted(char *module_name) 3263static bool blacklisted(const char *module_name)
3246{ 3264{
3247 const char *p; 3265 const char *p;
3248 size_t len; 3266 size_t len;
@@ -3272,7 +3290,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
3272 if (IS_ERR(mod)) 3290 if (IS_ERR(mod))
3273 return mod; 3291 return mod;
3274 3292
3275 if (blacklisted(mod->name)) 3293 if (blacklisted(info->name))
3276 return ERR_PTR(-EPERM); 3294 return ERR_PTR(-EPERM);
3277 3295
3278 err = check_modinfo(mod, info, flags); 3296 err = check_modinfo(mod, info, flags);
@@ -4201,7 +4219,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
4201 goto out; 4219 goto out;
4202 4220
4203 e = search_extable(mod->extable, 4221 e = search_extable(mod->extable,
4204 mod->extable + mod->num_exentries - 1, 4222 mod->num_exentries,
4205 addr); 4223 addr);
4206out: 4224out:
4207 preempt_enable(); 4225 preempt_enable();
diff --git a/kernel/padata.c b/kernel/padata.c
index ac8f1e524836..868f947166d7 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -934,29 +934,18 @@ static struct kobj_type padata_attr_type = {
934}; 934};
935 935
936/** 936/**
937 * padata_alloc_possible - Allocate and initialize padata instance.
938 * Use the cpu_possible_mask for serial and
939 * parallel workers.
940 *
941 * @wq: workqueue to use for the allocated padata instance
942 */
943struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
944{
945 return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
946}
947EXPORT_SYMBOL(padata_alloc_possible);
948
949/**
950 * padata_alloc - allocate and initialize a padata instance and specify 937 * padata_alloc - allocate and initialize a padata instance and specify
951 * cpumasks for serial and parallel workers. 938 * cpumasks for serial and parallel workers.
952 * 939 *
953 * @wq: workqueue to use for the allocated padata instance 940 * @wq: workqueue to use for the allocated padata instance
954 * @pcpumask: cpumask that will be used for padata parallelization 941 * @pcpumask: cpumask that will be used for padata parallelization
955 * @cbcpumask: cpumask that will be used for padata serialization 942 * @cbcpumask: cpumask that will be used for padata serialization
943 *
944 * Must be called from a cpus_read_lock() protected region
956 */ 945 */
957struct padata_instance *padata_alloc(struct workqueue_struct *wq, 946static struct padata_instance *padata_alloc(struct workqueue_struct *wq,
958 const struct cpumask *pcpumask, 947 const struct cpumask *pcpumask,
959 const struct cpumask *cbcpumask) 948 const struct cpumask *cbcpumask)
960{ 949{
961 struct padata_instance *pinst; 950 struct padata_instance *pinst;
962 struct parallel_data *pd = NULL; 951 struct parallel_data *pd = NULL;
@@ -965,7 +954,6 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq,
965 if (!pinst) 954 if (!pinst)
966 goto err; 955 goto err;
967 956
968 get_online_cpus();
969 if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) 957 if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
970 goto err_free_inst; 958 goto err_free_inst;
971 if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { 959 if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
@@ -989,14 +977,12 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq,
989 977
990 pinst->flags = 0; 978 pinst->flags = 0;
991 979
992 put_online_cpus();
993
994 BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); 980 BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
995 kobject_init(&pinst->kobj, &padata_attr_type); 981 kobject_init(&pinst->kobj, &padata_attr_type);
996 mutex_init(&pinst->lock); 982 mutex_init(&pinst->lock);
997 983
998#ifdef CONFIG_HOTPLUG_CPU 984#ifdef CONFIG_HOTPLUG_CPU
999 cpuhp_state_add_instance_nocalls(hp_online, &pinst->node); 985 cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node);
1000#endif 986#endif
1001 return pinst; 987 return pinst;
1002 988
@@ -1005,12 +991,27 @@ err_free_masks:
1005 free_cpumask_var(pinst->cpumask.cbcpu); 991 free_cpumask_var(pinst->cpumask.cbcpu);
1006err_free_inst: 992err_free_inst:
1007 kfree(pinst); 993 kfree(pinst);
1008 put_online_cpus();
1009err: 994err:
1010 return NULL; 995 return NULL;
1011} 996}
1012 997
1013/** 998/**
999 * padata_alloc_possible - Allocate and initialize padata instance.
1000 * Use the cpu_possible_mask for serial and
1001 * parallel workers.
1002 *
1003 * @wq: workqueue to use for the allocated padata instance
1004 *
1005 * Must be called from a cpus_read_lock() protected region
1006 */
1007struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
1008{
1009 lockdep_assert_cpus_held();
1010 return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
1011}
1012EXPORT_SYMBOL(padata_alloc_possible);
1013
1014/**
1014 * padata_free - free a padata instance 1015 * padata_free - free a padata instance
1015 * 1016 *
1016 * @padata_inst: padata instance to free 1017 * @padata_inst: padata instance to free
diff --git a/kernel/pid.c b/kernel/pid.c
index fd1cde1e4576..731c4e528f4e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -575,16 +575,13 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
575 */ 575 */
576void __init pidhash_init(void) 576void __init pidhash_init(void)
577{ 577{
578 unsigned int i, pidhash_size; 578 unsigned int pidhash_size;
579 579
580 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, 580 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
581 HASH_EARLY | HASH_SMALL, 581 HASH_EARLY | HASH_SMALL | HASH_ZERO,
582 &pidhash_shift, NULL, 582 &pidhash_shift, NULL,
583 0, 4096); 583 0, 4096);
584 pidhash_size = 1U << pidhash_shift; 584 pidhash_size = 1U << pidhash_shift;
585
586 for (i = 0; i < pidhash_size; i++)
587 INIT_HLIST_HEAD(&pid_hash[i]);
588} 585}
589 586
590void __init pidmap_init(void) 587void __init pidmap_init(void)
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a8b978c35a6a..e1914c7b85b1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1108,7 +1108,7 @@ static struct attribute * g[] = {
1108}; 1108};
1109 1109
1110 1110
1111static struct attribute_group attr_group = { 1111static const struct attribute_group attr_group = {
1112 .attrs = g, 1112 .attrs = g,
1113}; 1113};
1114 1114
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d401c21136d1..42bd800a6755 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -705,7 +705,7 @@ static struct attribute * g[] = {
705 NULL, 705 NULL,
706}; 706};
707 707
708static struct attribute_group attr_group = { 708static const struct attribute_group attr_group = {
709 .attrs = g, 709 .attrs = g,
710}; 710};
711 711
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3b1e0f3ad07f..222317721c5a 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -30,19 +30,17 @@
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/compiler.h> 31#include <linux/compiler.h>
32#include <linux/ktime.h> 32#include <linux/ktime.h>
33#include <linux/set_memory.h>
33 34
34#include <linux/uaccess.h> 35#include <linux/uaccess.h>
35#include <asm/mmu_context.h> 36#include <asm/mmu_context.h>
36#include <asm/pgtable.h> 37#include <asm/pgtable.h>
37#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
38#include <asm/io.h> 39#include <asm/io.h>
39#ifdef CONFIG_STRICT_KERNEL_RWX
40#include <asm/set_memory.h>
41#endif
42 40
43#include "power.h" 41#include "power.h"
44 42
45#ifdef CONFIG_STRICT_KERNEL_RWX 43#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_ARCH_HAS_SET_MEMORY)
46static bool hibernate_restore_protection; 44static bool hibernate_restore_protection;
47static bool hibernate_restore_protection_active; 45static bool hibernate_restore_protection_active;
48 46
@@ -77,7 +75,7 @@ static inline void hibernate_restore_protection_begin(void) {}
77static inline void hibernate_restore_protection_end(void) {} 75static inline void hibernate_restore_protection_end(void) {}
78static inline void hibernate_restore_protect_page(void *page_address) {} 76static inline void hibernate_restore_protect_page(void *page_address) {}
79static inline void hibernate_restore_unprotect_page(void *page_address) {} 77static inline void hibernate_restore_unprotect_page(void *page_address) {}
80#endif /* CONFIG_STRICT_KERNEL_RWX */ 78#endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */
81 79
82static int swsusp_page_is_free(struct page *); 80static int swsusp_page_is_free(struct page *);
83static void swsusp_set_page_forbidden(struct page *); 81static void swsusp_set_page_forbidden(struct page *);
@@ -1425,7 +1423,7 @@ static unsigned int nr_meta_pages;
1425 * Numbers of normal and highmem page frames allocated for hibernation image 1423 * Numbers of normal and highmem page frames allocated for hibernation image
1426 * before suspending devices. 1424 * before suspending devices.
1427 */ 1425 */
1428unsigned int alloc_normal, alloc_highmem; 1426static unsigned int alloc_normal, alloc_highmem;
1429/* 1427/*
1430 * Memory bitmap used for marking saveable pages (during hibernation) or 1428 * Memory bitmap used for marking saveable pages (during hibernation) or
1431 * hibernation image pages (during restore) 1429 * hibernation image pages (during restore)
@@ -1929,8 +1927,7 @@ static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
1929 * also be located in the high memory, because of the way in which 1927 * also be located in the high memory, because of the way in which
1930 * copy_data_pages() works. 1928 * copy_data_pages() works.
1931 */ 1929 */
1932static int swsusp_alloc(struct memory_bitmap *orig_bm, 1930static int swsusp_alloc(struct memory_bitmap *copy_bm,
1933 struct memory_bitmap *copy_bm,
1934 unsigned int nr_pages, unsigned int nr_highmem) 1931 unsigned int nr_pages, unsigned int nr_highmem)
1935{ 1932{
1936 if (nr_highmem > 0) { 1933 if (nr_highmem > 0) {
@@ -1976,7 +1973,7 @@ asmlinkage __visible int swsusp_save(void)
1976 return -ENOMEM; 1973 return -ENOMEM;
1977 } 1974 }
1978 1975
1979 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) { 1976 if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem)) {
1980 printk(KERN_ERR "PM: Memory allocation failed\n"); 1977 printk(KERN_ERR "PM: Memory allocation failed\n");
1981 return -ENOMEM; 1978 return -ENOMEM;
1982 } 1979 }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c0248c74d6d4..3ecf275d7e44 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -86,11 +86,9 @@ static void freeze_enter(void)
86 86
87 /* Push all the CPUs into the idle loop. */ 87 /* Push all the CPUs into the idle loop. */
88 wake_up_all_idle_cpus(); 88 wake_up_all_idle_cpus();
89 pr_debug("PM: suspend-to-idle\n");
90 /* Make the current CPU wait so it can enter the idle loop too. */ 89 /* Make the current CPU wait so it can enter the idle loop too. */
91 wait_event(suspend_freeze_wait_head, 90 wait_event(suspend_freeze_wait_head,
92 suspend_freeze_state == FREEZE_STATE_WAKE); 91 suspend_freeze_state == FREEZE_STATE_WAKE);
93 pr_debug("PM: resume from suspend-to-idle\n");
94 92
95 cpuidle_pause(); 93 cpuidle_pause();
96 put_online_cpus(); 94 put_online_cpus();
@@ -106,6 +104,8 @@ static void freeze_enter(void)
106 104
107static void s2idle_loop(void) 105static void s2idle_loop(void)
108{ 106{
107 pr_debug("PM: suspend-to-idle\n");
108
109 do { 109 do {
110 freeze_enter(); 110 freeze_enter();
111 111
@@ -121,6 +121,8 @@ static void s2idle_loop(void)
121 121
122 pm_wakeup_clear(false); 122 pm_wakeup_clear(false);
123 } while (!dpm_suspend_noirq(PMSG_SUSPEND)); 123 } while (!dpm_suspend_noirq(PMSG_SUSPEND));
124
125 pr_debug("PM: resume from suspend-to-idle\n");
124} 126}
125 127
126void freeze_wake(void) 128void freeze_wake(void)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index f80fd33639e0..57d22571f306 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -225,14 +225,14 @@ static struct block_device *hib_resume_bdev;
225struct hib_bio_batch { 225struct hib_bio_batch {
226 atomic_t count; 226 atomic_t count;
227 wait_queue_head_t wait; 227 wait_queue_head_t wait;
228 int error; 228 blk_status_t error;
229}; 229};
230 230
231static void hib_init_batch(struct hib_bio_batch *hb) 231static void hib_init_batch(struct hib_bio_batch *hb)
232{ 232{
233 atomic_set(&hb->count, 0); 233 atomic_set(&hb->count, 0);
234 init_waitqueue_head(&hb->wait); 234 init_waitqueue_head(&hb->wait);
235 hb->error = 0; 235 hb->error = BLK_STS_OK;
236} 236}
237 237
238static void hib_end_io(struct bio *bio) 238static void hib_end_io(struct bio *bio)
@@ -240,7 +240,7 @@ static void hib_end_io(struct bio *bio)
240 struct hib_bio_batch *hb = bio->bi_private; 240 struct hib_bio_batch *hb = bio->bi_private;
241 struct page *page = bio->bi_io_vec[0].bv_page; 241 struct page *page = bio->bi_io_vec[0].bv_page;
242 242
243 if (bio->bi_error) { 243 if (bio->bi_status) {
244 printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", 244 printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
245 imajor(bio->bi_bdev->bd_inode), 245 imajor(bio->bi_bdev->bd_inode),
246 iminor(bio->bi_bdev->bd_inode), 246 iminor(bio->bi_bdev->bd_inode),
@@ -253,8 +253,8 @@ static void hib_end_io(struct bio *bio)
253 flush_icache_range((unsigned long)page_address(page), 253 flush_icache_range((unsigned long)page_address(page),
254 (unsigned long)page_address(page) + PAGE_SIZE); 254 (unsigned long)page_address(page) + PAGE_SIZE);
255 255
256 if (bio->bi_error && !hb->error) 256 if (bio->bi_status && !hb->error)
257 hb->error = bio->bi_error; 257 hb->error = bio->bi_status;
258 if (atomic_dec_and_test(&hb->count)) 258 if (atomic_dec_and_test(&hb->count))
259 wake_up(&hb->wait); 259 wake_up(&hb->wait);
260 260
@@ -293,10 +293,10 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
293 return error; 293 return error;
294} 294}
295 295
296static int hib_wait_io(struct hib_bio_batch *hb) 296static blk_status_t hib_wait_io(struct hib_bio_batch *hb)
297{ 297{
298 wait_event(hb->wait, atomic_read(&hb->count) == 0); 298 wait_event(hb->wait, atomic_read(&hb->count) == 0);
299 return hb->error; 299 return blk_status_to_errno(hb->error);
300} 300}
301 301
302/* 302/*
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 1db044f808b7..2a7d04049af4 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -18,12 +18,14 @@
18 18
19#ifdef CONFIG_PRINTK 19#ifdef CONFIG_PRINTK
20 20
21#define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff 21#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff
22#define PRINTK_NMI_CONTEXT_MASK 0x80000000 22#define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000
23#define PRINTK_NMI_CONTEXT_MASK 0x80000000
23 24
24extern raw_spinlock_t logbuf_lock; 25extern raw_spinlock_t logbuf_lock;
25 26
26__printf(1, 0) int vprintk_default(const char *fmt, va_list args); 27__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
28__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
27__printf(1, 0) int vprintk_func(const char *fmt, va_list args); 29__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
28void __printk_safe_enter(void); 30void __printk_safe_enter(void);
29void __printk_safe_exit(void); 31void __printk_safe_exit(void);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a1aecf44ab07..fc47863f629c 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -269,7 +269,6 @@ static struct console *exclusive_console;
269#define MAX_CMDLINECONSOLES 8 269#define MAX_CMDLINECONSOLES 8
270 270
271static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; 271static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
272static int console_cmdline_cnt;
273 272
274static int preferred_console = -1; 273static int preferred_console = -1;
275int console_set_on_cmdline; 274int console_set_on_cmdline;
@@ -1176,7 +1175,7 @@ static void boot_delay_msec(int level)
1176 unsigned long long k; 1175 unsigned long long k;
1177 unsigned long timeout; 1176 unsigned long timeout;
1178 1177
1179 if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) 1178 if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING)
1180 || suppress_message_printing(level)) { 1179 || suppress_message_printing(level)) {
1181 return; 1180 return;
1182 } 1181 }
@@ -1906,25 +1905,12 @@ static int __add_preferred_console(char *name, int idx, char *options,
1906 * See if this tty is not yet registered, and 1905 * See if this tty is not yet registered, and
1907 * if we have a slot free. 1906 * if we have a slot free.
1908 */ 1907 */
1909 for (i = 0, c = console_cmdline; i < console_cmdline_cnt; i++, c++) { 1908 for (i = 0, c = console_cmdline;
1909 i < MAX_CMDLINECONSOLES && c->name[0];
1910 i++, c++) {
1910 if (strcmp(c->name, name) == 0 && c->index == idx) { 1911 if (strcmp(c->name, name) == 0 && c->index == idx) {
1911 if (brl_options) 1912 if (!brl_options)
1912 return 0; 1913 preferred_console = i;
1913
1914 /*
1915 * Maintain an invariant that will help to find if
1916 * the matching console is preferred, see
1917 * register_console():
1918 *
1919 * The last non-braille console is always
1920 * the preferred one.
1921 */
1922 if (i != console_cmdline_cnt - 1)
1923 swap(console_cmdline[i],
1924 console_cmdline[console_cmdline_cnt - 1]);
1925
1926 preferred_console = console_cmdline_cnt - 1;
1927
1928 return 0; 1914 return 0;
1929 } 1915 }
1930 } 1916 }
@@ -1937,7 +1923,6 @@ static int __add_preferred_console(char *name, int idx, char *options,
1937 braille_set_options(c, brl_options); 1923 braille_set_options(c, brl_options);
1938 1924
1939 c->index = idx; 1925 c->index = idx;
1940 console_cmdline_cnt++;
1941 return 0; 1926 return 0;
1942} 1927}
1943/* 1928/*
@@ -2477,23 +2462,12 @@ void register_console(struct console *newcon)
2477 } 2462 }
2478 2463
2479 /* 2464 /*
2480 * See if this console matches one we selected on the command line. 2465 * See if this console matches one we selected on
2481 * 2466 * the command line.
2482 * There may be several entries in the console_cmdline array matching
2483 * with the same console, one with newcon->match(), another by
2484 * name/index:
2485 *
2486 * pl011,mmio,0x87e024000000,115200 -- added from SPCR
2487 * ttyAMA0 -- added from command line
2488 *
2489 * Traverse the console_cmdline array in reverse order to be
2490 * sure that if this console is preferred then it will be the first
2491 * matching entry. We use the invariant that is maintained in
2492 * __add_preferred_console().
2493 */ 2467 */
2494 for (i = console_cmdline_cnt - 1; i >= 0; i--) { 2468 for (i = 0, c = console_cmdline;
2495 c = console_cmdline + i; 2469 i < MAX_CMDLINECONSOLES && c->name[0];
2496 2470 i++, c++) {
2497 if (!newcon->match || 2471 if (!newcon->match ||
2498 newcon->match(newcon, c->name, c->index, c->options) != 0) { 2472 newcon->match(newcon, c->name, c->index, c->options) != 0) {
2499 /* default matching */ 2473 /* default matching */
@@ -2746,16 +2720,13 @@ void wake_up_klogd(void)
2746 preempt_enable(); 2720 preempt_enable();
2747} 2721}
2748 2722
2749int printk_deferred(const char *fmt, ...) 2723int vprintk_deferred(const char *fmt, va_list args)
2750{ 2724{
2751 va_list args;
2752 int r; 2725 int r;
2753 2726
2754 preempt_disable();
2755 va_start(args, fmt);
2756 r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); 2727 r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
2757 va_end(args);
2758 2728
2729 preempt_disable();
2759 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); 2730 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
2760 irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); 2731 irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
2761 preempt_enable(); 2732 preempt_enable();
@@ -2763,6 +2734,18 @@ int printk_deferred(const char *fmt, ...)
2763 return r; 2734 return r;
2764} 2735}
2765 2736
2737int printk_deferred(const char *fmt, ...)
2738{
2739 va_list args;
2740 int r;
2741
2742 va_start(args, fmt);
2743 r = vprintk_deferred(fmt, args);
2744 va_end(args);
2745
2746 return r;
2747}
2748
2766/* 2749/*
2767 * printk rate limiting, lifted from the networking subsystem. 2750 * printk rate limiting, lifted from the networking subsystem.
2768 * 2751 *
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 033e50a7d706..3cdaeaef9ce1 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -80,8 +80,8 @@ static void queue_flush_work(struct printk_safe_seq_buf *s)
80 * happen, printk_safe_log_store() will notice the buffer->len mismatch 80 * happen, printk_safe_log_store() will notice the buffer->len mismatch
81 * and repeat the write. 81 * and repeat the write.
82 */ 82 */
83static int printk_safe_log_store(struct printk_safe_seq_buf *s, 83static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
84 const char *fmt, va_list args) 84 const char *fmt, va_list args)
85{ 85{
86 int add; 86 int add;
87 size_t len; 87 size_t len;
@@ -299,7 +299,7 @@ void printk_safe_flush_on_panic(void)
299 * one writer running. But the buffer might get flushed from another 299 * one writer running. But the buffer might get flushed from another
300 * CPU, so we need to be careful. 300 * CPU, so we need to be careful.
301 */ 301 */
302static int vprintk_nmi(const char *fmt, va_list args) 302static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
303{ 303{
304 struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); 304 struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
305 305
@@ -308,17 +308,29 @@ static int vprintk_nmi(const char *fmt, va_list args)
308 308
309void printk_nmi_enter(void) 309void printk_nmi_enter(void)
310{ 310{
311 this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); 311 /*
312 * The size of the extra per-CPU buffer is limited. Use it only when
313 * the main one is locked. If this CPU is not in the safe context,
314 * the lock must be taken on another CPU and we could wait for it.
315 */
316 if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) &&
317 raw_spin_is_locked(&logbuf_lock)) {
318 this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
319 } else {
320 this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK);
321 }
312} 322}
313 323
314void printk_nmi_exit(void) 324void printk_nmi_exit(void)
315{ 325{
316 this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); 326 this_cpu_and(printk_context,
327 ~(PRINTK_NMI_CONTEXT_MASK |
328 PRINTK_NMI_DEFERRED_CONTEXT_MASK));
317} 329}
318 330
319#else 331#else
320 332
321static int vprintk_nmi(const char *fmt, va_list args) 333static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
322{ 334{
323 return 0; 335 return 0;
324} 336}
@@ -330,7 +342,7 @@ static int vprintk_nmi(const char *fmt, va_list args)
330 * into itself. It uses a per-CPU buffer to store the message, just like 342 * into itself. It uses a per-CPU buffer to store the message, just like
331 * NMI. 343 * NMI.
332 */ 344 */
333static int vprintk_safe(const char *fmt, va_list args) 345static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args)
334{ 346{
335 struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); 347 struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
336 348
@@ -351,12 +363,22 @@ void __printk_safe_exit(void)
351 363
352__printf(1, 0) int vprintk_func(const char *fmt, va_list args) 364__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
353{ 365{
366 /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
354 if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) 367 if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
355 return vprintk_nmi(fmt, args); 368 return vprintk_nmi(fmt, args);
356 369
370 /* Use extra buffer to prevent a recursion deadlock in safe mode. */
357 if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) 371 if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
358 return vprintk_safe(fmt, args); 372 return vprintk_safe(fmt, args);
359 373
374 /*
375 * Use the main logbuf when logbuf_lock is available in NMI.
376 * But avoid calling console drivers that might have their own locks.
377 */
378 if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK)
379 return vprintk_deferred(fmt, args);
380
381 /* No obstacles. */
360 return vprintk_default(fmt, args); 382 return vprintk_default(fmt, args);
361} 383}
362 384
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 266ddcc1d8bb..60f356d91060 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -60,19 +60,25 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
60} 60}
61 61
62 62
63void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
64 const struct cred *ptracer_cred)
65{
66 BUG_ON(!list_empty(&child->ptrace_entry));
67 list_add(&child->ptrace_entry, &new_parent->ptraced);
68 child->parent = new_parent;
69 child->ptracer_cred = get_cred(ptracer_cred);
70}
71
63/* 72/*
64 * ptrace a task: make the debugger its new parent and 73 * ptrace a task: make the debugger its new parent and
65 * move it to the ptrace list. 74 * move it to the ptrace list.
66 * 75 *
67 * Must be called with the tasklist lock write-held. 76 * Must be called with the tasklist lock write-held.
68 */ 77 */
69void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) 78static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
70{ 79{
71 BUG_ON(!list_empty(&child->ptrace_entry));
72 list_add(&child->ptrace_entry, &new_parent->ptraced);
73 child->parent = new_parent;
74 rcu_read_lock(); 80 rcu_read_lock();
75 child->ptracer_cred = get_cred(__task_cred(new_parent)); 81 __ptrace_link(child, new_parent, __task_cred(new_parent));
76 rcu_read_unlock(); 82 rcu_read_unlock();
77} 83}
78 84
@@ -386,7 +392,7 @@ static int ptrace_attach(struct task_struct *task, long request,
386 flags |= PT_SEIZED; 392 flags |= PT_SEIZED;
387 task->ptrace = flags; 393 task->ptrace = flags;
388 394
389 __ptrace_link(task, current); 395 ptrace_link(task, current);
390 396
391 /* SEIZE doesn't trap tracee on attach */ 397 /* SEIZE doesn't trap tracee on attach */
392 if (!seize) 398 if (!seize)
@@ -459,7 +465,7 @@ static int ptrace_traceme(void)
459 */ 465 */
460 if (!ret && !(current->real_parent->flags & PF_EXITING)) { 466 if (!ret && !(current->real_parent->flags & PF_EXITING)) {
461 current->ptrace = PT_PTRACED; 467 current->ptrace = PT_PTRACED;
462 __ptrace_link(current, current->real_parent); 468 ptrace_link(current, current->real_parent);
463 } 469 }
464 } 470 }
465 write_unlock_irq(&tasklist_lock); 471 write_unlock_irq(&tasklist_lock);
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
new file mode 100644
index 000000000000..be90c945063f
--- /dev/null
+++ b/kernel/rcu/Kconfig
@@ -0,0 +1,242 @@
1#
2# RCU-related configuration options
3#
4
5menu "RCU Subsystem"
6
7config TREE_RCU
8 bool
9 default y if !PREEMPT && SMP
10 help
11 This option selects the RCU implementation that is
12 designed for very large SMP system with hundreds or
13 thousands of CPUs. It also scales down nicely to
14 smaller systems.
15
16config PREEMPT_RCU
17 bool
18 default y if PREEMPT
19 help
20 This option selects the RCU implementation that is
21 designed for very large SMP systems with hundreds or
22 thousands of CPUs, but for which real-time response
23 is also required. It also scales down nicely to
24 smaller systems.
25
26 Select this option if you are unsure.
27
28config TINY_RCU
29 bool
30 default y if !PREEMPT && !SMP
31 help
32 This option selects the RCU implementation that is
33 designed for UP systems from which real-time response
34 is not required. This option greatly reduces the
35 memory footprint of RCU.
36
37config RCU_EXPERT
38 bool "Make expert-level adjustments to RCU configuration"
39 default n
40 help
41 This option needs to be enabled if you wish to make
42 expert-level adjustments to RCU configuration. By default,
43 no such adjustments can be made, which has the often-beneficial
44 side-effect of preventing "make oldconfig" from asking you all
45 sorts of detailed questions about how you would like numerous
46 obscure RCU options to be set up.
47
48 Say Y if you need to make expert-level adjustments to RCU.
49
50 Say N if you are unsure.
51
52config SRCU
53 bool
54 help
55 This option selects the sleepable version of RCU. This version
56 permits arbitrary sleeping or blocking within RCU read-side critical
57 sections.
58
59config TINY_SRCU
60 bool
61 default y if SRCU && TINY_RCU
62 help
63 This option selects the single-CPU non-preemptible version of SRCU.
64
65config TREE_SRCU
66 bool
67 default y if SRCU && !TINY_RCU
68 help
69 This option selects the full-fledged version of SRCU.
70
71config TASKS_RCU
72 bool
73 default n
74 select SRCU
75 help
76 This option enables a task-based RCU implementation that uses
77 only voluntary context switch (not preemption!), idle, and
78 user-mode execution as quiescent states.
79
80config RCU_STALL_COMMON
81 def_bool ( TREE_RCU || PREEMPT_RCU )
82 help
83 This option enables RCU CPU stall code that is common between
84 the TINY and TREE variants of RCU. The purpose is to allow
85 the tiny variants to disable RCU CPU stall warnings, while
86 making these warnings mandatory for the tree variants.
87
88config RCU_NEED_SEGCBLIST
89 def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU )
90
91config CONTEXT_TRACKING
92 bool
93
94config CONTEXT_TRACKING_FORCE
95 bool "Force context tracking"
96 depends on CONTEXT_TRACKING
97 default y if !NO_HZ_FULL
98 help
99 The major pre-requirement for full dynticks to work is to
100 support the context tracking subsystem. But there are also
101 other dependencies to provide in order to make the full
102 dynticks working.
103
104 This option stands for testing when an arch implements the
105 context tracking backend but doesn't yet fullfill all the
106 requirements to make the full dynticks feature working.
107 Without the full dynticks, there is no way to test the support
108 for context tracking and the subsystems that rely on it: RCU
109 userspace extended quiescent state and tickless cputime
110 accounting. This option copes with the absence of the full
111 dynticks subsystem by forcing the context tracking on all
112 CPUs in the system.
113
114 Say Y only if you're working on the development of an
115 architecture backend for the context tracking.
116
117 Say N otherwise, this option brings an overhead that you
118 don't want in production.
119
120
121config RCU_FANOUT
122 int "Tree-based hierarchical RCU fanout value"
123 range 2 64 if 64BIT
124 range 2 32 if !64BIT
125 depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
126 default 64 if 64BIT
127 default 32 if !64BIT
128 help
129 This option controls the fanout of hierarchical implementations
130 of RCU, allowing RCU to work efficiently on machines with
131 large numbers of CPUs. This value must be at least the fourth
132 root of NR_CPUS, which allows NR_CPUS to be insanely large.
133 The default value of RCU_FANOUT should be used for production
134 systems, but if you are stress-testing the RCU implementation
135 itself, small RCU_FANOUT values allow you to test large-system
136 code paths on small(er) systems.
137
138 Select a specific number if testing RCU itself.
139 Take the default if unsure.
140
141config RCU_FANOUT_LEAF
142 int "Tree-based hierarchical RCU leaf-level fanout value"
143 range 2 64 if 64BIT
144 range 2 32 if !64BIT
145 depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
146 default 16
147 help
148 This option controls the leaf-level fanout of hierarchical
149 implementations of RCU, and allows trading off cache misses
150 against lock contention. Systems that synchronize their
151 scheduling-clock interrupts for energy-efficiency reasons will
152 want the default because the smaller leaf-level fanout keeps
153 lock contention levels acceptably low. Very large systems
154 (hundreds or thousands of CPUs) will instead want to set this
155 value to the maximum value possible in order to reduce the
156 number of cache misses incurred during RCU's grace-period
157 initialization. These systems tend to run CPU-bound, and thus
158 are not helped by synchronized interrupts, and thus tend to
159 skew them, which reduces lock contention enough that large
160 leaf-level fanouts work well. That said, setting leaf-level
161 fanout to a large number will likely cause problematic
162 lock contention on the leaf-level rcu_node structures unless
163 you boot with the skew_tick kernel parameter.
164
165 Select a specific number if testing RCU itself.
166
167 Select the maximum permissible value for large systems, but
168 please understand that you may also need to set the skew_tick
169 kernel boot parameter to avoid contention on the rcu_node
170 structure's locks.
171
172 Take the default if unsure.
173
174config RCU_FAST_NO_HZ
175 bool "Accelerate last non-dyntick-idle CPU's grace periods"
176 depends on NO_HZ_COMMON && SMP && RCU_EXPERT
177 default n
178 help
179 This option permits CPUs to enter dynticks-idle state even if
180 they have RCU callbacks queued, and prevents RCU from waking
181 these CPUs up more than roughly once every four jiffies (by
182 default, you can adjust this using the rcutree.rcu_idle_gp_delay
183 parameter), thus improving energy efficiency. On the other
184 hand, this option increases the duration of RCU grace periods,
185 for example, slowing down synchronize_rcu().
186
187 Say Y if energy efficiency is critically important, and you
188 don't care about increased grace-period durations.
189
190 Say N if you are unsure.
191
192config RCU_BOOST
193 bool "Enable RCU priority boosting"
194 depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
195 default n
196 help
197 This option boosts the priority of preempted RCU readers that
198 block the current preemptible RCU grace period for too long.
199 This option also prevents heavy loads from blocking RCU
200 callback invocation for all flavors of RCU.
201
202 Say Y here if you are working with real-time apps or heavy loads
203 Say N here if you are unsure.
204
205config RCU_BOOST_DELAY
206 int "Milliseconds to delay boosting after RCU grace-period start"
207 range 0 3000
208 depends on RCU_BOOST
209 default 500
210 help
211 This option specifies the time to wait after the beginning of
212 a given grace period before priority-boosting preempted RCU
213 readers blocking that grace period. Note that any RCU reader
214 blocking an expedited RCU grace period is boosted immediately.
215
216 Accept the default if unsure.
217
218config RCU_NOCB_CPU
219 bool "Offload RCU callback processing from boot-selected CPUs"
220 depends on TREE_RCU || PREEMPT_RCU
221 depends on RCU_EXPERT || NO_HZ_FULL
222 default n
223 help
224 Use this option to reduce OS jitter for aggressive HPC or
225 real-time workloads. It can also be used to offload RCU
226 callback invocation to energy-efficient CPUs in battery-powered
227 asymmetric multiprocessors.
228
229 This option offloads callback invocation from the set of
230 CPUs specified at boot time by the rcu_nocbs parameter.
231 For each such CPU, a kthread ("rcuox/N") will be created to
232 invoke callbacks, where the "N" is the CPU being offloaded,
233 and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
234 "s" for RCU-sched. Nothing prevents this kthread from running
235 on the specified CPUs, but (1) the kthreads may be preempted
236 between each callback, and (2) affinity or cgroups can be used
237 to force the kthreads to run on whatever set of CPUs is desired.
238
239 Say Y here if you want to help to debug reduced OS jitter.
240 Say N here if you are unsure.
241
242endmenu # "RCU Subsystem"
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
new file mode 100644
index 000000000000..0ec7d1d33a14
--- /dev/null
+++ b/kernel/rcu/Kconfig.debug
@@ -0,0 +1,82 @@
1#
2# RCU-related debugging configuration options
3#
4
5menu "RCU Debugging"
6
7config PROVE_RCU
8 def_bool PROVE_LOCKING
9
10config TORTURE_TEST
11 tristate
12 default n
13
14config RCU_PERF_TEST
15 tristate "performance tests for RCU"
16 depends on DEBUG_KERNEL
17 select TORTURE_TEST
18 select SRCU
19 select TASKS_RCU
20 default n
21 help
22 This option provides a kernel module that runs performance
23 tests on the RCU infrastructure. The kernel module may be built
24 after the fact on the running kernel to be tested, if desired.
25
26 Say Y here if you want RCU performance tests to be built into
27 the kernel.
28 Say M if you want the RCU performance tests to build as a module.
29 Say N if you are unsure.
30
31config RCU_TORTURE_TEST
32 tristate "torture tests for RCU"
33 depends on DEBUG_KERNEL
34 select TORTURE_TEST
35 select SRCU
36 select TASKS_RCU
37 default n
38 help
39 This option provides a kernel module that runs torture tests
40 on the RCU infrastructure. The kernel module may be built
41 after the fact on the running kernel to be tested, if desired.
42
43 Say Y here if you want RCU torture tests to be built into
44 the kernel.
45 Say M if you want the RCU torture tests to build as a module.
46 Say N if you are unsure.
47
48config RCU_CPU_STALL_TIMEOUT
49 int "RCU CPU stall timeout in seconds"
50 depends on RCU_STALL_COMMON
51 range 3 300
52 default 21
53 help
54 If a given RCU grace period extends more than the specified
55 number of seconds, a CPU stall warning is printed. If the
56 RCU grace period persists, additional CPU stall warnings are
57 printed at more widely spaced intervals.
58
59config RCU_TRACE
60 bool "Enable tracing for RCU"
61 depends on DEBUG_KERNEL
62 default y if TREE_RCU
63 select TRACE_CLOCK
64 help
65 This option enables additional tracepoints for ftrace-style
66 event tracing.
67
68 Say Y here if you want to enable RCU tracing
69 Say N if you are unsure.
70
71config RCU_EQS_DEBUG
72 bool "Provide debugging asserts for adding NO_HZ support to an arch"
73 depends on DEBUG_KERNEL
74 help
75 This option provides consistency checks in RCU's handling of
76 NO_HZ. These checks have proven quite helpful in detecting
77 bugs in arch-specific NO_HZ code.
78
79 Say N here if you need ultimate kernel/user switch latencies
80 Say Y if you are unsure
81
82endmenu # "RCU Debugging"
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 23803c7d5180..13c0fc852767 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,13 +3,11 @@
3KCOV_INSTRUMENT := n 3KCOV_INSTRUMENT := n
4 4
5obj-y += update.o sync.o 5obj-y += update.o sync.o
6obj-$(CONFIG_CLASSIC_SRCU) += srcu.o
7obj-$(CONFIG_TREE_SRCU) += srcutree.o 6obj-$(CONFIG_TREE_SRCU) += srcutree.o
8obj-$(CONFIG_TINY_SRCU) += srcutiny.o 7obj-$(CONFIG_TINY_SRCU) += srcutiny.o
9obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 8obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
10obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o 9obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
11obj-$(CONFIG_TREE_RCU) += tree.o 10obj-$(CONFIG_TREE_RCU) += tree.o
12obj-$(CONFIG_PREEMPT_RCU) += tree.o 11obj-$(CONFIG_PREEMPT_RCU) += tree.o
13obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
14obj-$(CONFIG_TINY_RCU) += tiny.o 12obj-$(CONFIG_TINY_RCU) += tiny.o
15obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o 13obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 73e16ec4054b..808b8c85f626 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -212,6 +212,18 @@ int rcu_jiffies_till_stall_check(void);
212 */ 212 */
213#define TPS(x) tracepoint_string(x) 213#define TPS(x) tracepoint_string(x)
214 214
215/*
216 * Dump the ftrace buffer, but only one time per callsite per boot.
217 */
218#define rcu_ftrace_dump(oops_dump_mode) \
219do { \
220 static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
221 \
222 if (!atomic_read(&___rfd_beenhere) && \
223 !atomic_xchg(&___rfd_beenhere, 1)) \
224 ftrace_dump(oops_dump_mode); \
225} while (0)
226
215void rcu_early_boot_tests(void); 227void rcu_early_boot_tests(void);
216void rcu_test_sync_prims(void); 228void rcu_test_sync_prims(void);
217 229
@@ -291,6 +303,271 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
291 cpu <= rnp->grphi; \ 303 cpu <= rnp->grphi; \
292 cpu = cpumask_next((cpu), cpu_possible_mask)) 304 cpu = cpumask_next((cpu), cpu_possible_mask))
293 305
306/*
307 * Wrappers for the rcu_node::lock acquire and release.
308 *
309 * Because the rcu_nodes form a tree, the tree traversal locking will observe
310 * different lock values, this in turn means that an UNLOCK of one level
311 * followed by a LOCK of another level does not imply a full memory barrier;
312 * and most importantly transitivity is lost.
313 *
314 * In order to restore full ordering between tree levels, augment the regular
315 * lock acquire functions with smp_mb__after_unlock_lock().
316 *
317 * As ->lock of struct rcu_node is a __private field, therefore one should use
318 * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
319 */
320#define raw_spin_lock_rcu_node(p) \
321do { \
322 raw_spin_lock(&ACCESS_PRIVATE(p, lock)); \
323 smp_mb__after_unlock_lock(); \
324} while (0)
325
326#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock))
327
328#define raw_spin_lock_irq_rcu_node(p) \
329do { \
330 raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
331 smp_mb__after_unlock_lock(); \
332} while (0)
333
334#define raw_spin_unlock_irq_rcu_node(p) \
335 raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
336
337#define raw_spin_lock_irqsave_rcu_node(p, flags) \
338do { \
339 raw_spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
340 smp_mb__after_unlock_lock(); \
341} while (0)
342
343#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \
344 raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
345
346#define raw_spin_trylock_rcu_node(p) \
347({ \
348 bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock)); \
349 \
350 if (___locked) \
351 smp_mb__after_unlock_lock(); \
352 ___locked; \
353})
354
294#endif /* #if defined(SRCU) || !defined(TINY_RCU) */ 355#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
295 356
357#ifdef CONFIG_TINY_RCU
358/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
359static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */
360{
361 return true;
362}
363static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */
364{
365 return false;
366}
367
368static inline void rcu_expedite_gp(void)
369{
370}
371
372static inline void rcu_unexpedite_gp(void)
373{
374}
375#else /* #ifdef CONFIG_TINY_RCU */
376bool rcu_gp_is_normal(void); /* Internal RCU use. */
377bool rcu_gp_is_expedited(void); /* Internal RCU use. */
378void rcu_expedite_gp(void);
379void rcu_unexpedite_gp(void);
380void rcupdate_announce_bootup_oddness(void);
381#endif /* #else #ifdef CONFIG_TINY_RCU */
382
383#define RCU_SCHEDULER_INACTIVE 0
384#define RCU_SCHEDULER_INIT 1
385#define RCU_SCHEDULER_RUNNING 2
386
387#ifdef CONFIG_TINY_RCU
388static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
389#else /* #ifdef CONFIG_TINY_RCU */
390void rcu_request_urgent_qs_task(struct task_struct *t);
391#endif /* #else #ifdef CONFIG_TINY_RCU */
392
393enum rcutorture_type {
394 RCU_FLAVOR,
395 RCU_BH_FLAVOR,
396 RCU_SCHED_FLAVOR,
397 RCU_TASKS_FLAVOR,
398 SRCU_FLAVOR,
399 INVALID_RCU_FLAVOR
400};
401
402#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
403void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
404 unsigned long *gpnum, unsigned long *completed);
405void rcutorture_record_test_transition(void);
406void rcutorture_record_progress(unsigned long vernum);
407void do_trace_rcu_torture_read(const char *rcutorturename,
408 struct rcu_head *rhp,
409 unsigned long secs,
410 unsigned long c_old,
411 unsigned long c);
412#else
413static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
414 int *flags,
415 unsigned long *gpnum,
416 unsigned long *completed)
417{
418 *flags = 0;
419 *gpnum = 0;
420 *completed = 0;
421}
422static inline void rcutorture_record_test_transition(void)
423{
424}
425static inline void rcutorture_record_progress(unsigned long vernum)
426{
427}
428#ifdef CONFIG_RCU_TRACE
429void do_trace_rcu_torture_read(const char *rcutorturename,
430 struct rcu_head *rhp,
431 unsigned long secs,
432 unsigned long c_old,
433 unsigned long c);
434#else
435#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
436 do { } while (0)
437#endif
438#endif
439
440#ifdef CONFIG_TINY_SRCU
441
442static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
443 struct srcu_struct *sp, int *flags,
444 unsigned long *gpnum,
445 unsigned long *completed)
446{
447 if (test_type != SRCU_FLAVOR)
448 return;
449 *flags = 0;
450 *completed = sp->srcu_idx;
451 *gpnum = *completed;
452}
453
454#elif defined(CONFIG_TREE_SRCU)
455
456void srcutorture_get_gp_data(enum rcutorture_type test_type,
457 struct srcu_struct *sp, int *flags,
458 unsigned long *gpnum, unsigned long *completed);
459
460#endif
461
462#ifdef CONFIG_TINY_RCU
463
464/*
465 * Return the number of grace periods started.
466 */
467static inline unsigned long rcu_batches_started(void)
468{
469 return 0;
470}
471
472/*
473 * Return the number of bottom-half grace periods started.
474 */
475static inline unsigned long rcu_batches_started_bh(void)
476{
477 return 0;
478}
479
480/*
481 * Return the number of sched grace periods started.
482 */
483static inline unsigned long rcu_batches_started_sched(void)
484{
485 return 0;
486}
487
488/*
489 * Return the number of grace periods completed.
490 */
491static inline unsigned long rcu_batches_completed(void)
492{
493 return 0;
494}
495
496/*
497 * Return the number of bottom-half grace periods completed.
498 */
499static inline unsigned long rcu_batches_completed_bh(void)
500{
501 return 0;
502}
503
504/*
505 * Return the number of sched grace periods completed.
506 */
507static inline unsigned long rcu_batches_completed_sched(void)
508{
509 return 0;
510}
511
512/*
513 * Return the number of expedited grace periods completed.
514 */
515static inline unsigned long rcu_exp_batches_completed(void)
516{
517 return 0;
518}
519
520/*
521 * Return the number of expedited sched grace periods completed.
522 */
523static inline unsigned long rcu_exp_batches_completed_sched(void)
524{
525 return 0;
526}
527
528static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
529{
530 return 0;
531}
532
533static inline void rcu_force_quiescent_state(void)
534{
535}
536
537static inline void rcu_bh_force_quiescent_state(void)
538{
539}
540
541static inline void rcu_sched_force_quiescent_state(void)
542{
543}
544
545static inline void show_rcu_gp_kthreads(void)
546{
547}
548
549#else /* #ifdef CONFIG_TINY_RCU */
550extern unsigned long rcutorture_testseq;
551extern unsigned long rcutorture_vernum;
552unsigned long rcu_batches_started(void);
553unsigned long rcu_batches_started_bh(void);
554unsigned long rcu_batches_started_sched(void);
555unsigned long rcu_batches_completed(void);
556unsigned long rcu_batches_completed_bh(void);
557unsigned long rcu_batches_completed_sched(void);
558unsigned long rcu_exp_batches_completed(void);
559unsigned long rcu_exp_batches_completed_sched(void);
560unsigned long srcu_batches_completed(struct srcu_struct *sp);
561void show_rcu_gp_kthreads(void);
562void rcu_force_quiescent_state(void);
563void rcu_bh_force_quiescent_state(void);
564void rcu_sched_force_quiescent_state(void);
565#endif /* #else #ifdef CONFIG_TINY_RCU */
566
567#ifdef CONFIG_RCU_NOCB_CPU
568bool rcu_is_nocb_cpu(int cpu);
569#else
570static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
571#endif
572
296#endif /* __LINUX_RCU_H */ 573#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index a4a86fb47e4a..3cc18110b612 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -48,6 +48,8 @@
48#include <linux/torture.h> 48#include <linux/torture.h>
49#include <linux/vmalloc.h> 49#include <linux/vmalloc.h>
50 50
51#include "rcu.h"
52
51MODULE_LICENSE("GPL"); 53MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); 54MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
53 55
@@ -59,12 +61,16 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
59#define VERBOSE_PERFOUT_ERRSTRING(s) \ 61#define VERBOSE_PERFOUT_ERRSTRING(s) \
60 do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) 62 do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
61 63
64torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
65torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
62torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); 66torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
63torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); 67torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
64torture_param(int, nreaders, -1, "Number of RCU reader threads"); 68torture_param(int, nreaders, 0, "Number of RCU reader threads");
65torture_param(int, nwriters, -1, "Number of RCU updater threads"); 69torture_param(int, nwriters, -1, "Number of RCU updater threads");
66torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); 70torture_param(bool, shutdown, !IS_ENABLED(MODULE),
71 "Shutdown at end of performance tests.");
67torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); 72torture_param(bool, verbose, true, "Enable verbose debugging printk()s");
73torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
68 74
69static char *perf_type = "rcu"; 75static char *perf_type = "rcu";
70module_param(perf_type, charp, 0444); 76module_param(perf_type, charp, 0444);
@@ -86,13 +92,16 @@ static u64 t_rcu_perf_writer_started;
86static u64 t_rcu_perf_writer_finished; 92static u64 t_rcu_perf_writer_finished;
87static unsigned long b_rcu_perf_writer_started; 93static unsigned long b_rcu_perf_writer_started;
88static unsigned long b_rcu_perf_writer_finished; 94static unsigned long b_rcu_perf_writer_finished;
95static DEFINE_PER_CPU(atomic_t, n_async_inflight);
89 96
90static int rcu_perf_writer_state; 97static int rcu_perf_writer_state;
91#define RTWS_INIT 0 98#define RTWS_INIT 0
92#define RTWS_EXP_SYNC 1 99#define RTWS_ASYNC 1
93#define RTWS_SYNC 2 100#define RTWS_BARRIER 2
94#define RTWS_IDLE 2 101#define RTWS_EXP_SYNC 3
95#define RTWS_STOPPING 3 102#define RTWS_SYNC 4
103#define RTWS_IDLE 5
104#define RTWS_STOPPING 6
96 105
97#define MAX_MEAS 10000 106#define MAX_MEAS 10000
98#define MIN_MEAS 100 107#define MIN_MEAS 100
@@ -114,6 +123,8 @@ struct rcu_perf_ops {
114 unsigned long (*started)(void); 123 unsigned long (*started)(void);
115 unsigned long (*completed)(void); 124 unsigned long (*completed)(void);
116 unsigned long (*exp_completed)(void); 125 unsigned long (*exp_completed)(void);
126 void (*async)(struct rcu_head *head, rcu_callback_t func);
127 void (*gp_barrier)(void);
117 void (*sync)(void); 128 void (*sync)(void);
118 void (*exp_sync)(void); 129 void (*exp_sync)(void);
119 const char *name; 130 const char *name;
@@ -153,6 +164,8 @@ static struct rcu_perf_ops rcu_ops = {
153 .started = rcu_batches_started, 164 .started = rcu_batches_started,
154 .completed = rcu_batches_completed, 165 .completed = rcu_batches_completed,
155 .exp_completed = rcu_exp_batches_completed, 166 .exp_completed = rcu_exp_batches_completed,
167 .async = call_rcu,
168 .gp_barrier = rcu_barrier,
156 .sync = synchronize_rcu, 169 .sync = synchronize_rcu,
157 .exp_sync = synchronize_rcu_expedited, 170 .exp_sync = synchronize_rcu_expedited,
158 .name = "rcu" 171 .name = "rcu"
@@ -181,6 +194,8 @@ static struct rcu_perf_ops rcu_bh_ops = {
181 .started = rcu_batches_started_bh, 194 .started = rcu_batches_started_bh,
182 .completed = rcu_batches_completed_bh, 195 .completed = rcu_batches_completed_bh,
183 .exp_completed = rcu_exp_batches_completed_sched, 196 .exp_completed = rcu_exp_batches_completed_sched,
197 .async = call_rcu_bh,
198 .gp_barrier = rcu_barrier_bh,
184 .sync = synchronize_rcu_bh, 199 .sync = synchronize_rcu_bh,
185 .exp_sync = synchronize_rcu_bh_expedited, 200 .exp_sync = synchronize_rcu_bh_expedited,
186 .name = "rcu_bh" 201 .name = "rcu_bh"
@@ -208,6 +223,16 @@ static unsigned long srcu_perf_completed(void)
208 return srcu_batches_completed(srcu_ctlp); 223 return srcu_batches_completed(srcu_ctlp);
209} 224}
210 225
226static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func)
227{
228 call_srcu(srcu_ctlp, head, func);
229}
230
231static void srcu_rcu_barrier(void)
232{
233 srcu_barrier(srcu_ctlp);
234}
235
211static void srcu_perf_synchronize(void) 236static void srcu_perf_synchronize(void)
212{ 237{
213 synchronize_srcu(srcu_ctlp); 238 synchronize_srcu(srcu_ctlp);
@@ -226,11 +251,42 @@ static struct rcu_perf_ops srcu_ops = {
226 .started = NULL, 251 .started = NULL,
227 .completed = srcu_perf_completed, 252 .completed = srcu_perf_completed,
228 .exp_completed = srcu_perf_completed, 253 .exp_completed = srcu_perf_completed,
254 .async = srcu_call_rcu,
255 .gp_barrier = srcu_rcu_barrier,
229 .sync = srcu_perf_synchronize, 256 .sync = srcu_perf_synchronize,
230 .exp_sync = srcu_perf_synchronize_expedited, 257 .exp_sync = srcu_perf_synchronize_expedited,
231 .name = "srcu" 258 .name = "srcu"
232}; 259};
233 260
261static struct srcu_struct srcud;
262
263static void srcu_sync_perf_init(void)
264{
265 srcu_ctlp = &srcud;
266 init_srcu_struct(srcu_ctlp);
267}
268
269static void srcu_sync_perf_cleanup(void)
270{
271 cleanup_srcu_struct(srcu_ctlp);
272}
273
274static struct rcu_perf_ops srcud_ops = {
275 .ptype = SRCU_FLAVOR,
276 .init = srcu_sync_perf_init,
277 .cleanup = srcu_sync_perf_cleanup,
278 .readlock = srcu_perf_read_lock,
279 .readunlock = srcu_perf_read_unlock,
280 .started = NULL,
281 .completed = srcu_perf_completed,
282 .exp_completed = srcu_perf_completed,
283 .async = srcu_call_rcu,
284 .gp_barrier = srcu_rcu_barrier,
285 .sync = srcu_perf_synchronize,
286 .exp_sync = srcu_perf_synchronize_expedited,
287 .name = "srcud"
288};
289
234/* 290/*
235 * Definitions for sched perf testing. 291 * Definitions for sched perf testing.
236 */ 292 */
@@ -254,6 +310,8 @@ static struct rcu_perf_ops sched_ops = {
254 .started = rcu_batches_started_sched, 310 .started = rcu_batches_started_sched,
255 .completed = rcu_batches_completed_sched, 311 .completed = rcu_batches_completed_sched,
256 .exp_completed = rcu_exp_batches_completed_sched, 312 .exp_completed = rcu_exp_batches_completed_sched,
313 .async = call_rcu_sched,
314 .gp_barrier = rcu_barrier_sched,
257 .sync = synchronize_sched, 315 .sync = synchronize_sched,
258 .exp_sync = synchronize_sched_expedited, 316 .exp_sync = synchronize_sched_expedited,
259 .name = "sched" 317 .name = "sched"
@@ -281,6 +339,8 @@ static struct rcu_perf_ops tasks_ops = {
281 .readunlock = tasks_perf_read_unlock, 339 .readunlock = tasks_perf_read_unlock,
282 .started = rcu_no_completed, 340 .started = rcu_no_completed,
283 .completed = rcu_no_completed, 341 .completed = rcu_no_completed,
342 .async = call_rcu_tasks,
343 .gp_barrier = rcu_barrier_tasks,
284 .sync = synchronize_rcu_tasks, 344 .sync = synchronize_rcu_tasks,
285 .exp_sync = synchronize_rcu_tasks, 345 .exp_sync = synchronize_rcu_tasks,
286 .name = "tasks" 346 .name = "tasks"
@@ -344,6 +404,15 @@ rcu_perf_reader(void *arg)
344} 404}
345 405
346/* 406/*
407 * Callback function for asynchronous grace periods from rcu_perf_writer().
408 */
409static void rcu_perf_async_cb(struct rcu_head *rhp)
410{
411 atomic_dec(this_cpu_ptr(&n_async_inflight));
412 kfree(rhp);
413}
414
415/*
347 * RCU perf writer kthread. Repeatedly does a grace period. 416 * RCU perf writer kthread. Repeatedly does a grace period.
348 */ 417 */
349static int 418static int
@@ -352,6 +421,7 @@ rcu_perf_writer(void *arg)
352 int i = 0; 421 int i = 0;
353 int i_max; 422 int i_max;
354 long me = (long)arg; 423 long me = (long)arg;
424 struct rcu_head *rhp = NULL;
355 struct sched_param sp; 425 struct sched_param sp;
356 bool started = false, done = false, alldone = false; 426 bool started = false, done = false, alldone = false;
357 u64 t; 427 u64 t;
@@ -380,9 +450,27 @@ rcu_perf_writer(void *arg)
380 } 450 }
381 451
382 do { 452 do {
453 if (writer_holdoff)
454 udelay(writer_holdoff);
383 wdp = &wdpp[i]; 455 wdp = &wdpp[i];
384 *wdp = ktime_get_mono_fast_ns(); 456 *wdp = ktime_get_mono_fast_ns();
385 if (gp_exp) { 457 if (gp_async) {
458retry:
459 if (!rhp)
460 rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
461 if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
462 rcu_perf_writer_state = RTWS_ASYNC;
463 atomic_inc(this_cpu_ptr(&n_async_inflight));
464 cur_ops->async(rhp, rcu_perf_async_cb);
465 rhp = NULL;
466 } else if (!kthread_should_stop()) {
467 rcu_perf_writer_state = RTWS_BARRIER;
468 cur_ops->gp_barrier();
469 goto retry;
470 } else {
471 kfree(rhp); /* Because we are stopping. */
472 }
473 } else if (gp_exp) {
386 rcu_perf_writer_state = RTWS_EXP_SYNC; 474 rcu_perf_writer_state = RTWS_EXP_SYNC;
387 cur_ops->exp_sync(); 475 cur_ops->exp_sync();
388 } else { 476 } else {
@@ -429,6 +517,10 @@ rcu_perf_writer(void *arg)
429 i++; 517 i++;
430 rcu_perf_wait_shutdown(); 518 rcu_perf_wait_shutdown();
431 } while (!torture_must_stop()); 519 } while (!torture_must_stop());
520 if (gp_async) {
521 rcu_perf_writer_state = RTWS_BARRIER;
522 cur_ops->gp_barrier();
523 }
432 rcu_perf_writer_state = RTWS_STOPPING; 524 rcu_perf_writer_state = RTWS_STOPPING;
433 writer_n_durations[me] = i_max; 525 writer_n_durations[me] = i_max;
434 torture_kthread_stopping("rcu_perf_writer"); 526 torture_kthread_stopping("rcu_perf_writer");
@@ -452,6 +544,17 @@ rcu_perf_cleanup(void)
452 u64 *wdp; 544 u64 *wdp;
453 u64 *wdpp; 545 u64 *wdpp;
454 546
547 /*
548 * Would like warning at start, but everything is expedited
549 * during the mid-boot phase, so have to wait till the end.
550 */
551 if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
552 VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
553 if (rcu_gp_is_normal() && gp_exp)
554 VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
555 if (gp_exp && gp_async)
556 VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!");
557
455 if (torture_cleanup_begin()) 558 if (torture_cleanup_begin())
456 return; 559 return;
457 560
@@ -554,7 +657,7 @@ rcu_perf_init(void)
554 long i; 657 long i;
555 int firsterr = 0; 658 int firsterr = 0;
556 static struct rcu_perf_ops *perf_ops[] = { 659 static struct rcu_perf_ops *perf_ops[] = {
557 &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, 660 &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops,
558 RCUPERF_TASKS_OPS 661 RCUPERF_TASKS_OPS
559 }; 662 };
560 663
@@ -624,16 +727,6 @@ rcu_perf_init(void)
624 firsterr = -ENOMEM; 727 firsterr = -ENOMEM;
625 goto unwind; 728 goto unwind;
626 } 729 }
627 if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) {
628 VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
629 firsterr = -EINVAL;
630 goto unwind;
631 }
632 if (rcu_gp_is_normal() && gp_exp) {
633 VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
634 firsterr = -EINVAL;
635 goto unwind;
636 }
637 for (i = 0; i < nrealwriters; i++) { 730 for (i = 0; i < nrealwriters; i++) {
638 writer_durations[i] = 731 writer_durations[i] =
639 kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), 732 kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index ae6e574d4cf5..b8f7f8ce8575 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -52,6 +52,8 @@
52#include <linux/torture.h> 52#include <linux/torture.h>
53#include <linux/vmalloc.h> 53#include <linux/vmalloc.h>
54 54
55#include "rcu.h"
56
55MODULE_LICENSE("GPL"); 57MODULE_LICENSE("GPL");
56MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); 58MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
57 59
@@ -562,31 +564,19 @@ static void srcu_torture_stats(void)
562 int __maybe_unused cpu; 564 int __maybe_unused cpu;
563 int idx; 565 int idx;
564 566
565#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU)
566#ifdef CONFIG_TREE_SRCU 567#ifdef CONFIG_TREE_SRCU
567 idx = srcu_ctlp->srcu_idx & 0x1; 568 idx = srcu_ctlp->srcu_idx & 0x1;
568#else /* #ifdef CONFIG_TREE_SRCU */
569 idx = srcu_ctlp->completed & 0x1;
570#endif /* #else #ifdef CONFIG_TREE_SRCU */
571 pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", 569 pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
572 torture_type, TORTURE_FLAG, idx); 570 torture_type, TORTURE_FLAG, idx);
573 for_each_possible_cpu(cpu) { 571 for_each_possible_cpu(cpu) {
574 unsigned long l0, l1; 572 unsigned long l0, l1;
575 unsigned long u0, u1; 573 unsigned long u0, u1;
576 long c0, c1; 574 long c0, c1;
577#ifdef CONFIG_TREE_SRCU
578 struct srcu_data *counts; 575 struct srcu_data *counts;
579 576
580 counts = per_cpu_ptr(srcu_ctlp->sda, cpu); 577 counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
581 u0 = counts->srcu_unlock_count[!idx]; 578 u0 = counts->srcu_unlock_count[!idx];
582 u1 = counts->srcu_unlock_count[idx]; 579 u1 = counts->srcu_unlock_count[idx];
583#else /* #ifdef CONFIG_TREE_SRCU */
584 struct srcu_array *counts;
585
586 counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
587 u0 = counts->unlock_count[!idx];
588 u1 = counts->unlock_count[idx];
589#endif /* #else #ifdef CONFIG_TREE_SRCU */
590 580
591 /* 581 /*
592 * Make sure that a lock is always counted if the corresponding 582 * Make sure that a lock is always counted if the corresponding
@@ -594,13 +584,8 @@ static void srcu_torture_stats(void)
594 */ 584 */
595 smp_rmb(); 585 smp_rmb();
596 586
597#ifdef CONFIG_TREE_SRCU
598 l0 = counts->srcu_lock_count[!idx]; 587 l0 = counts->srcu_lock_count[!idx];
599 l1 = counts->srcu_lock_count[idx]; 588 l1 = counts->srcu_lock_count[idx];
600#else /* #ifdef CONFIG_TREE_SRCU */
601 l0 = counts->lock_count[!idx];
602 l1 = counts->lock_count[idx];
603#endif /* #else #ifdef CONFIG_TREE_SRCU */
604 589
605 c0 = l0 - u0; 590 c0 = l0 - u0;
606 c1 = l1 - u1; 591 c1 = l1 - u1;
@@ -609,7 +594,7 @@ static void srcu_torture_stats(void)
609 pr_cont("\n"); 594 pr_cont("\n");
610#elif defined(CONFIG_TINY_SRCU) 595#elif defined(CONFIG_TINY_SRCU)
611 idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; 596 idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
612 pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n", 597 pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
613 torture_type, TORTURE_FLAG, idx, 598 torture_type, TORTURE_FLAG, idx,
614 READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), 599 READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
615 READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); 600 READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
deleted file mode 100644
index 584d8a983883..000000000000
--- a/kernel/rcu/srcu.c
+++ /dev/null
@@ -1,662 +0,0 @@
1/*
2 * Sleepable Read-Copy Update mechanism for mutual exclusion.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012
20 *
21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 * Lai Jiangshan <laijs@cn.fujitsu.com>
23 *
24 * For detailed explanation of Read-Copy Update mechanism see -
25 * Documentation/RCU/ *.txt
26 *
27 */
28
29#include <linux/export.h>
30#include <linux/mutex.h>
31#include <linux/percpu.h>
32#include <linux/preempt.h>
33#include <linux/rcupdate_wait.h>
34#include <linux/sched.h>
35#include <linux/smp.h>
36#include <linux/delay.h>
37#include <linux/srcu.h>
38
39#include "rcu.h"
40
41/*
42 * Initialize an rcu_batch structure to empty.
43 */
44static inline void rcu_batch_init(struct rcu_batch *b)
45{
46 b->head = NULL;
47 b->tail = &b->head;
48}
49
50/*
51 * Enqueue a callback onto the tail of the specified rcu_batch structure.
52 */
53static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
54{
55 *b->tail = head;
56 b->tail = &head->next;
57}
58
59/*
60 * Is the specified rcu_batch structure empty?
61 */
62static inline bool rcu_batch_empty(struct rcu_batch *b)
63{
64 return b->tail == &b->head;
65}
66
67/*
68 * Remove the callback at the head of the specified rcu_batch structure
69 * and return a pointer to it, or return NULL if the structure is empty.
70 */
71static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
72{
73 struct rcu_head *head;
74
75 if (rcu_batch_empty(b))
76 return NULL;
77
78 head = b->head;
79 b->head = head->next;
80 if (b->tail == &head->next)
81 rcu_batch_init(b);
82
83 return head;
84}
85
86/*
87 * Move all callbacks from the rcu_batch structure specified by "from" to
88 * the structure specified by "to".
89 */
90static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
91{
92 if (!rcu_batch_empty(from)) {
93 *to->tail = from->head;
94 to->tail = from->tail;
95 rcu_batch_init(from);
96 }
97}
98
99static int init_srcu_struct_fields(struct srcu_struct *sp)
100{
101 sp->completed = 0;
102 spin_lock_init(&sp->queue_lock);
103 sp->running = false;
104 rcu_batch_init(&sp->batch_queue);
105 rcu_batch_init(&sp->batch_check0);
106 rcu_batch_init(&sp->batch_check1);
107 rcu_batch_init(&sp->batch_done);
108 INIT_DELAYED_WORK(&sp->work, process_srcu);
109 sp->per_cpu_ref = alloc_percpu(struct srcu_array);
110 return sp->per_cpu_ref ? 0 : -ENOMEM;
111}
112
113#ifdef CONFIG_DEBUG_LOCK_ALLOC
114
115int __init_srcu_struct(struct srcu_struct *sp, const char *name,
116 struct lock_class_key *key)
117{
118 /* Don't re-initialize a lock while it is held. */
119 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
120 lockdep_init_map(&sp->dep_map, name, key, 0);
121 return init_srcu_struct_fields(sp);
122}
123EXPORT_SYMBOL_GPL(__init_srcu_struct);
124
125#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
126
127/**
128 * init_srcu_struct - initialize a sleep-RCU structure
129 * @sp: structure to initialize.
130 *
131 * Must invoke this on a given srcu_struct before passing that srcu_struct
132 * to any other function. Each srcu_struct represents a separate domain
133 * of SRCU protection.
134 */
135int init_srcu_struct(struct srcu_struct *sp)
136{
137 return init_srcu_struct_fields(sp);
138}
139EXPORT_SYMBOL_GPL(init_srcu_struct);
140
141#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
142
143/*
144 * Returns approximate total of the readers' ->lock_count[] values for the
145 * rank of per-CPU counters specified by idx.
146 */
147static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
148{
149 int cpu;
150 unsigned long sum = 0;
151
152 for_each_possible_cpu(cpu) {
153 struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
154
155 sum += READ_ONCE(cpuc->lock_count[idx]);
156 }
157 return sum;
158}
159
160/*
161 * Returns approximate total of the readers' ->unlock_count[] values for the
162 * rank of per-CPU counters specified by idx.
163 */
164static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
165{
166 int cpu;
167 unsigned long sum = 0;
168
169 for_each_possible_cpu(cpu) {
170 struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
171
172 sum += READ_ONCE(cpuc->unlock_count[idx]);
173 }
174 return sum;
175}
176
177/*
178 * Return true if the number of pre-existing readers is determined to
179 * be zero.
180 */
181static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
182{
183 unsigned long unlocks;
184
185 unlocks = srcu_readers_unlock_idx(sp, idx);
186
187 /*
188 * Make sure that a lock is always counted if the corresponding unlock
189 * is counted. Needs to be a smp_mb() as the read side may contain a
190 * read from a variable that is written to before the synchronize_srcu()
191 * in the write side. In this case smp_mb()s A and B act like the store
192 * buffering pattern.
193 *
194 * This smp_mb() also pairs with smp_mb() C to prevent accesses after the
195 * synchronize_srcu() from being executed before the grace period ends.
196 */
197 smp_mb(); /* A */
198
199 /*
200 * If the locks are the same as the unlocks, then there must have
201 * been no readers on this index at some time in between. This does not
202 * mean that there are no more readers, as one could have read the
203 * current index but not have incremented the lock counter yet.
204 *
205 * Possible bug: There is no guarantee that there haven't been ULONG_MAX
206 * increments of ->lock_count[] since the unlocks were counted, meaning
207 * that this could return true even if there are still active readers.
208 * Since there are no memory barriers around srcu_flip(), the CPU is not
209 * required to increment ->completed before running
210 * srcu_readers_unlock_idx(), which means that there could be an
211 * arbitrarily large number of critical sections that execute after
212 * srcu_readers_unlock_idx() but use the old value of ->completed.
213 */
214 return srcu_readers_lock_idx(sp, idx) == unlocks;
215}
216
217/**
218 * srcu_readers_active - returns true if there are readers. and false
219 * otherwise
220 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
221 *
222 * Note that this is not an atomic primitive, and can therefore suffer
223 * severe errors when invoked on an active srcu_struct. That said, it
224 * can be useful as an error check at cleanup time.
225 */
226static bool srcu_readers_active(struct srcu_struct *sp)
227{
228 int cpu;
229 unsigned long sum = 0;
230
231 for_each_possible_cpu(cpu) {
232 struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
233
234 sum += READ_ONCE(cpuc->lock_count[0]);
235 sum += READ_ONCE(cpuc->lock_count[1]);
236 sum -= READ_ONCE(cpuc->unlock_count[0]);
237 sum -= READ_ONCE(cpuc->unlock_count[1]);
238 }
239 return sum;
240}
241
242/**
243 * cleanup_srcu_struct - deconstruct a sleep-RCU structure
244 * @sp: structure to clean up.
245 *
246 * Must invoke this only after you are finished using a given srcu_struct
247 * that was initialized via init_srcu_struct(). This code does some
248 * probabalistic checking, spotting late uses of srcu_read_lock(),
249 * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu().
250 * If any such late uses are detected, the per-CPU memory associated with
251 * the srcu_struct is simply leaked and WARN_ON() is invoked. If the
252 * caller frees the srcu_struct itself, a use-after-free crash will likely
253 * ensue, but at least there will be a warning printed.
254 */
255void cleanup_srcu_struct(struct srcu_struct *sp)
256{
257 if (WARN_ON(srcu_readers_active(sp)))
258 return; /* Leakage unless caller handles error. */
259 free_percpu(sp->per_cpu_ref);
260 sp->per_cpu_ref = NULL;
261}
262EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
263
264/*
265 * Counts the new reader in the appropriate per-CPU element of the
266 * srcu_struct. Must be called from process context.
267 * Returns an index that must be passed to the matching srcu_read_unlock().
268 */
269int __srcu_read_lock(struct srcu_struct *sp)
270{
271 int idx;
272
273 idx = READ_ONCE(sp->completed) & 0x1;
274 __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
275 smp_mb(); /* B */ /* Avoid leaking the critical section. */
276 return idx;
277}
278EXPORT_SYMBOL_GPL(__srcu_read_lock);
279
280/*
281 * Removes the count for the old reader from the appropriate per-CPU
282 * element of the srcu_struct. Note that this may well be a different
283 * CPU than that which was incremented by the corresponding srcu_read_lock().
284 * Must be called from process context.
285 */
286void __srcu_read_unlock(struct srcu_struct *sp, int idx)
287{
288 smp_mb(); /* C */ /* Avoid leaking the critical section. */
289 this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
290}
291EXPORT_SYMBOL_GPL(__srcu_read_unlock);
292
293/*
294 * We use an adaptive strategy for synchronize_srcu() and especially for
295 * synchronize_srcu_expedited(). We spin for a fixed time period
296 * (defined below) to allow SRCU readers to exit their read-side critical
297 * sections. If there are still some readers after 10 microseconds,
298 * we repeatedly block for 1-millisecond time periods. This approach
299 * has done well in testing, so there is no need for a config parameter.
300 */
301#define SRCU_RETRY_CHECK_DELAY 5
302#define SYNCHRONIZE_SRCU_TRYCOUNT 2
303#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
304
305/*
306 * @@@ Wait until all pre-existing readers complete. Such readers
307 * will have used the index specified by "idx".
308 * the caller should ensures the ->completed is not changed while checking
309 * and idx = (->completed & 1) ^ 1
310 */
311static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
312{
313 for (;;) {
314 if (srcu_readers_active_idx_check(sp, idx))
315 return true;
316 if (--trycount <= 0)
317 return false;
318 udelay(SRCU_RETRY_CHECK_DELAY);
319 }
320}
321
322/*
323 * Increment the ->completed counter so that future SRCU readers will
324 * use the other rank of the ->(un)lock_count[] arrays. This allows
325 * us to wait for pre-existing readers in a starvation-free manner.
326 */
327static void srcu_flip(struct srcu_struct *sp)
328{
329 WRITE_ONCE(sp->completed, sp->completed + 1);
330
331 /*
332 * Ensure that if the updater misses an __srcu_read_unlock()
333 * increment, that task's next __srcu_read_lock() will see the
334 * above counter update. Note that both this memory barrier
335 * and the one in srcu_readers_active_idx_check() provide the
336 * guarantee for __srcu_read_lock().
337 */
338 smp_mb(); /* D */ /* Pairs with C. */
339}
340
341/*
342 * Enqueue an SRCU callback on the specified srcu_struct structure,
343 * initiating grace-period processing if it is not already running.
344 *
345 * Note that all CPUs must agree that the grace period extended beyond
346 * all pre-existing SRCU read-side critical section. On systems with
347 * more than one CPU, this means that when "func()" is invoked, each CPU
348 * is guaranteed to have executed a full memory barrier since the end of
349 * its last corresponding SRCU read-side critical section whose beginning
350 * preceded the call to call_rcu(). It also means that each CPU executing
351 * an SRCU read-side critical section that continues beyond the start of
352 * "func()" must have executed a memory barrier after the call_rcu()
353 * but before the beginning of that SRCU read-side critical section.
354 * Note that these guarantees include CPUs that are offline, idle, or
355 * executing in user mode, as well as CPUs that are executing in the kernel.
356 *
357 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
358 * resulting SRCU callback function "func()", then both CPU A and CPU
359 * B are guaranteed to execute a full memory barrier during the time
360 * interval between the call to call_rcu() and the invocation of "func()".
361 * This guarantee applies even if CPU A and CPU B are the same CPU (but
362 * again only if the system has more than one CPU).
363 *
364 * Of course, these guarantees apply only for invocations of call_srcu(),
365 * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
366 * srcu_struct structure.
367 */
368void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
369 rcu_callback_t func)
370{
371 unsigned long flags;
372
373 head->next = NULL;
374 head->func = func;
375 spin_lock_irqsave(&sp->queue_lock, flags);
376 smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
377 rcu_batch_queue(&sp->batch_queue, head);
378 if (!sp->running) {
379 sp->running = true;
380 queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
381 }
382 spin_unlock_irqrestore(&sp->queue_lock, flags);
383}
384EXPORT_SYMBOL_GPL(call_srcu);
385
386static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
387static void srcu_reschedule(struct srcu_struct *sp);
388
389/*
390 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
391 */
392static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
393{
394 struct rcu_synchronize rcu;
395 struct rcu_head *head = &rcu.head;
396 bool done = false;
397
398 RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
399 lock_is_held(&rcu_bh_lock_map) ||
400 lock_is_held(&rcu_lock_map) ||
401 lock_is_held(&rcu_sched_lock_map),
402 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
403
404 might_sleep();
405 init_completion(&rcu.completion);
406
407 head->next = NULL;
408 head->func = wakeme_after_rcu;
409 spin_lock_irq(&sp->queue_lock);
410 smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
411 if (!sp->running) {
412 /* steal the processing owner */
413 sp->running = true;
414 rcu_batch_queue(&sp->batch_check0, head);
415 spin_unlock_irq(&sp->queue_lock);
416
417 srcu_advance_batches(sp, trycount);
418 if (!rcu_batch_empty(&sp->batch_done)) {
419 BUG_ON(sp->batch_done.head != head);
420 rcu_batch_dequeue(&sp->batch_done);
421 done = true;
422 }
423 /* give the processing owner to work_struct */
424 srcu_reschedule(sp);
425 } else {
426 rcu_batch_queue(&sp->batch_queue, head);
427 spin_unlock_irq(&sp->queue_lock);
428 }
429
430 if (!done) {
431 wait_for_completion(&rcu.completion);
432 smp_mb(); /* Caller's later accesses after GP. */
433 }
434
435}
436
437/**
438 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
439 * @sp: srcu_struct with which to synchronize.
440 *
441 * Wait for the count to drain to zero of both indexes. To avoid the
442 * possible starvation of synchronize_srcu(), it waits for the count of
443 * the index=((->completed & 1) ^ 1) to drain to zero at first,
444 * and then flip the completed and wait for the count of the other index.
445 *
446 * Can block; must be called from process context.
447 *
448 * Note that it is illegal to call synchronize_srcu() from the corresponding
449 * SRCU read-side critical section; doing so will result in deadlock.
450 * However, it is perfectly legal to call synchronize_srcu() on one
451 * srcu_struct from some other srcu_struct's read-side critical section,
452 * as long as the resulting graph of srcu_structs is acyclic.
453 *
454 * There are memory-ordering constraints implied by synchronize_srcu().
455 * On systems with more than one CPU, when synchronize_srcu() returns,
456 * each CPU is guaranteed to have executed a full memory barrier since
457 * the end of its last corresponding SRCU-sched read-side critical section
458 * whose beginning preceded the call to synchronize_srcu(). In addition,
459 * each CPU having an SRCU read-side critical section that extends beyond
460 * the return from synchronize_srcu() is guaranteed to have executed a
461 * full memory barrier after the beginning of synchronize_srcu() and before
462 * the beginning of that SRCU read-side critical section. Note that these
463 * guarantees include CPUs that are offline, idle, or executing in user mode,
464 * as well as CPUs that are executing in the kernel.
465 *
466 * Furthermore, if CPU A invoked synchronize_srcu(), which returned
467 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
468 * to have executed a full memory barrier during the execution of
469 * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
470 * are the same CPU, but again only if the system has more than one CPU.
471 *
472 * Of course, these memory-ordering guarantees apply only when
473 * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
474 * passed the same srcu_struct structure.
475 */
476void synchronize_srcu(struct srcu_struct *sp)
477{
478 __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
479 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
480 : SYNCHRONIZE_SRCU_TRYCOUNT);
481}
482EXPORT_SYMBOL_GPL(synchronize_srcu);
483
484/**
485 * synchronize_srcu_expedited - Brute-force SRCU grace period
486 * @sp: srcu_struct with which to synchronize.
487 *
488 * Wait for an SRCU grace period to elapse, but be more aggressive about
489 * spinning rather than blocking when waiting.
490 *
491 * Note that synchronize_srcu_expedited() has the same deadlock and
492 * memory-ordering properties as does synchronize_srcu().
493 */
494void synchronize_srcu_expedited(struct srcu_struct *sp)
495{
496 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
497}
498EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
499
500/**
501 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
502 * @sp: srcu_struct on which to wait for in-flight callbacks.
503 */
504void srcu_barrier(struct srcu_struct *sp)
505{
506 synchronize_srcu(sp);
507}
508EXPORT_SYMBOL_GPL(srcu_barrier);
509
510/**
511 * srcu_batches_completed - return batches completed.
512 * @sp: srcu_struct on which to report batch completion.
513 *
514 * Report the number of batches, correlated with, but not necessarily
515 * precisely the same as, the number of grace periods that have elapsed.
516 */
517unsigned long srcu_batches_completed(struct srcu_struct *sp)
518{
519 return sp->completed;
520}
521EXPORT_SYMBOL_GPL(srcu_batches_completed);
522
523#define SRCU_CALLBACK_BATCH 10
524#define SRCU_INTERVAL 1
525
526/*
527 * Move any new SRCU callbacks to the first stage of the SRCU grace
528 * period pipeline.
529 */
530static void srcu_collect_new(struct srcu_struct *sp)
531{
532 if (!rcu_batch_empty(&sp->batch_queue)) {
533 spin_lock_irq(&sp->queue_lock);
534 rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
535 spin_unlock_irq(&sp->queue_lock);
536 }
537}
538
539/*
540 * Core SRCU state machine. Advance callbacks from ->batch_check0 to
541 * ->batch_check1 and then to ->batch_done as readers drain.
542 */
543static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
544{
545 int idx = 1 ^ (sp->completed & 1);
546
547 /*
548 * Because readers might be delayed for an extended period after
549 * fetching ->completed for their index, at any point in time there
550 * might well be readers using both idx=0 and idx=1. We therefore
551 * need to wait for readers to clear from both index values before
552 * invoking a callback.
553 */
554
555 if (rcu_batch_empty(&sp->batch_check0) &&
556 rcu_batch_empty(&sp->batch_check1))
557 return; /* no callbacks need to be advanced */
558
559 if (!try_check_zero(sp, idx, trycount))
560 return; /* failed to advance, will try after SRCU_INTERVAL */
561
562 /*
563 * The callbacks in ->batch_check1 have already done with their
564 * first zero check and flip back when they were enqueued on
565 * ->batch_check0 in a previous invocation of srcu_advance_batches().
566 * (Presumably try_check_zero() returned false during that
567 * invocation, leaving the callbacks stranded on ->batch_check1.)
568 * They are therefore ready to invoke, so move them to ->batch_done.
569 */
570 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
571
572 if (rcu_batch_empty(&sp->batch_check0))
573 return; /* no callbacks need to be advanced */
574 srcu_flip(sp);
575
576 /*
577 * The callbacks in ->batch_check0 just finished their
578 * first check zero and flip, so move them to ->batch_check1
579 * for future checking on the other idx.
580 */
581 rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
582
583 /*
584 * SRCU read-side critical sections are normally short, so check
585 * at least twice in quick succession after a flip.
586 */
587 trycount = trycount < 2 ? 2 : trycount;
588 if (!try_check_zero(sp, idx^1, trycount))
589 return; /* failed to advance, will try after SRCU_INTERVAL */
590
591 /*
592 * The callbacks in ->batch_check1 have now waited for all
593 * pre-existing readers using both idx values. They are therefore
594 * ready to invoke, so move them to ->batch_done.
595 */
596 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
597}
598
599/*
600 * Invoke a limited number of SRCU callbacks that have passed through
601 * their grace period. If there are more to do, SRCU will reschedule
602 * the workqueue. Note that needed memory barriers have been executed
603 * in this task's context by srcu_readers_active_idx_check().
604 */
605static void srcu_invoke_callbacks(struct srcu_struct *sp)
606{
607 int i;
608 struct rcu_head *head;
609
610 for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
611 head = rcu_batch_dequeue(&sp->batch_done);
612 if (!head)
613 break;
614 local_bh_disable();
615 head->func(head);
616 local_bh_enable();
617 }
618}
619
620/*
621 * Finished one round of SRCU grace period. Start another if there are
622 * more SRCU callbacks queued, otherwise put SRCU into not-running state.
623 */
624static void srcu_reschedule(struct srcu_struct *sp)
625{
626 bool pending = true;
627
628 if (rcu_batch_empty(&sp->batch_done) &&
629 rcu_batch_empty(&sp->batch_check1) &&
630 rcu_batch_empty(&sp->batch_check0) &&
631 rcu_batch_empty(&sp->batch_queue)) {
632 spin_lock_irq(&sp->queue_lock);
633 if (rcu_batch_empty(&sp->batch_done) &&
634 rcu_batch_empty(&sp->batch_check1) &&
635 rcu_batch_empty(&sp->batch_check0) &&
636 rcu_batch_empty(&sp->batch_queue)) {
637 sp->running = false;
638 pending = false;
639 }
640 spin_unlock_irq(&sp->queue_lock);
641 }
642
643 if (pending)
644 queue_delayed_work(system_power_efficient_wq,
645 &sp->work, SRCU_INTERVAL);
646}
647
648/*
649 * This is the work-queue function that handles SRCU grace periods.
650 */
651void process_srcu(struct work_struct *work)
652{
653 struct srcu_struct *sp;
654
655 sp = container_of(work, struct srcu_struct, work.work);
656
657 srcu_collect_new(sp);
658 srcu_advance_batches(sp, 1);
659 srcu_invoke_callbacks(sp);
660 srcu_reschedule(sp);
661}
662EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 36e1f82faed1..1a1c1047d2ed 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -38,8 +38,8 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
38 sp->srcu_lock_nesting[0] = 0; 38 sp->srcu_lock_nesting[0] = 0;
39 sp->srcu_lock_nesting[1] = 0; 39 sp->srcu_lock_nesting[1] = 0;
40 init_swait_queue_head(&sp->srcu_wq); 40 init_swait_queue_head(&sp->srcu_wq);
41 sp->srcu_gp_seq = 0; 41 sp->srcu_cb_head = NULL;
42 rcu_segcblist_init(&sp->srcu_cblist); 42 sp->srcu_cb_tail = &sp->srcu_cb_head;
43 sp->srcu_gp_running = false; 43 sp->srcu_gp_running = false;
44 sp->srcu_gp_waiting = false; 44 sp->srcu_gp_waiting = false;
45 sp->srcu_idx = 0; 45 sp->srcu_idx = 0;
@@ -88,31 +88,16 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
88{ 88{
89 WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); 89 WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
90 flush_work(&sp->srcu_work); 90 flush_work(&sp->srcu_work);
91 WARN_ON(rcu_seq_state(sp->srcu_gp_seq));
92 WARN_ON(sp->srcu_gp_running); 91 WARN_ON(sp->srcu_gp_running);
93 WARN_ON(sp->srcu_gp_waiting); 92 WARN_ON(sp->srcu_gp_waiting);
94 WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)); 93 WARN_ON(sp->srcu_cb_head);
94 WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
95} 95}
96EXPORT_SYMBOL_GPL(cleanup_srcu_struct); 96EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
97 97
98/* 98/*
99 * Counts the new reader in the appropriate per-CPU element of the
100 * srcu_struct. Must be called from process context.
101 * Returns an index that must be passed to the matching srcu_read_unlock().
102 */
103int __srcu_read_lock(struct srcu_struct *sp)
104{
105 int idx;
106
107 idx = READ_ONCE(sp->srcu_idx);
108 WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1);
109 return idx;
110}
111EXPORT_SYMBOL_GPL(__srcu_read_lock);
112
113/*
114 * Removes the count for the old reader from the appropriate element of 99 * Removes the count for the old reader from the appropriate element of
115 * the srcu_struct. Must be called from process context. 100 * the srcu_struct.
116 */ 101 */
117void __srcu_read_unlock(struct srcu_struct *sp, int idx) 102void __srcu_read_unlock(struct srcu_struct *sp, int idx)
118{ 103{
@@ -132,52 +117,44 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
132void srcu_drive_gp(struct work_struct *wp) 117void srcu_drive_gp(struct work_struct *wp)
133{ 118{
134 int idx; 119 int idx;
135 struct rcu_cblist ready_cbs; 120 struct rcu_head *lh;
136 struct srcu_struct *sp;
137 struct rcu_head *rhp; 121 struct rcu_head *rhp;
122 struct srcu_struct *sp;
138 123
139 sp = container_of(wp, struct srcu_struct, srcu_work); 124 sp = container_of(wp, struct srcu_struct, srcu_work);
140 if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist)) 125 if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head))
141 return; /* Already running or nothing to do. */ 126 return; /* Already running or nothing to do. */
142 127
143 /* Tag recently arrived callbacks and wait for readers. */ 128 /* Remove recently arrived callbacks and wait for readers. */
144 WRITE_ONCE(sp->srcu_gp_running, true); 129 WRITE_ONCE(sp->srcu_gp_running, true);
145 rcu_segcblist_accelerate(&sp->srcu_cblist, 130 local_irq_disable();
146 rcu_seq_snap(&sp->srcu_gp_seq)); 131 lh = sp->srcu_cb_head;
147 rcu_seq_start(&sp->srcu_gp_seq); 132 sp->srcu_cb_head = NULL;
133 sp->srcu_cb_tail = &sp->srcu_cb_head;
134 local_irq_enable();
148 idx = sp->srcu_idx; 135 idx = sp->srcu_idx;
149 WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); 136 WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
150 WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ 137 WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
151 swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); 138 swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
152 WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ 139 WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
153 rcu_seq_end(&sp->srcu_gp_seq); 140
154 141 /* Invoke the callbacks we removed above. */
155 /* Update callback list based on GP, and invoke ready callbacks. */ 142 while (lh) {
156 rcu_segcblist_advance(&sp->srcu_cblist, 143 rhp = lh;
157 rcu_seq_current(&sp->srcu_gp_seq)); 144 lh = lh->next;
158 if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) { 145 local_bh_disable();
159 rcu_cblist_init(&ready_cbs); 146 rhp->func(rhp);
160 local_irq_disable(); 147 local_bh_enable();
161 rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
162 local_irq_enable();
163 rhp = rcu_cblist_dequeue(&ready_cbs);
164 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
165 local_bh_disable();
166 rhp->func(rhp);
167 local_bh_enable();
168 }
169 local_irq_disable();
170 rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
171 local_irq_enable();
172 } 148 }
173 WRITE_ONCE(sp->srcu_gp_running, false);
174 149
175 /* 150 /*
176 * If more callbacks, reschedule ourselves. This can race with 151 * Enable rescheduling, and if there are more callbacks,
177 * a call_srcu() at interrupt level, but the ->srcu_gp_running 152 * reschedule ourselves. This can race with a call_srcu()
178 * checks will straighten that out. 153 * at interrupt level, but the ->srcu_gp_running checks will
154 * straighten that out.
179 */ 155 */
180 if (!rcu_segcblist_empty(&sp->srcu_cblist)) 156 WRITE_ONCE(sp->srcu_gp_running, false);
157 if (READ_ONCE(sp->srcu_cb_head))
181 schedule_work(&sp->srcu_work); 158 schedule_work(&sp->srcu_work);
182} 159}
183EXPORT_SYMBOL_GPL(srcu_drive_gp); 160EXPORT_SYMBOL_GPL(srcu_drive_gp);
@@ -186,14 +163,16 @@ EXPORT_SYMBOL_GPL(srcu_drive_gp);
186 * Enqueue an SRCU callback on the specified srcu_struct structure, 163 * Enqueue an SRCU callback on the specified srcu_struct structure,
187 * initiating grace-period processing if it is not already running. 164 * initiating grace-period processing if it is not already running.
188 */ 165 */
189void call_srcu(struct srcu_struct *sp, struct rcu_head *head, 166void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
190 rcu_callback_t func) 167 rcu_callback_t func)
191{ 168{
192 unsigned long flags; 169 unsigned long flags;
193 170
194 head->func = func; 171 rhp->func = func;
172 rhp->next = NULL;
195 local_irq_save(flags); 173 local_irq_save(flags);
196 rcu_segcblist_enqueue(&sp->srcu_cblist, head, false); 174 *sp->srcu_cb_tail = rhp;
175 sp->srcu_cb_tail = &rhp->next;
197 local_irq_restore(flags); 176 local_irq_restore(flags);
198 if (!READ_ONCE(sp->srcu_gp_running)) 177 if (!READ_ONCE(sp->srcu_gp_running))
199 schedule_work(&sp->srcu_work); 178 schedule_work(&sp->srcu_work);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 3ae8474557df..d0ca524bf042 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -40,9 +40,15 @@
40#include "rcu.h" 40#include "rcu.h"
41#include "rcu_segcblist.h" 41#include "rcu_segcblist.h"
42 42
43ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */ 43/* Holdoff in nanoseconds for auto-expediting. */
44#define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000)
45static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF;
44module_param(exp_holdoff, ulong, 0444); 46module_param(exp_holdoff, ulong, 0444);
45 47
48/* Overflow-check frequency. N bits roughly says every 2**N grace periods. */
49static ulong counter_wrap_check = (ULONG_MAX >> 2);
50module_param(counter_wrap_check, ulong, 0444);
51
46static void srcu_invoke_callbacks(struct work_struct *work); 52static void srcu_invoke_callbacks(struct work_struct *work);
47static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); 53static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
48 54
@@ -70,7 +76,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
70 76
71 /* Each pass through this loop initializes one srcu_node structure. */ 77 /* Each pass through this loop initializes one srcu_node structure. */
72 rcu_for_each_node_breadth_first(sp, snp) { 78 rcu_for_each_node_breadth_first(sp, snp) {
73 spin_lock_init(&snp->lock); 79 raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
74 WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != 80 WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
75 ARRAY_SIZE(snp->srcu_data_have_cbs)); 81 ARRAY_SIZE(snp->srcu_data_have_cbs));
76 for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { 82 for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
@@ -104,7 +110,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
104 snp_first = sp->level[level]; 110 snp_first = sp->level[level];
105 for_each_possible_cpu(cpu) { 111 for_each_possible_cpu(cpu) {
106 sdp = per_cpu_ptr(sp->sda, cpu); 112 sdp = per_cpu_ptr(sp->sda, cpu);
107 spin_lock_init(&sdp->lock); 113 raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
108 rcu_segcblist_init(&sdp->srcu_cblist); 114 rcu_segcblist_init(&sdp->srcu_cblist);
109 sdp->srcu_cblist_invoking = false; 115 sdp->srcu_cblist_invoking = false;
110 sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; 116 sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
@@ -163,7 +169,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
163 /* Don't re-initialize a lock while it is held. */ 169 /* Don't re-initialize a lock while it is held. */
164 debug_check_no_locks_freed((void *)sp, sizeof(*sp)); 170 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
165 lockdep_init_map(&sp->dep_map, name, key, 0); 171 lockdep_init_map(&sp->dep_map, name, key, 0);
166 spin_lock_init(&sp->gp_lock); 172 raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
167 return init_srcu_struct_fields(sp, false); 173 return init_srcu_struct_fields(sp, false);
168} 174}
169EXPORT_SYMBOL_GPL(__init_srcu_struct); 175EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -180,7 +186,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
180 */ 186 */
181int init_srcu_struct(struct srcu_struct *sp) 187int init_srcu_struct(struct srcu_struct *sp)
182{ 188{
183 spin_lock_init(&sp->gp_lock); 189 raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
184 return init_srcu_struct_fields(sp, false); 190 return init_srcu_struct_fields(sp, false);
185} 191}
186EXPORT_SYMBOL_GPL(init_srcu_struct); 192EXPORT_SYMBOL_GPL(init_srcu_struct);
@@ -191,7 +197,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
191 * First-use initialization of statically allocated srcu_struct 197 * First-use initialization of statically allocated srcu_struct
192 * structure. Wiring up the combining tree is more than can be 198 * structure. Wiring up the combining tree is more than can be
193 * done with compile-time initialization, so this check is added 199 * done with compile-time initialization, so this check is added
194 * to each update-side SRCU primitive. Use ->gp_lock, which -is- 200 * to each update-side SRCU primitive. Use sp->lock, which -is-
195 * compile-time initialized, to resolve races involving multiple 201 * compile-time initialized, to resolve races involving multiple
196 * CPUs trying to garner first-use privileges. 202 * CPUs trying to garner first-use privileges.
197 */ 203 */
@@ -203,13 +209,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp)
203 /* The smp_load_acquire() pairs with the smp_store_release(). */ 209 /* The smp_load_acquire() pairs with the smp_store_release(). */
204 if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ 210 if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
205 return; /* Already initialized. */ 211 return; /* Already initialized. */
206 spin_lock_irqsave(&sp->gp_lock, flags); 212 raw_spin_lock_irqsave_rcu_node(sp, flags);
207 if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { 213 if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
208 spin_unlock_irqrestore(&sp->gp_lock, flags); 214 raw_spin_unlock_irqrestore_rcu_node(sp, flags);
209 return; 215 return;
210 } 216 }
211 init_srcu_struct_fields(sp, true); 217 init_srcu_struct_fields(sp, true);
212 spin_unlock_irqrestore(&sp->gp_lock, flags); 218 raw_spin_unlock_irqrestore_rcu_node(sp, flags);
213} 219}
214 220
215/* 221/*
@@ -275,15 +281,20 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
275 * not mean that there are no more readers, as one could have read 281 * not mean that there are no more readers, as one could have read
276 * the current index but not have incremented the lock counter yet. 282 * the current index but not have incremented the lock counter yet.
277 * 283 *
278 * Possible bug: There is no guarantee that there haven't been 284 * So suppose that the updater is preempted here for so long
279 * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were 285 * that more than ULONG_MAX non-nested readers come and go in
280 * counted, meaning that this could return true even if there are 286 * the meantime. It turns out that this cannot result in overflow
281 * still active readers. Since there are no memory barriers around 287 * because if a reader modifies its unlock count after we read it
282 * srcu_flip(), the CPU is not required to increment ->srcu_idx 288 * above, then that reader's next load of ->srcu_idx is guaranteed
283 * before running srcu_readers_unlock_idx(), which means that there 289 * to get the new value, which will cause it to operate on the
284 * could be an arbitrarily large number of critical sections that 290 * other bank of counters, where it cannot contribute to the
285 * execute after srcu_readers_unlock_idx() but use the old value 291 * overflow of these counters. This means that there is a maximum
286 * of ->srcu_idx. 292 * of 2*NR_CPUS increments, which cannot overflow given current
293 * systems, especially not on 64-bit systems.
294 *
295 * OK, how about nesting? This does impose a limit on nesting
296 * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient,
297 * especially on 64-bit systems.
287 */ 298 */
288 return srcu_readers_lock_idx(sp, idx) == unlocks; 299 return srcu_readers_lock_idx(sp, idx) == unlocks;
289} 300}
@@ -357,7 +368,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
357 368
358/* 369/*
359 * Counts the new reader in the appropriate per-CPU element of the 370 * Counts the new reader in the appropriate per-CPU element of the
360 * srcu_struct. Must be called from process context. 371 * srcu_struct.
361 * Returns an index that must be passed to the matching srcu_read_unlock(). 372 * Returns an index that must be passed to the matching srcu_read_unlock().
362 */ 373 */
363int __srcu_read_lock(struct srcu_struct *sp) 374int __srcu_read_lock(struct srcu_struct *sp)
@@ -365,7 +376,7 @@ int __srcu_read_lock(struct srcu_struct *sp)
365 int idx; 376 int idx;
366 377
367 idx = READ_ONCE(sp->srcu_idx) & 0x1; 378 idx = READ_ONCE(sp->srcu_idx) & 0x1;
368 __this_cpu_inc(sp->sda->srcu_lock_count[idx]); 379 this_cpu_inc(sp->sda->srcu_lock_count[idx]);
369 smp_mb(); /* B */ /* Avoid leaking the critical section. */ 380 smp_mb(); /* B */ /* Avoid leaking the critical section. */
370 return idx; 381 return idx;
371} 382}
@@ -375,7 +386,6 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
375 * Removes the count for the old reader from the appropriate per-CPU 386 * Removes the count for the old reader from the appropriate per-CPU
376 * element of the srcu_struct. Note that this may well be a different 387 * element of the srcu_struct. Note that this may well be a different
377 * CPU than that which was incremented by the corresponding srcu_read_lock(). 388 * CPU than that which was incremented by the corresponding srcu_read_lock().
378 * Must be called from process context.
379 */ 389 */
380void __srcu_read_unlock(struct srcu_struct *sp, int idx) 390void __srcu_read_unlock(struct srcu_struct *sp, int idx)
381{ 391{
@@ -401,8 +411,7 @@ static void srcu_gp_start(struct srcu_struct *sp)
401 struct srcu_data *sdp = this_cpu_ptr(sp->sda); 411 struct srcu_data *sdp = this_cpu_ptr(sp->sda);
402 int state; 412 int state;
403 413
404 RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock), 414 lockdep_assert_held(&sp->lock);
405 "Invoked srcu_gp_start() without ->gp_lock!");
406 WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); 415 WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
407 rcu_segcblist_advance(&sdp->srcu_cblist, 416 rcu_segcblist_advance(&sdp->srcu_cblist,
408 rcu_seq_current(&sp->srcu_gp_seq)); 417 rcu_seq_current(&sp->srcu_gp_seq));
@@ -490,17 +499,20 @@ static void srcu_gp_end(struct srcu_struct *sp)
490{ 499{
491 unsigned long cbdelay; 500 unsigned long cbdelay;
492 bool cbs; 501 bool cbs;
502 int cpu;
503 unsigned long flags;
493 unsigned long gpseq; 504 unsigned long gpseq;
494 int idx; 505 int idx;
495 int idxnext; 506 int idxnext;
496 unsigned long mask; 507 unsigned long mask;
508 struct srcu_data *sdp;
497 struct srcu_node *snp; 509 struct srcu_node *snp;
498 510
499 /* Prevent more than one additional grace period. */ 511 /* Prevent more than one additional grace period. */
500 mutex_lock(&sp->srcu_cb_mutex); 512 mutex_lock(&sp->srcu_cb_mutex);
501 513
502 /* End the current grace period. */ 514 /* End the current grace period. */
503 spin_lock_irq(&sp->gp_lock); 515 raw_spin_lock_irq_rcu_node(sp);
504 idx = rcu_seq_state(sp->srcu_gp_seq); 516 idx = rcu_seq_state(sp->srcu_gp_seq);
505 WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); 517 WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
506 cbdelay = srcu_get_delay(sp); 518 cbdelay = srcu_get_delay(sp);
@@ -509,7 +521,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
509 gpseq = rcu_seq_current(&sp->srcu_gp_seq); 521 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
510 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) 522 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
511 sp->srcu_gp_seq_needed_exp = gpseq; 523 sp->srcu_gp_seq_needed_exp = gpseq;
512 spin_unlock_irq(&sp->gp_lock); 524 raw_spin_unlock_irq_rcu_node(sp);
513 mutex_unlock(&sp->srcu_gp_mutex); 525 mutex_unlock(&sp->srcu_gp_mutex);
514 /* A new grace period can start at this point. But only one. */ 526 /* A new grace period can start at this point. But only one. */
515 527
@@ -517,7 +529,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
517 idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); 529 idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
518 idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); 530 idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
519 rcu_for_each_node_breadth_first(sp, snp) { 531 rcu_for_each_node_breadth_first(sp, snp) {
520 spin_lock_irq(&snp->lock); 532 raw_spin_lock_irq_rcu_node(snp);
521 cbs = false; 533 cbs = false;
522 if (snp >= sp->level[rcu_num_lvls - 1]) 534 if (snp >= sp->level[rcu_num_lvls - 1])
523 cbs = snp->srcu_have_cbs[idx] == gpseq; 535 cbs = snp->srcu_have_cbs[idx] == gpseq;
@@ -527,28 +539,37 @@ static void srcu_gp_end(struct srcu_struct *sp)
527 snp->srcu_gp_seq_needed_exp = gpseq; 539 snp->srcu_gp_seq_needed_exp = gpseq;
528 mask = snp->srcu_data_have_cbs[idx]; 540 mask = snp->srcu_data_have_cbs[idx];
529 snp->srcu_data_have_cbs[idx] = 0; 541 snp->srcu_data_have_cbs[idx] = 0;
530 spin_unlock_irq(&snp->lock); 542 raw_spin_unlock_irq_rcu_node(snp);
531 if (cbs) { 543 if (cbs)
532 smp_mb(); /* GP end before CB invocation. */
533 srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); 544 srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
534 } 545
546 /* Occasionally prevent srcu_data counter wrap. */
547 if (!(gpseq & counter_wrap_check))
548 for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
549 sdp = per_cpu_ptr(sp->sda, cpu);
550 raw_spin_lock_irqsave_rcu_node(sdp, flags);
551 if (ULONG_CMP_GE(gpseq,
552 sdp->srcu_gp_seq_needed + 100))
553 sdp->srcu_gp_seq_needed = gpseq;
554 raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
555 }
535 } 556 }
536 557
537 /* Callback initiation done, allow grace periods after next. */ 558 /* Callback initiation done, allow grace periods after next. */
538 mutex_unlock(&sp->srcu_cb_mutex); 559 mutex_unlock(&sp->srcu_cb_mutex);
539 560
540 /* Start a new grace period if needed. */ 561 /* Start a new grace period if needed. */
541 spin_lock_irq(&sp->gp_lock); 562 raw_spin_lock_irq_rcu_node(sp);
542 gpseq = rcu_seq_current(&sp->srcu_gp_seq); 563 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
543 if (!rcu_seq_state(gpseq) && 564 if (!rcu_seq_state(gpseq) &&
544 ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { 565 ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
545 srcu_gp_start(sp); 566 srcu_gp_start(sp);
546 spin_unlock_irq(&sp->gp_lock); 567 raw_spin_unlock_irq_rcu_node(sp);
547 /* Throttle expedited grace periods: Should be rare! */ 568 /* Throttle expedited grace periods: Should be rare! */
548 srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff 569 srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
549 ? 0 : SRCU_INTERVAL); 570 ? 0 : SRCU_INTERVAL);
550 } else { 571 } else {
551 spin_unlock_irq(&sp->gp_lock); 572 raw_spin_unlock_irq_rcu_node(sp);
552 } 573 }
553} 574}
554 575
@@ -568,18 +589,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
568 if (rcu_seq_done(&sp->srcu_gp_seq, s) || 589 if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
569 ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) 590 ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
570 return; 591 return;
571 spin_lock_irqsave(&snp->lock, flags); 592 raw_spin_lock_irqsave_rcu_node(snp, flags);
572 if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { 593 if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
573 spin_unlock_irqrestore(&snp->lock, flags); 594 raw_spin_unlock_irqrestore_rcu_node(snp, flags);
574 return; 595 return;
575 } 596 }
576 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); 597 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
577 spin_unlock_irqrestore(&snp->lock, flags); 598 raw_spin_unlock_irqrestore_rcu_node(snp, flags);
578 } 599 }
579 spin_lock_irqsave(&sp->gp_lock, flags); 600 raw_spin_lock_irqsave_rcu_node(sp, flags);
580 if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) 601 if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
581 sp->srcu_gp_seq_needed_exp = s; 602 sp->srcu_gp_seq_needed_exp = s;
582 spin_unlock_irqrestore(&sp->gp_lock, flags); 603 raw_spin_unlock_irqrestore_rcu_node(sp, flags);
583} 604}
584 605
585/* 606/*
@@ -601,14 +622,13 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
601 for (; snp != NULL; snp = snp->srcu_parent) { 622 for (; snp != NULL; snp = snp->srcu_parent) {
602 if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) 623 if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
603 return; /* GP already done and CBs recorded. */ 624 return; /* GP already done and CBs recorded. */
604 spin_lock_irqsave(&snp->lock, flags); 625 raw_spin_lock_irqsave_rcu_node(snp, flags);
605 if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { 626 if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
606 snp_seq = snp->srcu_have_cbs[idx]; 627 snp_seq = snp->srcu_have_cbs[idx];
607 if (snp == sdp->mynode && snp_seq == s) 628 if (snp == sdp->mynode && snp_seq == s)
608 snp->srcu_data_have_cbs[idx] |= sdp->grpmask; 629 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
609 spin_unlock_irqrestore(&snp->lock, flags); 630 raw_spin_unlock_irqrestore_rcu_node(snp, flags);
610 if (snp == sdp->mynode && snp_seq != s) { 631 if (snp == sdp->mynode && snp_seq != s) {
611 smp_mb(); /* CBs after GP! */
612 srcu_schedule_cbs_sdp(sdp, do_norm 632 srcu_schedule_cbs_sdp(sdp, do_norm
613 ? SRCU_INTERVAL 633 ? SRCU_INTERVAL
614 : 0); 634 : 0);
@@ -623,11 +643,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
623 snp->srcu_data_have_cbs[idx] |= sdp->grpmask; 643 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
624 if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) 644 if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
625 snp->srcu_gp_seq_needed_exp = s; 645 snp->srcu_gp_seq_needed_exp = s;
626 spin_unlock_irqrestore(&snp->lock, flags); 646 raw_spin_unlock_irqrestore_rcu_node(snp, flags);
627 } 647 }
628 648
629 /* Top of tree, must ensure the grace period will be started. */ 649 /* Top of tree, must ensure the grace period will be started. */
630 spin_lock_irqsave(&sp->gp_lock, flags); 650 raw_spin_lock_irqsave_rcu_node(sp, flags);
631 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { 651 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
632 /* 652 /*
633 * Record need for grace period s. Pair with load 653 * Record need for grace period s. Pair with load
@@ -646,7 +666,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
646 queue_delayed_work(system_power_efficient_wq, &sp->work, 666 queue_delayed_work(system_power_efficient_wq, &sp->work,
647 srcu_get_delay(sp)); 667 srcu_get_delay(sp));
648 } 668 }
649 spin_unlock_irqrestore(&sp->gp_lock, flags); 669 raw_spin_unlock_irqrestore_rcu_node(sp, flags);
650} 670}
651 671
652/* 672/*
@@ -672,6 +692,16 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
672 */ 692 */
673static void srcu_flip(struct srcu_struct *sp) 693static void srcu_flip(struct srcu_struct *sp)
674{ 694{
695 /*
696 * Ensure that if this updater saw a given reader's increment
697 * from __srcu_read_lock(), that reader was using an old value
698 * of ->srcu_idx. Also ensure that if a given reader sees the
699 * new value of ->srcu_idx, this updater's earlier scans cannot
700 * have seen that reader's increments (which is OK, because this
701 * grace period need not wait on that reader).
702 */
703 smp_mb(); /* E */ /* Pairs with B and C. */
704
675 WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); 705 WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
676 706
677 /* 707 /*
@@ -746,6 +776,13 @@ static bool srcu_might_be_idle(struct srcu_struct *sp)
746} 776}
747 777
748/* 778/*
779 * SRCU callback function to leak a callback.
780 */
781static void srcu_leak_callback(struct rcu_head *rhp)
782{
783}
784
785/*
749 * Enqueue an SRCU callback on the srcu_data structure associated with 786 * Enqueue an SRCU callback on the srcu_data structure associated with
750 * the current CPU and the specified srcu_struct structure, initiating 787 * the current CPU and the specified srcu_struct structure, initiating
751 * grace-period processing if it is not already running. 788 * grace-period processing if it is not already running.
@@ -783,10 +820,16 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
783 struct srcu_data *sdp; 820 struct srcu_data *sdp;
784 821
785 check_init_srcu_struct(sp); 822 check_init_srcu_struct(sp);
823 if (debug_rcu_head_queue(rhp)) {
824 /* Probable double call_srcu(), so leak the callback. */
825 WRITE_ONCE(rhp->func, srcu_leak_callback);
826 WARN_ONCE(1, "call_srcu(): Leaked duplicate callback\n");
827 return;
828 }
786 rhp->func = func; 829 rhp->func = func;
787 local_irq_save(flags); 830 local_irq_save(flags);
788 sdp = this_cpu_ptr(sp->sda); 831 sdp = this_cpu_ptr(sp->sda);
789 spin_lock(&sdp->lock); 832 raw_spin_lock_rcu_node(sdp);
790 rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); 833 rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
791 rcu_segcblist_advance(&sdp->srcu_cblist, 834 rcu_segcblist_advance(&sdp->srcu_cblist,
792 rcu_seq_current(&sp->srcu_gp_seq)); 835 rcu_seq_current(&sp->srcu_gp_seq));
@@ -800,13 +843,30 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
800 sdp->srcu_gp_seq_needed_exp = s; 843 sdp->srcu_gp_seq_needed_exp = s;
801 needexp = true; 844 needexp = true;
802 } 845 }
803 spin_unlock_irqrestore(&sdp->lock, flags); 846 raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
804 if (needgp) 847 if (needgp)
805 srcu_funnel_gp_start(sp, sdp, s, do_norm); 848 srcu_funnel_gp_start(sp, sdp, s, do_norm);
806 else if (needexp) 849 else if (needexp)
807 srcu_funnel_exp_start(sp, sdp->mynode, s); 850 srcu_funnel_exp_start(sp, sdp->mynode, s);
808} 851}
809 852
853/**
854 * call_srcu() - Queue a callback for invocation after an SRCU grace period
855 * @sp: srcu_struct in queue the callback
856 * @head: structure to be used for queueing the SRCU callback.
857 * @func: function to be invoked after the SRCU grace period
858 *
859 * The callback function will be invoked some time after a full SRCU
860 * grace period elapses, in other words after all pre-existing SRCU
861 * read-side critical sections have completed. However, the callback
862 * function might well execute concurrently with other SRCU read-side
863 * critical sections that started after call_srcu() was invoked. SRCU
864 * read-side critical sections are delimited by srcu_read_lock() and
865 * srcu_read_unlock(), and may be nested.
866 *
867 * The callback will be invoked from process context, but must nevertheless
868 * be fast and must not block.
869 */
810void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, 870void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
811 rcu_callback_t func) 871 rcu_callback_t func)
812{ 872{
@@ -954,13 +1014,16 @@ void srcu_barrier(struct srcu_struct *sp)
954 */ 1014 */
955 for_each_possible_cpu(cpu) { 1015 for_each_possible_cpu(cpu) {
956 sdp = per_cpu_ptr(sp->sda, cpu); 1016 sdp = per_cpu_ptr(sp->sda, cpu);
957 spin_lock_irq(&sdp->lock); 1017 raw_spin_lock_irq_rcu_node(sdp);
958 atomic_inc(&sp->srcu_barrier_cpu_cnt); 1018 atomic_inc(&sp->srcu_barrier_cpu_cnt);
959 sdp->srcu_barrier_head.func = srcu_barrier_cb; 1019 sdp->srcu_barrier_head.func = srcu_barrier_cb;
1020 debug_rcu_head_queue(&sdp->srcu_barrier_head);
960 if (!rcu_segcblist_entrain(&sdp->srcu_cblist, 1021 if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
961 &sdp->srcu_barrier_head, 0)) 1022 &sdp->srcu_barrier_head, 0)) {
1023 debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
962 atomic_dec(&sp->srcu_barrier_cpu_cnt); 1024 atomic_dec(&sp->srcu_barrier_cpu_cnt);
963 spin_unlock_irq(&sdp->lock); 1025 }
1026 raw_spin_unlock_irq_rcu_node(sdp);
964 } 1027 }
965 1028
966 /* Remove the initial count, at which point reaching zero can happen. */ 1029 /* Remove the initial count, at which point reaching zero can happen. */
@@ -1009,17 +1072,17 @@ static void srcu_advance_state(struct srcu_struct *sp)
1009 */ 1072 */
1010 idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ 1073 idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
1011 if (idx == SRCU_STATE_IDLE) { 1074 if (idx == SRCU_STATE_IDLE) {
1012 spin_lock_irq(&sp->gp_lock); 1075 raw_spin_lock_irq_rcu_node(sp);
1013 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { 1076 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
1014 WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); 1077 WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
1015 spin_unlock_irq(&sp->gp_lock); 1078 raw_spin_unlock_irq_rcu_node(sp);
1016 mutex_unlock(&sp->srcu_gp_mutex); 1079 mutex_unlock(&sp->srcu_gp_mutex);
1017 return; 1080 return;
1018 } 1081 }
1019 idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); 1082 idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
1020 if (idx == SRCU_STATE_IDLE) 1083 if (idx == SRCU_STATE_IDLE)
1021 srcu_gp_start(sp); 1084 srcu_gp_start(sp);
1022 spin_unlock_irq(&sp->gp_lock); 1085 raw_spin_unlock_irq_rcu_node(sp);
1023 if (idx != SRCU_STATE_IDLE) { 1086 if (idx != SRCU_STATE_IDLE) {
1024 mutex_unlock(&sp->srcu_gp_mutex); 1087 mutex_unlock(&sp->srcu_gp_mutex);
1025 return; /* Someone else started the grace period. */ 1088 return; /* Someone else started the grace period. */
@@ -1068,22 +1131,22 @@ static void srcu_invoke_callbacks(struct work_struct *work)
1068 sdp = container_of(work, struct srcu_data, work.work); 1131 sdp = container_of(work, struct srcu_data, work.work);
1069 sp = sdp->sp; 1132 sp = sdp->sp;
1070 rcu_cblist_init(&ready_cbs); 1133 rcu_cblist_init(&ready_cbs);
1071 spin_lock_irq(&sdp->lock); 1134 raw_spin_lock_irq_rcu_node(sdp);
1072 smp_mb(); /* Old grace periods before callback invocation! */
1073 rcu_segcblist_advance(&sdp->srcu_cblist, 1135 rcu_segcblist_advance(&sdp->srcu_cblist,
1074 rcu_seq_current(&sp->srcu_gp_seq)); 1136 rcu_seq_current(&sp->srcu_gp_seq));
1075 if (sdp->srcu_cblist_invoking || 1137 if (sdp->srcu_cblist_invoking ||
1076 !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { 1138 !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
1077 spin_unlock_irq(&sdp->lock); 1139 raw_spin_unlock_irq_rcu_node(sdp);
1078 return; /* Someone else on the job or nothing to do. */ 1140 return; /* Someone else on the job or nothing to do. */
1079 } 1141 }
1080 1142
1081 /* We are on the job! Extract and invoke ready callbacks. */ 1143 /* We are on the job! Extract and invoke ready callbacks. */
1082 sdp->srcu_cblist_invoking = true; 1144 sdp->srcu_cblist_invoking = true;
1083 rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); 1145 rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
1084 spin_unlock_irq(&sdp->lock); 1146 raw_spin_unlock_irq_rcu_node(sdp);
1085 rhp = rcu_cblist_dequeue(&ready_cbs); 1147 rhp = rcu_cblist_dequeue(&ready_cbs);
1086 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { 1148 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
1149 debug_rcu_head_unqueue(rhp);
1087 local_bh_disable(); 1150 local_bh_disable();
1088 rhp->func(rhp); 1151 rhp->func(rhp);
1089 local_bh_enable(); 1152 local_bh_enable();
@@ -1093,13 +1156,13 @@ static void srcu_invoke_callbacks(struct work_struct *work)
1093 * Update counts, accelerate new callbacks, and if needed, 1156 * Update counts, accelerate new callbacks, and if needed,
1094 * schedule another round of callback invocation. 1157 * schedule another round of callback invocation.
1095 */ 1158 */
1096 spin_lock_irq(&sdp->lock); 1159 raw_spin_lock_irq_rcu_node(sdp);
1097 rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); 1160 rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
1098 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, 1161 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
1099 rcu_seq_snap(&sp->srcu_gp_seq)); 1162 rcu_seq_snap(&sp->srcu_gp_seq));
1100 sdp->srcu_cblist_invoking = false; 1163 sdp->srcu_cblist_invoking = false;
1101 more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); 1164 more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
1102 spin_unlock_irq(&sdp->lock); 1165 raw_spin_unlock_irq_rcu_node(sdp);
1103 if (more) 1166 if (more)
1104 srcu_schedule_cbs_sdp(sdp, 0); 1167 srcu_schedule_cbs_sdp(sdp, 0);
1105} 1168}
@@ -1112,7 +1175,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
1112{ 1175{
1113 bool pushgp = true; 1176 bool pushgp = true;
1114 1177
1115 spin_lock_irq(&sp->gp_lock); 1178 raw_spin_lock_irq_rcu_node(sp);
1116 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { 1179 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
1117 if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { 1180 if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
1118 /* All requests fulfilled, time to go idle. */ 1181 /* All requests fulfilled, time to go idle. */
@@ -1122,7 +1185,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
1122 /* Outstanding request and no GP. Start one. */ 1185 /* Outstanding request and no GP. Start one. */
1123 srcu_gp_start(sp); 1186 srcu_gp_start(sp);
1124 } 1187 }
1125 spin_unlock_irq(&sp->gp_lock); 1188 raw_spin_unlock_irq_rcu_node(sp);
1126 1189
1127 if (pushgp) 1190 if (pushgp)
1128 queue_delayed_work(system_power_efficient_wq, &sp->work, delay); 1191 queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
@@ -1153,3 +1216,12 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
1153 *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); 1216 *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed);
1154} 1217}
1155EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); 1218EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
1219
1220static int __init srcu_bootup_announce(void)
1221{
1222 pr_info("Hierarchical SRCU implementation.\n");
1223 if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
1224 pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
1225 return 0;
1226}
1227early_initcall(srcu_bootup_announce);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index e5385731e391..f8488965250f 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,15 +35,26 @@
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/cpu.h> 36#include <linux/cpu.h>
37#include <linux/prefetch.h> 37#include <linux/prefetch.h>
38#include <linux/trace_events.h>
39 38
40#include "rcu.h" 39#include "rcu.h"
41 40
42/* Forward declarations for tiny_plugin.h. */ 41/* Global control variables for rcupdate callback mechanism. */
43struct rcu_ctrlblk; 42struct rcu_ctrlblk {
44static void __call_rcu(struct rcu_head *head, 43 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
45 rcu_callback_t func, 44 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
46 struct rcu_ctrlblk *rcp); 45 struct rcu_head **curtail; /* ->next pointer of last CB. */
46};
47
48/* Definition for rcupdate control block. */
49static struct rcu_ctrlblk rcu_sched_ctrlblk = {
50 .donetail = &rcu_sched_ctrlblk.rcucblist,
51 .curtail = &rcu_sched_ctrlblk.rcucblist,
52};
53
54static struct rcu_ctrlblk rcu_bh_ctrlblk = {
55 .donetail = &rcu_bh_ctrlblk.rcucblist,
56 .curtail = &rcu_bh_ctrlblk.rcucblist,
57};
47 58
48#include "tiny_plugin.h" 59#include "tiny_plugin.h"
49 60
@@ -59,19 +70,6 @@ void rcu_barrier_sched(void)
59} 70}
60EXPORT_SYMBOL(rcu_barrier_sched); 71EXPORT_SYMBOL(rcu_barrier_sched);
61 72
62#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
63
64/*
65 * Test whether RCU thinks that the current CPU is idle.
66 */
67bool notrace __rcu_is_watching(void)
68{
69 return true;
70}
71EXPORT_SYMBOL(__rcu_is_watching);
72
73#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
74
75/* 73/*
76 * Helper function for rcu_sched_qs() and rcu_bh_qs(). 74 * Helper function for rcu_sched_qs() and rcu_bh_qs().
77 * Also irqs are disabled to avoid confusion due to interrupt handlers 75 * Also irqs are disabled to avoid confusion due to interrupt handlers
@@ -79,7 +77,6 @@ EXPORT_SYMBOL(__rcu_is_watching);
79 */ 77 */
80static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 78static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
81{ 79{
82 RCU_TRACE(reset_cpu_stall_ticks(rcp);)
83 if (rcp->donetail != rcp->curtail) { 80 if (rcp->donetail != rcp->curtail) {
84 rcp->donetail = rcp->curtail; 81 rcp->donetail = rcp->curtail;
85 return 1; 82 return 1;
@@ -125,7 +122,6 @@ void rcu_bh_qs(void)
125 */ 122 */
126void rcu_check_callbacks(int user) 123void rcu_check_callbacks(int user)
127{ 124{
128 RCU_TRACE(check_cpu_stalls();)
129 if (user) 125 if (user)
130 rcu_sched_qs(); 126 rcu_sched_qs();
131 else if (!in_softirq()) 127 else if (!in_softirq())
@@ -140,10 +136,8 @@ void rcu_check_callbacks(int user)
140 */ 136 */
141static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) 137static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
142{ 138{
143 const char *rn = NULL;
144 struct rcu_head *next, *list; 139 struct rcu_head *next, *list;
145 unsigned long flags; 140 unsigned long flags;
146 RCU_TRACE(int cb_count = 0;)
147 141
148 /* Move the ready-to-invoke callbacks to a local list. */ 142 /* Move the ready-to-invoke callbacks to a local list. */
149 local_irq_save(flags); 143 local_irq_save(flags);
@@ -152,7 +146,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
152 local_irq_restore(flags); 146 local_irq_restore(flags);
153 return; 147 return;
154 } 148 }
155 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);)
156 list = rcp->rcucblist; 149 list = rcp->rcucblist;
157 rcp->rcucblist = *rcp->donetail; 150 rcp->rcucblist = *rcp->donetail;
158 *rcp->donetail = NULL; 151 *rcp->donetail = NULL;
@@ -162,22 +155,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
162 local_irq_restore(flags); 155 local_irq_restore(flags);
163 156
164 /* Invoke the callbacks on the local list. */ 157 /* Invoke the callbacks on the local list. */
165 RCU_TRACE(rn = rcp->name;)
166 while (list) { 158 while (list) {
167 next = list->next; 159 next = list->next;
168 prefetch(next); 160 prefetch(next);
169 debug_rcu_head_unqueue(list); 161 debug_rcu_head_unqueue(list);
170 local_bh_disable(); 162 local_bh_disable();
171 __rcu_reclaim(rn, list); 163 __rcu_reclaim("", list);
172 local_bh_enable(); 164 local_bh_enable();
173 list = next; 165 list = next;
174 RCU_TRACE(cb_count++;)
175 } 166 }
176 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);)
177 RCU_TRACE(trace_rcu_batch_end(rcp->name,
178 cb_count, 0, need_resched(),
179 is_idle_task(current),
180 false));
181} 167}
182 168
183static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) 169static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
@@ -221,7 +207,6 @@ static void __call_rcu(struct rcu_head *head,
221 local_irq_save(flags); 207 local_irq_save(flags);
222 *rcp->curtail = head; 208 *rcp->curtail = head;
223 rcp->curtail = &head->next; 209 rcp->curtail = &head->next;
224 RCU_TRACE(rcp->qlen++;)
225 local_irq_restore(flags); 210 local_irq_restore(flags);
226 211
227 if (unlikely(is_idle_task(current))) { 212 if (unlikely(is_idle_task(current))) {
@@ -254,8 +239,5 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
254void __init rcu_init(void) 239void __init rcu_init(void)
255{ 240{
256 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 241 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
257 RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);)
258 RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);)
259
260 rcu_early_boot_tests(); 242 rcu_early_boot_tests();
261} 243}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 371034e77f87..f0a01b2a3062 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -22,36 +22,6 @@
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */ 23 */
24 24
25#include <linux/kthread.h>
26#include <linux/init.h>
27#include <linux/debugfs.h>
28#include <linux/seq_file.h>
29
30/* Global control variables for rcupdate callback mechanism. */
31struct rcu_ctrlblk {
32 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
33 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
34 struct rcu_head **curtail; /* ->next pointer of last CB. */
35 RCU_TRACE(long qlen); /* Number of pending CBs. */
36 RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
37 RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
38 RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
39 RCU_TRACE(const char *name); /* Name of RCU type. */
40};
41
42/* Definition for rcupdate control block. */
43static struct rcu_ctrlblk rcu_sched_ctrlblk = {
44 .donetail = &rcu_sched_ctrlblk.rcucblist,
45 .curtail = &rcu_sched_ctrlblk.rcucblist,
46 RCU_TRACE(.name = "rcu_sched")
47};
48
49static struct rcu_ctrlblk rcu_bh_ctrlblk = {
50 .donetail = &rcu_bh_ctrlblk.rcucblist,
51 .curtail = &rcu_bh_ctrlblk.rcucblist,
52 RCU_TRACE(.name = "rcu_bh")
53};
54
55#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) 25#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
56#include <linux/kernel_stat.h> 26#include <linux/kernel_stat.h>
57 27
@@ -75,96 +45,3 @@ void __init rcu_scheduler_starting(void)
75} 45}
76 46
77#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ 47#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
78
79#ifdef CONFIG_RCU_TRACE
80
81static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
82{
83 unsigned long flags;
84
85 local_irq_save(flags);
86 rcp->qlen -= n;
87 local_irq_restore(flags);
88}
89
90/*
91 * Dump statistics for TINY_RCU, such as they are.
92 */
93static int show_tiny_stats(struct seq_file *m, void *unused)
94{
95 seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
96 seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
97 return 0;
98}
99
100static int show_tiny_stats_open(struct inode *inode, struct file *file)
101{
102 return single_open(file, show_tiny_stats, NULL);
103}
104
105static const struct file_operations show_tiny_stats_fops = {
106 .owner = THIS_MODULE,
107 .open = show_tiny_stats_open,
108 .read = seq_read,
109 .llseek = seq_lseek,
110 .release = single_release,
111};
112
113static struct dentry *rcudir;
114
115static int __init rcutiny_trace_init(void)
116{
117 struct dentry *retval;
118
119 rcudir = debugfs_create_dir("rcu", NULL);
120 if (!rcudir)
121 goto free_out;
122 retval = debugfs_create_file("rcudata", 0444, rcudir,
123 NULL, &show_tiny_stats_fops);
124 if (!retval)
125 goto free_out;
126 return 0;
127free_out:
128 debugfs_remove_recursive(rcudir);
129 return 1;
130}
131device_initcall(rcutiny_trace_init);
132
133static void check_cpu_stall(struct rcu_ctrlblk *rcp)
134{
135 unsigned long j;
136 unsigned long js;
137
138 if (rcu_cpu_stall_suppress)
139 return;
140 rcp->ticks_this_gp++;
141 j = jiffies;
142 js = READ_ONCE(rcp->jiffies_stall);
143 if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
144 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
145 rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
146 jiffies - rcp->gp_start, rcp->qlen);
147 dump_stack();
148 WRITE_ONCE(rcp->jiffies_stall,
149 jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
150 } else if (ULONG_CMP_GE(j, js)) {
151 WRITE_ONCE(rcp->jiffies_stall,
152 jiffies + rcu_jiffies_till_stall_check());
153 }
154}
155
156static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
157{
158 rcp->ticks_this_gp = 0;
159 rcp->gp_start = jiffies;
160 WRITE_ONCE(rcp->jiffies_stall,
161 jiffies + rcu_jiffies_till_stall_check());
162}
163
164static void check_cpu_stalls(void)
165{
166 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);)
167 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);)
168}
169
170#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e354e475e645..51d4c3acf32d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -168,35 +168,17 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp,
168static void sync_sched_exp_online_cleanup(int cpu); 168static void sync_sched_exp_online_cleanup(int cpu);
169 169
170/* rcuc/rcub kthread realtime priority */ 170/* rcuc/rcub kthread realtime priority */
171#ifdef CONFIG_RCU_KTHREAD_PRIO
172static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
173#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */
174static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; 171static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
175#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */
176module_param(kthread_prio, int, 0644); 172module_param(kthread_prio, int, 0644);
177 173
178/* Delay in jiffies for grace-period initialization delays, debug only. */ 174/* Delay in jiffies for grace-period initialization delays, debug only. */
179 175
180#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT 176static int gp_preinit_delay;
181static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY; 177module_param(gp_preinit_delay, int, 0444);
182module_param(gp_preinit_delay, int, 0644); 178static int gp_init_delay;
183#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ 179module_param(gp_init_delay, int, 0444);
184static const int gp_preinit_delay; 180static int gp_cleanup_delay;
185#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ 181module_param(gp_cleanup_delay, int, 0444);
186
187#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
188static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
189module_param(gp_init_delay, int, 0644);
190#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
191static const int gp_init_delay;
192#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
193
194#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP
195static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY;
196module_param(gp_cleanup_delay, int, 0644);
197#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
198static const int gp_cleanup_delay;
199#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
200 182
201/* 183/*
202 * Number of grace periods between delays, normalized by the duration of 184 * Number of grace periods between delays, normalized by the duration of
@@ -250,6 +232,7 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
250 */ 232 */
251void rcu_sched_qs(void) 233void rcu_sched_qs(void)
252{ 234{
235 RCU_LOCKDEP_WARN(preemptible(), "rcu_sched_qs() invoked with preemption enabled!!!");
253 if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) 236 if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
254 return; 237 return;
255 trace_rcu_grace_period(TPS("rcu_sched"), 238 trace_rcu_grace_period(TPS("rcu_sched"),
@@ -265,6 +248,7 @@ void rcu_sched_qs(void)
265 248
266void rcu_bh_qs(void) 249void rcu_bh_qs(void)
267{ 250{
251 RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!");
268 if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { 252 if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
269 trace_rcu_grace_period(TPS("rcu_bh"), 253 trace_rcu_grace_period(TPS("rcu_bh"),
270 __this_cpu_read(rcu_bh_data.gpnum), 254 __this_cpu_read(rcu_bh_data.gpnum),
@@ -286,10 +270,6 @@ void rcu_bh_qs(void)
286static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 270static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
287 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 271 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
288 .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), 272 .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
289#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
290 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
291 .dynticks_idle = ATOMIC_INIT(1),
292#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
293}; 273};
294 274
295/* 275/*
@@ -478,7 +458,7 @@ void rcu_note_context_switch(bool preempt)
478 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 458 barrier(); /* Avoid RCU read-side critical sections leaking down. */
479 trace_rcu_utilization(TPS("Start context switch")); 459 trace_rcu_utilization(TPS("Start context switch"));
480 rcu_sched_qs(); 460 rcu_sched_qs();
481 rcu_preempt_note_context_switch(); 461 rcu_preempt_note_context_switch(preempt);
482 /* Load rcu_urgent_qs before other flags. */ 462 /* Load rcu_urgent_qs before other flags. */
483 if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) 463 if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs)))
484 goto out; 464 goto out;
@@ -534,9 +514,12 @@ void rcu_all_qs(void)
534} 514}
535EXPORT_SYMBOL_GPL(rcu_all_qs); 515EXPORT_SYMBOL_GPL(rcu_all_qs);
536 516
537static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 517#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */
538static long qhimark = 10000; /* If this many pending, ignore blimit. */ 518static long blimit = DEFAULT_RCU_BLIMIT;
539static long qlowmark = 100; /* Once only this many pending, use blimit. */ 519#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */
520static long qhimark = DEFAULT_RCU_QHIMARK;
521#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */
522static long qlowmark = DEFAULT_RCU_QLOMARK;
540 523
541module_param(blimit, long, 0444); 524module_param(blimit, long, 0444);
542module_param(qhimark, long, 0444); 525module_param(qhimark, long, 0444);
@@ -559,10 +542,7 @@ module_param(jiffies_till_sched_qs, ulong, 0644);
559 542
560static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 543static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
561 struct rcu_data *rdp); 544 struct rcu_data *rdp);
562static void force_qs_rnp(struct rcu_state *rsp, 545static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp));
563 int (*f)(struct rcu_data *rsp, bool *isidle,
564 unsigned long *maxj),
565 bool *isidle, unsigned long *maxj);
566static void force_quiescent_state(struct rcu_state *rsp); 546static void force_quiescent_state(struct rcu_state *rsp);
567static int rcu_pending(void); 547static int rcu_pending(void);
568 548
@@ -757,6 +737,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
757 int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; 737 int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
758 int *fp = &rnp->need_future_gp[idx]; 738 int *fp = &rnp->need_future_gp[idx];
759 739
740 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!");
760 return READ_ONCE(*fp); 741 return READ_ONCE(*fp);
761} 742}
762 743
@@ -768,6 +749,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
768static bool 749static bool
769cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 750cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
770{ 751{
752 RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!");
771 if (rcu_gp_in_progress(rsp)) 753 if (rcu_gp_in_progress(rsp))
772 return false; /* No, a grace period is already in progress. */ 754 return false; /* No, a grace period is already in progress. */
773 if (rcu_future_needs_gp(rsp)) 755 if (rcu_future_needs_gp(rsp))
@@ -794,6 +776,7 @@ static void rcu_eqs_enter_common(bool user)
794 struct rcu_data *rdp; 776 struct rcu_data *rdp;
795 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 777 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
796 778
779 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!");
797 trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); 780 trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0);
798 if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 781 if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
799 !user && !is_idle_task(current)) { 782 !user && !is_idle_task(current)) {
@@ -864,7 +847,6 @@ void rcu_idle_enter(void)
864 847
865 local_irq_save(flags); 848 local_irq_save(flags);
866 rcu_eqs_enter(false); 849 rcu_eqs_enter(false);
867 rcu_sysidle_enter(0);
868 local_irq_restore(flags); 850 local_irq_restore(flags);
869} 851}
870EXPORT_SYMBOL_GPL(rcu_idle_enter); 852EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -914,7 +896,6 @@ void rcu_irq_exit(void)
914 trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); 896 trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1);
915 rdtp->dynticks_nesting--; 897 rdtp->dynticks_nesting--;
916 } 898 }
917 rcu_sysidle_enter(1);
918} 899}
919 900
920/* 901/*
@@ -967,6 +948,7 @@ static void rcu_eqs_exit(bool user)
967 struct rcu_dynticks *rdtp; 948 struct rcu_dynticks *rdtp;
968 long long oldval; 949 long long oldval;
969 950
951 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!");
970 rdtp = this_cpu_ptr(&rcu_dynticks); 952 rdtp = this_cpu_ptr(&rcu_dynticks);
971 oldval = rdtp->dynticks_nesting; 953 oldval = rdtp->dynticks_nesting;
972 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); 954 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
@@ -995,7 +977,6 @@ void rcu_idle_exit(void)
995 977
996 local_irq_save(flags); 978 local_irq_save(flags);
997 rcu_eqs_exit(false); 979 rcu_eqs_exit(false);
998 rcu_sysidle_exit(0);
999 local_irq_restore(flags); 980 local_irq_restore(flags);
1000} 981}
1001EXPORT_SYMBOL_GPL(rcu_idle_exit); 982EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -1047,7 +1028,6 @@ void rcu_irq_enter(void)
1047 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); 1028 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
1048 else 1029 else
1049 rcu_eqs_exit_common(oldval, true); 1030 rcu_eqs_exit_common(oldval, true);
1050 rcu_sysidle_exit(1);
1051} 1031}
1052 1032
1053/* 1033/*
@@ -1130,22 +1110,11 @@ void rcu_nmi_exit(void)
1130} 1110}
1131 1111
1132/** 1112/**
1133 * __rcu_is_watching - are RCU read-side critical sections safe?
1134 *
1135 * Return true if RCU is watching the running CPU, which means that
1136 * this CPU can safely enter RCU read-side critical sections. Unlike
1137 * rcu_is_watching(), the caller of __rcu_is_watching() must have at
1138 * least disabled preemption.
1139 */
1140bool notrace __rcu_is_watching(void)
1141{
1142 return !rcu_dynticks_curr_cpu_in_eqs();
1143}
1144
1145/**
1146 * rcu_is_watching - see if RCU thinks that the current CPU is idle 1113 * rcu_is_watching - see if RCU thinks that the current CPU is idle
1147 * 1114 *
1148 * If the current CPU is in its idle loop and is neither in an interrupt 1115 * Return true if RCU is watching the running CPU, which means that this
1116 * CPU can safely enter RCU read-side critical sections. In other words,
1117 * if the current CPU is in its idle loop and is neither in an interrupt
1149 * or NMI handler, return true. 1118 * or NMI handler, return true.
1150 */ 1119 */
1151bool notrace rcu_is_watching(void) 1120bool notrace rcu_is_watching(void)
@@ -1153,7 +1122,7 @@ bool notrace rcu_is_watching(void)
1153 bool ret; 1122 bool ret;
1154 1123
1155 preempt_disable_notrace(); 1124 preempt_disable_notrace();
1156 ret = __rcu_is_watching(); 1125 ret = !rcu_dynticks_curr_cpu_in_eqs();
1157 preempt_enable_notrace(); 1126 preempt_enable_notrace();
1158 return ret; 1127 return ret;
1159} 1128}
@@ -1237,11 +1206,9 @@ static int rcu_is_cpu_rrupt_from_idle(void)
1237 * credit them with an implicit quiescent state. Return 1 if this CPU 1206 * credit them with an implicit quiescent state. Return 1 if this CPU
1238 * is in dynticks idle mode, which is an extended quiescent state. 1207 * is in dynticks idle mode, which is an extended quiescent state.
1239 */ 1208 */
1240static int dyntick_save_progress_counter(struct rcu_data *rdp, 1209static int dyntick_save_progress_counter(struct rcu_data *rdp)
1241 bool *isidle, unsigned long *maxj)
1242{ 1210{
1243 rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); 1211 rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
1244 rcu_sysidle_check_cpu(rdp, isidle, maxj);
1245 if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { 1212 if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
1246 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); 1213 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
1247 if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, 1214 if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
@@ -1258,8 +1225,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
1258 * idle state since the last call to dyntick_save_progress_counter() 1225 * idle state since the last call to dyntick_save_progress_counter()
1259 * for this same CPU, or by virtue of having been offline. 1226 * for this same CPU, or by virtue of having been offline.
1260 */ 1227 */
1261static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, 1228static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
1262 bool *isidle, unsigned long *maxj)
1263{ 1229{
1264 unsigned long jtsq; 1230 unsigned long jtsq;
1265 bool *rnhqp; 1231 bool *rnhqp;
@@ -1674,6 +1640,8 @@ void rcu_cpu_stall_reset(void)
1674static unsigned long rcu_cbs_completed(struct rcu_state *rsp, 1640static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
1675 struct rcu_node *rnp) 1641 struct rcu_node *rnp)
1676{ 1642{
1643 lockdep_assert_held(&rnp->lock);
1644
1677 /* 1645 /*
1678 * If RCU is idle, we just wait for the next grace period. 1646 * If RCU is idle, we just wait for the next grace period.
1679 * But we can only be sure that RCU is idle if we are looking 1647 * But we can only be sure that RCU is idle if we are looking
@@ -1719,6 +1687,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1719 bool ret = false; 1687 bool ret = false;
1720 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); 1688 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
1721 1689
1690 lockdep_assert_held(&rnp->lock);
1691
1722 /* 1692 /*
1723 * Pick up grace-period number for new callbacks. If this 1693 * Pick up grace-period number for new callbacks. If this
1724 * grace period is already marked as needed, return to the caller. 1694 * grace period is already marked as needed, return to the caller.
@@ -1845,6 +1815,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1845{ 1815{
1846 bool ret = false; 1816 bool ret = false;
1847 1817
1818 lockdep_assert_held(&rnp->lock);
1819
1848 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ 1820 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1849 if (!rcu_segcblist_pend_cbs(&rdp->cblist)) 1821 if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1850 return false; 1822 return false;
@@ -1883,6 +1855,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1883static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1855static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1884 struct rcu_data *rdp) 1856 struct rcu_data *rdp)
1885{ 1857{
1858 lockdep_assert_held(&rnp->lock);
1859
1886 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ 1860 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1887 if (!rcu_segcblist_pend_cbs(&rdp->cblist)) 1861 if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1888 return false; 1862 return false;
@@ -1909,6 +1883,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1909 bool ret; 1883 bool ret;
1910 bool need_gp; 1884 bool need_gp;
1911 1885
1886 lockdep_assert_held(&rnp->lock);
1887
1912 /* Handle the ends of any preceding grace periods first. */ 1888 /* Handle the ends of any preceding grace periods first. */
1913 if (rdp->completed == rnp->completed && 1889 if (rdp->completed == rnp->completed &&
1914 !unlikely(READ_ONCE(rdp->gpwrap))) { 1890 !unlikely(READ_ONCE(rdp->gpwrap))) {
@@ -2115,25 +2091,16 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
2115 */ 2091 */
2116static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) 2092static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
2117{ 2093{
2118 bool isidle = false;
2119 unsigned long maxj;
2120 struct rcu_node *rnp = rcu_get_root(rsp); 2094 struct rcu_node *rnp = rcu_get_root(rsp);
2121 2095
2122 WRITE_ONCE(rsp->gp_activity, jiffies); 2096 WRITE_ONCE(rsp->gp_activity, jiffies);
2123 rsp->n_force_qs++; 2097 rsp->n_force_qs++;
2124 if (first_time) { 2098 if (first_time) {
2125 /* Collect dyntick-idle snapshots. */ 2099 /* Collect dyntick-idle snapshots. */
2126 if (is_sysidle_rcu_state(rsp)) { 2100 force_qs_rnp(rsp, dyntick_save_progress_counter);
2127 isidle = true;
2128 maxj = jiffies - ULONG_MAX / 4;
2129 }
2130 force_qs_rnp(rsp, dyntick_save_progress_counter,
2131 &isidle, &maxj);
2132 rcu_sysidle_report_gp(rsp, isidle, maxj);
2133 } else { 2101 } else {
2134 /* Handle dyntick-idle and offline CPUs. */ 2102 /* Handle dyntick-idle and offline CPUs. */
2135 isidle = true; 2103 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
2136 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
2137 } 2104 }
2138 /* Clear flag to prevent immediate re-entry. */ 2105 /* Clear flag to prevent immediate re-entry. */
2139 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2106 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
@@ -2341,6 +2308,7 @@ static bool
2341rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 2308rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
2342 struct rcu_data *rdp) 2309 struct rcu_data *rdp)
2343{ 2310{
2311 lockdep_assert_held(&rnp->lock);
2344 if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { 2312 if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
2345 /* 2313 /*
2346 * Either we have not yet spawned the grace-period 2314 * Either we have not yet spawned the grace-period
@@ -2402,6 +2370,7 @@ static bool rcu_start_gp(struct rcu_state *rsp)
2402static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 2370static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
2403 __releases(rcu_get_root(rsp)->lock) 2371 __releases(rcu_get_root(rsp)->lock)
2404{ 2372{
2373 lockdep_assert_held(&rcu_get_root(rsp)->lock);
2405 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 2374 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
2406 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 2375 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
2407 raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); 2376 raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
@@ -2426,6 +2395,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2426 unsigned long oldmask = 0; 2395 unsigned long oldmask = 0;
2427 struct rcu_node *rnp_c; 2396 struct rcu_node *rnp_c;
2428 2397
2398 lockdep_assert_held(&rnp->lock);
2399
2429 /* Walk up the rcu_node hierarchy. */ 2400 /* Walk up the rcu_node hierarchy. */
2430 for (;;) { 2401 for (;;) {
2431 if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { 2402 if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
@@ -2486,6 +2457,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
2486 unsigned long mask; 2457 unsigned long mask;
2487 struct rcu_node *rnp_p; 2458 struct rcu_node *rnp_p;
2488 2459
2460 lockdep_assert_held(&rnp->lock);
2489 if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || 2461 if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
2490 rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 2462 rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
2491 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2463 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2599,6 +2571,8 @@ static void
2599rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, 2571rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
2600 struct rcu_node *rnp, struct rcu_data *rdp) 2572 struct rcu_node *rnp, struct rcu_data *rdp)
2601{ 2573{
2574 lockdep_assert_held(&rsp->orphan_lock);
2575
2602 /* No-CBs CPUs do not have orphanable callbacks. */ 2576 /* No-CBs CPUs do not have orphanable callbacks. */
2603 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) 2577 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
2604 return; 2578 return;
@@ -2639,6 +2613,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
2639{ 2613{
2640 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 2614 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
2641 2615
2616 lockdep_assert_held(&rsp->orphan_lock);
2617
2642 /* No-CBs CPUs are handled specially. */ 2618 /* No-CBs CPUs are handled specially. */
2643 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || 2619 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
2644 rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) 2620 rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
@@ -2705,6 +2681,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2705 long mask; 2681 long mask;
2706 struct rcu_node *rnp = rnp_leaf; 2682 struct rcu_node *rnp = rnp_leaf;
2707 2683
2684 lockdep_assert_held(&rnp->lock);
2708 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || 2685 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
2709 rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) 2686 rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
2710 return; 2687 return;
@@ -2895,10 +2872,7 @@ void rcu_check_callbacks(int user)
2895 * 2872 *
2896 * The caller must have suppressed start of new grace periods. 2873 * The caller must have suppressed start of new grace periods.
2897 */ 2874 */
2898static void force_qs_rnp(struct rcu_state *rsp, 2875static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp))
2899 int (*f)(struct rcu_data *rsp, bool *isidle,
2900 unsigned long *maxj),
2901 bool *isidle, unsigned long *maxj)
2902{ 2876{
2903 int cpu; 2877 int cpu;
2904 unsigned long flags; 2878 unsigned long flags;
@@ -2937,7 +2911,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
2937 for_each_leaf_node_possible_cpu(rnp, cpu) { 2911 for_each_leaf_node_possible_cpu(rnp, cpu) {
2938 unsigned long bit = leaf_node_cpu_bit(rnp, cpu); 2912 unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
2939 if ((rnp->qsmask & bit) != 0) { 2913 if ((rnp->qsmask & bit) != 0) {
2940 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) 2914 if (f(per_cpu_ptr(rsp->rda, cpu)))
2941 mask |= bit; 2915 mask |= bit;
2942 } 2916 }
2943 } 2917 }
@@ -3143,9 +3117,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
3143 WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); 3117 WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
3144 3118
3145 if (debug_rcu_head_queue(head)) { 3119 if (debug_rcu_head_queue(head)) {
3146 /* Probable double call_rcu(), so leak the callback. */ 3120 /*
3121 * Probable double call_rcu(), so leak the callback.
3122 * Use rcu:rcu_callback trace event to find the previous
3123 * time callback was passed to __call_rcu().
3124 */
3125 WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n",
3126 head, head->func);
3147 WRITE_ONCE(head->func, rcu_leak_callback); 3127 WRITE_ONCE(head->func, rcu_leak_callback);
3148 WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
3149 return; 3128 return;
3150 } 3129 }
3151 head->func = func; 3130 head->func = func;
@@ -3194,8 +3173,24 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
3194 local_irq_restore(flags); 3173 local_irq_restore(flags);
3195} 3174}
3196 3175
3197/* 3176/**
3198 * Queue an RCU-sched callback for invocation after a grace period. 3177 * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
3178 * @head: structure to be used for queueing the RCU updates.
3179 * @func: actual callback function to be invoked after the grace period
3180 *
3181 * The callback function will be invoked some time after a full grace
3182 * period elapses, in other words after all currently executing RCU
3183 * read-side critical sections have completed. call_rcu_sched() assumes
3184 * that the read-side critical sections end on enabling of preemption
3185 * or on voluntary preemption.
3186 * RCU read-side critical sections are delimited by :
3187 * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
3188 * - anything that disables preemption.
3189 *
3190 * These may be nested.
3191 *
3192 * See the description of call_rcu() for more detailed information on
3193 * memory ordering guarantees.
3199 */ 3194 */
3200void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) 3195void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
3201{ 3196{
@@ -3203,8 +3198,26 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
3203} 3198}
3204EXPORT_SYMBOL_GPL(call_rcu_sched); 3199EXPORT_SYMBOL_GPL(call_rcu_sched);
3205 3200
3206/* 3201/**
3207 * Queue an RCU callback for invocation after a quicker grace period. 3202 * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
3203 * @head: structure to be used for queueing the RCU updates.
3204 * @func: actual callback function to be invoked after the grace period
3205 *
3206 * The callback function will be invoked some time after a full grace
3207 * period elapses, in other words after all currently executing RCU
3208 * read-side critical sections have completed. call_rcu_bh() assumes
3209 * that the read-side critical sections end on completion of a softirq
3210 * handler. This means that read-side critical sections in process
3211 * context must not be interrupted by softirqs. This interface is to be
3212 * used when most of the read-side critical sections are in softirq context.
3213 * RCU read-side critical sections are delimited by :
3214 * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context.
3215 * OR
3216 * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
3217 * These may be nested.
3218 *
3219 * See the description of call_rcu() for more detailed information on
3220 * memory ordering guarantees.
3208 */ 3221 */
3209void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) 3222void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
3210{ 3223{
@@ -3280,12 +3293,6 @@ static inline int rcu_blocking_is_gp(void)
3280 * to have executed a full memory barrier during the execution of 3293 * to have executed a full memory barrier during the execution of
3281 * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but 3294 * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
3282 * again only if the system has more than one CPU). 3295 * again only if the system has more than one CPU).
3283 *
3284 * This primitive provides the guarantees made by the (now removed)
3285 * synchronize_kernel() API. In contrast, synchronize_rcu() only
3286 * guarantees that rcu_read_lock() sections will have completed.
3287 * In "classic RCU", these two guarantees happen to be one and
3288 * the same, but can differ in realtime RCU implementations.
3289 */ 3296 */
3290void synchronize_sched(void) 3297void synchronize_sched(void)
3291{ 3298{
@@ -3578,8 +3585,14 @@ static void rcu_barrier_func(void *type)
3578 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 3585 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
3579 3586
3580 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); 3587 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
3581 atomic_inc(&rsp->barrier_cpu_count); 3588 rdp->barrier_head.func = rcu_barrier_callback;
3582 rsp->call(&rdp->barrier_head, rcu_barrier_callback); 3589 debug_rcu_head_queue(&rdp->barrier_head);
3590 if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
3591 atomic_inc(&rsp->barrier_cpu_count);
3592 } else {
3593 debug_rcu_head_unqueue(&rdp->barrier_head);
3594 _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence);
3595 }
3583} 3596}
3584 3597
3585/* 3598/*
@@ -3698,6 +3711,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
3698 long mask; 3711 long mask;
3699 struct rcu_node *rnp = rnp_leaf; 3712 struct rcu_node *rnp = rnp_leaf;
3700 3713
3714 lockdep_assert_held(&rnp->lock);
3701 for (;;) { 3715 for (;;) {
3702 mask = rnp->grpmask; 3716 mask = rnp->grpmask;
3703 rnp = rnp->parent; 3717 rnp = rnp->parent;
@@ -3753,7 +3767,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3753 !init_nocb_callback_list(rdp)) 3767 !init_nocb_callback_list(rdp))
3754 rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ 3768 rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
3755 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 3769 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
3756 rcu_sysidle_init_percpu_data(rdp->dynticks);
3757 rcu_dynticks_eqs_online(); 3770 rcu_dynticks_eqs_online();
3758 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 3771 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
3759 3772
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ba38262c3554..9af0f31d6847 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -45,14 +45,6 @@ struct rcu_dynticks {
45 bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ 45 bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
46 unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ 46 unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
47 bool rcu_urgent_qs; /* GP old need light quiescent state. */ 47 bool rcu_urgent_qs; /* GP old need light quiescent state. */
48#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
49 long long dynticks_idle_nesting;
50 /* irq/process nesting level from idle. */
51 atomic_t dynticks_idle; /* Even value for idle, else odd. */
52 /* "Idle" excludes userspace execution. */
53 unsigned long dynticks_idle_jiffies;
54 /* End of last non-NMI non-idle period. */
55#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
56#ifdef CONFIG_RCU_FAST_NO_HZ 48#ifdef CONFIG_RCU_FAST_NO_HZ
57 bool all_lazy; /* Are all CPU's CBs lazy? */ 49 bool all_lazy; /* Are all CPU's CBs lazy? */
58 unsigned long nonlazy_posted; 50 unsigned long nonlazy_posted;
@@ -160,19 +152,6 @@ struct rcu_node {
160 /* Number of tasks boosted for expedited GP. */ 152 /* Number of tasks boosted for expedited GP. */
161 unsigned long n_normal_boosts; 153 unsigned long n_normal_boosts;
162 /* Number of tasks boosted for normal GP. */ 154 /* Number of tasks boosted for normal GP. */
163 unsigned long n_balk_blkd_tasks;
164 /* Refused to boost: no blocked tasks. */
165 unsigned long n_balk_exp_gp_tasks;
166 /* Refused to boost: nothing blocking GP. */
167 unsigned long n_balk_boost_tasks;
168 /* Refused to boost: already boosting. */
169 unsigned long n_balk_notblocked;
170 /* Refused to boost: RCU RS CS still running. */
171 unsigned long n_balk_notyet;
172 /* Refused to boost: not yet time. */
173 unsigned long n_balk_nos;
174 /* Refused to boost: not sure why, though. */
175 /* This can happen due to race conditions. */
176#ifdef CONFIG_RCU_NOCB_CPU 155#ifdef CONFIG_RCU_NOCB_CPU
177 struct swait_queue_head nocb_gp_wq[2]; 156 struct swait_queue_head nocb_gp_wq[2];
178 /* Place for rcu_nocb_kthread() to wait GP. */ 157 /* Place for rcu_nocb_kthread() to wait GP. */
@@ -312,9 +291,9 @@ struct rcu_data {
312}; 291};
313 292
314/* Values for nocb_defer_wakeup field in struct rcu_data. */ 293/* Values for nocb_defer_wakeup field in struct rcu_data. */
315#define RCU_NOGP_WAKE_NOT 0 294#define RCU_NOCB_WAKE_NOT 0
316#define RCU_NOGP_WAKE 1 295#define RCU_NOCB_WAKE 1
317#define RCU_NOGP_WAKE_FORCE 2 296#define RCU_NOCB_WAKE_FORCE 2
318 297
319#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) 298#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
320 /* For jiffies_till_first_fqs and */ 299 /* For jiffies_till_first_fqs and */
@@ -477,7 +456,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
477 456
478/* Forward declarations for rcutree_plugin.h */ 457/* Forward declarations for rcutree_plugin.h */
479static void rcu_bootup_announce(void); 458static void rcu_bootup_announce(void);
480static void rcu_preempt_note_context_switch(void); 459static void rcu_preempt_note_context_switch(bool preempt);
481static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 460static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
482#ifdef CONFIG_HOTPLUG_CPU 461#ifdef CONFIG_HOTPLUG_CPU
483static bool rcu_preempt_has_tasks(struct rcu_node *rnp); 462static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
@@ -529,15 +508,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
529#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 508#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
530static void __maybe_unused rcu_kick_nohz_cpu(int cpu); 509static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
531static bool init_nocb_callback_list(struct rcu_data *rdp); 510static bool init_nocb_callback_list(struct rcu_data *rdp);
532static void rcu_sysidle_enter(int irq);
533static void rcu_sysidle_exit(int irq);
534static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
535 unsigned long *maxj);
536static bool is_sysidle_rcu_state(struct rcu_state *rsp);
537static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
538 unsigned long maxj);
539static void rcu_bind_gp_kthread(void); 511static void rcu_bind_gp_kthread(void);
540static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
541static bool rcu_nohz_full_cpu(struct rcu_state *rsp); 512static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
542static void rcu_dynticks_task_enter(void); 513static void rcu_dynticks_task_enter(void);
543static void rcu_dynticks_task_exit(void); 514static void rcu_dynticks_task_exit(void);
@@ -551,75 +522,3 @@ void srcu_offline_cpu(unsigned int cpu) { }
551#endif /* #else #ifdef CONFIG_SRCU */ 522#endif /* #else #ifdef CONFIG_SRCU */
552 523
553#endif /* #ifndef RCU_TREE_NONCORE */ 524#endif /* #ifndef RCU_TREE_NONCORE */
554
555#ifdef CONFIG_RCU_TRACE
556/* Read out queue lengths for tracing. */
557static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
558{
559#ifdef CONFIG_RCU_NOCB_CPU
560 *ql = atomic_long_read(&rdp->nocb_q_count);
561 *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
562#else /* #ifdef CONFIG_RCU_NOCB_CPU */
563 *ql = 0;
564 *qll = 0;
565#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
566}
567#endif /* #ifdef CONFIG_RCU_TRACE */
568
569/*
570 * Wrappers for the rcu_node::lock acquire and release.
571 *
572 * Because the rcu_nodes form a tree, the tree traversal locking will observe
573 * different lock values, this in turn means that an UNLOCK of one level
574 * followed by a LOCK of another level does not imply a full memory barrier;
575 * and most importantly transitivity is lost.
576 *
577 * In order to restore full ordering between tree levels, augment the regular
578 * lock acquire functions with smp_mb__after_unlock_lock().
579 *
580 * As ->lock of struct rcu_node is a __private field, therefore one should use
581 * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
582 */
583static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
584{
585 raw_spin_lock(&ACCESS_PRIVATE(rnp, lock));
586 smp_mb__after_unlock_lock();
587}
588
589static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp)
590{
591 raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock));
592}
593
594static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
595{
596 raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock));
597 smp_mb__after_unlock_lock();
598}
599
600static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp)
601{
602 raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock));
603}
604
605#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \
606do { \
607 typecheck(unsigned long, flags); \
608 raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \
609 smp_mb__after_unlock_lock(); \
610} while (0)
611
612#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \
613do { \
614 typecheck(unsigned long, flags); \
615 raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \
616} while (0)
617
618static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
619{
620 bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock));
621
622 if (locked)
623 smp_mb__after_unlock_lock();
624 return locked;
625}
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index e513b4ab1197..dd21ca47e4b4 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -147,7 +147,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
147 * 147 *
148 * Caller must hold the rcu_state's exp_mutex. 148 * Caller must hold the rcu_state's exp_mutex.
149 */ 149 */
150static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) 150static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
151{ 151{
152 return rnp->exp_tasks == NULL && 152 return rnp->exp_tasks == NULL &&
153 READ_ONCE(rnp->expmask) == 0; 153 READ_ONCE(rnp->expmask) == 0;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c9a48657512a..908b309d60d7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -70,7 +70,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
70static void __init rcu_bootup_announce_oddness(void) 70static void __init rcu_bootup_announce_oddness(void)
71{ 71{
72 if (IS_ENABLED(CONFIG_RCU_TRACE)) 72 if (IS_ENABLED(CONFIG_RCU_TRACE))
73 pr_info("\tRCU debugfs-based tracing is enabled.\n"); 73 pr_info("\tRCU event tracing is enabled.\n");
74 if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || 74 if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
75 (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) 75 (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
76 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", 76 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
@@ -90,8 +90,32 @@ static void __init rcu_bootup_announce_oddness(void)
90 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 90 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
91 if (nr_cpu_ids != NR_CPUS) 91 if (nr_cpu_ids != NR_CPUS)
92 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 92 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
93 if (IS_ENABLED(CONFIG_RCU_BOOST)) 93#ifdef CONFIG_RCU_BOOST
94 pr_info("\tRCU kthread priority: %d.\n", kthread_prio); 94 pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY);
95#endif
96 if (blimit != DEFAULT_RCU_BLIMIT)
97 pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit);
98 if (qhimark != DEFAULT_RCU_QHIMARK)
99 pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
100 if (qlowmark != DEFAULT_RCU_QLOMARK)
101 pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
102 if (jiffies_till_first_fqs != ULONG_MAX)
103 pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
104 if (jiffies_till_next_fqs != ULONG_MAX)
105 pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs);
106 if (rcu_kick_kthreads)
107 pr_info("\tKick kthreads if too-long grace period.\n");
108 if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
109 pr_info("\tRCU callback double-/use-after-free debug enabled.\n");
110 if (gp_preinit_delay)
111 pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
112 if (gp_init_delay)
113 pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
114 if (gp_cleanup_delay)
115 pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
116 if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
117 pr_info("\tRCU debug extended QS entry/exit.\n");
118 rcupdate_announce_bootup_oddness();
95} 119}
96 120
97#ifdef CONFIG_PREEMPT_RCU 121#ifdef CONFIG_PREEMPT_RCU
@@ -155,6 +179,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
155 (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); 179 (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
156 struct task_struct *t = current; 180 struct task_struct *t = current;
157 181
182 lockdep_assert_held(&rnp->lock);
183
158 /* 184 /*
159 * Decide where to queue the newly blocked task. In theory, 185 * Decide where to queue the newly blocked task. In theory,
160 * this could be an if-statement. In practice, when I tried 186 * this could be an if-statement. In practice, when I tried
@@ -263,6 +289,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
263 */ 289 */
264static void rcu_preempt_qs(void) 290static void rcu_preempt_qs(void)
265{ 291{
292 RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n");
266 if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { 293 if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) {
267 trace_rcu_grace_period(TPS("rcu_preempt"), 294 trace_rcu_grace_period(TPS("rcu_preempt"),
268 __this_cpu_read(rcu_data_p->gpnum), 295 __this_cpu_read(rcu_data_p->gpnum),
@@ -286,12 +313,14 @@ static void rcu_preempt_qs(void)
286 * 313 *
287 * Caller must disable interrupts. 314 * Caller must disable interrupts.
288 */ 315 */
289static void rcu_preempt_note_context_switch(void) 316static void rcu_preempt_note_context_switch(bool preempt)
290{ 317{
291 struct task_struct *t = current; 318 struct task_struct *t = current;
292 struct rcu_data *rdp; 319 struct rcu_data *rdp;
293 struct rcu_node *rnp; 320 struct rcu_node *rnp;
294 321
322 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
323 WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
295 if (t->rcu_read_lock_nesting > 0 && 324 if (t->rcu_read_lock_nesting > 0 &&
296 !t->rcu_read_unlock_special.b.blocked) { 325 !t->rcu_read_unlock_special.b.blocked) {
297 326
@@ -607,6 +636,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
607 */ 636 */
608static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 637static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
609{ 638{
639 RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
610 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); 640 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
611 if (rcu_preempt_has_tasks(rnp)) 641 if (rcu_preempt_has_tasks(rnp))
612 rnp->gp_tasks = rnp->blkd_tasks.next; 642 rnp->gp_tasks = rnp->blkd_tasks.next;
@@ -643,8 +673,37 @@ static void rcu_preempt_do_callbacks(void)
643 673
644#endif /* #ifdef CONFIG_RCU_BOOST */ 674#endif /* #ifdef CONFIG_RCU_BOOST */
645 675
646/* 676/**
647 * Queue a preemptible-RCU callback for invocation after a grace period. 677 * call_rcu() - Queue an RCU callback for invocation after a grace period.
678 * @head: structure to be used for queueing the RCU updates.
679 * @func: actual callback function to be invoked after the grace period
680 *
681 * The callback function will be invoked some time after a full grace
682 * period elapses, in other words after all pre-existing RCU read-side
683 * critical sections have completed. However, the callback function
684 * might well execute concurrently with RCU read-side critical sections
685 * that started after call_rcu() was invoked. RCU read-side critical
686 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
687 * and may be nested.
688 *
689 * Note that all CPUs must agree that the grace period extended beyond
690 * all pre-existing RCU read-side critical section. On systems with more
691 * than one CPU, this means that when "func()" is invoked, each CPU is
692 * guaranteed to have executed a full memory barrier since the end of its
693 * last RCU read-side critical section whose beginning preceded the call
694 * to call_rcu(). It also means that each CPU executing an RCU read-side
695 * critical section that continues beyond the start of "func()" must have
696 * executed a memory barrier after the call_rcu() but before the beginning
697 * of that RCU read-side critical section. Note that these guarantees
698 * include CPUs that are offline, idle, or executing in user mode, as
699 * well as CPUs that are executing in the kernel.
700 *
701 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
702 * resulting RCU callback function "func()", then both CPU A and CPU B are
703 * guaranteed to execute a full memory barrier during the time interval
704 * between the call to call_rcu() and the invocation of "func()" -- even
705 * if CPU A and CPU B are the same CPU (but again only if the system has
706 * more than one CPU).
648 */ 707 */
649void call_rcu(struct rcu_head *head, rcu_callback_t func) 708void call_rcu(struct rcu_head *head, rcu_callback_t func)
650{ 709{
@@ -663,8 +722,13 @@ EXPORT_SYMBOL_GPL(call_rcu);
663 * synchronize_rcu() was waiting. RCU read-side critical sections are 722 * synchronize_rcu() was waiting. RCU read-side critical sections are
664 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. 723 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
665 * 724 *
666 * See the description of synchronize_sched() for more detailed information 725 * See the description of synchronize_sched() for more detailed
667 * on memory ordering guarantees. 726 * information on memory-ordering guarantees. However, please note
727 * that -only- the memory-ordering guarantees apply. For example,
728 * synchronize_rcu() is -not- guaranteed to wait on things like code
729 * protected by preempt_disable(), instead, synchronize_rcu() is -only-
730 * guaranteed to wait on RCU read-side critical sections, that is, sections
731 * of code protected by rcu_read_lock().
668 */ 732 */
669void synchronize_rcu(void) 733void synchronize_rcu(void)
670{ 734{
@@ -738,7 +802,7 @@ static void __init rcu_bootup_announce(void)
738 * Because preemptible RCU does not exist, we never have to check for 802 * Because preemptible RCU does not exist, we never have to check for
739 * CPUs being in quiescent states. 803 * CPUs being in quiescent states.
740 */ 804 */
741static void rcu_preempt_note_context_switch(void) 805static void rcu_preempt_note_context_switch(bool preempt)
742{ 806{
743} 807}
744 808
@@ -835,33 +899,6 @@ void exit_rcu(void)
835 899
836#include "../locking/rtmutex_common.h" 900#include "../locking/rtmutex_common.h"
837 901
838#ifdef CONFIG_RCU_TRACE
839
840static void rcu_initiate_boost_trace(struct rcu_node *rnp)
841{
842 if (!rcu_preempt_has_tasks(rnp))
843 rnp->n_balk_blkd_tasks++;
844 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
845 rnp->n_balk_exp_gp_tasks++;
846 else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
847 rnp->n_balk_boost_tasks++;
848 else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
849 rnp->n_balk_notblocked++;
850 else if (rnp->gp_tasks != NULL &&
851 ULONG_CMP_LT(jiffies, rnp->boost_time))
852 rnp->n_balk_notyet++;
853 else
854 rnp->n_balk_nos++;
855}
856
857#else /* #ifdef CONFIG_RCU_TRACE */
858
859static void rcu_initiate_boost_trace(struct rcu_node *rnp)
860{
861}
862
863#endif /* #else #ifdef CONFIG_RCU_TRACE */
864
865static void rcu_wake_cond(struct task_struct *t, int status) 902static void rcu_wake_cond(struct task_struct *t, int status)
866{ 903{
867 /* 904 /*
@@ -992,8 +1029,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
992{ 1029{
993 struct task_struct *t; 1030 struct task_struct *t;
994 1031
1032 lockdep_assert_held(&rnp->lock);
995 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { 1033 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
996 rnp->n_balk_exp_gp_tasks++;
997 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1034 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
998 return; 1035 return;
999 } 1036 }
@@ -1009,7 +1046,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1009 if (t) 1046 if (t)
1010 rcu_wake_cond(t, rnp->boost_kthread_status); 1047 rcu_wake_cond(t, rnp->boost_kthread_status);
1011 } else { 1048 } else {
1012 rcu_initiate_boost_trace(rnp);
1013 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1049 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1014 } 1050 }
1015} 1051}
@@ -1260,8 +1296,7 @@ static void rcu_prepare_kthreads(int cpu)
1260int rcu_needs_cpu(u64 basemono, u64 *nextevt) 1296int rcu_needs_cpu(u64 basemono, u64 *nextevt)
1261{ 1297{
1262 *nextevt = KTIME_MAX; 1298 *nextevt = KTIME_MAX;
1263 return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) 1299 return rcu_cpu_has_callbacks(NULL);
1264 ? 0 : rcu_cpu_has_callbacks(NULL);
1265} 1300}
1266 1301
1267/* 1302/*
@@ -1372,10 +1407,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
1372 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 1407 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1373 unsigned long dj; 1408 unsigned long dj;
1374 1409
1375 if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) { 1410 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!");
1376 *nextevt = KTIME_MAX;
1377 return 0;
1378 }
1379 1411
1380 /* Snapshot to detect later posting of non-lazy callback. */ 1412 /* Snapshot to detect later posting of non-lazy callback. */
1381 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; 1413 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
@@ -1424,8 +1456,8 @@ static void rcu_prepare_for_idle(void)
1424 struct rcu_state *rsp; 1456 struct rcu_state *rsp;
1425 int tne; 1457 int tne;
1426 1458
1427 if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || 1459 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!");
1428 rcu_is_nocb_cpu(smp_processor_id())) 1460 if (rcu_is_nocb_cpu(smp_processor_id()))
1429 return; 1461 return;
1430 1462
1431 /* Handle nohz enablement switches conservatively. */ 1463 /* Handle nohz enablement switches conservatively. */
@@ -1479,8 +1511,8 @@ static void rcu_prepare_for_idle(void)
1479 */ 1511 */
1480static void rcu_cleanup_after_idle(void) 1512static void rcu_cleanup_after_idle(void)
1481{ 1513{
1482 if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || 1514 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!");
1483 rcu_is_nocb_cpu(smp_processor_id())) 1515 if (rcu_is_nocb_cpu(smp_processor_id()))
1484 return; 1516 return;
1485 if (rcu_try_advance_all_cbs()) 1517 if (rcu_try_advance_all_cbs())
1486 invoke_rcu_core(); 1518 invoke_rcu_core();
@@ -1747,7 +1779,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
1747 init_swait_queue_head(&rnp->nocb_gp_wq[1]); 1779 init_swait_queue_head(&rnp->nocb_gp_wq[1]);
1748} 1780}
1749 1781
1750#ifndef CONFIG_RCU_NOCB_CPU_ALL
1751/* Is the specified CPU a no-CBs CPU? */ 1782/* Is the specified CPU a no-CBs CPU? */
1752bool rcu_is_nocb_cpu(int cpu) 1783bool rcu_is_nocb_cpu(int cpu)
1753{ 1784{
@@ -1755,7 +1786,6 @@ bool rcu_is_nocb_cpu(int cpu)
1755 return cpumask_test_cpu(cpu, rcu_nocb_mask); 1786 return cpumask_test_cpu(cpu, rcu_nocb_mask);
1756 return false; 1787 return false;
1757} 1788}
1758#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1759 1789
1760/* 1790/*
1761 * Kick the leader kthread for this NOCB group. 1791 * Kick the leader kthread for this NOCB group.
@@ -1769,6 +1799,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
1769 if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { 1799 if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
1770 /* Prior smp_mb__after_atomic() orders against prior enqueue. */ 1800 /* Prior smp_mb__after_atomic() orders against prior enqueue. */
1771 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); 1801 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
1802 smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
1772 swake_up(&rdp_leader->nocb_wq); 1803 swake_up(&rdp_leader->nocb_wq);
1773 } 1804 }
1774} 1805}
@@ -1860,7 +1891,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
1860 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1891 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1861 TPS("WakeEmpty")); 1892 TPS("WakeEmpty"));
1862 } else { 1893 } else {
1863 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE); 1894 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE);
1864 /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ 1895 /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
1865 smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); 1896 smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
1866 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1897 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
@@ -1874,7 +1905,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
1874 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1905 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1875 TPS("WakeOvf")); 1906 TPS("WakeOvf"));
1876 } else { 1907 } else {
1877 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE); 1908 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE);
1878 /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ 1909 /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
1879 smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); 1910 smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
1880 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1911 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
@@ -2023,6 +2054,7 @@ wait_again:
2023 * nocb_gp_head, where they await a grace period. 2054 * nocb_gp_head, where they await a grace period.
2024 */ 2055 */
2025 gotcbs = false; 2056 gotcbs = false;
2057 smp_mb(); /* wakeup before ->nocb_head reads. */
2026 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { 2058 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
2027 rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); 2059 rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
2028 if (!rdp->nocb_gp_head) 2060 if (!rdp->nocb_gp_head)
@@ -2201,8 +2233,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2201 if (!rcu_nocb_need_deferred_wakeup(rdp)) 2233 if (!rcu_nocb_need_deferred_wakeup(rdp))
2202 return; 2234 return;
2203 ndw = READ_ONCE(rdp->nocb_defer_wakeup); 2235 ndw = READ_ONCE(rdp->nocb_defer_wakeup);
2204 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT); 2236 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
2205 wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE); 2237 wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE);
2206 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); 2238 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
2207} 2239}
2208 2240
@@ -2212,10 +2244,6 @@ void __init rcu_init_nohz(void)
2212 bool need_rcu_nocb_mask = true; 2244 bool need_rcu_nocb_mask = true;
2213 struct rcu_state *rsp; 2245 struct rcu_state *rsp;
2214 2246
2215#ifdef CONFIG_RCU_NOCB_CPU_NONE
2216 need_rcu_nocb_mask = false;
2217#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
2218
2219#if defined(CONFIG_NO_HZ_FULL) 2247#if defined(CONFIG_NO_HZ_FULL)
2220 if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) 2248 if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
2221 need_rcu_nocb_mask = true; 2249 need_rcu_nocb_mask = true;
@@ -2231,14 +2259,6 @@ void __init rcu_init_nohz(void)
2231 if (!have_rcu_nocb_mask) 2259 if (!have_rcu_nocb_mask)
2232 return; 2260 return;
2233 2261
2234#ifdef CONFIG_RCU_NOCB_CPU_ZERO
2235 pr_info("\tOffload RCU callbacks from CPU 0\n");
2236 cpumask_set_cpu(0, rcu_nocb_mask);
2237#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
2238#ifdef CONFIG_RCU_NOCB_CPU_ALL
2239 pr_info("\tOffload RCU callbacks from all CPUs\n");
2240 cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
2241#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
2242#if defined(CONFIG_NO_HZ_FULL) 2262#if defined(CONFIG_NO_HZ_FULL)
2243 if (tick_nohz_full_running) 2263 if (tick_nohz_full_running)
2244 cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); 2264 cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
@@ -2491,421 +2511,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
2491#endif /* #ifdef CONFIG_NO_HZ_FULL */ 2511#endif /* #ifdef CONFIG_NO_HZ_FULL */
2492} 2512}
2493 2513
2494
2495#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
2496
2497static int full_sysidle_state; /* Current system-idle state. */
2498#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
2499#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
2500#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
2501#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
2502#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
2503
2504/*
2505 * Invoked to note exit from irq or task transition to idle. Note that
2506 * usermode execution does -not- count as idle here! After all, we want
2507 * to detect full-system idle states, not RCU quiescent states and grace
2508 * periods. The caller must have disabled interrupts.
2509 */
2510static void rcu_sysidle_enter(int irq)
2511{
2512 unsigned long j;
2513 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
2514
2515 /* If there are no nohz_full= CPUs, no need to track this. */
2516 if (!tick_nohz_full_enabled())
2517 return;
2518
2519 /* Adjust nesting, check for fully idle. */
2520 if (irq) {
2521 rdtp->dynticks_idle_nesting--;
2522 WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2523 if (rdtp->dynticks_idle_nesting != 0)
2524 return; /* Still not fully idle. */
2525 } else {
2526 if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
2527 DYNTICK_TASK_NEST_VALUE) {
2528 rdtp->dynticks_idle_nesting = 0;
2529 } else {
2530 rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
2531 WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2532 return; /* Still not fully idle. */
2533 }
2534 }
2535
2536 /* Record start of fully idle period. */
2537 j = jiffies;
2538 WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
2539 smp_mb__before_atomic();
2540 atomic_inc(&rdtp->dynticks_idle);
2541 smp_mb__after_atomic();
2542 WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
2543}
2544
2545/*
2546 * Unconditionally force exit from full system-idle state. This is
2547 * invoked when a normal CPU exits idle, but must be called separately
2548 * for the timekeeping CPU (tick_do_timer_cpu). The reason for this
2549 * is that the timekeeping CPU is permitted to take scheduling-clock
2550 * interrupts while the system is in system-idle state, and of course
2551 * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
2552 * interrupt from any other type of interrupt.
2553 */
2554void rcu_sysidle_force_exit(void)
2555{
2556 int oldstate = READ_ONCE(full_sysidle_state);
2557 int newoldstate;
2558
2559 /*
2560 * Each pass through the following loop attempts to exit full
2561 * system-idle state. If contention proves to be a problem,
2562 * a trylock-based contention tree could be used here.
2563 */
2564 while (oldstate > RCU_SYSIDLE_SHORT) {
2565 newoldstate = cmpxchg(&full_sysidle_state,
2566 oldstate, RCU_SYSIDLE_NOT);
2567 if (oldstate == newoldstate &&
2568 oldstate == RCU_SYSIDLE_FULL_NOTED) {
2569 rcu_kick_nohz_cpu(tick_do_timer_cpu);
2570 return; /* We cleared it, done! */
2571 }
2572 oldstate = newoldstate;
2573 }
2574 smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
2575}
2576
2577/*
2578 * Invoked to note entry to irq or task transition from idle. Note that
2579 * usermode execution does -not- count as idle here! The caller must
2580 * have disabled interrupts.
2581 */
2582static void rcu_sysidle_exit(int irq)
2583{
2584 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
2585
2586 /* If there are no nohz_full= CPUs, no need to track this. */
2587 if (!tick_nohz_full_enabled())
2588 return;
2589
2590 /* Adjust nesting, check for already non-idle. */
2591 if (irq) {
2592 rdtp->dynticks_idle_nesting++;
2593 WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2594 if (rdtp->dynticks_idle_nesting != 1)
2595 return; /* Already non-idle. */
2596 } else {
2597 /*
2598 * Allow for irq misnesting. Yes, it really is possible
2599 * to enter an irq handler then never leave it, and maybe
2600 * also vice versa. Handle both possibilities.
2601 */
2602 if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
2603 rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
2604 WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2605 return; /* Already non-idle. */
2606 } else {
2607 rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
2608 }
2609 }
2610
2611 /* Record end of idle period. */
2612 smp_mb__before_atomic();
2613 atomic_inc(&rdtp->dynticks_idle);
2614 smp_mb__after_atomic();
2615 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
2616
2617 /*
2618 * If we are the timekeeping CPU, we are permitted to be non-idle
2619 * during a system-idle state. This must be the case, because
2620 * the timekeeping CPU has to take scheduling-clock interrupts
2621 * during the time that the system is transitioning to full
2622 * system-idle state. This means that the timekeeping CPU must
2623 * invoke rcu_sysidle_force_exit() directly if it does anything
2624 * more than take a scheduling-clock interrupt.
2625 */
2626 if (smp_processor_id() == tick_do_timer_cpu)
2627 return;
2628
2629 /* Update system-idle state: We are clearly no longer fully idle! */
2630 rcu_sysidle_force_exit();
2631}
2632
2633/*
2634 * Check to see if the current CPU is idle. Note that usermode execution
2635 * does not count as idle. The caller must have disabled interrupts,
2636 * and must be running on tick_do_timer_cpu.
2637 */
2638static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2639 unsigned long *maxj)
2640{
2641 int cur;
2642 unsigned long j;
2643 struct rcu_dynticks *rdtp = rdp->dynticks;
2644
2645 /* If there are no nohz_full= CPUs, don't check system-wide idleness. */
2646 if (!tick_nohz_full_enabled())
2647 return;
2648
2649 /*
2650 * If some other CPU has already reported non-idle, if this is
2651 * not the flavor of RCU that tracks sysidle state, or if this
2652 * is an offline or the timekeeping CPU, nothing to do.
2653 */
2654 if (!*isidle || rdp->rsp != rcu_state_p ||
2655 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
2656 return;
2657 /* Verify affinity of current kthread. */
2658 WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
2659
2660 /* Pick up current idle and NMI-nesting counter and check. */
2661 cur = atomic_read(&rdtp->dynticks_idle);
2662 if (cur & 0x1) {
2663 *isidle = false; /* We are not idle! */
2664 return;
2665 }
2666 smp_mb(); /* Read counters before timestamps. */
2667
2668 /* Pick up timestamps. */
2669 j = READ_ONCE(rdtp->dynticks_idle_jiffies);
2670 /* If this CPU entered idle more recently, update maxj timestamp. */
2671 if (ULONG_CMP_LT(*maxj, j))
2672 *maxj = j;
2673}
2674
2675/*
2676 * Is this the flavor of RCU that is handling full-system idle?
2677 */
2678static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2679{
2680 return rsp == rcu_state_p;
2681}
2682
2683/*
2684 * Return a delay in jiffies based on the number of CPUs, rcu_node
2685 * leaf fanout, and jiffies tick rate. The idea is to allow larger
2686 * systems more time to transition to full-idle state in order to
2687 * avoid the cache thrashing that otherwise occur on the state variable.
2688 * Really small systems (less than a couple of tens of CPUs) should
2689 * instead use a single global atomically incremented counter, and later
2690 * versions of this will automatically reconfigure themselves accordingly.
2691 */
2692static unsigned long rcu_sysidle_delay(void)
2693{
2694 if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2695 return 0;
2696 return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
2697}
2698
2699/*
2700 * Advance the full-system-idle state. This is invoked when all of
2701 * the non-timekeeping CPUs are idle.
2702 */
2703static void rcu_sysidle(unsigned long j)
2704{
2705 /* Check the current state. */
2706 switch (READ_ONCE(full_sysidle_state)) {
2707 case RCU_SYSIDLE_NOT:
2708
2709 /* First time all are idle, so note a short idle period. */
2710 WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
2711 break;
2712
2713 case RCU_SYSIDLE_SHORT:
2714
2715 /*
2716 * Idle for a bit, time to advance to next state?
2717 * cmpxchg failure means race with non-idle, let them win.
2718 */
2719 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2720 (void)cmpxchg(&full_sysidle_state,
2721 RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
2722 break;
2723
2724 case RCU_SYSIDLE_LONG:
2725
2726 /*
2727 * Do an additional check pass before advancing to full.
2728 * cmpxchg failure means race with non-idle, let them win.
2729 */
2730 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2731 (void)cmpxchg(&full_sysidle_state,
2732 RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
2733 break;
2734
2735 default:
2736 break;
2737 }
2738}
2739
2740/*
2741 * Found a non-idle non-timekeeping CPU, so kick the system-idle state
2742 * back to the beginning.
2743 */
2744static void rcu_sysidle_cancel(void)
2745{
2746 smp_mb();
2747 if (full_sysidle_state > RCU_SYSIDLE_SHORT)
2748 WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
2749}
2750
2751/*
2752 * Update the sysidle state based on the results of a force-quiescent-state
2753 * scan of the CPUs' dyntick-idle state.
2754 */
2755static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
2756 unsigned long maxj, bool gpkt)
2757{
2758 if (rsp != rcu_state_p)
2759 return; /* Wrong flavor, ignore. */
2760 if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2761 return; /* Running state machine from timekeeping CPU. */
2762 if (isidle)
2763 rcu_sysidle(maxj); /* More idle! */
2764 else
2765 rcu_sysidle_cancel(); /* Idle is over. */
2766}
2767
2768/*
2769 * Wrapper for rcu_sysidle_report() when called from the grace-period
2770 * kthread's context.
2771 */
2772static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2773 unsigned long maxj)
2774{
2775 /* If there are no nohz_full= CPUs, no need to track this. */
2776 if (!tick_nohz_full_enabled())
2777 return;
2778
2779 rcu_sysidle_report(rsp, isidle, maxj, true);
2780}
2781
2782/* Callback and function for forcing an RCU grace period. */
2783struct rcu_sysidle_head {
2784 struct rcu_head rh;
2785 int inuse;
2786};
2787
2788static void rcu_sysidle_cb(struct rcu_head *rhp)
2789{
2790 struct rcu_sysidle_head *rshp;
2791
2792 /*
2793 * The following memory barrier is needed to replace the
2794 * memory barriers that would normally be in the memory
2795 * allocator.
2796 */
2797 smp_mb(); /* grace period precedes setting inuse. */
2798
2799 rshp = container_of(rhp, struct rcu_sysidle_head, rh);
2800 WRITE_ONCE(rshp->inuse, 0);
2801}
2802
2803/*
2804 * Check to see if the system is fully idle, other than the timekeeping CPU.
2805 * The caller must have disabled interrupts. This is not intended to be
2806 * called unless tick_nohz_full_enabled().
2807 */
2808bool rcu_sys_is_idle(void)
2809{
2810 static struct rcu_sysidle_head rsh;
2811 int rss = READ_ONCE(full_sysidle_state);
2812
2813 if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
2814 return false;
2815
2816 /* Handle small-system case by doing a full scan of CPUs. */
2817 if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
2818 int oldrss = rss - 1;
2819
2820 /*
2821 * One pass to advance to each state up to _FULL.
2822 * Give up if any pass fails to advance the state.
2823 */
2824 while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
2825 int cpu;
2826 bool isidle = true;
2827 unsigned long maxj = jiffies - ULONG_MAX / 4;
2828 struct rcu_data *rdp;
2829
2830 /* Scan all the CPUs looking for nonidle CPUs. */
2831 for_each_possible_cpu(cpu) {
2832 rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
2833 rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
2834 if (!isidle)
2835 break;
2836 }
2837 rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
2838 oldrss = rss;
2839 rss = READ_ONCE(full_sysidle_state);
2840 }
2841 }
2842
2843 /* If this is the first observation of an idle period, record it. */
2844 if (rss == RCU_SYSIDLE_FULL) {
2845 rss = cmpxchg(&full_sysidle_state,
2846 RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
2847 return rss == RCU_SYSIDLE_FULL;
2848 }
2849
2850 smp_mb(); /* ensure rss load happens before later caller actions. */
2851
2852 /* If already fully idle, tell the caller (in case of races). */
2853 if (rss == RCU_SYSIDLE_FULL_NOTED)
2854 return true;
2855
2856 /*
2857 * If we aren't there yet, and a grace period is not in flight,
2858 * initiate a grace period. Either way, tell the caller that
2859 * we are not there yet. We use an xchg() rather than an assignment
2860 * to make up for the memory barriers that would otherwise be
2861 * provided by the memory allocator.
2862 */
2863 if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
2864 !rcu_gp_in_progress(rcu_state_p) &&
2865 !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
2866 call_rcu(&rsh.rh, rcu_sysidle_cb);
2867 return false;
2868}
2869
2870/*
2871 * Initialize dynticks sysidle state for CPUs coming online.
2872 */
2873static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2874{
2875 rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
2876}
2877
2878#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2879
2880static void rcu_sysidle_enter(int irq)
2881{
2882}
2883
2884static void rcu_sysidle_exit(int irq)
2885{
2886}
2887
2888static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2889 unsigned long *maxj)
2890{
2891}
2892
2893static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2894{
2895 return false;
2896}
2897
2898static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2899 unsigned long maxj)
2900{
2901}
2902
2903static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2904{
2905}
2906
2907#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2908
2909/* 2514/*
2910 * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the 2515 * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
2911 * grace-period kthread will do force_quiescent_state() processing? 2516 * grace-period kthread will do force_quiescent_state() processing?
@@ -2936,13 +2541,7 @@ static void rcu_bind_gp_kthread(void)
2936 2541
2937 if (!tick_nohz_full_enabled()) 2542 if (!tick_nohz_full_enabled())
2938 return; 2543 return;
2939#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
2940 cpu = tick_do_timer_cpu;
2941 if (cpu >= 0 && cpu < nr_cpu_ids)
2942 set_cpus_allowed_ptr(current, cpumask_of(cpu));
2943#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2944 housekeeping_affine(current); 2544 housekeeping_affine(current);
2945#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2946} 2545}
2947 2546
2948/* Record the current task on dyntick-idle entry. */ 2547/* Record the current task on dyntick-idle entry. */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
deleted file mode 100644
index 6cea17a1ea30..000000000000
--- a/kernel/rcu/tree_trace.c
+++ /dev/null
@@ -1,494 +0,0 @@
1/*
2 * Read-Copy Update tracing for hierarchical implementation.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright IBM Corporation, 2008
19 * Author: Paul E. McKenney
20 *
21 * Papers: http://www.rdrop.com/users/paulmck/RCU
22 *
23 * For detailed explanation of Read-Copy Update mechanism see -
24 * Documentation/RCU
25 *
26 */
27#include <linux/types.h>
28#include <linux/kernel.h>
29#include <linux/init.h>
30#include <linux/spinlock.h>
31#include <linux/smp.h>
32#include <linux/rcupdate.h>
33#include <linux/interrupt.h>
34#include <linux/sched.h>
35#include <linux/atomic.h>
36#include <linux/bitops.h>
37#include <linux/completion.h>
38#include <linux/percpu.h>
39#include <linux/notifier.h>
40#include <linux/cpu.h>
41#include <linux/mutex.h>
42#include <linux/debugfs.h>
43#include <linux/seq_file.h>
44#include <linux/prefetch.h>
45
46#define RCU_TREE_NONCORE
47#include "tree.h"
48#include "rcu.h"
49
50static int r_open(struct inode *inode, struct file *file,
51 const struct seq_operations *op)
52{
53 int ret = seq_open(file, op);
54 if (!ret) {
55 struct seq_file *m = (struct seq_file *)file->private_data;
56 m->private = inode->i_private;
57 }
58 return ret;
59}
60
61static void *r_start(struct seq_file *m, loff_t *pos)
62{
63 struct rcu_state *rsp = (struct rcu_state *)m->private;
64 *pos = cpumask_next(*pos - 1, cpu_possible_mask);
65 if ((*pos) < nr_cpu_ids)
66 return per_cpu_ptr(rsp->rda, *pos);
67 return NULL;
68}
69
70static void *r_next(struct seq_file *m, void *v, loff_t *pos)
71{
72 (*pos)++;
73 return r_start(m, pos);
74}
75
76static void r_stop(struct seq_file *m, void *v)
77{
78}
79
80static int show_rcubarrier(struct seq_file *m, void *v)
81{
82 struct rcu_state *rsp = (struct rcu_state *)m->private;
83 seq_printf(m, "bcc: %d bseq: %lu\n",
84 atomic_read(&rsp->barrier_cpu_count),
85 rsp->barrier_sequence);
86 return 0;
87}
88
89static int rcubarrier_open(struct inode *inode, struct file *file)
90{
91 return single_open(file, show_rcubarrier, inode->i_private);
92}
93
94static const struct file_operations rcubarrier_fops = {
95 .owner = THIS_MODULE,
96 .open = rcubarrier_open,
97 .read = seq_read,
98 .llseek = no_llseek,
99 .release = single_release,
100};
101
102#ifdef CONFIG_RCU_BOOST
103
104static char convert_kthread_status(unsigned int kthread_status)
105{
106 if (kthread_status > RCU_KTHREAD_MAX)
107 return '?';
108 return "SRWOY"[kthread_status];
109}
110
111#endif /* #ifdef CONFIG_RCU_BOOST */
112
113static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
114{
115 long ql, qll;
116
117 if (!rdp->beenonline)
118 return;
119 seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d",
120 rdp->cpu,
121 cpu_is_offline(rdp->cpu) ? '!' : ' ',
122 ulong2long(rdp->completed), ulong2long(rdp->gpnum),
123 rdp->cpu_no_qs.b.norm,
124 rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu),
125 rdp->core_needs_qs);
126 seq_printf(m, " dt=%d/%llx/%d df=%lu",
127 rcu_dynticks_snap(rdp->dynticks),
128 rdp->dynticks->dynticks_nesting,
129 rdp->dynticks->dynticks_nmi_nesting,
130 rdp->dynticks_fqs);
131 seq_printf(m, " of=%lu", rdp->offline_fqs);
132 rcu_nocb_q_lengths(rdp, &ql, &qll);
133 qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist);
134 ql += rcu_segcblist_n_cbs(&rdp->cblist);
135 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
136 qll, ql,
137 ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)],
138 ".R"[!rcu_segcblist_segempty(&rdp->cblist,
139 RCU_NEXT_READY_TAIL)],
140 ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)],
141 ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]);
142#ifdef CONFIG_RCU_BOOST
143 seq_printf(m, " kt=%d/%c ktl=%x",
144 per_cpu(rcu_cpu_has_work, rdp->cpu),
145 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
146 rdp->cpu)),
147 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
148#endif /* #ifdef CONFIG_RCU_BOOST */
149 seq_printf(m, " b=%ld", rdp->blimit);
150 seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
151 rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
152 rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
153}
154
155static int show_rcudata(struct seq_file *m, void *v)
156{
157 print_one_rcu_data(m, (struct rcu_data *)v);
158 return 0;
159}
160
161static const struct seq_operations rcudate_op = {
162 .start = r_start,
163 .next = r_next,
164 .stop = r_stop,
165 .show = show_rcudata,
166};
167
168static int rcudata_open(struct inode *inode, struct file *file)
169{
170 return r_open(inode, file, &rcudate_op);
171}
172
173static const struct file_operations rcudata_fops = {
174 .owner = THIS_MODULE,
175 .open = rcudata_open,
176 .read = seq_read,
177 .llseek = no_llseek,
178 .release = seq_release,
179};
180
181static int show_rcuexp(struct seq_file *m, void *v)
182{
183 int cpu;
184 struct rcu_state *rsp = (struct rcu_state *)m->private;
185 struct rcu_data *rdp;
186 unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
187
188 for_each_possible_cpu(cpu) {
189 rdp = per_cpu_ptr(rsp->rda, cpu);
190 s0 += atomic_long_read(&rdp->exp_workdone0);
191 s1 += atomic_long_read(&rdp->exp_workdone1);
192 s2 += atomic_long_read(&rdp->exp_workdone2);
193 s3 += atomic_long_read(&rdp->exp_workdone3);
194 }
195 seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n",
196 rsp->expedited_sequence, s0, s1, s2, s3,
197 atomic_read(&rsp->expedited_need_qs),
198 rsp->expedited_sequence / 2);
199 return 0;
200}
201
202static int rcuexp_open(struct inode *inode, struct file *file)
203{
204 return single_open(file, show_rcuexp, inode->i_private);
205}
206
207static const struct file_operations rcuexp_fops = {
208 .owner = THIS_MODULE,
209 .open = rcuexp_open,
210 .read = seq_read,
211 .llseek = no_llseek,
212 .release = single_release,
213};
214
215#ifdef CONFIG_RCU_BOOST
216
217static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
218{
219 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
220 rnp->grplo, rnp->grphi,
221 "T."[list_empty(&rnp->blkd_tasks)],
222 "N."[!rnp->gp_tasks],
223 "E."[!rnp->exp_tasks],
224 "B."[!rnp->boost_tasks],
225 convert_kthread_status(rnp->boost_kthread_status),
226 rnp->n_tasks_boosted, rnp->n_exp_boosts,
227 rnp->n_normal_boosts);
228 seq_printf(m, "j=%04x bt=%04x\n",
229 (int)(jiffies & 0xffff),
230 (int)(rnp->boost_time & 0xffff));
231 seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
232 rnp->n_balk_blkd_tasks,
233 rnp->n_balk_exp_gp_tasks,
234 rnp->n_balk_boost_tasks,
235 rnp->n_balk_notblocked,
236 rnp->n_balk_notyet,
237 rnp->n_balk_nos);
238}
239
240static int show_rcu_node_boost(struct seq_file *m, void *unused)
241{
242 struct rcu_node *rnp;
243
244 rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
245 print_one_rcu_node_boost(m, rnp);
246 return 0;
247}
248
249static int rcu_node_boost_open(struct inode *inode, struct file *file)
250{
251 return single_open(file, show_rcu_node_boost, NULL);
252}
253
254static const struct file_operations rcu_node_boost_fops = {
255 .owner = THIS_MODULE,
256 .open = rcu_node_boost_open,
257 .read = seq_read,
258 .llseek = no_llseek,
259 .release = single_release,
260};
261
262#endif /* #ifdef CONFIG_RCU_BOOST */
263
264static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
265{
266 unsigned long gpnum;
267 int level = 0;
268 struct rcu_node *rnp;
269
270 gpnum = rsp->gpnum;
271 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
272 ulong2long(rsp->completed), ulong2long(gpnum),
273 rsp->gp_state,
274 (long)(rsp->jiffies_force_qs - jiffies),
275 (int)(jiffies & 0xffff));
276 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
277 rsp->n_force_qs, rsp->n_force_qs_ngp,
278 rsp->n_force_qs - rsp->n_force_qs_ngp,
279 READ_ONCE(rsp->n_force_qs_lh),
280 rsp->orphan_done.len_lazy,
281 rsp->orphan_done.len);
282 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
283 if (rnp->level != level) {
284 seq_puts(m, "\n");
285 level = rnp->level;
286 }
287 seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ",
288 rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
289 ".G"[rnp->gp_tasks != NULL],
290 ".E"[rnp->exp_tasks != NULL],
291 ".T"[!list_empty(&rnp->blkd_tasks)],
292 rnp->grplo, rnp->grphi, rnp->grpnum);
293 }
294 seq_puts(m, "\n");
295}
296
297static int show_rcuhier(struct seq_file *m, void *v)
298{
299 struct rcu_state *rsp = (struct rcu_state *)m->private;
300 print_one_rcu_state(m, rsp);
301 return 0;
302}
303
304static int rcuhier_open(struct inode *inode, struct file *file)
305{
306 return single_open(file, show_rcuhier, inode->i_private);
307}
308
309static const struct file_operations rcuhier_fops = {
310 .owner = THIS_MODULE,
311 .open = rcuhier_open,
312 .read = seq_read,
313 .llseek = no_llseek,
314 .release = single_release,
315};
316
317static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
318{
319 unsigned long flags;
320 unsigned long completed;
321 unsigned long gpnum;
322 unsigned long gpage;
323 unsigned long gpmax;
324 struct rcu_node *rnp = &rsp->node[0];
325
326 raw_spin_lock_irqsave_rcu_node(rnp, flags);
327 completed = READ_ONCE(rsp->completed);
328 gpnum = READ_ONCE(rsp->gpnum);
329 if (completed == gpnum)
330 gpage = 0;
331 else
332 gpage = jiffies - rsp->gp_start;
333 gpmax = rsp->gp_max;
334 raw_spin_unlock_irqrestore(&rnp->lock, flags);
335 seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n",
336 ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
337}
338
339static int show_rcugp(struct seq_file *m, void *v)
340{
341 struct rcu_state *rsp = (struct rcu_state *)m->private;
342 show_one_rcugp(m, rsp);
343 return 0;
344}
345
346static int rcugp_open(struct inode *inode, struct file *file)
347{
348 return single_open(file, show_rcugp, inode->i_private);
349}
350
351static const struct file_operations rcugp_fops = {
352 .owner = THIS_MODULE,
353 .open = rcugp_open,
354 .read = seq_read,
355 .llseek = no_llseek,
356 .release = single_release,
357};
358
359static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
360{
361 if (!rdp->beenonline)
362 return;
363 seq_printf(m, "%3d%cnp=%ld ",
364 rdp->cpu,
365 cpu_is_offline(rdp->cpu) ? '!' : ' ',
366 rdp->n_rcu_pending);
367 seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
368 rdp->n_rp_core_needs_qs,
369 rdp->n_rp_report_qs,
370 rdp->n_rp_cb_ready,
371 rdp->n_rp_cpu_needs_gp);
372 seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n",
373 rdp->n_rp_gp_completed,
374 rdp->n_rp_gp_started,
375 rdp->n_rp_nocb_defer_wakeup,
376 rdp->n_rp_need_nothing);
377}
378
379static int show_rcu_pending(struct seq_file *m, void *v)
380{
381 print_one_rcu_pending(m, (struct rcu_data *)v);
382 return 0;
383}
384
385static const struct seq_operations rcu_pending_op = {
386 .start = r_start,
387 .next = r_next,
388 .stop = r_stop,
389 .show = show_rcu_pending,
390};
391
392static int rcu_pending_open(struct inode *inode, struct file *file)
393{
394 return r_open(inode, file, &rcu_pending_op);
395}
396
397static const struct file_operations rcu_pending_fops = {
398 .owner = THIS_MODULE,
399 .open = rcu_pending_open,
400 .read = seq_read,
401 .llseek = no_llseek,
402 .release = seq_release,
403};
404
405static int show_rcutorture(struct seq_file *m, void *unused)
406{
407 seq_printf(m, "rcutorture test sequence: %lu %s\n",
408 rcutorture_testseq >> 1,
409 (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
410 seq_printf(m, "rcutorture update version number: %lu\n",
411 rcutorture_vernum);
412 return 0;
413}
414
415static int rcutorture_open(struct inode *inode, struct file *file)
416{
417 return single_open(file, show_rcutorture, NULL);
418}
419
420static const struct file_operations rcutorture_fops = {
421 .owner = THIS_MODULE,
422 .open = rcutorture_open,
423 .read = seq_read,
424 .llseek = seq_lseek,
425 .release = single_release,
426};
427
428static struct dentry *rcudir;
429
430static int __init rcutree_trace_init(void)
431{
432 struct rcu_state *rsp;
433 struct dentry *retval;
434 struct dentry *rspdir;
435
436 rcudir = debugfs_create_dir("rcu", NULL);
437 if (!rcudir)
438 goto free_out;
439
440 for_each_rcu_flavor(rsp) {
441 rspdir = debugfs_create_dir(rsp->name, rcudir);
442 if (!rspdir)
443 goto free_out;
444
445 retval = debugfs_create_file("rcudata", 0444,
446 rspdir, rsp, &rcudata_fops);
447 if (!retval)
448 goto free_out;
449
450 retval = debugfs_create_file("rcuexp", 0444,
451 rspdir, rsp, &rcuexp_fops);
452 if (!retval)
453 goto free_out;
454
455 retval = debugfs_create_file("rcu_pending", 0444,
456 rspdir, rsp, &rcu_pending_fops);
457 if (!retval)
458 goto free_out;
459
460 retval = debugfs_create_file("rcubarrier", 0444,
461 rspdir, rsp, &rcubarrier_fops);
462 if (!retval)
463 goto free_out;
464
465#ifdef CONFIG_RCU_BOOST
466 if (rsp == &rcu_preempt_state) {
467 retval = debugfs_create_file("rcuboost", 0444,
468 rspdir, NULL, &rcu_node_boost_fops);
469 if (!retval)
470 goto free_out;
471 }
472#endif
473
474 retval = debugfs_create_file("rcugp", 0444,
475 rspdir, rsp, &rcugp_fops);
476 if (!retval)
477 goto free_out;
478
479 retval = debugfs_create_file("rcuhier", 0444,
480 rspdir, rsp, &rcuhier_fops);
481 if (!retval)
482 goto free_out;
483 }
484
485 retval = debugfs_create_file("rcutorture", 0444, rcudir,
486 NULL, &rcutorture_fops);
487 if (!retval)
488 goto free_out;
489 return 0;
490free_out:
491 debugfs_remove_recursive(rcudir);
492 return 1;
493}
494device_initcall(rcutree_trace_init);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 273e869ca21d..00e77c470017 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,7 +62,9 @@
62#define MODULE_PARAM_PREFIX "rcupdate." 62#define MODULE_PARAM_PREFIX "rcupdate."
63 63
64#ifndef CONFIG_TINY_RCU 64#ifndef CONFIG_TINY_RCU
65extern int rcu_expedited; /* from sysctl */
65module_param(rcu_expedited, int, 0); 66module_param(rcu_expedited, int, 0);
67extern int rcu_normal; /* from sysctl */
66module_param(rcu_normal, int, 0); 68module_param(rcu_normal, int, 0);
67static int rcu_normal_after_boot; 69static int rcu_normal_after_boot;
68module_param(rcu_normal_after_boot, int, 0); 70module_param(rcu_normal_after_boot, int, 0);
@@ -379,6 +381,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
379 struct rcu_synchronize *rs_array) 381 struct rcu_synchronize *rs_array)
380{ 382{
381 int i; 383 int i;
384 int j;
382 385
383 /* Initialize and register callbacks for each flavor specified. */ 386 /* Initialize and register callbacks for each flavor specified. */
384 for (i = 0; i < n; i++) { 387 for (i = 0; i < n; i++) {
@@ -390,7 +393,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
390 } 393 }
391 init_rcu_head_on_stack(&rs_array[i].head); 394 init_rcu_head_on_stack(&rs_array[i].head);
392 init_completion(&rs_array[i].completion); 395 init_completion(&rs_array[i].completion);
393 (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); 396 for (j = 0; j < i; j++)
397 if (crcu_array[j] == crcu_array[i])
398 break;
399 if (j == i)
400 (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
394 } 401 }
395 402
396 /* Wait for all callbacks to be invoked. */ 403 /* Wait for all callbacks to be invoked. */
@@ -399,7 +406,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
399 (crcu_array[i] == call_rcu || 406 (crcu_array[i] == call_rcu ||
400 crcu_array[i] == call_rcu_bh)) 407 crcu_array[i] == call_rcu_bh))
401 continue; 408 continue;
402 wait_for_completion(&rs_array[i].completion); 409 for (j = 0; j < i; j++)
410 if (crcu_array[j] == crcu_array[i])
411 break;
412 if (j == i)
413 wait_for_completion(&rs_array[i].completion);
403 destroy_rcu_head_on_stack(&rs_array[i].head); 414 destroy_rcu_head_on_stack(&rs_array[i].head);
404 } 415 }
405} 416}
@@ -560,15 +571,30 @@ static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
560DEFINE_SRCU(tasks_rcu_exit_srcu); 571DEFINE_SRCU(tasks_rcu_exit_srcu);
561 572
562/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ 573/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
563static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; 574#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
575static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
564module_param(rcu_task_stall_timeout, int, 0644); 576module_param(rcu_task_stall_timeout, int, 0644);
565 577
566static void rcu_spawn_tasks_kthread(void); 578static void rcu_spawn_tasks_kthread(void);
567static struct task_struct *rcu_tasks_kthread_ptr; 579static struct task_struct *rcu_tasks_kthread_ptr;
568 580
569/* 581/**
570 * Post an RCU-tasks callback. First call must be from process context 582 * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
571 * after the scheduler if fully operational. 583 * @rhp: structure to be used for queueing the RCU updates.
584 * @func: actual callback function to be invoked after the grace period
585 *
586 * The callback function will be invoked some time after a full grace
587 * period elapses, in other words after all currently executing RCU
588 * read-side critical sections have completed. call_rcu_tasks() assumes
589 * that the read-side critical sections end at a voluntary context
590 * switch (not a preemption!), entry into idle, or transition to usermode
591 * execution. As such, there are no read-side primitives analogous to
592 * rcu_read_lock() and rcu_read_unlock() because this primitive is intended
593 * to determine that all tasks have passed through a safe state, not so
594 * much for data-strcuture synchronization.
595 *
596 * See the description of call_rcu() for more detailed information on
597 * memory ordering guarantees.
572 */ 598 */
573void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) 599void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
574{ 600{
@@ -851,6 +877,23 @@ static void rcu_spawn_tasks_kthread(void)
851 877
852#endif /* #ifdef CONFIG_TASKS_RCU */ 878#endif /* #ifdef CONFIG_TASKS_RCU */
853 879
880#ifndef CONFIG_TINY_RCU
881
882/*
883 * Print any non-default Tasks RCU settings.
884 */
885static void __init rcu_tasks_bootup_oddness(void)
886{
887#ifdef CONFIG_TASKS_RCU
888 if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT)
889 pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout);
890 else
891 pr_info("\tTasks RCU enabled.\n");
892#endif /* #ifdef CONFIG_TASKS_RCU */
893}
894
895#endif /* #ifndef CONFIG_TINY_RCU */
896
854#ifdef CONFIG_PROVE_RCU 897#ifdef CONFIG_PROVE_RCU
855 898
856/* 899/*
@@ -935,3 +978,25 @@ late_initcall(rcu_verify_early_boot_tests);
935#else 978#else
936void rcu_early_boot_tests(void) {} 979void rcu_early_boot_tests(void) {}
937#endif /* CONFIG_PROVE_RCU */ 980#endif /* CONFIG_PROVE_RCU */
981
982#ifndef CONFIG_TINY_RCU
983
984/*
985 * Print any significant non-default boot-time settings.
986 */
987void __init rcupdate_announce_bootup_oddness(void)
988{
989 if (rcu_normal)
990 pr_info("\tNo expedited grace period (rcu_normal).\n");
991 else if (rcu_normal_after_boot)
992 pr_info("\tNo expedited grace period (rcu_normal_after_boot).\n");
993 else if (rcu_expedited)
994 pr_info("\tAll grace periods are expedited (rcu_expedited).\n");
995 if (rcu_cpu_stall_suppress)
996 pr_info("\tRCU CPU stall warnings suppressed (rcu_cpu_stall_suppress).\n");
997 if (rcu_cpu_stall_timeout != CONFIG_RCU_CPU_STALL_TIMEOUT)
998 pr_info("\tRCU CPU stall warnings timeout set to %d (rcu_cpu_stall_timeout).\n", rcu_cpu_stall_timeout);
999 rcu_tasks_bootup_oddness();
1000}
1001
1002#endif /* #ifndef CONFIG_TINY_RCU */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 89ab6758667b..53f0164ed362 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,9 +16,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
16endif 16endif
17 17
18obj-y += core.o loadavg.o clock.o cputime.o 18obj-y += core.o loadavg.o clock.o cputime.o
19obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o 19obj-y += idle_task.o fair.o rt.o deadline.o
20obj-y += wait.o swait.o completion.o idle.o 20obj-y += wait.o wait_bit.o swait.o completion.o idle.o
21obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o 21obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
22obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o 22obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
23obj-$(CONFIG_SCHEDSTATS) += stats.o 23obj-$(CONFIG_SCHEDSTATS) += stats.o
24obj-$(CONFIG_SCHED_DEBUG) += debug.o 24obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 00a45c45beca..ca0f8fc945c6 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -64,6 +64,7 @@
64#include <linux/workqueue.h> 64#include <linux/workqueue.h>
65#include <linux/compiler.h> 65#include <linux/compiler.h>
66#include <linux/tick.h> 66#include <linux/tick.h>
67#include <linux/init.h>
67 68
68/* 69/*
69 * Scheduler clock - returns current time in nanosec units. 70 * Scheduler clock - returns current time in nanosec units.
@@ -124,14 +125,27 @@ int sched_clock_stable(void)
124 return static_branch_likely(&__sched_clock_stable); 125 return static_branch_likely(&__sched_clock_stable);
125} 126}
126 127
128static void __scd_stamp(struct sched_clock_data *scd)
129{
130 scd->tick_gtod = ktime_get_ns();
131 scd->tick_raw = sched_clock();
132}
133
127static void __set_sched_clock_stable(void) 134static void __set_sched_clock_stable(void)
128{ 135{
129 struct sched_clock_data *scd = this_scd(); 136 struct sched_clock_data *scd;
130 137
131 /* 138 /*
139 * Since we're still unstable and the tick is already running, we have
140 * to disable IRQs in order to get a consistent scd->tick* reading.
141 */
142 local_irq_disable();
143 scd = this_scd();
144 /*
132 * Attempt to make the (initial) unstable->stable transition continuous. 145 * Attempt to make the (initial) unstable->stable transition continuous.
133 */ 146 */
134 __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw); 147 __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
148 local_irq_enable();
135 149
136 printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", 150 printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
137 scd->tick_gtod, __gtod_offset, 151 scd->tick_gtod, __gtod_offset,
@@ -141,8 +155,38 @@ static void __set_sched_clock_stable(void)
141 tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); 155 tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
142} 156}
143 157
158/*
159 * If we ever get here, we're screwed, because we found out -- typically after
160 * the fact -- that TSC wasn't good. This means all our clocksources (including
161 * ktime) could have reported wrong values.
162 *
163 * What we do here is an attempt to fix up and continue sort of where we left
164 * off in a coherent manner.
165 *
166 * The only way to fully avoid random clock jumps is to boot with:
167 * "tsc=unstable".
168 */
144static void __sched_clock_work(struct work_struct *work) 169static void __sched_clock_work(struct work_struct *work)
145{ 170{
171 struct sched_clock_data *scd;
172 int cpu;
173
174 /* take a current timestamp and set 'now' */
175 preempt_disable();
176 scd = this_scd();
177 __scd_stamp(scd);
178 scd->clock = scd->tick_gtod + __gtod_offset;
179 preempt_enable();
180
181 /* clone to all CPUs */
182 for_each_possible_cpu(cpu)
183 per_cpu(sched_clock_data, cpu) = *scd;
184
185 printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n");
186 printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
187 scd->tick_gtod, __gtod_offset,
188 scd->tick_raw, __sched_clock_offset);
189
146 static_branch_disable(&__sched_clock_stable); 190 static_branch_disable(&__sched_clock_stable);
147} 191}
148 192
@@ -150,27 +194,11 @@ static DECLARE_WORK(sched_clock_work, __sched_clock_work);
150 194
151static void __clear_sched_clock_stable(void) 195static void __clear_sched_clock_stable(void)
152{ 196{
153 struct sched_clock_data *scd = this_scd(); 197 if (!sched_clock_stable())
154 198 return;
155 /*
156 * Attempt to make the stable->unstable transition continuous.
157 *
158 * Trouble is, this is typically called from the TSC watchdog
159 * timer, which is late per definition. This means the tick
160 * values can already be screwy.
161 *
162 * Still do what we can.
163 */
164 __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod);
165
166 printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
167 scd->tick_gtod, __gtod_offset,
168 scd->tick_raw, __sched_clock_offset);
169 199
170 tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); 200 tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
171 201 schedule_work(&sched_clock_work);
172 if (sched_clock_stable())
173 schedule_work(&sched_clock_work);
174} 202}
175 203
176void clear_sched_clock_stable(void) 204void clear_sched_clock_stable(void)
@@ -183,7 +211,11 @@ void clear_sched_clock_stable(void)
183 __clear_sched_clock_stable(); 211 __clear_sched_clock_stable();
184} 212}
185 213
186void sched_clock_init_late(void) 214/*
215 * We run this as late_initcall() such that it runs after all built-in drivers,
216 * notably: acpi_processor and intel_idle, which can mark the TSC as unstable.
217 */
218static int __init sched_clock_init_late(void)
187{ 219{
188 sched_clock_running = 2; 220 sched_clock_running = 2;
189 /* 221 /*
@@ -197,7 +229,10 @@ void sched_clock_init_late(void)
197 229
198 if (__sched_clock_stable_early) 230 if (__sched_clock_stable_early)
199 __set_sched_clock_stable(); 231 __set_sched_clock_stable();
232
233 return 0;
200} 234}
235late_initcall(sched_clock_init_late);
201 236
202/* 237/*
203 * min, max except they take wrapping into account 238 * min, max except they take wrapping into account
@@ -347,21 +382,38 @@ void sched_clock_tick(void)
347{ 382{
348 struct sched_clock_data *scd; 383 struct sched_clock_data *scd;
349 384
385 if (sched_clock_stable())
386 return;
387
388 if (unlikely(!sched_clock_running))
389 return;
390
350 WARN_ON_ONCE(!irqs_disabled()); 391 WARN_ON_ONCE(!irqs_disabled());
351 392
393 scd = this_scd();
394 __scd_stamp(scd);
395 sched_clock_local(scd);
396}
397
398void sched_clock_tick_stable(void)
399{
400 u64 gtod, clock;
401
402 if (!sched_clock_stable())
403 return;
404
352 /* 405 /*
353 * Update these values even if sched_clock_stable(), because it can 406 * Called under watchdog_lock.
354 * become unstable at any point in time at which point we need some
355 * values to fall back on.
356 * 407 *
357 * XXX arguably we can skip this if we expose tsc_clocksource_reliable 408 * The watchdog just found this TSC to (still) be stable, so now is a
409 * good moment to update our __gtod_offset. Because once we find the
410 * TSC to be unstable, any computation will be computing crap.
358 */ 411 */
359 scd = this_scd(); 412 local_irq_disable();
360 scd->tick_raw = sched_clock(); 413 gtod = ktime_get_ns();
361 scd->tick_gtod = ktime_get_ns(); 414 clock = sched_clock();
362 415 __gtod_offset = (clock + __sched_clock_offset) - gtod;
363 if (!sched_clock_stable() && likely(sched_clock_running)) 416 local_irq_enable();
364 sched_clock_local(scd);
365} 417}
366 418
367/* 419/*
@@ -374,15 +426,21 @@ void sched_clock_idle_sleep_event(void)
374EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); 426EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
375 427
376/* 428/*
377 * We just idled delta nanoseconds (called with irqs disabled): 429 * We just idled; resync with ktime.
378 */ 430 */
379void sched_clock_idle_wakeup_event(u64 delta_ns) 431void sched_clock_idle_wakeup_event(void)
380{ 432{
381 if (timekeeping_suspended) 433 unsigned long flags;
434
435 if (sched_clock_stable())
436 return;
437
438 if (unlikely(timekeeping_suspended))
382 return; 439 return;
383 440
441 local_irq_save(flags);
384 sched_clock_tick(); 442 sched_clock_tick();
385 touch_softlockup_watchdog_sched(); 443 local_irq_restore(flags);
386} 444}
387EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 445EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
388 446
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 53f9558fa925..13fc5ae9bf2f 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -66,7 +66,7 @@ do_wait_for_common(struct completion *x,
66 if (!x->done) { 66 if (!x->done) {
67 DECLARE_WAITQUEUE(wait, current); 67 DECLARE_WAITQUEUE(wait, current);
68 68
69 __add_wait_queue_tail_exclusive(&x->wait, &wait); 69 __add_wait_queue_entry_tail_exclusive(&x->wait, &wait);
70 do { 70 do {
71 if (signal_pending_state(state, current)) { 71 if (signal_pending_state(state, current)) {
72 timeout = -ERESTARTSYS; 72 timeout = -ERESTARTSYS;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 803c3bc274c4..17c667b427b4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10,6 +10,7 @@
10#include <uapi/linux/sched/types.h> 10#include <uapi/linux/sched/types.h>
11#include <linux/sched/loadavg.h> 11#include <linux/sched/loadavg.h>
12#include <linux/sched/hotplug.h> 12#include <linux/sched/hotplug.h>
13#include <linux/wait_bit.h>
13#include <linux/cpuset.h> 14#include <linux/cpuset.h>
14#include <linux/delayacct.h> 15#include <linux/delayacct.h>
15#include <linux/init_task.h> 16#include <linux/init_task.h>
@@ -788,36 +789,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
788 dequeue_task(rq, p, flags); 789 dequeue_task(rq, p, flags);
789} 790}
790 791
791void sched_set_stop_task(int cpu, struct task_struct *stop)
792{
793 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
794 struct task_struct *old_stop = cpu_rq(cpu)->stop;
795
796 if (stop) {
797 /*
798 * Make it appear like a SCHED_FIFO task, its something
799 * userspace knows about and won't get confused about.
800 *
801 * Also, it will make PI more or less work without too
802 * much confusion -- but then, stop work should not
803 * rely on PI working anyway.
804 */
805 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
806
807 stop->sched_class = &stop_sched_class;
808 }
809
810 cpu_rq(cpu)->stop = stop;
811
812 if (old_stop) {
813 /*
814 * Reset it back to a normal scheduling class so that
815 * it can die in pieces.
816 */
817 old_stop->sched_class = &rt_sched_class;
818 }
819}
820
821/* 792/*
822 * __normal_prio - return the priority that is based on the static prio 793 * __normal_prio - return the priority that is based on the static prio
823 */ 794 */
@@ -1588,6 +1559,36 @@ static void update_avg(u64 *avg, u64 sample)
1588 *avg += diff >> 3; 1559 *avg += diff >> 3;
1589} 1560}
1590 1561
1562void sched_set_stop_task(int cpu, struct task_struct *stop)
1563{
1564 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
1565 struct task_struct *old_stop = cpu_rq(cpu)->stop;
1566
1567 if (stop) {
1568 /*
1569 * Make it appear like a SCHED_FIFO task, its something
1570 * userspace knows about and won't get confused about.
1571 *
1572 * Also, it will make PI more or less work without too
1573 * much confusion -- but then, stop work should not
1574 * rely on PI working anyway.
1575 */
1576 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
1577
1578 stop->sched_class = &stop_sched_class;
1579 }
1580
1581 cpu_rq(cpu)->stop = stop;
1582
1583 if (old_stop) {
1584 /*
1585 * Reset it back to a normal scheduling class so that
1586 * it can die in pieces.
1587 */
1588 old_stop->sched_class = &rt_sched_class;
1589 }
1590}
1591
1591#else 1592#else
1592 1593
1593static inline int __set_cpus_allowed_ptr(struct task_struct *p, 1594static inline int __set_cpus_allowed_ptr(struct task_struct *p,
@@ -1731,7 +1732,7 @@ void sched_ttwu_pending(void)
1731{ 1732{
1732 struct rq *rq = this_rq(); 1733 struct rq *rq = this_rq();
1733 struct llist_node *llist = llist_del_all(&rq->wake_list); 1734 struct llist_node *llist = llist_del_all(&rq->wake_list);
1734 struct task_struct *p; 1735 struct task_struct *p, *t;
1735 struct rq_flags rf; 1736 struct rq_flags rf;
1736 1737
1737 if (!llist) 1738 if (!llist)
@@ -1740,17 +1741,8 @@ void sched_ttwu_pending(void)
1740 rq_lock_irqsave(rq, &rf); 1741 rq_lock_irqsave(rq, &rf);
1741 update_rq_clock(rq); 1742 update_rq_clock(rq);
1742 1743
1743 while (llist) { 1744 llist_for_each_entry_safe(p, t, llist, wake_entry)
1744 int wake_flags = 0; 1745 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
1745
1746 p = llist_entry(llist, struct task_struct, wake_entry);
1747 llist = llist_next(llist);
1748
1749 if (p->sched_remote_wakeup)
1750 wake_flags = WF_MIGRATED;
1751
1752 ttwu_do_activate(rq, p, wake_flags, &rf);
1753 }
1754 1746
1755 rq_unlock_irqrestore(rq, &rf); 1747 rq_unlock_irqrestore(rq, &rf);
1756} 1748}
@@ -2148,23 +2140,6 @@ int wake_up_state(struct task_struct *p, unsigned int state)
2148} 2140}
2149 2141
2150/* 2142/*
2151 * This function clears the sched_dl_entity static params.
2152 */
2153void __dl_clear_params(struct task_struct *p)
2154{
2155 struct sched_dl_entity *dl_se = &p->dl;
2156
2157 dl_se->dl_runtime = 0;
2158 dl_se->dl_deadline = 0;
2159 dl_se->dl_period = 0;
2160 dl_se->flags = 0;
2161 dl_se->dl_bw = 0;
2162
2163 dl_se->dl_throttled = 0;
2164 dl_se->dl_yielded = 0;
2165}
2166
2167/*
2168 * Perform scheduler related setup for a newly forked process p. 2143 * Perform scheduler related setup for a newly forked process p.
2169 * p is forked by current. 2144 * p is forked by current.
2170 * 2145 *
@@ -2193,6 +2168,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2193 2168
2194 RB_CLEAR_NODE(&p->dl.rb_node); 2169 RB_CLEAR_NODE(&p->dl.rb_node);
2195 init_dl_task_timer(&p->dl); 2170 init_dl_task_timer(&p->dl);
2171 init_dl_inactive_task_timer(&p->dl);
2196 __dl_clear_params(p); 2172 __dl_clear_params(p);
2197 2173
2198 INIT_LIST_HEAD(&p->rt.run_list); 2174 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2430,7 +2406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
2430unsigned long to_ratio(u64 period, u64 runtime) 2406unsigned long to_ratio(u64 period, u64 runtime)
2431{ 2407{
2432 if (runtime == RUNTIME_INF) 2408 if (runtime == RUNTIME_INF)
2433 return 1ULL << 20; 2409 return BW_UNIT;
2434 2410
2435 /* 2411 /*
2436 * Doing this here saves a lot of checks in all 2412 * Doing this here saves a lot of checks in all
@@ -2440,93 +2416,9 @@ unsigned long to_ratio(u64 period, u64 runtime)
2440 if (period == 0) 2416 if (period == 0)
2441 return 0; 2417 return 0;
2442 2418
2443 return div64_u64(runtime << 20, period); 2419 return div64_u64(runtime << BW_SHIFT, period);
2444}
2445
2446#ifdef CONFIG_SMP
2447inline struct dl_bw *dl_bw_of(int i)
2448{
2449 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2450 "sched RCU must be held");
2451 return &cpu_rq(i)->rd->dl_bw;
2452}
2453
2454static inline int dl_bw_cpus(int i)
2455{
2456 struct root_domain *rd = cpu_rq(i)->rd;
2457 int cpus = 0;
2458
2459 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2460 "sched RCU must be held");
2461 for_each_cpu_and(i, rd->span, cpu_active_mask)
2462 cpus++;
2463
2464 return cpus;
2465}
2466#else
2467inline struct dl_bw *dl_bw_of(int i)
2468{
2469 return &cpu_rq(i)->dl.dl_bw;
2470}
2471
2472static inline int dl_bw_cpus(int i)
2473{
2474 return 1;
2475}
2476#endif
2477
2478/*
2479 * We must be sure that accepting a new task (or allowing changing the
2480 * parameters of an existing one) is consistent with the bandwidth
2481 * constraints. If yes, this function also accordingly updates the currently
2482 * allocated bandwidth to reflect the new situation.
2483 *
2484 * This function is called while holding p's rq->lock.
2485 *
2486 * XXX we should delay bw change until the task's 0-lag point, see
2487 * __setparam_dl().
2488 */
2489static int dl_overflow(struct task_struct *p, int policy,
2490 const struct sched_attr *attr)
2491{
2492
2493 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
2494 u64 period = attr->sched_period ?: attr->sched_deadline;
2495 u64 runtime = attr->sched_runtime;
2496 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
2497 int cpus, err = -1;
2498
2499 /* !deadline task may carry old deadline bandwidth */
2500 if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
2501 return 0;
2502
2503 /*
2504 * Either if a task, enters, leave, or stays -deadline but changes
2505 * its parameters, we may need to update accordingly the total
2506 * allocated bandwidth of the container.
2507 */
2508 raw_spin_lock(&dl_b->lock);
2509 cpus = dl_bw_cpus(task_cpu(p));
2510 if (dl_policy(policy) && !task_has_dl_policy(p) &&
2511 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
2512 __dl_add(dl_b, new_bw);
2513 err = 0;
2514 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
2515 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
2516 __dl_clear(dl_b, p->dl.dl_bw);
2517 __dl_add(dl_b, new_bw);
2518 err = 0;
2519 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
2520 __dl_clear(dl_b, p->dl.dl_bw);
2521 err = 0;
2522 }
2523 raw_spin_unlock(&dl_b->lock);
2524
2525 return err;
2526} 2420}
2527 2421
2528extern void init_dl_bw(struct dl_bw *dl_b);
2529
2530/* 2422/*
2531 * wake_up_new_task - wake up a newly created task for the first time. 2423 * wake_up_new_task - wake up a newly created task for the first time.
2532 * 2424 *
@@ -3687,7 +3579,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
3687 exception_exit(prev_state); 3579 exception_exit(prev_state);
3688} 3580}
3689 3581
3690int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 3582int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
3691 void *key) 3583 void *key)
3692{ 3584{
3693 return try_to_wake_up(curr->private, mode, wake_flags); 3585 return try_to_wake_up(curr->private, mode, wake_flags);
@@ -4009,46 +3901,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
4009} 3901}
4010 3902
4011/* 3903/*
4012 * This function initializes the sched_dl_entity of a newly becoming
4013 * SCHED_DEADLINE task.
4014 *
4015 * Only the static values are considered here, the actual runtime and the
4016 * absolute deadline will be properly calculated when the task is enqueued
4017 * for the first time with its new policy.
4018 */
4019static void
4020__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
4021{
4022 struct sched_dl_entity *dl_se = &p->dl;
4023
4024 dl_se->dl_runtime = attr->sched_runtime;
4025 dl_se->dl_deadline = attr->sched_deadline;
4026 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
4027 dl_se->flags = attr->sched_flags;
4028 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
4029
4030 /*
4031 * Changing the parameters of a task is 'tricky' and we're not doing
4032 * the correct thing -- also see task_dead_dl() and switched_from_dl().
4033 *
4034 * What we SHOULD do is delay the bandwidth release until the 0-lag
4035 * point. This would include retaining the task_struct until that time
4036 * and change dl_overflow() to not immediately decrement the current
4037 * amount.
4038 *
4039 * Instead we retain the current runtime/deadline and let the new
4040 * parameters take effect after the current reservation period lapses.
4041 * This is safe (albeit pessimistic) because the 0-lag point is always
4042 * before the current scheduling deadline.
4043 *
4044 * We can still have temporary overloads because we do not delay the
4045 * change in bandwidth until that time; so admission control is
4046 * not on the safe side. It does however guarantee tasks will never
4047 * consume more than promised.
4048 */
4049}
4050
4051/*
4052 * sched_setparam() passes in -1 for its policy, to let the functions 3904 * sched_setparam() passes in -1 for its policy, to let the functions
4053 * it calls know not to change it. 3905 * it calls know not to change it.
4054 */ 3906 */
@@ -4101,59 +3953,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
4101 p->sched_class = &fair_sched_class; 3953 p->sched_class = &fair_sched_class;
4102} 3954}
4103 3955
4104static void
4105__getparam_dl(struct task_struct *p, struct sched_attr *attr)
4106{
4107 struct sched_dl_entity *dl_se = &p->dl;
4108
4109 attr->sched_priority = p->rt_priority;
4110 attr->sched_runtime = dl_se->dl_runtime;
4111 attr->sched_deadline = dl_se->dl_deadline;
4112 attr->sched_period = dl_se->dl_period;
4113 attr->sched_flags = dl_se->flags;
4114}
4115
4116/*
4117 * This function validates the new parameters of a -deadline task.
4118 * We ask for the deadline not being zero, and greater or equal
4119 * than the runtime, as well as the period of being zero or
4120 * greater than deadline. Furthermore, we have to be sure that
4121 * user parameters are above the internal resolution of 1us (we
4122 * check sched_runtime only since it is always the smaller one) and
4123 * below 2^63 ns (we have to check both sched_deadline and
4124 * sched_period, as the latter can be zero).
4125 */
4126static bool
4127__checkparam_dl(const struct sched_attr *attr)
4128{
4129 /* deadline != 0 */
4130 if (attr->sched_deadline == 0)
4131 return false;
4132
4133 /*
4134 * Since we truncate DL_SCALE bits, make sure we're at least
4135 * that big.
4136 */
4137 if (attr->sched_runtime < (1ULL << DL_SCALE))
4138 return false;
4139
4140 /*
4141 * Since we use the MSB for wrap-around and sign issues, make
4142 * sure it's not set (mind that period can be equal to zero).
4143 */
4144 if (attr->sched_deadline & (1ULL << 63) ||
4145 attr->sched_period & (1ULL << 63))
4146 return false;
4147
4148 /* runtime <= deadline <= period (if period != 0) */
4149 if ((attr->sched_period != 0 &&
4150 attr->sched_period < attr->sched_deadline) ||
4151 attr->sched_deadline < attr->sched_runtime)
4152 return false;
4153
4154 return true;
4155}
4156
4157/* 3956/*
4158 * Check the target process has a UID that matches the current process's: 3957 * Check the target process has a UID that matches the current process's:
4159 */ 3958 */
@@ -4170,19 +3969,6 @@ static bool check_same_owner(struct task_struct *p)
4170 return match; 3969 return match;
4171} 3970}
4172 3971
4173static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
4174{
4175 struct sched_dl_entity *dl_se = &p->dl;
4176
4177 if (dl_se->dl_runtime != attr->sched_runtime ||
4178 dl_se->dl_deadline != attr->sched_deadline ||
4179 dl_se->dl_period != attr->sched_period ||
4180 dl_se->flags != attr->sched_flags)
4181 return true;
4182
4183 return false;
4184}
4185
4186static int __sched_setscheduler(struct task_struct *p, 3972static int __sched_setscheduler(struct task_struct *p,
4187 const struct sched_attr *attr, 3973 const struct sched_attr *attr,
4188 bool user, bool pi) 3974 bool user, bool pi)
@@ -4197,8 +3983,8 @@ static int __sched_setscheduler(struct task_struct *p,
4197 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 3983 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
4198 struct rq *rq; 3984 struct rq *rq;
4199 3985
4200 /* May grab non-irq protected spin_locks: */ 3986 /* The pi code expects interrupts enabled */
4201 BUG_ON(in_interrupt()); 3987 BUG_ON(pi && in_interrupt());
4202recheck: 3988recheck:
4203 /* Double check policy once rq lock held: */ 3989 /* Double check policy once rq lock held: */
4204 if (policy < 0) { 3990 if (policy < 0) {
@@ -4211,7 +3997,8 @@ recheck:
4211 return -EINVAL; 3997 return -EINVAL;
4212 } 3998 }
4213 3999
4214 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) 4000 if (attr->sched_flags &
4001 ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
4215 return -EINVAL; 4002 return -EINVAL;
4216 4003
4217 /* 4004 /*
@@ -4362,7 +4149,7 @@ change:
4362 * of a SCHED_DEADLINE task) we need to check if enough bandwidth 4149 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
4363 * is available. 4150 * is available.
4364 */ 4151 */
4365 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { 4152 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
4366 task_rq_unlock(rq, p, &rf); 4153 task_rq_unlock(rq, p, &rf);
4367 return -EBUSY; 4154 return -EBUSY;
4368 } 4155 }
@@ -5463,26 +5250,17 @@ void init_idle(struct task_struct *idle, int cpu)
5463#endif 5250#endif
5464} 5251}
5465 5252
5253#ifdef CONFIG_SMP
5254
5466int cpuset_cpumask_can_shrink(const struct cpumask *cur, 5255int cpuset_cpumask_can_shrink(const struct cpumask *cur,
5467 const struct cpumask *trial) 5256 const struct cpumask *trial)
5468{ 5257{
5469 int ret = 1, trial_cpus; 5258 int ret = 1;
5470 struct dl_bw *cur_dl_b;
5471 unsigned long flags;
5472 5259
5473 if (!cpumask_weight(cur)) 5260 if (!cpumask_weight(cur))
5474 return ret; 5261 return ret;
5475 5262
5476 rcu_read_lock_sched(); 5263 ret = dl_cpuset_cpumask_can_shrink(cur, trial);
5477 cur_dl_b = dl_bw_of(cpumask_any(cur));
5478 trial_cpus = cpumask_weight(trial);
5479
5480 raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
5481 if (cur_dl_b->bw != -1 &&
5482 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
5483 ret = 0;
5484 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
5485 rcu_read_unlock_sched();
5486 5264
5487 return ret; 5265 return ret;
5488} 5266}
@@ -5506,43 +5284,14 @@ int task_can_attach(struct task_struct *p,
5506 goto out; 5284 goto out;
5507 } 5285 }
5508 5286
5509#ifdef CONFIG_SMP
5510 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, 5287 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
5511 cs_cpus_allowed)) { 5288 cs_cpus_allowed))
5512 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, 5289 ret = dl_task_can_attach(p, cs_cpus_allowed);
5513 cs_cpus_allowed);
5514 struct dl_bw *dl_b;
5515 bool overflow;
5516 int cpus;
5517 unsigned long flags;
5518
5519 rcu_read_lock_sched();
5520 dl_b = dl_bw_of(dest_cpu);
5521 raw_spin_lock_irqsave(&dl_b->lock, flags);
5522 cpus = dl_bw_cpus(dest_cpu);
5523 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
5524 if (overflow)
5525 ret = -EBUSY;
5526 else {
5527 /*
5528 * We reserve space for this task in the destination
5529 * root_domain, as we can't fail after this point.
5530 * We will free resources in the source root_domain
5531 * later on (see set_cpus_allowed_dl()).
5532 */
5533 __dl_add(dl_b, p->dl.dl_bw);
5534 }
5535 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5536 rcu_read_unlock_sched();
5537 5290
5538 }
5539#endif
5540out: 5291out:
5541 return ret; 5292 return ret;
5542} 5293}
5543 5294
5544#ifdef CONFIG_SMP
5545
5546bool sched_smp_initialized __read_mostly; 5295bool sched_smp_initialized __read_mostly;
5547 5296
5548#ifdef CONFIG_NUMA_BALANCING 5297#ifdef CONFIG_NUMA_BALANCING
@@ -5605,7 +5354,7 @@ void idle_task_exit(void)
5605 BUG_ON(cpu_online(smp_processor_id())); 5354 BUG_ON(cpu_online(smp_processor_id()));
5606 5355
5607 if (mm != &init_mm) { 5356 if (mm != &init_mm) {
5608 switch_mm_irqs_off(mm, &init_mm, current); 5357 switch_mm(mm, &init_mm, current);
5609 finish_arch_post_lock_switch(); 5358 finish_arch_post_lock_switch();
5610 } 5359 }
5611 mmdrop(mm); 5360 mmdrop(mm);
@@ -5805,23 +5554,8 @@ static void cpuset_cpu_active(void)
5805 5554
5806static int cpuset_cpu_inactive(unsigned int cpu) 5555static int cpuset_cpu_inactive(unsigned int cpu)
5807{ 5556{
5808 unsigned long flags;
5809 struct dl_bw *dl_b;
5810 bool overflow;
5811 int cpus;
5812
5813 if (!cpuhp_tasks_frozen) { 5557 if (!cpuhp_tasks_frozen) {
5814 rcu_read_lock_sched(); 5558 if (dl_cpu_busy(cpu))
5815 dl_b = dl_bw_of(cpu);
5816
5817 raw_spin_lock_irqsave(&dl_b->lock, flags);
5818 cpus = dl_bw_cpus(cpu);
5819 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5820 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5821
5822 rcu_read_unlock_sched();
5823
5824 if (overflow)
5825 return -EBUSY; 5559 return -EBUSY;
5826 cpuset_update_active_cpus(); 5560 cpuset_update_active_cpus();
5827 } else { 5561 } else {
@@ -5874,15 +5608,9 @@ int sched_cpu_deactivate(unsigned int cpu)
5874 * users of this state to go away such that all new such users will 5608 * users of this state to go away such that all new such users will
5875 * observe it. 5609 * observe it.
5876 * 5610 *
5877 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
5878 * not imply sync_sched(), so wait for both.
5879 *
5880 * Do sync before park smpboot threads to take care the rcu boost case. 5611 * Do sync before park smpboot threads to take care the rcu boost case.
5881 */ 5612 */
5882 if (IS_ENABLED(CONFIG_PREEMPT)) 5613 synchronize_rcu_mult(call_rcu, call_rcu_sched);
5883 synchronize_rcu_mult(call_rcu, call_rcu_sched);
5884 else
5885 synchronize_rcu();
5886 5614
5887 if (!sched_smp_initialized) 5615 if (!sched_smp_initialized)
5888 return 0; 5616 return 0;
@@ -5958,7 +5686,6 @@ void __init sched_init_smp(void)
5958 cpumask_var_t non_isolated_cpus; 5686 cpumask_var_t non_isolated_cpus;
5959 5687
5960 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 5688 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
5961 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
5962 5689
5963 sched_init_numa(); 5690 sched_init_numa();
5964 5691
@@ -5968,7 +5695,7 @@ void __init sched_init_smp(void)
5968 * happen. 5695 * happen.
5969 */ 5696 */
5970 mutex_lock(&sched_domains_mutex); 5697 mutex_lock(&sched_domains_mutex);
5971 init_sched_domains(cpu_active_mask); 5698 sched_init_domains(cpu_active_mask);
5972 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 5699 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
5973 if (cpumask_empty(non_isolated_cpus)) 5700 if (cpumask_empty(non_isolated_cpus))
5974 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 5701 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -5984,7 +5711,6 @@ void __init sched_init_smp(void)
5984 init_sched_dl_class(); 5711 init_sched_dl_class();
5985 5712
5986 sched_init_smt(); 5713 sched_init_smt();
5987 sched_clock_init_late();
5988 5714
5989 sched_smp_initialized = true; 5715 sched_smp_initialized = true;
5990} 5716}
@@ -6000,7 +5726,6 @@ early_initcall(migration_init);
6000void __init sched_init_smp(void) 5726void __init sched_init_smp(void)
6001{ 5727{
6002 sched_init_granularity(); 5728 sched_init_granularity();
6003 sched_clock_init_late();
6004} 5729}
6005#endif /* CONFIG_SMP */ 5730#endif /* CONFIG_SMP */
6006 5731
@@ -6026,28 +5751,13 @@ static struct kmem_cache *task_group_cache __read_mostly;
6026DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 5751DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6027DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); 5752DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
6028 5753
6029#define WAIT_TABLE_BITS 8
6030#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
6031static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
6032
6033wait_queue_head_t *bit_waitqueue(void *word, int bit)
6034{
6035 const int shift = BITS_PER_LONG == 32 ? 5 : 6;
6036 unsigned long val = (unsigned long)word << shift | bit;
6037
6038 return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
6039}
6040EXPORT_SYMBOL(bit_waitqueue);
6041
6042void __init sched_init(void) 5754void __init sched_init(void)
6043{ 5755{
6044 int i, j; 5756 int i, j;
6045 unsigned long alloc_size = 0, ptr; 5757 unsigned long alloc_size = 0, ptr;
6046 5758
6047 sched_clock_init(); 5759 sched_clock_init();
6048 5760 wait_bit_init();
6049 for (i = 0; i < WAIT_TABLE_SIZE; i++)
6050 init_waitqueue_head(bit_wait_table + i);
6051 5761
6052#ifdef CONFIG_FAIR_GROUP_SCHED 5762#ifdef CONFIG_FAIR_GROUP_SCHED
6053 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 5763 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
@@ -6199,7 +5909,6 @@ void __init sched_init(void)
6199 calc_load_update = jiffies + LOAD_FREQ; 5909 calc_load_update = jiffies + LOAD_FREQ;
6200 5910
6201#ifdef CONFIG_SMP 5911#ifdef CONFIG_SMP
6202 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
6203 /* May be allocated at isolcpus cmdline parse time */ 5912 /* May be allocated at isolcpus cmdline parse time */
6204 if (cpu_isolated_map == NULL) 5913 if (cpu_isolated_map == NULL)
6205 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 5914 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
@@ -6251,8 +5960,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
6251 5960
6252 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && 5961 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6253 !is_idle_task(current)) || 5962 !is_idle_task(current)) ||
6254 system_state != SYSTEM_RUNNING || oops_in_progress) 5963 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
5964 oops_in_progress)
6255 return; 5965 return;
5966
6256 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 5967 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6257 return; 5968 return;
6258 prev_jiffy = jiffies; 5969 prev_jiffy = jiffies;
@@ -6507,385 +6218,6 @@ void sched_move_task(struct task_struct *tsk)
6507 6218
6508 task_rq_unlock(rq, tsk, &rf); 6219 task_rq_unlock(rq, tsk, &rf);
6509} 6220}
6510#endif /* CONFIG_CGROUP_SCHED */
6511
6512#ifdef CONFIG_RT_GROUP_SCHED
6513/*
6514 * Ensure that the real time constraints are schedulable.
6515 */
6516static DEFINE_MUTEX(rt_constraints_mutex);
6517
6518/* Must be called with tasklist_lock held */
6519static inline int tg_has_rt_tasks(struct task_group *tg)
6520{
6521 struct task_struct *g, *p;
6522
6523 /*
6524 * Autogroups do not have RT tasks; see autogroup_create().
6525 */
6526 if (task_group_is_autogroup(tg))
6527 return 0;
6528
6529 for_each_process_thread(g, p) {
6530 if (rt_task(p) && task_group(p) == tg)
6531 return 1;
6532 }
6533
6534 return 0;
6535}
6536
6537struct rt_schedulable_data {
6538 struct task_group *tg;
6539 u64 rt_period;
6540 u64 rt_runtime;
6541};
6542
6543static int tg_rt_schedulable(struct task_group *tg, void *data)
6544{
6545 struct rt_schedulable_data *d = data;
6546 struct task_group *child;
6547 unsigned long total, sum = 0;
6548 u64 period, runtime;
6549
6550 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
6551 runtime = tg->rt_bandwidth.rt_runtime;
6552
6553 if (tg == d->tg) {
6554 period = d->rt_period;
6555 runtime = d->rt_runtime;
6556 }
6557
6558 /*
6559 * Cannot have more runtime than the period.
6560 */
6561 if (runtime > period && runtime != RUNTIME_INF)
6562 return -EINVAL;
6563
6564 /*
6565 * Ensure we don't starve existing RT tasks.
6566 */
6567 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
6568 return -EBUSY;
6569
6570 total = to_ratio(period, runtime);
6571
6572 /*
6573 * Nobody can have more than the global setting allows.
6574 */
6575 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
6576 return -EINVAL;
6577
6578 /*
6579 * The sum of our children's runtime should not exceed our own.
6580 */
6581 list_for_each_entry_rcu(child, &tg->children, siblings) {
6582 period = ktime_to_ns(child->rt_bandwidth.rt_period);
6583 runtime = child->rt_bandwidth.rt_runtime;
6584
6585 if (child == d->tg) {
6586 period = d->rt_period;
6587 runtime = d->rt_runtime;
6588 }
6589
6590 sum += to_ratio(period, runtime);
6591 }
6592
6593 if (sum > total)
6594 return -EINVAL;
6595
6596 return 0;
6597}
6598
6599static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
6600{
6601 int ret;
6602
6603 struct rt_schedulable_data data = {
6604 .tg = tg,
6605 .rt_period = period,
6606 .rt_runtime = runtime,
6607 };
6608
6609 rcu_read_lock();
6610 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
6611 rcu_read_unlock();
6612
6613 return ret;
6614}
6615
6616static int tg_set_rt_bandwidth(struct task_group *tg,
6617 u64 rt_period, u64 rt_runtime)
6618{
6619 int i, err = 0;
6620
6621 /*
6622 * Disallowing the root group RT runtime is BAD, it would disallow the
6623 * kernel creating (and or operating) RT threads.
6624 */
6625 if (tg == &root_task_group && rt_runtime == 0)
6626 return -EINVAL;
6627
6628 /* No period doesn't make any sense. */
6629 if (rt_period == 0)
6630 return -EINVAL;
6631
6632 mutex_lock(&rt_constraints_mutex);
6633 read_lock(&tasklist_lock);
6634 err = __rt_schedulable(tg, rt_period, rt_runtime);
6635 if (err)
6636 goto unlock;
6637
6638 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
6639 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
6640 tg->rt_bandwidth.rt_runtime = rt_runtime;
6641
6642 for_each_possible_cpu(i) {
6643 struct rt_rq *rt_rq = tg->rt_rq[i];
6644
6645 raw_spin_lock(&rt_rq->rt_runtime_lock);
6646 rt_rq->rt_runtime = rt_runtime;
6647 raw_spin_unlock(&rt_rq->rt_runtime_lock);
6648 }
6649 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
6650unlock:
6651 read_unlock(&tasklist_lock);
6652 mutex_unlock(&rt_constraints_mutex);
6653
6654 return err;
6655}
6656
6657static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
6658{
6659 u64 rt_runtime, rt_period;
6660
6661 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
6662 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
6663 if (rt_runtime_us < 0)
6664 rt_runtime = RUNTIME_INF;
6665
6666 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
6667}
6668
6669static long sched_group_rt_runtime(struct task_group *tg)
6670{
6671 u64 rt_runtime_us;
6672
6673 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
6674 return -1;
6675
6676 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
6677 do_div(rt_runtime_us, NSEC_PER_USEC);
6678 return rt_runtime_us;
6679}
6680
6681static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
6682{
6683 u64 rt_runtime, rt_period;
6684
6685 rt_period = rt_period_us * NSEC_PER_USEC;
6686 rt_runtime = tg->rt_bandwidth.rt_runtime;
6687
6688 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
6689}
6690
6691static long sched_group_rt_period(struct task_group *tg)
6692{
6693 u64 rt_period_us;
6694
6695 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
6696 do_div(rt_period_us, NSEC_PER_USEC);
6697 return rt_period_us;
6698}
6699#endif /* CONFIG_RT_GROUP_SCHED */
6700
6701#ifdef CONFIG_RT_GROUP_SCHED
6702static int sched_rt_global_constraints(void)
6703{
6704 int ret = 0;
6705
6706 mutex_lock(&rt_constraints_mutex);
6707 read_lock(&tasklist_lock);
6708 ret = __rt_schedulable(NULL, 0, 0);
6709 read_unlock(&tasklist_lock);
6710 mutex_unlock(&rt_constraints_mutex);
6711
6712 return ret;
6713}
6714
6715static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
6716{
6717 /* Don't accept realtime tasks when there is no way for them to run */
6718 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
6719 return 0;
6720
6721 return 1;
6722}
6723
6724#else /* !CONFIG_RT_GROUP_SCHED */
6725static int sched_rt_global_constraints(void)
6726{
6727 unsigned long flags;
6728 int i;
6729
6730 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
6731 for_each_possible_cpu(i) {
6732 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
6733
6734 raw_spin_lock(&rt_rq->rt_runtime_lock);
6735 rt_rq->rt_runtime = global_rt_runtime();
6736 raw_spin_unlock(&rt_rq->rt_runtime_lock);
6737 }
6738 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
6739
6740 return 0;
6741}
6742#endif /* CONFIG_RT_GROUP_SCHED */
6743
6744static int sched_dl_global_validate(void)
6745{
6746 u64 runtime = global_rt_runtime();
6747 u64 period = global_rt_period();
6748 u64 new_bw = to_ratio(period, runtime);
6749 struct dl_bw *dl_b;
6750 int cpu, ret = 0;
6751 unsigned long flags;
6752
6753 /*
6754 * Here we want to check the bandwidth not being set to some
6755 * value smaller than the currently allocated bandwidth in
6756 * any of the root_domains.
6757 *
6758 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
6759 * cycling on root_domains... Discussion on different/better
6760 * solutions is welcome!
6761 */
6762 for_each_possible_cpu(cpu) {
6763 rcu_read_lock_sched();
6764 dl_b = dl_bw_of(cpu);
6765
6766 raw_spin_lock_irqsave(&dl_b->lock, flags);
6767 if (new_bw < dl_b->total_bw)
6768 ret = -EBUSY;
6769 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
6770
6771 rcu_read_unlock_sched();
6772
6773 if (ret)
6774 break;
6775 }
6776
6777 return ret;
6778}
6779
6780static void sched_dl_do_global(void)
6781{
6782 u64 new_bw = -1;
6783 struct dl_bw *dl_b;
6784 int cpu;
6785 unsigned long flags;
6786
6787 def_dl_bandwidth.dl_period = global_rt_period();
6788 def_dl_bandwidth.dl_runtime = global_rt_runtime();
6789
6790 if (global_rt_runtime() != RUNTIME_INF)
6791 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
6792
6793 /*
6794 * FIXME: As above...
6795 */
6796 for_each_possible_cpu(cpu) {
6797 rcu_read_lock_sched();
6798 dl_b = dl_bw_of(cpu);
6799
6800 raw_spin_lock_irqsave(&dl_b->lock, flags);
6801 dl_b->bw = new_bw;
6802 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
6803
6804 rcu_read_unlock_sched();
6805 }
6806}
6807
6808static int sched_rt_global_validate(void)
6809{
6810 if (sysctl_sched_rt_period <= 0)
6811 return -EINVAL;
6812
6813 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
6814 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
6815 return -EINVAL;
6816
6817 return 0;
6818}
6819
6820static void sched_rt_do_global(void)
6821{
6822 def_rt_bandwidth.rt_runtime = global_rt_runtime();
6823 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
6824}
6825
6826int sched_rt_handler(struct ctl_table *table, int write,
6827 void __user *buffer, size_t *lenp,
6828 loff_t *ppos)
6829{
6830 int old_period, old_runtime;
6831 static DEFINE_MUTEX(mutex);
6832 int ret;
6833
6834 mutex_lock(&mutex);
6835 old_period = sysctl_sched_rt_period;
6836 old_runtime = sysctl_sched_rt_runtime;
6837
6838 ret = proc_dointvec(table, write, buffer, lenp, ppos);
6839
6840 if (!ret && write) {
6841 ret = sched_rt_global_validate();
6842 if (ret)
6843 goto undo;
6844
6845 ret = sched_dl_global_validate();
6846 if (ret)
6847 goto undo;
6848
6849 ret = sched_rt_global_constraints();
6850 if (ret)
6851 goto undo;
6852
6853 sched_rt_do_global();
6854 sched_dl_do_global();
6855 }
6856 if (0) {
6857undo:
6858 sysctl_sched_rt_period = old_period;
6859 sysctl_sched_rt_runtime = old_runtime;
6860 }
6861 mutex_unlock(&mutex);
6862
6863 return ret;
6864}
6865
6866int sched_rr_handler(struct ctl_table *table, int write,
6867 void __user *buffer, size_t *lenp,
6868 loff_t *ppos)
6869{
6870 int ret;
6871 static DEFINE_MUTEX(mutex);
6872
6873 mutex_lock(&mutex);
6874 ret = proc_dointvec(table, write, buffer, lenp, ppos);
6875 /*
6876 * Make sure that internally we keep jiffies.
6877 * Also, writing zero resets the timeslice to default:
6878 */
6879 if (!ret && write) {
6880 sched_rr_timeslice =
6881 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
6882 msecs_to_jiffies(sysctl_sched_rr_timeslice);
6883 }
6884 mutex_unlock(&mutex);
6885 return ret;
6886}
6887
6888#ifdef CONFIG_CGROUP_SCHED
6889 6221
6890static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 6222static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
6891{ 6223{
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 76877a62b5fa..29a397067ffa 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -101,9 +101,6 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
101 if (sg_policy->next_freq == next_freq) 101 if (sg_policy->next_freq == next_freq)
102 return; 102 return;
103 103
104 if (sg_policy->next_freq > next_freq)
105 next_freq = (sg_policy->next_freq + next_freq) >> 1;
106
107 sg_policy->next_freq = next_freq; 104 sg_policy->next_freq = next_freq;
108 sg_policy->last_freq_update_time = time; 105 sg_policy->last_freq_update_time = time;
109 106
@@ -245,11 +242,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
245 sugov_update_commit(sg_policy, time, next_f); 242 sugov_update_commit(sg_policy, time, next_f);
246} 243}
247 244
248static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) 245static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
249{ 246{
250 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 247 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
251 struct cpufreq_policy *policy = sg_policy->policy; 248 struct cpufreq_policy *policy = sg_policy->policy;
252 u64 last_freq_update_time = sg_policy->last_freq_update_time;
253 unsigned long util = 0, max = 1; 249 unsigned long util = 0, max = 1;
254 unsigned int j; 250 unsigned int j;
255 251
@@ -265,7 +261,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu)
265 * enough, don't take the CPU into account as it probably is 261 * enough, don't take the CPU into account as it probably is
266 * idle now (and clear iowait_boost for it). 262 * idle now (and clear iowait_boost for it).
267 */ 263 */
268 delta_ns = last_freq_update_time - j_sg_cpu->last_update; 264 delta_ns = time - j_sg_cpu->last_update;
269 if (delta_ns > TICK_NSEC) { 265 if (delta_ns > TICK_NSEC) {
270 j_sg_cpu->iowait_boost = 0; 266 j_sg_cpu->iowait_boost = 0;
271 continue; 267 continue;
@@ -309,7 +305,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
309 if (flags & SCHED_CPUFREQ_RT_DL) 305 if (flags & SCHED_CPUFREQ_RT_DL)
310 next_f = sg_policy->policy->cpuinfo.max_freq; 306 next_f = sg_policy->policy->cpuinfo.max_freq;
311 else 307 else
312 next_f = sugov_next_freq_shared(sg_cpu); 308 next_f = sugov_next_freq_shared(sg_cpu, time);
313 309
314 sugov_update_commit(sg_policy, time, next_f); 310 sugov_update_commit(sg_policy, time, next_f);
315 } 311 }
@@ -614,6 +610,11 @@ static int sugov_start(struct cpufreq_policy *policy)
614 sg_cpu->sg_policy = sg_policy; 610 sg_cpu->sg_policy = sg_policy;
615 sg_cpu->flags = SCHED_CPUFREQ_RT; 611 sg_cpu->flags = SCHED_CPUFREQ_RT;
616 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; 612 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
613 }
614
615 for_each_cpu(cpu, policy->cpus) {
616 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
617
617 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, 618 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
618 policy_is_shared(policy) ? 619 policy_is_shared(policy) ?
619 sugov_update_shared : 620 sugov_update_shared :
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index aea3135c5d90..14d2dbf97c53 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -611,9 +611,9 @@ static void cputime_adjust(struct task_cputime *curr,
611 utime = curr->utime; 611 utime = curr->utime;
612 612
613 /* 613 /*
614 * If either stime or both stime and utime are 0, assume all runtime is 614 * If either stime or utime are 0, assume all runtime is userspace.
615 * userspace. Once a task gets some ticks, the monotonicy code at 615 * Once a task gets some ticks, the monotonicy code at 'update:'
616 * 'update' will ensure things converge to the observed ratio. 616 * will ensure things converge to the observed ratio.
617 */ 617 */
618 if (stime == 0) { 618 if (stime == 0) {
619 utime = rtime; 619 utime = rtime;
@@ -679,20 +679,21 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
679#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 679#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
680 680
681#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 681#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
682static u64 vtime_delta(struct task_struct *tsk) 682static u64 vtime_delta(struct vtime *vtime)
683{ 683{
684 unsigned long now = READ_ONCE(jiffies); 684 unsigned long long clock;
685 685
686 if (time_before(now, (unsigned long)tsk->vtime_snap)) 686 clock = sched_clock();
687 if (clock < vtime->starttime)
687 return 0; 688 return 0;
688 689
689 return jiffies_to_nsecs(now - tsk->vtime_snap); 690 return clock - vtime->starttime;
690} 691}
691 692
692static u64 get_vtime_delta(struct task_struct *tsk) 693static u64 get_vtime_delta(struct vtime *vtime)
693{ 694{
694 unsigned long now = READ_ONCE(jiffies); 695 u64 delta = vtime_delta(vtime);
695 u64 delta, other; 696 u64 other;
696 697
697 /* 698 /*
698 * Unlike tick based timing, vtime based timing never has lost 699 * Unlike tick based timing, vtime based timing never has lost
@@ -701,104 +702,138 @@ static u64 get_vtime_delta(struct task_struct *tsk)
701 * elapsed time. Limit account_other_time to prevent rounding 702 * elapsed time. Limit account_other_time to prevent rounding
702 * errors from causing elapsed vtime to go negative. 703 * errors from causing elapsed vtime to go negative.
703 */ 704 */
704 delta = jiffies_to_nsecs(now - tsk->vtime_snap);
705 other = account_other_time(delta); 705 other = account_other_time(delta);
706 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); 706 WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
707 tsk->vtime_snap = now; 707 vtime->starttime += delta;
708 708
709 return delta - other; 709 return delta - other;
710} 710}
711 711
712static void __vtime_account_system(struct task_struct *tsk) 712static void __vtime_account_system(struct task_struct *tsk,
713 struct vtime *vtime)
713{ 714{
714 account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); 715 vtime->stime += get_vtime_delta(vtime);
716 if (vtime->stime >= TICK_NSEC) {
717 account_system_time(tsk, irq_count(), vtime->stime);
718 vtime->stime = 0;
719 }
720}
721
722static void vtime_account_guest(struct task_struct *tsk,
723 struct vtime *vtime)
724{
725 vtime->gtime += get_vtime_delta(vtime);
726 if (vtime->gtime >= TICK_NSEC) {
727 account_guest_time(tsk, vtime->gtime);
728 vtime->gtime = 0;
729 }
715} 730}
716 731
717void vtime_account_system(struct task_struct *tsk) 732void vtime_account_system(struct task_struct *tsk)
718{ 733{
719 if (!vtime_delta(tsk)) 734 struct vtime *vtime = &tsk->vtime;
735
736 if (!vtime_delta(vtime))
720 return; 737 return;
721 738
722 write_seqcount_begin(&tsk->vtime_seqcount); 739 write_seqcount_begin(&vtime->seqcount);
723 __vtime_account_system(tsk); 740 /* We might have scheduled out from guest path */
724 write_seqcount_end(&tsk->vtime_seqcount); 741 if (current->flags & PF_VCPU)
742 vtime_account_guest(tsk, vtime);
743 else
744 __vtime_account_system(tsk, vtime);
745 write_seqcount_end(&vtime->seqcount);
725} 746}
726 747
727void vtime_account_user(struct task_struct *tsk) 748void vtime_user_enter(struct task_struct *tsk)
728{ 749{
729 write_seqcount_begin(&tsk->vtime_seqcount); 750 struct vtime *vtime = &tsk->vtime;
730 tsk->vtime_snap_whence = VTIME_SYS; 751
731 if (vtime_delta(tsk)) 752 write_seqcount_begin(&vtime->seqcount);
732 account_user_time(tsk, get_vtime_delta(tsk)); 753 __vtime_account_system(tsk, vtime);
733 write_seqcount_end(&tsk->vtime_seqcount); 754 vtime->state = VTIME_USER;
755 write_seqcount_end(&vtime->seqcount);
734} 756}
735 757
736void vtime_user_enter(struct task_struct *tsk) 758void vtime_user_exit(struct task_struct *tsk)
737{ 759{
738 write_seqcount_begin(&tsk->vtime_seqcount); 760 struct vtime *vtime = &tsk->vtime;
739 if (vtime_delta(tsk)) 761
740 __vtime_account_system(tsk); 762 write_seqcount_begin(&vtime->seqcount);
741 tsk->vtime_snap_whence = VTIME_USER; 763 vtime->utime += get_vtime_delta(vtime);
742 write_seqcount_end(&tsk->vtime_seqcount); 764 if (vtime->utime >= TICK_NSEC) {
765 account_user_time(tsk, vtime->utime);
766 vtime->utime = 0;
767 }
768 vtime->state = VTIME_SYS;
769 write_seqcount_end(&vtime->seqcount);
743} 770}
744 771
745void vtime_guest_enter(struct task_struct *tsk) 772void vtime_guest_enter(struct task_struct *tsk)
746{ 773{
774 struct vtime *vtime = &tsk->vtime;
747 /* 775 /*
748 * The flags must be updated under the lock with 776 * The flags must be updated under the lock with
749 * the vtime_snap flush and update. 777 * the vtime_starttime flush and update.
750 * That enforces a right ordering and update sequence 778 * That enforces a right ordering and update sequence
751 * synchronization against the reader (task_gtime()) 779 * synchronization against the reader (task_gtime())
752 * that can thus safely catch up with a tickless delta. 780 * that can thus safely catch up with a tickless delta.
753 */ 781 */
754 write_seqcount_begin(&tsk->vtime_seqcount); 782 write_seqcount_begin(&vtime->seqcount);
755 if (vtime_delta(tsk)) 783 __vtime_account_system(tsk, vtime);
756 __vtime_account_system(tsk);
757 current->flags |= PF_VCPU; 784 current->flags |= PF_VCPU;
758 write_seqcount_end(&tsk->vtime_seqcount); 785 write_seqcount_end(&vtime->seqcount);
759} 786}
760EXPORT_SYMBOL_GPL(vtime_guest_enter); 787EXPORT_SYMBOL_GPL(vtime_guest_enter);
761 788
762void vtime_guest_exit(struct task_struct *tsk) 789void vtime_guest_exit(struct task_struct *tsk)
763{ 790{
764 write_seqcount_begin(&tsk->vtime_seqcount); 791 struct vtime *vtime = &tsk->vtime;
765 __vtime_account_system(tsk); 792
793 write_seqcount_begin(&vtime->seqcount);
794 vtime_account_guest(tsk, vtime);
766 current->flags &= ~PF_VCPU; 795 current->flags &= ~PF_VCPU;
767 write_seqcount_end(&tsk->vtime_seqcount); 796 write_seqcount_end(&vtime->seqcount);
768} 797}
769EXPORT_SYMBOL_GPL(vtime_guest_exit); 798EXPORT_SYMBOL_GPL(vtime_guest_exit);
770 799
771void vtime_account_idle(struct task_struct *tsk) 800void vtime_account_idle(struct task_struct *tsk)
772{ 801{
773 account_idle_time(get_vtime_delta(tsk)); 802 account_idle_time(get_vtime_delta(&tsk->vtime));
774} 803}
775 804
776void arch_vtime_task_switch(struct task_struct *prev) 805void arch_vtime_task_switch(struct task_struct *prev)
777{ 806{
778 write_seqcount_begin(&prev->vtime_seqcount); 807 struct vtime *vtime = &prev->vtime;
779 prev->vtime_snap_whence = VTIME_INACTIVE;
780 write_seqcount_end(&prev->vtime_seqcount);
781 808
782 write_seqcount_begin(&current->vtime_seqcount); 809 write_seqcount_begin(&vtime->seqcount);
783 current->vtime_snap_whence = VTIME_SYS; 810 vtime->state = VTIME_INACTIVE;
784 current->vtime_snap = jiffies; 811 write_seqcount_end(&vtime->seqcount);
785 write_seqcount_end(&current->vtime_seqcount); 812
813 vtime = &current->vtime;
814
815 write_seqcount_begin(&vtime->seqcount);
816 vtime->state = VTIME_SYS;
817 vtime->starttime = sched_clock();
818 write_seqcount_end(&vtime->seqcount);
786} 819}
787 820
788void vtime_init_idle(struct task_struct *t, int cpu) 821void vtime_init_idle(struct task_struct *t, int cpu)
789{ 822{
823 struct vtime *vtime = &t->vtime;
790 unsigned long flags; 824 unsigned long flags;
791 825
792 local_irq_save(flags); 826 local_irq_save(flags);
793 write_seqcount_begin(&t->vtime_seqcount); 827 write_seqcount_begin(&vtime->seqcount);
794 t->vtime_snap_whence = VTIME_SYS; 828 vtime->state = VTIME_SYS;
795 t->vtime_snap = jiffies; 829 vtime->starttime = sched_clock();
796 write_seqcount_end(&t->vtime_seqcount); 830 write_seqcount_end(&vtime->seqcount);
797 local_irq_restore(flags); 831 local_irq_restore(flags);
798} 832}
799 833
800u64 task_gtime(struct task_struct *t) 834u64 task_gtime(struct task_struct *t)
801{ 835{
836 struct vtime *vtime = &t->vtime;
802 unsigned int seq; 837 unsigned int seq;
803 u64 gtime; 838 u64 gtime;
804 839
@@ -806,13 +841,13 @@ u64 task_gtime(struct task_struct *t)
806 return t->gtime; 841 return t->gtime;
807 842
808 do { 843 do {
809 seq = read_seqcount_begin(&t->vtime_seqcount); 844 seq = read_seqcount_begin(&vtime->seqcount);
810 845
811 gtime = t->gtime; 846 gtime = t->gtime;
812 if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) 847 if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
813 gtime += vtime_delta(t); 848 gtime += vtime->gtime + vtime_delta(vtime);
814 849
815 } while (read_seqcount_retry(&t->vtime_seqcount, seq)); 850 } while (read_seqcount_retry(&vtime->seqcount, seq));
816 851
817 return gtime; 852 return gtime;
818} 853}
@@ -824,8 +859,9 @@ u64 task_gtime(struct task_struct *t)
824 */ 859 */
825void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) 860void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
826{ 861{
827 u64 delta; 862 struct vtime *vtime = &t->vtime;
828 unsigned int seq; 863 unsigned int seq;
864 u64 delta;
829 865
830 if (!vtime_accounting_enabled()) { 866 if (!vtime_accounting_enabled()) {
831 *utime = t->utime; 867 *utime = t->utime;
@@ -834,25 +870,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
834 } 870 }
835 871
836 do { 872 do {
837 seq = read_seqcount_begin(&t->vtime_seqcount); 873 seq = read_seqcount_begin(&vtime->seqcount);
838 874
839 *utime = t->utime; 875 *utime = t->utime;
840 *stime = t->stime; 876 *stime = t->stime;
841 877
842 /* Task is sleeping, nothing to add */ 878 /* Task is sleeping, nothing to add */
843 if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) 879 if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
844 continue; 880 continue;
845 881
846 delta = vtime_delta(t); 882 delta = vtime_delta(vtime);
847 883
848 /* 884 /*
849 * Task runs either in user or kernel space, add pending nohz time to 885 * Task runs either in user or kernel space, add pending nohz time to
850 * the right place. 886 * the right place.
851 */ 887 */
852 if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) 888 if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
853 *utime += delta; 889 *utime += vtime->utime + delta;
854 else if (t->vtime_snap_whence == VTIME_SYS) 890 else if (vtime->state == VTIME_SYS)
855 *stime += delta; 891 *stime += vtime->stime + delta;
856 } while (read_seqcount_retry(&t->vtime_seqcount, seq)); 892 } while (read_seqcount_retry(&vtime->seqcount, seq));
857} 893}
858#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 894#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a2ce59015642..755bd3f1a1a9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -17,6 +17,7 @@
17#include "sched.h" 17#include "sched.h"
18 18
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <uapi/linux/sched/types.h>
20 21
21struct dl_bandwidth def_dl_bandwidth; 22struct dl_bandwidth def_dl_bandwidth;
22 23
@@ -43,6 +44,254 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
43 return !RB_EMPTY_NODE(&dl_se->rb_node); 44 return !RB_EMPTY_NODE(&dl_se->rb_node);
44} 45}
45 46
47#ifdef CONFIG_SMP
48static inline struct dl_bw *dl_bw_of(int i)
49{
50 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
51 "sched RCU must be held");
52 return &cpu_rq(i)->rd->dl_bw;
53}
54
55static inline int dl_bw_cpus(int i)
56{
57 struct root_domain *rd = cpu_rq(i)->rd;
58 int cpus = 0;
59
60 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
61 "sched RCU must be held");
62 for_each_cpu_and(i, rd->span, cpu_active_mask)
63 cpus++;
64
65 return cpus;
66}
67#else
68static inline struct dl_bw *dl_bw_of(int i)
69{
70 return &cpu_rq(i)->dl.dl_bw;
71}
72
73static inline int dl_bw_cpus(int i)
74{
75 return 1;
76}
77#endif
78
79static inline
80void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
81{
82 u64 old = dl_rq->running_bw;
83
84 lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
85 dl_rq->running_bw += dl_bw;
86 SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
87 SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
88}
89
90static inline
91void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
92{
93 u64 old = dl_rq->running_bw;
94
95 lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
96 dl_rq->running_bw -= dl_bw;
97 SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
98 if (dl_rq->running_bw > old)
99 dl_rq->running_bw = 0;
100}
101
102static inline
103void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
104{
105 u64 old = dl_rq->this_bw;
106
107 lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
108 dl_rq->this_bw += dl_bw;
109 SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
110}
111
112static inline
113void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
114{
115 u64 old = dl_rq->this_bw;
116
117 lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
118 dl_rq->this_bw -= dl_bw;
119 SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
120 if (dl_rq->this_bw > old)
121 dl_rq->this_bw = 0;
122 SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
123}
124
125void dl_change_utilization(struct task_struct *p, u64 new_bw)
126{
127 struct rq *rq;
128
129 if (task_on_rq_queued(p))
130 return;
131
132 rq = task_rq(p);
133 if (p->dl.dl_non_contending) {
134 sub_running_bw(p->dl.dl_bw, &rq->dl);
135 p->dl.dl_non_contending = 0;
136 /*
137 * If the timer handler is currently running and the
138 * timer cannot be cancelled, inactive_task_timer()
139 * will see that dl_not_contending is not set, and
140 * will not touch the rq's active utilization,
141 * so we are still safe.
142 */
143 if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
144 put_task_struct(p);
145 }
146 sub_rq_bw(p->dl.dl_bw, &rq->dl);
147 add_rq_bw(new_bw, &rq->dl);
148}
149
150/*
151 * The utilization of a task cannot be immediately removed from
152 * the rq active utilization (running_bw) when the task blocks.
153 * Instead, we have to wait for the so called "0-lag time".
154 *
155 * If a task blocks before the "0-lag time", a timer (the inactive
156 * timer) is armed, and running_bw is decreased when the timer
157 * fires.
158 *
159 * If the task wakes up again before the inactive timer fires,
160 * the timer is cancelled, whereas if the task wakes up after the
161 * inactive timer fired (and running_bw has been decreased) the
162 * task's utilization has to be added to running_bw again.
163 * A flag in the deadline scheduling entity (dl_non_contending)
164 * is used to avoid race conditions between the inactive timer handler
165 * and task wakeups.
166 *
167 * The following diagram shows how running_bw is updated. A task is
168 * "ACTIVE" when its utilization contributes to running_bw; an
169 * "ACTIVE contending" task is in the TASK_RUNNING state, while an
170 * "ACTIVE non contending" task is a blocked task for which the "0-lag time"
171 * has not passed yet. An "INACTIVE" task is a task for which the "0-lag"
172 * time already passed, which does not contribute to running_bw anymore.
173 * +------------------+
174 * wakeup | ACTIVE |
175 * +------------------>+ contending |
176 * | add_running_bw | |
177 * | +----+------+------+
178 * | | ^
179 * | dequeue | |
180 * +--------+-------+ | |
181 * | | t >= 0-lag | | wakeup
182 * | INACTIVE |<---------------+ |
183 * | | sub_running_bw | |
184 * +--------+-------+ | |
185 * ^ | |
186 * | t < 0-lag | |
187 * | | |
188 * | V |
189 * | +----+------+------+
190 * | sub_running_bw | ACTIVE |
191 * +-------------------+ |
192 * inactive timer | non contending |
193 * fired +------------------+
194 *
195 * The task_non_contending() function is invoked when a task
196 * blocks, and checks if the 0-lag time already passed or
197 * not (in the first case, it directly updates running_bw;
198 * in the second case, it arms the inactive timer).
199 *
200 * The task_contending() function is invoked when a task wakes
201 * up, and checks if the task is still in the "ACTIVE non contending"
202 * state or not (in the second case, it updates running_bw).
203 */
204static void task_non_contending(struct task_struct *p)
205{
206 struct sched_dl_entity *dl_se = &p->dl;
207 struct hrtimer *timer = &dl_se->inactive_timer;
208 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
209 struct rq *rq = rq_of_dl_rq(dl_rq);
210 s64 zerolag_time;
211
212 /*
213 * If this is a non-deadline task that has been boosted,
214 * do nothing
215 */
216 if (dl_se->dl_runtime == 0)
217 return;
218
219 WARN_ON(hrtimer_active(&dl_se->inactive_timer));
220 WARN_ON(dl_se->dl_non_contending);
221
222 zerolag_time = dl_se->deadline -
223 div64_long((dl_se->runtime * dl_se->dl_period),
224 dl_se->dl_runtime);
225
226 /*
227 * Using relative times instead of the absolute "0-lag time"
228 * allows to simplify the code
229 */
230 zerolag_time -= rq_clock(rq);
231
232 /*
233 * If the "0-lag time" already passed, decrease the active
234 * utilization now, instead of starting a timer
235 */
236 if (zerolag_time < 0) {
237 if (dl_task(p))
238 sub_running_bw(dl_se->dl_bw, dl_rq);
239 if (!dl_task(p) || p->state == TASK_DEAD) {
240 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
241
242 if (p->state == TASK_DEAD)
243 sub_rq_bw(p->dl.dl_bw, &rq->dl);
244 raw_spin_lock(&dl_b->lock);
245 __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
246 __dl_clear_params(p);
247 raw_spin_unlock(&dl_b->lock);
248 }
249
250 return;
251 }
252
253 dl_se->dl_non_contending = 1;
254 get_task_struct(p);
255 hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
256}
257
258static void task_contending(struct sched_dl_entity *dl_se, int flags)
259{
260 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
261
262 /*
263 * If this is a non-deadline task that has been boosted,
264 * do nothing
265 */
266 if (dl_se->dl_runtime == 0)
267 return;
268
269 if (flags & ENQUEUE_MIGRATED)
270 add_rq_bw(dl_se->dl_bw, dl_rq);
271
272 if (dl_se->dl_non_contending) {
273 dl_se->dl_non_contending = 0;
274 /*
275 * If the timer handler is currently running and the
276 * timer cannot be cancelled, inactive_task_timer()
277 * will see that dl_not_contending is not set, and
278 * will not touch the rq's active utilization,
279 * so we are still safe.
280 */
281 if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
282 put_task_struct(dl_task_of(dl_se));
283 } else {
284 /*
285 * Since "dl_non_contending" is not set, the
286 * task's utilization has already been removed from
287 * active utilization (either when the task blocked,
288 * when the "inactive timer" fired).
289 * So, add it back.
290 */
291 add_running_bw(dl_se->dl_bw, dl_rq);
292 }
293}
294
46static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) 295static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
47{ 296{
48 struct sched_dl_entity *dl_se = &p->dl; 297 struct sched_dl_entity *dl_se = &p->dl;
@@ -83,6 +332,10 @@ void init_dl_rq(struct dl_rq *dl_rq)
83#else 332#else
84 init_dl_bw(&dl_rq->dl_bw); 333 init_dl_bw(&dl_rq->dl_bw);
85#endif 334#endif
335
336 dl_rq->running_bw = 0;
337 dl_rq->this_bw = 0;
338 init_dl_rq_bw_ratio(dl_rq);
86} 339}
87 340
88#ifdef CONFIG_SMP 341#ifdef CONFIG_SMP
@@ -484,13 +737,84 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
484} 737}
485 738
486/* 739/*
487 * When a -deadline entity is queued back on the runqueue, its runtime and 740 * Revised wakeup rule [1]: For self-suspending tasks, rather then
488 * deadline might need updating. 741 * re-initializing task's runtime and deadline, the revised wakeup
742 * rule adjusts the task's runtime to avoid the task to overrun its
743 * density.
744 *
745 * Reasoning: a task may overrun the density if:
746 * runtime / (deadline - t) > dl_runtime / dl_deadline
747 *
748 * Therefore, runtime can be adjusted to:
749 * runtime = (dl_runtime / dl_deadline) * (deadline - t)
750 *
751 * In such way that runtime will be equal to the maximum density
752 * the task can use without breaking any rule.
753 *
754 * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant
755 * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24.
756 */
757static void
758update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq)
759{
760 u64 laxity = dl_se->deadline - rq_clock(rq);
761
762 /*
763 * If the task has deadline < period, and the deadline is in the past,
764 * it should already be throttled before this check.
765 *
766 * See update_dl_entity() comments for further details.
767 */
768 WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq)));
769
770 dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT;
771}
772
773/*
774 * Regarding the deadline, a task with implicit deadline has a relative
775 * deadline == relative period. A task with constrained deadline has a
776 * relative deadline <= relative period.
777 *
778 * We support constrained deadline tasks. However, there are some restrictions
779 * applied only for tasks which do not have an implicit deadline. See
780 * update_dl_entity() to know more about such restrictions.
781 *
782 * The dl_is_implicit() returns true if the task has an implicit deadline.
783 */
784static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
785{
786 return dl_se->dl_deadline == dl_se->dl_period;
787}
788
789/*
790 * When a deadline entity is placed in the runqueue, its runtime and deadline
791 * might need to be updated. This is done by a CBS wake up rule. There are two
792 * different rules: 1) the original CBS; and 2) the Revisited CBS.
793 *
794 * When the task is starting a new period, the Original CBS is used. In this
795 * case, the runtime is replenished and a new absolute deadline is set.
796 *
797 * When a task is queued before the begin of the next period, using the
798 * remaining runtime and deadline could make the entity to overflow, see
799 * dl_entity_overflow() to find more about runtime overflow. When such case
800 * is detected, the runtime and deadline need to be updated.
801 *
802 * If the task has an implicit deadline, i.e., deadline == period, the Original
803 * CBS is applied. the runtime is replenished and a new absolute deadline is
804 * set, as in the previous cases.
805 *
806 * However, the Original CBS does not work properly for tasks with
807 * deadline < period, which are said to have a constrained deadline. By
808 * applying the Original CBS, a constrained deadline task would be able to run
809 * runtime/deadline in a period. With deadline < period, the task would
810 * overrun the runtime/period allowed bandwidth, breaking the admission test.
489 * 811 *
490 * The policy here is that we update the deadline of the entity only if: 812 * In order to prevent this misbehave, the Revisited CBS is used for
491 * - the current deadline is in the past, 813 * constrained deadline tasks when a runtime overflow is detected. In the
492 * - using the remaining runtime with the current deadline would make 814 * Revisited CBS, rather than replenishing & setting a new absolute deadline,
493 * the entity exceed its bandwidth. 815 * the remaining runtime of the task is reduced to avoid runtime overflow.
816 * Please refer to the comments update_dl_revised_wakeup() function to find
817 * more about the Revised CBS rule.
494 */ 818 */
495static void update_dl_entity(struct sched_dl_entity *dl_se, 819static void update_dl_entity(struct sched_dl_entity *dl_se,
496 struct sched_dl_entity *pi_se) 820 struct sched_dl_entity *pi_se)
@@ -500,6 +824,14 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
500 824
501 if (dl_time_before(dl_se->deadline, rq_clock(rq)) || 825 if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
502 dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { 826 dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
827
828 if (unlikely(!dl_is_implicit(dl_se) &&
829 !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
830 !dl_se->dl_boosted)){
831 update_dl_revised_wakeup(dl_se, rq);
832 return;
833 }
834
503 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 835 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
504 dl_se->runtime = pi_se->dl_runtime; 836 dl_se->runtime = pi_se->dl_runtime;
505 } 837 }
@@ -593,10 +925,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
593 * The task might have changed its scheduling policy to something 925 * The task might have changed its scheduling policy to something
594 * different than SCHED_DEADLINE (through switched_from_dl()). 926 * different than SCHED_DEADLINE (through switched_from_dl()).
595 */ 927 */
596 if (!dl_task(p)) { 928 if (!dl_task(p))
597 __dl_clear_params(p);
598 goto unlock; 929 goto unlock;
599 }
600 930
601 /* 931 /*
602 * The task might have been boosted by someone else and might be in the 932 * The task might have been boosted by someone else and might be in the
@@ -723,6 +1053,8 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
723 if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) 1053 if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
724 return; 1054 return;
725 dl_se->dl_throttled = 1; 1055 dl_se->dl_throttled = 1;
1056 if (dl_se->runtime > 0)
1057 dl_se->runtime = 0;
726 } 1058 }
727} 1059}
728 1060
@@ -735,6 +1067,47 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
735extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); 1067extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
736 1068
737/* 1069/*
1070 * This function implements the GRUB accounting rule:
1071 * according to the GRUB reclaiming algorithm, the runtime is
1072 * not decreased as "dq = -dt", but as
1073 * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
1074 * where u is the utilization of the task, Umax is the maximum reclaimable
1075 * utilization, Uinact is the (per-runqueue) inactive utilization, computed
1076 * as the difference between the "total runqueue utilization" and the
1077 * runqueue active utilization, and Uextra is the (per runqueue) extra
1078 * reclaimable utilization.
1079 * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
1080 * multiplied by 2^BW_SHIFT, the result has to be shifted right by
1081 * BW_SHIFT.
1082 * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT,
1083 * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
1084 * Since delta is a 64 bit variable, to have an overflow its value
1085 * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
1086 * So, overflow is not an issue here.
1087 */
1088u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
1089{
1090 u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
1091 u64 u_act;
1092 u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
1093
1094 /*
1095 * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
1096 * we compare u_inact + rq->dl.extra_bw with
1097 * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
1098 * u_inact + rq->dl.extra_bw can be larger than
1099 * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
1100 * leading to wrong results)
1101 */
1102 if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
1103 u_act = u_act_min;
1104 else
1105 u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
1106
1107 return (delta * u_act) >> BW_SHIFT;
1108}
1109
1110/*
738 * Update the current task's runtime statistics (provided it is still 1111 * Update the current task's runtime statistics (provided it is still
739 * a -deadline task and has not been removed from the dl_rq). 1112 * a -deadline task and has not been removed from the dl_rq).
740 */ 1113 */
@@ -776,6 +1149,8 @@ static void update_curr_dl(struct rq *rq)
776 1149
777 sched_rt_avg_update(rq, delta_exec); 1150 sched_rt_avg_update(rq, delta_exec);
778 1151
1152 if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
1153 delta_exec = grub_reclaim(delta_exec, rq, &curr->dl);
779 dl_se->runtime -= delta_exec; 1154 dl_se->runtime -= delta_exec;
780 1155
781throttle: 1156throttle:
@@ -815,6 +1190,56 @@ throttle:
815 } 1190 }
816} 1191}
817 1192
1193static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
1194{
1195 struct sched_dl_entity *dl_se = container_of(timer,
1196 struct sched_dl_entity,
1197 inactive_timer);
1198 struct task_struct *p = dl_task_of(dl_se);
1199 struct rq_flags rf;
1200 struct rq *rq;
1201
1202 rq = task_rq_lock(p, &rf);
1203
1204 if (!dl_task(p) || p->state == TASK_DEAD) {
1205 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1206
1207 if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
1208 sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
1209 sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
1210 dl_se->dl_non_contending = 0;
1211 }
1212
1213 raw_spin_lock(&dl_b->lock);
1214 __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
1215 raw_spin_unlock(&dl_b->lock);
1216 __dl_clear_params(p);
1217
1218 goto unlock;
1219 }
1220 if (dl_se->dl_non_contending == 0)
1221 goto unlock;
1222
1223 sched_clock_tick();
1224 update_rq_clock(rq);
1225
1226 sub_running_bw(dl_se->dl_bw, &rq->dl);
1227 dl_se->dl_non_contending = 0;
1228unlock:
1229 task_rq_unlock(rq, p, &rf);
1230 put_task_struct(p);
1231
1232 return HRTIMER_NORESTART;
1233}
1234
1235void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
1236{
1237 struct hrtimer *timer = &dl_se->inactive_timer;
1238
1239 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1240 timer->function = inactive_task_timer;
1241}
1242
818#ifdef CONFIG_SMP 1243#ifdef CONFIG_SMP
819 1244
820static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) 1245static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
@@ -946,10 +1371,12 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
946 * parameters of the task might need updating. Otherwise, 1371 * parameters of the task might need updating. Otherwise,
947 * we want a replenishment of its runtime. 1372 * we want a replenishment of its runtime.
948 */ 1373 */
949 if (flags & ENQUEUE_WAKEUP) 1374 if (flags & ENQUEUE_WAKEUP) {
1375 task_contending(dl_se, flags);
950 update_dl_entity(dl_se, pi_se); 1376 update_dl_entity(dl_se, pi_se);
951 else if (flags & ENQUEUE_REPLENISH) 1377 } else if (flags & ENQUEUE_REPLENISH) {
952 replenish_dl_entity(dl_se, pi_se); 1378 replenish_dl_entity(dl_se, pi_se);
1379 }
953 1380
954 __enqueue_dl_entity(dl_se); 1381 __enqueue_dl_entity(dl_se);
955} 1382}
@@ -959,28 +1386,25 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
959 __dequeue_dl_entity(dl_se); 1386 __dequeue_dl_entity(dl_se);
960} 1387}
961 1388
962static inline bool dl_is_constrained(struct sched_dl_entity *dl_se)
963{
964 return dl_se->dl_deadline < dl_se->dl_period;
965}
966
967static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) 1389static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
968{ 1390{
969 struct task_struct *pi_task = rt_mutex_get_top_task(p); 1391 struct task_struct *pi_task = rt_mutex_get_top_task(p);
970 struct sched_dl_entity *pi_se = &p->dl; 1392 struct sched_dl_entity *pi_se = &p->dl;
971 1393
972 /* 1394 /*
973 * Use the scheduling parameters of the top pi-waiter 1395 * Use the scheduling parameters of the top pi-waiter task if:
974 * task if we have one and its (absolute) deadline is 1396 * - we have a top pi-waiter which is a SCHED_DEADLINE task AND
975 * smaller than our one... OTW we keep our runtime and 1397 * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
976 * deadline. 1398 * smaller than our deadline OR we are a !SCHED_DEADLINE task getting
1399 * boosted due to a SCHED_DEADLINE pi-waiter).
1400 * Otherwise we keep our runtime and deadline.
977 */ 1401 */
978 if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { 1402 if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
979 pi_se = &pi_task->dl; 1403 pi_se = &pi_task->dl;
980 } else if (!dl_prio(p->normal_prio)) { 1404 } else if (!dl_prio(p->normal_prio)) {
981 /* 1405 /*
982 * Special case in which we have a !SCHED_DEADLINE task 1406 * Special case in which we have a !SCHED_DEADLINE task
983 * that is going to be deboosted, but exceedes its 1407 * that is going to be deboosted, but exceeds its
984 * runtime while doing so. No point in replenishing 1408 * runtime while doing so. No point in replenishing
985 * it, as it's going to return back to its original 1409 * it, as it's going to return back to its original
986 * scheduling class after this. 1410 * scheduling class after this.
@@ -995,17 +1419,32 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
995 * If that is the case, the task will be throttled and 1419 * If that is the case, the task will be throttled and
996 * the replenishment timer will be set to the next period. 1420 * the replenishment timer will be set to the next period.
997 */ 1421 */
998 if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) 1422 if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
999 dl_check_constrained_dl(&p->dl); 1423 dl_check_constrained_dl(&p->dl);
1000 1424
1425 if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
1426 add_rq_bw(p->dl.dl_bw, &rq->dl);
1427 add_running_bw(p->dl.dl_bw, &rq->dl);
1428 }
1429
1001 /* 1430 /*
1002 * If p is throttled, we do nothing. In fact, if it exhausted 1431 * If p is throttled, we do not enqueue it. In fact, if it exhausted
1003 * its budget it needs a replenishment and, since it now is on 1432 * its budget it needs a replenishment and, since it now is on
1004 * its rq, the bandwidth timer callback (which clearly has not 1433 * its rq, the bandwidth timer callback (which clearly has not
1005 * run yet) will take care of this. 1434 * run yet) will take care of this.
1435 * However, the active utilization does not depend on the fact
1436 * that the task is on the runqueue or not (but depends on the
1437 * task's state - in GRUB parlance, "inactive" vs "active contending").
1438 * In other words, even if a task is throttled its utilization must
1439 * be counted in the active utilization; hence, we need to call
1440 * add_running_bw().
1006 */ 1441 */
1007 if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) 1442 if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
1443 if (flags & ENQUEUE_WAKEUP)
1444 task_contending(&p->dl, flags);
1445
1008 return; 1446 return;
1447 }
1009 1448
1010 enqueue_dl_entity(&p->dl, pi_se, flags); 1449 enqueue_dl_entity(&p->dl, pi_se, flags);
1011 1450
@@ -1023,6 +1462,23 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
1023{ 1462{
1024 update_curr_dl(rq); 1463 update_curr_dl(rq);
1025 __dequeue_task_dl(rq, p, flags); 1464 __dequeue_task_dl(rq, p, flags);
1465
1466 if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
1467 sub_running_bw(p->dl.dl_bw, &rq->dl);
1468 sub_rq_bw(p->dl.dl_bw, &rq->dl);
1469 }
1470
1471 /*
1472 * This check allows to start the inactive timer (or to immediately
1473 * decrease the active utilization, if needed) in two cases:
1474 * when the task blocks and when it is terminating
1475 * (p->state == TASK_DEAD). We can handle the two cases in the same
1476 * way, because from GRUB's point of view the same thing is happening
1477 * (the task moves from "active contending" to "active non contending"
1478 * or "inactive")
1479 */
1480 if (flags & DEQUEUE_SLEEP)
1481 task_non_contending(p);
1026} 1482}
1027 1483
1028/* 1484/*
@@ -1100,6 +1556,37 @@ out:
1100 return cpu; 1556 return cpu;
1101} 1557}
1102 1558
1559static void migrate_task_rq_dl(struct task_struct *p)
1560{
1561 struct rq *rq;
1562
1563 if (p->state != TASK_WAKING)
1564 return;
1565
1566 rq = task_rq(p);
1567 /*
1568 * Since p->state == TASK_WAKING, set_task_cpu() has been called
1569 * from try_to_wake_up(). Hence, p->pi_lock is locked, but
1570 * rq->lock is not... So, lock it
1571 */
1572 raw_spin_lock(&rq->lock);
1573 if (p->dl.dl_non_contending) {
1574 sub_running_bw(p->dl.dl_bw, &rq->dl);
1575 p->dl.dl_non_contending = 0;
1576 /*
1577 * If the timer handler is currently running and the
1578 * timer cannot be cancelled, inactive_task_timer()
1579 * will see that dl_not_contending is not set, and
1580 * will not touch the rq's active utilization,
1581 * so we are still safe.
1582 */
1583 if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
1584 put_task_struct(p);
1585 }
1586 sub_rq_bw(p->dl.dl_bw, &rq->dl);
1587 raw_spin_unlock(&rq->lock);
1588}
1589
1103static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) 1590static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
1104{ 1591{
1105 /* 1592 /*
@@ -1255,19 +1742,6 @@ static void task_fork_dl(struct task_struct *p)
1255 */ 1742 */
1256} 1743}
1257 1744
1258static void task_dead_dl(struct task_struct *p)
1259{
1260 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1261
1262 /*
1263 * Since we are TASK_DEAD we won't slip out of the domain!
1264 */
1265 raw_spin_lock_irq(&dl_b->lock);
1266 /* XXX we should retain the bw until 0-lag */
1267 dl_b->total_bw -= p->dl.dl_bw;
1268 raw_spin_unlock_irq(&dl_b->lock);
1269}
1270
1271static void set_curr_task_dl(struct rq *rq) 1745static void set_curr_task_dl(struct rq *rq)
1272{ 1746{
1273 struct task_struct *p = rq->curr; 1747 struct task_struct *p = rq->curr;
@@ -1533,7 +2007,7 @@ retry:
1533 * then possible that next_task has migrated. 2007 * then possible that next_task has migrated.
1534 */ 2008 */
1535 task = pick_next_pushable_dl_task(rq); 2009 task = pick_next_pushable_dl_task(rq);
1536 if (task_cpu(next_task) == rq->cpu && task == next_task) { 2010 if (task == next_task) {
1537 /* 2011 /*
1538 * The task is still there. We don't try 2012 * The task is still there. We don't try
1539 * again, some other cpu will pull it when ready. 2013 * again, some other cpu will pull it when ready.
@@ -1551,7 +2025,11 @@ retry:
1551 } 2025 }
1552 2026
1553 deactivate_task(rq, next_task, 0); 2027 deactivate_task(rq, next_task, 0);
2028 sub_running_bw(next_task->dl.dl_bw, &rq->dl);
2029 sub_rq_bw(next_task->dl.dl_bw, &rq->dl);
1554 set_task_cpu(next_task, later_rq->cpu); 2030 set_task_cpu(next_task, later_rq->cpu);
2031 add_rq_bw(next_task->dl.dl_bw, &later_rq->dl);
2032 add_running_bw(next_task->dl.dl_bw, &later_rq->dl);
1555 activate_task(later_rq, next_task, 0); 2033 activate_task(later_rq, next_task, 0);
1556 ret = 1; 2034 ret = 1;
1557 2035
@@ -1639,7 +2117,11 @@ static void pull_dl_task(struct rq *this_rq)
1639 resched = true; 2117 resched = true;
1640 2118
1641 deactivate_task(src_rq, p, 0); 2119 deactivate_task(src_rq, p, 0);
2120 sub_running_bw(p->dl.dl_bw, &src_rq->dl);
2121 sub_rq_bw(p->dl.dl_bw, &src_rq->dl);
1642 set_task_cpu(p, this_cpu); 2122 set_task_cpu(p, this_cpu);
2123 add_rq_bw(p->dl.dl_bw, &this_rq->dl);
2124 add_running_bw(p->dl.dl_bw, &this_rq->dl);
1643 activate_task(this_rq, p, 0); 2125 activate_task(this_rq, p, 0);
1644 dmin = p->dl.deadline; 2126 dmin = p->dl.deadline;
1645 2127
@@ -1695,7 +2177,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
1695 * until we complete the update. 2177 * until we complete the update.
1696 */ 2178 */
1697 raw_spin_lock(&src_dl_b->lock); 2179 raw_spin_lock(&src_dl_b->lock);
1698 __dl_clear(src_dl_b, p->dl.dl_bw); 2180 __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
1699 raw_spin_unlock(&src_dl_b->lock); 2181 raw_spin_unlock(&src_dl_b->lock);
1700 } 2182 }
1701 2183
@@ -1737,13 +2219,26 @@ void __init init_sched_dl_class(void)
1737static void switched_from_dl(struct rq *rq, struct task_struct *p) 2219static void switched_from_dl(struct rq *rq, struct task_struct *p)
1738{ 2220{
1739 /* 2221 /*
1740 * Start the deadline timer; if we switch back to dl before this we'll 2222 * task_non_contending() can start the "inactive timer" (if the 0-lag
1741 * continue consuming our current CBS slice. If we stay outside of 2223 * time is in the future). If the task switches back to dl before
1742 * SCHED_DEADLINE until the deadline passes, the timer will reset the 2224 * the "inactive timer" fires, it can continue to consume its current
1743 * task. 2225 * runtime using its current deadline. If it stays outside of
2226 * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer()
2227 * will reset the task parameters.
1744 */ 2228 */
1745 if (!start_dl_timer(p)) 2229 if (task_on_rq_queued(p) && p->dl.dl_runtime)
1746 __dl_clear_params(p); 2230 task_non_contending(p);
2231
2232 if (!task_on_rq_queued(p))
2233 sub_rq_bw(p->dl.dl_bw, &rq->dl);
2234
2235 /*
2236 * We cannot use inactive_task_timer() to invoke sub_running_bw()
2237 * at the 0-lag time, because the task could have been migrated
2238 * while SCHED_OTHER in the meanwhile.
2239 */
2240 if (p->dl.dl_non_contending)
2241 p->dl.dl_non_contending = 0;
1747 2242
1748 /* 2243 /*
1749 * Since this might be the only -deadline task on the rq, 2244 * Since this might be the only -deadline task on the rq,
@@ -1762,11 +2257,15 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
1762 */ 2257 */
1763static void switched_to_dl(struct rq *rq, struct task_struct *p) 2258static void switched_to_dl(struct rq *rq, struct task_struct *p)
1764{ 2259{
2260 if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
2261 put_task_struct(p);
1765 2262
1766 /* If p is not queued we will update its parameters at next wakeup. */ 2263 /* If p is not queued we will update its parameters at next wakeup. */
1767 if (!task_on_rq_queued(p)) 2264 if (!task_on_rq_queued(p)) {
1768 return; 2265 add_rq_bw(p->dl.dl_bw, &rq->dl);
1769 2266
2267 return;
2268 }
1770 /* 2269 /*
1771 * If p is boosted we already updated its params in 2270 * If p is boosted we already updated its params in
1772 * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), 2271 * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
@@ -1836,6 +2335,7 @@ const struct sched_class dl_sched_class = {
1836 2335
1837#ifdef CONFIG_SMP 2336#ifdef CONFIG_SMP
1838 .select_task_rq = select_task_rq_dl, 2337 .select_task_rq = select_task_rq_dl,
2338 .migrate_task_rq = migrate_task_rq_dl,
1839 .set_cpus_allowed = set_cpus_allowed_dl, 2339 .set_cpus_allowed = set_cpus_allowed_dl,
1840 .rq_online = rq_online_dl, 2340 .rq_online = rq_online_dl,
1841 .rq_offline = rq_offline_dl, 2341 .rq_offline = rq_offline_dl,
@@ -1845,7 +2345,6 @@ const struct sched_class dl_sched_class = {
1845 .set_curr_task = set_curr_task_dl, 2345 .set_curr_task = set_curr_task_dl,
1846 .task_tick = task_tick_dl, 2346 .task_tick = task_tick_dl,
1847 .task_fork = task_fork_dl, 2347 .task_fork = task_fork_dl,
1848 .task_dead = task_dead_dl,
1849 2348
1850 .prio_changed = prio_changed_dl, 2349 .prio_changed = prio_changed_dl,
1851 .switched_from = switched_from_dl, 2350 .switched_from = switched_from_dl,
@@ -1854,6 +2353,317 @@ const struct sched_class dl_sched_class = {
1854 .update_curr = update_curr_dl, 2353 .update_curr = update_curr_dl,
1855}; 2354};
1856 2355
2356int sched_dl_global_validate(void)
2357{
2358 u64 runtime = global_rt_runtime();
2359 u64 period = global_rt_period();
2360 u64 new_bw = to_ratio(period, runtime);
2361 struct dl_bw *dl_b;
2362 int cpu, ret = 0;
2363 unsigned long flags;
2364
2365 /*
2366 * Here we want to check the bandwidth not being set to some
2367 * value smaller than the currently allocated bandwidth in
2368 * any of the root_domains.
2369 *
2370 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
2371 * cycling on root_domains... Discussion on different/better
2372 * solutions is welcome!
2373 */
2374 for_each_possible_cpu(cpu) {
2375 rcu_read_lock_sched();
2376 dl_b = dl_bw_of(cpu);
2377
2378 raw_spin_lock_irqsave(&dl_b->lock, flags);
2379 if (new_bw < dl_b->total_bw)
2380 ret = -EBUSY;
2381 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2382
2383 rcu_read_unlock_sched();
2384
2385 if (ret)
2386 break;
2387 }
2388
2389 return ret;
2390}
2391
2392void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
2393{
2394 if (global_rt_runtime() == RUNTIME_INF) {
2395 dl_rq->bw_ratio = 1 << RATIO_SHIFT;
2396 dl_rq->extra_bw = 1 << BW_SHIFT;
2397 } else {
2398 dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
2399 global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
2400 dl_rq->extra_bw = to_ratio(global_rt_period(),
2401 global_rt_runtime());
2402 }
2403}
2404
2405void sched_dl_do_global(void)
2406{
2407 u64 new_bw = -1;
2408 struct dl_bw *dl_b;
2409 int cpu;
2410 unsigned long flags;
2411
2412 def_dl_bandwidth.dl_period = global_rt_period();
2413 def_dl_bandwidth.dl_runtime = global_rt_runtime();
2414
2415 if (global_rt_runtime() != RUNTIME_INF)
2416 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
2417
2418 /*
2419 * FIXME: As above...
2420 */
2421 for_each_possible_cpu(cpu) {
2422 rcu_read_lock_sched();
2423 dl_b = dl_bw_of(cpu);
2424
2425 raw_spin_lock_irqsave(&dl_b->lock, flags);
2426 dl_b->bw = new_bw;
2427 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2428
2429 rcu_read_unlock_sched();
2430 init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
2431 }
2432}
2433
2434/*
2435 * We must be sure that accepting a new task (or allowing changing the
2436 * parameters of an existing one) is consistent with the bandwidth
2437 * constraints. If yes, this function also accordingly updates the currently
2438 * allocated bandwidth to reflect the new situation.
2439 *
2440 * This function is called while holding p's rq->lock.
2441 */
2442int sched_dl_overflow(struct task_struct *p, int policy,
2443 const struct sched_attr *attr)
2444{
2445 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
2446 u64 period = attr->sched_period ?: attr->sched_deadline;
2447 u64 runtime = attr->sched_runtime;
2448 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
2449 int cpus, err = -1;
2450
2451 /* !deadline task may carry old deadline bandwidth */
2452 if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
2453 return 0;
2454
2455 /*
2456 * Either if a task, enters, leave, or stays -deadline but changes
2457 * its parameters, we may need to update accordingly the total
2458 * allocated bandwidth of the container.
2459 */
2460 raw_spin_lock(&dl_b->lock);
2461 cpus = dl_bw_cpus(task_cpu(p));
2462 if (dl_policy(policy) && !task_has_dl_policy(p) &&
2463 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
2464 if (hrtimer_active(&p->dl.inactive_timer))
2465 __dl_clear(dl_b, p->dl.dl_bw, cpus);
2466 __dl_add(dl_b, new_bw, cpus);
2467 err = 0;
2468 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
2469 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
2470 /*
2471 * XXX this is slightly incorrect: when the task
2472 * utilization decreases, we should delay the total
2473 * utilization change until the task's 0-lag point.
2474 * But this would require to set the task's "inactive
2475 * timer" when the task is not inactive.
2476 */
2477 __dl_clear(dl_b, p->dl.dl_bw, cpus);
2478 __dl_add(dl_b, new_bw, cpus);
2479 dl_change_utilization(p, new_bw);
2480 err = 0;
2481 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
2482 /*
2483 * Do not decrease the total deadline utilization here,
2484 * switched_from_dl() will take care to do it at the correct
2485 * (0-lag) time.
2486 */
2487 err = 0;
2488 }
2489 raw_spin_unlock(&dl_b->lock);
2490
2491 return err;
2492}
2493
2494/*
2495 * This function initializes the sched_dl_entity of a newly becoming
2496 * SCHED_DEADLINE task.
2497 *
2498 * Only the static values are considered here, the actual runtime and the
2499 * absolute deadline will be properly calculated when the task is enqueued
2500 * for the first time with its new policy.
2501 */
2502void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
2503{
2504 struct sched_dl_entity *dl_se = &p->dl;
2505
2506 dl_se->dl_runtime = attr->sched_runtime;
2507 dl_se->dl_deadline = attr->sched_deadline;
2508 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
2509 dl_se->flags = attr->sched_flags;
2510 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
2511 dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
2512}
2513
2514void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
2515{
2516 struct sched_dl_entity *dl_se = &p->dl;
2517
2518 attr->sched_priority = p->rt_priority;
2519 attr->sched_runtime = dl_se->dl_runtime;
2520 attr->sched_deadline = dl_se->dl_deadline;
2521 attr->sched_period = dl_se->dl_period;
2522 attr->sched_flags = dl_se->flags;
2523}
2524
2525/*
2526 * This function validates the new parameters of a -deadline task.
2527 * We ask for the deadline not being zero, and greater or equal
2528 * than the runtime, as well as the period of being zero or
2529 * greater than deadline. Furthermore, we have to be sure that
2530 * user parameters are above the internal resolution of 1us (we
2531 * check sched_runtime only since it is always the smaller one) and
2532 * below 2^63 ns (we have to check both sched_deadline and
2533 * sched_period, as the latter can be zero).
2534 */
2535bool __checkparam_dl(const struct sched_attr *attr)
2536{
2537 /* deadline != 0 */
2538 if (attr->sched_deadline == 0)
2539 return false;
2540
2541 /*
2542 * Since we truncate DL_SCALE bits, make sure we're at least
2543 * that big.
2544 */
2545 if (attr->sched_runtime < (1ULL << DL_SCALE))
2546 return false;
2547
2548 /*
2549 * Since we use the MSB for wrap-around and sign issues, make
2550 * sure it's not set (mind that period can be equal to zero).
2551 */
2552 if (attr->sched_deadline & (1ULL << 63) ||
2553 attr->sched_period & (1ULL << 63))
2554 return false;
2555
2556 /* runtime <= deadline <= period (if period != 0) */
2557 if ((attr->sched_period != 0 &&
2558 attr->sched_period < attr->sched_deadline) ||
2559 attr->sched_deadline < attr->sched_runtime)
2560 return false;
2561
2562 return true;
2563}
2564
2565/*
2566 * This function clears the sched_dl_entity static params.
2567 */
2568void __dl_clear_params(struct task_struct *p)
2569{
2570 struct sched_dl_entity *dl_se = &p->dl;
2571
2572 dl_se->dl_runtime = 0;
2573 dl_se->dl_deadline = 0;
2574 dl_se->dl_period = 0;
2575 dl_se->flags = 0;
2576 dl_se->dl_bw = 0;
2577 dl_se->dl_density = 0;
2578
2579 dl_se->dl_throttled = 0;
2580 dl_se->dl_yielded = 0;
2581 dl_se->dl_non_contending = 0;
2582}
2583
2584bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
2585{
2586 struct sched_dl_entity *dl_se = &p->dl;
2587
2588 if (dl_se->dl_runtime != attr->sched_runtime ||
2589 dl_se->dl_deadline != attr->sched_deadline ||
2590 dl_se->dl_period != attr->sched_period ||
2591 dl_se->flags != attr->sched_flags)
2592 return true;
2593
2594 return false;
2595}
2596
2597#ifdef CONFIG_SMP
2598int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
2599{
2600 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
2601 cs_cpus_allowed);
2602 struct dl_bw *dl_b;
2603 bool overflow;
2604 int cpus, ret;
2605 unsigned long flags;
2606
2607 rcu_read_lock_sched();
2608 dl_b = dl_bw_of(dest_cpu);
2609 raw_spin_lock_irqsave(&dl_b->lock, flags);
2610 cpus = dl_bw_cpus(dest_cpu);
2611 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
2612 if (overflow)
2613 ret = -EBUSY;
2614 else {
2615 /*
2616 * We reserve space for this task in the destination
2617 * root_domain, as we can't fail after this point.
2618 * We will free resources in the source root_domain
2619 * later on (see set_cpus_allowed_dl()).
2620 */
2621 __dl_add(dl_b, p->dl.dl_bw, cpus);
2622 ret = 0;
2623 }
2624 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2625 rcu_read_unlock_sched();
2626 return ret;
2627}
2628
2629int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
2630 const struct cpumask *trial)
2631{
2632 int ret = 1, trial_cpus;
2633 struct dl_bw *cur_dl_b;
2634 unsigned long flags;
2635
2636 rcu_read_lock_sched();
2637 cur_dl_b = dl_bw_of(cpumask_any(cur));
2638 trial_cpus = cpumask_weight(trial);
2639
2640 raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
2641 if (cur_dl_b->bw != -1 &&
2642 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
2643 ret = 0;
2644 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
2645 rcu_read_unlock_sched();
2646 return ret;
2647}
2648
2649bool dl_cpu_busy(unsigned int cpu)
2650{
2651 unsigned long flags;
2652 struct dl_bw *dl_b;
2653 bool overflow;
2654 int cpus;
2655
2656 rcu_read_lock_sched();
2657 dl_b = dl_bw_of(cpu);
2658 raw_spin_lock_irqsave(&dl_b->lock, flags);
2659 cpus = dl_bw_cpus(cpu);
2660 overflow = __dl_overflow(dl_b, cpus, 0, 0);
2661 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2662 rcu_read_unlock_sched();
2663 return overflow;
2664}
2665#endif
2666
1857#ifdef CONFIG_SCHED_DEBUG 2667#ifdef CONFIG_SCHED_DEBUG
1858extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); 2668extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
1859 2669
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 38f019324f1a..4fa66de52bd6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -552,15 +552,21 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
552 552
553#define P(x) \ 553#define P(x) \
554 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) 554 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
555#define PU(x) \
556 SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(rt_rq->x))
555#define PN(x) \ 557#define PN(x) \
556 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) 558 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
557 559
558 P(rt_nr_running); 560 PU(rt_nr_running);
561#ifdef CONFIG_SMP
562 PU(rt_nr_migratory);
563#endif
559 P(rt_throttled); 564 P(rt_throttled);
560 PN(rt_time); 565 PN(rt_time);
561 PN(rt_runtime); 566 PN(rt_runtime);
562 567
563#undef PN 568#undef PN
569#undef PU
564#undef P 570#undef P
565} 571}
566 572
@@ -569,14 +575,21 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
569 struct dl_bw *dl_bw; 575 struct dl_bw *dl_bw;
570 576
571 SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); 577 SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
572 SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); 578
579#define PU(x) \
580 SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x))
581
582 PU(dl_nr_running);
573#ifdef CONFIG_SMP 583#ifdef CONFIG_SMP
584 PU(dl_nr_migratory);
574 dl_bw = &cpu_rq(cpu)->rd->dl_bw; 585 dl_bw = &cpu_rq(cpu)->rd->dl_bw;
575#else 586#else
576 dl_bw = &dl_rq->dl_bw; 587 dl_bw = &dl_rq->dl_bw;
577#endif 588#endif
578 SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); 589 SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
579 SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); 590 SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
591
592#undef PU
580} 593}
581 594
582extern __read_mostly int sched_clock_running; 595extern __read_mostly int sched_clock_running;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d71109321841..c95880e216f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
369} 369}
370 370
371/* Iterate thr' all leaf cfs_rq's on a runqueue */ 371/* Iterate thr' all leaf cfs_rq's on a runqueue */
372#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 372#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
373 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 373 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
374 leaf_cfs_rq_list)
374 375
375/* Do the two (enqueued) entities belong to the same group ? */ 376/* Do the two (enqueued) entities belong to the same group ? */
376static inline struct cfs_rq * 377static inline struct cfs_rq *
@@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
463{ 464{
464} 465}
465 466
466#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 467#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
467 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 468 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
468 469
469static inline struct sched_entity *parent_entity(struct sched_entity *se) 470static inline struct sched_entity *parent_entity(struct sched_entity *se)
470{ 471{
@@ -1381,7 +1382,6 @@ static unsigned long weighted_cpuload(const int cpu);
1381static unsigned long source_load(int cpu, int type); 1382static unsigned long source_load(int cpu, int type);
1382static unsigned long target_load(int cpu, int type); 1383static unsigned long target_load(int cpu, int type);
1383static unsigned long capacity_of(int cpu); 1384static unsigned long capacity_of(int cpu);
1384static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1385 1385
1386/* Cached statistics for all CPUs within a node */ 1386/* Cached statistics for all CPUs within a node */
1387struct numa_stats { 1387struct numa_stats {
@@ -2469,7 +2469,8 @@ void task_numa_work(struct callback_head *work)
2469 return; 2469 return;
2470 2470
2471 2471
2472 down_read(&mm->mmap_sem); 2472 if (!down_read_trylock(&mm->mmap_sem))
2473 return;
2473 vma = find_vma(mm, start); 2474 vma = find_vma(mm, start);
2474 if (!vma) { 2475 if (!vma) {
2475 reset_ptenuma_scan(p); 2476 reset_ptenuma_scan(p);
@@ -2584,6 +2585,60 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
2584 } 2585 }
2585 } 2586 }
2586} 2587}
2588
2589/*
2590 * Can a task be moved from prev_cpu to this_cpu without causing a load
2591 * imbalance that would trigger the load balancer?
2592 */
2593static inline bool numa_wake_affine(struct sched_domain *sd,
2594 struct task_struct *p, int this_cpu,
2595 int prev_cpu, int sync)
2596{
2597 struct numa_stats prev_load, this_load;
2598 s64 this_eff_load, prev_eff_load;
2599
2600 update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
2601 update_numa_stats(&this_load, cpu_to_node(this_cpu));
2602
2603 /*
2604 * If sync wakeup then subtract the (maximum possible)
2605 * effect of the currently running task from the load
2606 * of the current CPU:
2607 */
2608 if (sync) {
2609 unsigned long current_load = task_h_load(current);
2610
2611 if (this_load.load > current_load)
2612 this_load.load -= current_load;
2613 else
2614 this_load.load = 0;
2615 }
2616
2617 /*
2618 * In low-load situations, where this_cpu's node is idle due to the
2619 * sync cause above having dropped this_load.load to 0, move the task.
2620 * Moving to an idle socket will not create a bad imbalance.
2621 *
2622 * Otherwise check if the nodes are near enough in load to allow this
2623 * task to be woken on this_cpu's node.
2624 */
2625 if (this_load.load > 0) {
2626 unsigned long task_load = task_h_load(p);
2627
2628 this_eff_load = 100;
2629 this_eff_load *= prev_load.compute_capacity;
2630
2631 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
2632 prev_eff_load *= this_load.compute_capacity;
2633
2634 this_eff_load *= this_load.load + task_load;
2635 prev_eff_load *= prev_load.load - task_load;
2636
2637 return this_eff_load <= prev_eff_load;
2638 }
2639
2640 return true;
2641}
2587#else 2642#else
2588static void task_tick_numa(struct rq *rq, struct task_struct *curr) 2643static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2589{ 2644{
@@ -2596,6 +2651,15 @@ static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2596static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) 2651static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2597{ 2652{
2598} 2653}
2654
2655#ifdef CONFIG_SMP
2656static inline bool numa_wake_affine(struct sched_domain *sd,
2657 struct task_struct *p, int this_cpu,
2658 int prev_cpu, int sync)
2659{
2660 return true;
2661}
2662#endif /* !SMP */
2599#endif /* CONFIG_NUMA_BALANCING */ 2663#endif /* CONFIG_NUMA_BALANCING */
2600 2664
2601static void 2665static void
@@ -2916,12 +2980,12 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2916 /* 2980 /*
2917 * Step 2: update *_avg. 2981 * Step 2: update *_avg.
2918 */ 2982 */
2919 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); 2983 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
2920 if (cfs_rq) { 2984 if (cfs_rq) {
2921 cfs_rq->runnable_load_avg = 2985 cfs_rq->runnable_load_avg =
2922 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); 2986 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
2923 } 2987 }
2924 sa->util_avg = sa->util_sum / LOAD_AVG_MAX; 2988 sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
2925 2989
2926 return 1; 2990 return 1;
2927} 2991}
@@ -2982,8 +3046,7 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
2982 * differential update where we store the last value we propagated. This in 3046 * differential update where we store the last value we propagated. This in
2983 * turn allows skipping updates if the differential is 'small'. 3047 * turn allows skipping updates if the differential is 'small'.
2984 * 3048 *
2985 * Updating tg's load_avg is necessary before update_cfs_share() (which is 3049 * Updating tg's load_avg is necessary before update_cfs_share().
2986 * done) and effective_load() (which is not done because it is too costly).
2987 */ 3050 */
2988static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) 3051static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2989{ 3052{
@@ -3563,7 +3626,7 @@ static inline void check_schedstat_required(void)
3563 trace_sched_stat_runtime_enabled()) { 3626 trace_sched_stat_runtime_enabled()) {
3564 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " 3627 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3565 "stat_blocked and stat_runtime require the " 3628 "stat_blocked and stat_runtime require the "
3566 "kernel parameter schedstats=enabled or " 3629 "kernel parameter schedstats=enable or "
3567 "kernel.sched_schedstats=1\n"); 3630 "kernel.sched_schedstats=1\n");
3568 } 3631 }
3569#endif 3632#endif
@@ -4642,24 +4705,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4642 hrtimer_cancel(&cfs_b->slack_timer); 4705 hrtimer_cancel(&cfs_b->slack_timer);
4643} 4706}
4644 4707
4708/*
4709 * Both these cpu hotplug callbacks race against unregister_fair_sched_group()
4710 *
4711 * The race is harmless, since modifying bandwidth settings of unhooked group
4712 * bits doesn't do much.
4713 */
4714
4715/* cpu online calback */
4645static void __maybe_unused update_runtime_enabled(struct rq *rq) 4716static void __maybe_unused update_runtime_enabled(struct rq *rq)
4646{ 4717{
4647 struct cfs_rq *cfs_rq; 4718 struct task_group *tg;
4719
4720 lockdep_assert_held(&rq->lock);
4648 4721
4649 for_each_leaf_cfs_rq(rq, cfs_rq) { 4722 rcu_read_lock();
4650 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; 4723 list_for_each_entry_rcu(tg, &task_groups, list) {
4724 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
4725 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4651 4726
4652 raw_spin_lock(&cfs_b->lock); 4727 raw_spin_lock(&cfs_b->lock);
4653 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; 4728 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4654 raw_spin_unlock(&cfs_b->lock); 4729 raw_spin_unlock(&cfs_b->lock);
4655 } 4730 }
4731 rcu_read_unlock();
4656} 4732}
4657 4733
4734/* cpu offline callback */
4658static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) 4735static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4659{ 4736{
4660 struct cfs_rq *cfs_rq; 4737 struct task_group *tg;
4738
4739 lockdep_assert_held(&rq->lock);
4740
4741 rcu_read_lock();
4742 list_for_each_entry_rcu(tg, &task_groups, list) {
4743 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4661 4744
4662 for_each_leaf_cfs_rq(rq, cfs_rq) {
4663 if (!cfs_rq->runtime_enabled) 4745 if (!cfs_rq->runtime_enabled)
4664 continue; 4746 continue;
4665 4747
@@ -4677,6 +4759,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4677 if (cfs_rq_throttled(cfs_rq)) 4759 if (cfs_rq_throttled(cfs_rq))
4678 unthrottle_cfs_rq(cfs_rq); 4760 unthrottle_cfs_rq(cfs_rq);
4679 } 4761 }
4762 rcu_read_unlock();
4680} 4763}
4681 4764
4682#else /* CONFIG_CFS_BANDWIDTH */ 4765#else /* CONFIG_CFS_BANDWIDTH */
@@ -5215,126 +5298,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
5215 return 0; 5298 return 0;
5216} 5299}
5217 5300
5218#ifdef CONFIG_FAIR_GROUP_SCHED
5219/*
5220 * effective_load() calculates the load change as seen from the root_task_group
5221 *
5222 * Adding load to a group doesn't make a group heavier, but can cause movement
5223 * of group shares between cpus. Assuming the shares were perfectly aligned one
5224 * can calculate the shift in shares.
5225 *
5226 * Calculate the effective load difference if @wl is added (subtracted) to @tg
5227 * on this @cpu and results in a total addition (subtraction) of @wg to the
5228 * total group weight.
5229 *
5230 * Given a runqueue weight distribution (rw_i) we can compute a shares
5231 * distribution (s_i) using:
5232 *
5233 * s_i = rw_i / \Sum rw_j (1)
5234 *
5235 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
5236 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
5237 * shares distribution (s_i):
5238 *
5239 * rw_i = { 2, 4, 1, 0 }
5240 * s_i = { 2/7, 4/7, 1/7, 0 }
5241 *
5242 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
5243 * task used to run on and the CPU the waker is running on), we need to
5244 * compute the effect of waking a task on either CPU and, in case of a sync
5245 * wakeup, compute the effect of the current task going to sleep.
5246 *
5247 * So for a change of @wl to the local @cpu with an overall group weight change
5248 * of @wl we can compute the new shares distribution (s'_i) using:
5249 *
5250 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
5251 *
5252 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
5253 * differences in waking a task to CPU 0. The additional task changes the
5254 * weight and shares distributions like:
5255 *
5256 * rw'_i = { 3, 4, 1, 0 }
5257 * s'_i = { 3/8, 4/8, 1/8, 0 }
5258 *
5259 * We can then compute the difference in effective weight by using:
5260 *
5261 * dw_i = S * (s'_i - s_i) (3)
5262 *
5263 * Where 'S' is the group weight as seen by its parent.
5264 *
5265 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
5266 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
5267 * 4/7) times the weight of the group.
5268 */
5269static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
5270{
5271 struct sched_entity *se = tg->se[cpu];
5272
5273 if (!tg->parent) /* the trivial, non-cgroup case */
5274 return wl;
5275
5276 for_each_sched_entity(se) {
5277 struct cfs_rq *cfs_rq = se->my_q;
5278 long W, w = cfs_rq_load_avg(cfs_rq);
5279
5280 tg = cfs_rq->tg;
5281
5282 /*
5283 * W = @wg + \Sum rw_j
5284 */
5285 W = wg + atomic_long_read(&tg->load_avg);
5286
5287 /* Ensure \Sum rw_j >= rw_i */
5288 W -= cfs_rq->tg_load_avg_contrib;
5289 W += w;
5290
5291 /*
5292 * w = rw_i + @wl
5293 */
5294 w += wl;
5295
5296 /*
5297 * wl = S * s'_i; see (2)
5298 */
5299 if (W > 0 && w < W)
5300 wl = (w * (long)scale_load_down(tg->shares)) / W;
5301 else
5302 wl = scale_load_down(tg->shares);
5303
5304 /*
5305 * Per the above, wl is the new se->load.weight value; since
5306 * those are clipped to [MIN_SHARES, ...) do so now. See
5307 * calc_cfs_shares().
5308 */
5309 if (wl < MIN_SHARES)
5310 wl = MIN_SHARES;
5311
5312 /*
5313 * wl = dw_i = S * (s'_i - s_i); see (3)
5314 */
5315 wl -= se->avg.load_avg;
5316
5317 /*
5318 * Recursively apply this logic to all parent groups to compute
5319 * the final effective load change on the root group. Since
5320 * only the @tg group gets extra weight, all parent groups can
5321 * only redistribute existing shares. @wl is the shift in shares
5322 * resulting from this level per the above.
5323 */
5324 wg = 0;
5325 }
5326
5327 return wl;
5328}
5329#else
5330
5331static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
5332{
5333 return wl;
5334}
5335
5336#endif
5337
5338static void record_wakee(struct task_struct *p) 5301static void record_wakee(struct task_struct *p)
5339{ 5302{
5340 /* 5303 /*
@@ -5385,67 +5348,25 @@ static int wake_wide(struct task_struct *p)
5385static int wake_affine(struct sched_domain *sd, struct task_struct *p, 5348static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5386 int prev_cpu, int sync) 5349 int prev_cpu, int sync)
5387{ 5350{
5388 s64 this_load, load; 5351 int this_cpu = smp_processor_id();
5389 s64 this_eff_load, prev_eff_load; 5352 bool affine = false;
5390 int idx, this_cpu;
5391 struct task_group *tg;
5392 unsigned long weight;
5393 int balanced;
5394
5395 idx = sd->wake_idx;
5396 this_cpu = smp_processor_id();
5397 load = source_load(prev_cpu, idx);
5398 this_load = target_load(this_cpu, idx);
5399 5353
5400 /* 5354 /*
5401 * If sync wakeup then subtract the (maximum possible) 5355 * Common case: CPUs are in the same socket, and select_idle_sibling()
5402 * effect of the currently running task from the load 5356 * will do its thing regardless of what we return:
5403 * of the current CPU:
5404 */
5405 if (sync) {
5406 tg = task_group(current);
5407 weight = current->se.avg.load_avg;
5408
5409 this_load += effective_load(tg, this_cpu, -weight, -weight);
5410 load += effective_load(tg, prev_cpu, 0, -weight);
5411 }
5412
5413 tg = task_group(p);
5414 weight = p->se.avg.load_avg;
5415
5416 /*
5417 * In low-load situations, where prev_cpu is idle and this_cpu is idle
5418 * due to the sync cause above having dropped this_load to 0, we'll
5419 * always have an imbalance, but there's really nothing you can do
5420 * about that, so that's good too.
5421 *
5422 * Otherwise check if either cpus are near enough in load to allow this
5423 * task to be woken on this_cpu.
5424 */ 5357 */
5425 this_eff_load = 100; 5358 if (cpus_share_cache(prev_cpu, this_cpu))
5426 this_eff_load *= capacity_of(prev_cpu); 5359 affine = true;
5427 5360 else
5428 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; 5361 affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
5429 prev_eff_load *= capacity_of(this_cpu);
5430
5431 if (this_load > 0) {
5432 this_eff_load *= this_load +
5433 effective_load(tg, this_cpu, weight, weight);
5434
5435 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
5436 }
5437
5438 balanced = this_eff_load <= prev_eff_load;
5439 5362
5440 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); 5363 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5364 if (affine) {
5365 schedstat_inc(sd->ttwu_move_affine);
5366 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5367 }
5441 5368
5442 if (!balanced) 5369 return affine;
5443 return 0;
5444
5445 schedstat_inc(sd->ttwu_move_affine);
5446 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5447
5448 return 1;
5449} 5370}
5450 5371
5451static inline int task_util(struct task_struct *p); 5372static inline int task_util(struct task_struct *p);
@@ -5484,12 +5405,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5484 int i; 5405 int i;
5485 5406
5486 /* Skip over this group if it has no CPUs allowed */ 5407 /* Skip over this group if it has no CPUs allowed */
5487 if (!cpumask_intersects(sched_group_cpus(group), 5408 if (!cpumask_intersects(sched_group_span(group),
5488 &p->cpus_allowed)) 5409 &p->cpus_allowed))
5489 continue; 5410 continue;
5490 5411
5491 local_group = cpumask_test_cpu(this_cpu, 5412 local_group = cpumask_test_cpu(this_cpu,
5492 sched_group_cpus(group)); 5413 sched_group_span(group));
5493 5414
5494 /* 5415 /*
5495 * Tally up the load of all CPUs in the group and find 5416 * Tally up the load of all CPUs in the group and find
@@ -5499,7 +5420,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5499 runnable_load = 0; 5420 runnable_load = 0;
5500 max_spare_cap = 0; 5421 max_spare_cap = 0;
5501 5422
5502 for_each_cpu(i, sched_group_cpus(group)) { 5423 for_each_cpu(i, sched_group_span(group)) {
5503 /* Bias balancing toward cpus of our domain */ 5424 /* Bias balancing toward cpus of our domain */
5504 if (local_group) 5425 if (local_group)
5505 load = source_load(i, load_idx); 5426 load = source_load(i, load_idx);
@@ -5602,10 +5523,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5602 5523
5603 /* Check if we have any choice: */ 5524 /* Check if we have any choice: */
5604 if (group->group_weight == 1) 5525 if (group->group_weight == 1)
5605 return cpumask_first(sched_group_cpus(group)); 5526 return cpumask_first(sched_group_span(group));
5606 5527
5607 /* Traverse only the allowed CPUs */ 5528 /* Traverse only the allowed CPUs */
5608 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { 5529 for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
5609 if (idle_cpu(i)) { 5530 if (idle_cpu(i)) {
5610 struct rq *rq = cpu_rq(i); 5531 struct rq *rq = cpu_rq(i);
5611 struct cpuidle_state *idle = idle_get_state(rq); 5532 struct cpuidle_state *idle = idle_get_state(rq);
@@ -5640,43 +5561,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5640 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; 5561 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
5641} 5562}
5642 5563
5643/*
5644 * Implement a for_each_cpu() variant that starts the scan at a given cpu
5645 * (@start), and wraps around.
5646 *
5647 * This is used to scan for idle CPUs; such that not all CPUs looking for an
5648 * idle CPU find the same CPU. The down-side is that tasks tend to cycle
5649 * through the LLC domain.
5650 *
5651 * Especially tbench is found sensitive to this.
5652 */
5653
5654static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
5655{
5656 int next;
5657
5658again:
5659 next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
5660
5661 if (*wrapped) {
5662 if (next >= start)
5663 return nr_cpumask_bits;
5664 } else {
5665 if (next >= nr_cpumask_bits) {
5666 *wrapped = 1;
5667 n = -1;
5668 goto again;
5669 }
5670 }
5671
5672 return next;
5673}
5674
5675#define for_each_cpu_wrap(cpu, mask, start, wrap) \
5676 for ((wrap) = 0, (cpu) = (start)-1; \
5677 (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
5678 (cpu) < nr_cpumask_bits; )
5679
5680#ifdef CONFIG_SCHED_SMT 5564#ifdef CONFIG_SCHED_SMT
5681 5565
5682static inline void set_idle_cores(int cpu, int val) 5566static inline void set_idle_cores(int cpu, int val)
@@ -5736,7 +5620,7 @@ unlock:
5736static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) 5620static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5737{ 5621{
5738 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); 5622 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
5739 int core, cpu, wrap; 5623 int core, cpu;
5740 5624
5741 if (!static_branch_likely(&sched_smt_present)) 5625 if (!static_branch_likely(&sched_smt_present))
5742 return -1; 5626 return -1;
@@ -5746,7 +5630,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
5746 5630
5747 cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); 5631 cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
5748 5632
5749 for_each_cpu_wrap(core, cpus, target, wrap) { 5633 for_each_cpu_wrap(core, cpus, target) {
5750 bool idle = true; 5634 bool idle = true;
5751 5635
5752 for_each_cpu(cpu, cpu_smt_mask(core)) { 5636 for_each_cpu(cpu, cpu_smt_mask(core)) {
@@ -5809,27 +5693,38 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
5809static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) 5693static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
5810{ 5694{
5811 struct sched_domain *this_sd; 5695 struct sched_domain *this_sd;
5812 u64 avg_cost, avg_idle = this_rq()->avg_idle; 5696 u64 avg_cost, avg_idle;
5813 u64 time, cost; 5697 u64 time, cost;
5814 s64 delta; 5698 s64 delta;
5815 int cpu, wrap; 5699 int cpu, nr = INT_MAX;
5816 5700
5817 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); 5701 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
5818 if (!this_sd) 5702 if (!this_sd)
5819 return -1; 5703 return -1;
5820 5704
5821 avg_cost = this_sd->avg_scan_cost;
5822
5823 /* 5705 /*
5824 * Due to large variance we need a large fuzz factor; hackbench in 5706 * Due to large variance we need a large fuzz factor; hackbench in
5825 * particularly is sensitive here. 5707 * particularly is sensitive here.
5826 */ 5708 */
5827 if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost) 5709 avg_idle = this_rq()->avg_idle / 512;
5710 avg_cost = this_sd->avg_scan_cost + 1;
5711
5712 if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
5828 return -1; 5713 return -1;
5829 5714
5715 if (sched_feat(SIS_PROP)) {
5716 u64 span_avg = sd->span_weight * avg_idle;
5717 if (span_avg > 4*avg_cost)
5718 nr = div_u64(span_avg, avg_cost);
5719 else
5720 nr = 4;
5721 }
5722
5830 time = local_clock(); 5723 time = local_clock();
5831 5724
5832 for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { 5725 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
5726 if (!--nr)
5727 return -1;
5833 if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 5728 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
5834 continue; 5729 continue;
5835 if (idle_cpu(cpu)) 5730 if (idle_cpu(cpu))
@@ -6011,11 +5906,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6011 5906
6012 if (affine_sd) { 5907 if (affine_sd) {
6013 sd = NULL; /* Prefer wake_affine over balance flags */ 5908 sd = NULL; /* Prefer wake_affine over balance flags */
6014 if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) 5909 if (cpu == prev_cpu)
5910 goto pick_cpu;
5911
5912 if (wake_affine(affine_sd, p, prev_cpu, sync))
6015 new_cpu = cpu; 5913 new_cpu = cpu;
6016 } 5914 }
6017 5915
6018 if (!sd) { 5916 if (!sd) {
5917 pick_cpu:
6019 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ 5918 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
6020 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); 5919 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6021 5920
@@ -6168,8 +6067,11 @@ static void set_last_buddy(struct sched_entity *se)
6168 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) 6067 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6169 return; 6068 return;
6170 6069
6171 for_each_sched_entity(se) 6070 for_each_sched_entity(se) {
6071 if (SCHED_WARN_ON(!se->on_rq))
6072 return;
6172 cfs_rq_of(se)->last = se; 6073 cfs_rq_of(se)->last = se;
6074 }
6173} 6075}
6174 6076
6175static void set_next_buddy(struct sched_entity *se) 6077static void set_next_buddy(struct sched_entity *se)
@@ -6177,8 +6079,11 @@ static void set_next_buddy(struct sched_entity *se)
6177 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) 6079 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6178 return; 6080 return;
6179 6081
6180 for_each_sched_entity(se) 6082 for_each_sched_entity(se) {
6083 if (SCHED_WARN_ON(!se->on_rq))
6084 return;
6181 cfs_rq_of(se)->next = se; 6085 cfs_rq_of(se)->next = se;
6086 }
6182} 6087}
6183 6088
6184static void set_skip_buddy(struct sched_entity *se) 6089static void set_skip_buddy(struct sched_entity *se)
@@ -6686,6 +6591,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
6686 if (dst_nid == p->numa_preferred_nid) 6591 if (dst_nid == p->numa_preferred_nid)
6687 return 0; 6592 return 0;
6688 6593
6594 /* Leaving a core idle is often worse than degrading locality. */
6595 if (env->idle != CPU_NOT_IDLE)
6596 return -1;
6597
6689 if (numa_group) { 6598 if (numa_group) {
6690 src_faults = group_faults(p, src_nid); 6599 src_faults = group_faults(p, src_nid);
6691 dst_faults = group_faults(p, dst_nid); 6600 dst_faults = group_faults(p, dst_nid);
@@ -6737,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
6737 * our sched_group. We may want to revisit it if we couldn't 6646 * our sched_group. We may want to revisit it if we couldn't
6738 * meet load balance goals by pulling other tasks on src_cpu. 6647 * meet load balance goals by pulling other tasks on src_cpu.
6739 * 6648 *
6740 * Also avoid computing new_dst_cpu if we have already computed 6649 * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
6741 * one in current iteration. 6650 * already computed one in current iteration.
6742 */ 6651 */
6743 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) 6652 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
6744 return 0; 6653 return 0;
6745 6654
6746 /* Prevent to re-select dst_cpu via env's cpus */ 6655 /* Prevent to re-select dst_cpu via env's cpus */
@@ -6970,10 +6879,28 @@ static void attach_tasks(struct lb_env *env)
6970} 6879}
6971 6880
6972#ifdef CONFIG_FAIR_GROUP_SCHED 6881#ifdef CONFIG_FAIR_GROUP_SCHED
6882
6883static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
6884{
6885 if (cfs_rq->load.weight)
6886 return false;
6887
6888 if (cfs_rq->avg.load_sum)
6889 return false;
6890
6891 if (cfs_rq->avg.util_sum)
6892 return false;
6893
6894 if (cfs_rq->runnable_load_sum)
6895 return false;
6896
6897 return true;
6898}
6899
6973static void update_blocked_averages(int cpu) 6900static void update_blocked_averages(int cpu)
6974{ 6901{
6975 struct rq *rq = cpu_rq(cpu); 6902 struct rq *rq = cpu_rq(cpu);
6976 struct cfs_rq *cfs_rq; 6903 struct cfs_rq *cfs_rq, *pos;
6977 struct rq_flags rf; 6904 struct rq_flags rf;
6978 6905
6979 rq_lock_irqsave(rq, &rf); 6906 rq_lock_irqsave(rq, &rf);
@@ -6983,7 +6910,7 @@ static void update_blocked_averages(int cpu)
6983 * Iterates the task_group tree in a bottom up fashion, see 6910 * Iterates the task_group tree in a bottom up fashion, see
6984 * list_add_leaf_cfs_rq() for details. 6911 * list_add_leaf_cfs_rq() for details.
6985 */ 6912 */
6986 for_each_leaf_cfs_rq(rq, cfs_rq) { 6913 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
6987 struct sched_entity *se; 6914 struct sched_entity *se;
6988 6915
6989 /* throttled entities do not contribute to load */ 6916 /* throttled entities do not contribute to load */
@@ -6997,6 +6924,13 @@ static void update_blocked_averages(int cpu)
6997 se = cfs_rq->tg->se[cpu]; 6924 se = cfs_rq->tg->se[cpu];
6998 if (se && !skip_blocked_update(se)) 6925 if (se && !skip_blocked_update(se))
6999 update_load_avg(se, 0); 6926 update_load_avg(se, 0);
6927
6928 /*
6929 * There can be a lot of idle CPU cgroups. Don't let fully
6930 * decayed cfs_rqs linger on the list.
6931 */
6932 if (cfs_rq_is_decayed(cfs_rq))
6933 list_del_leaf_cfs_rq(cfs_rq);
7000 } 6934 }
7001 rq_unlock_irqrestore(rq, &rf); 6935 rq_unlock_irqrestore(rq, &rf);
7002} 6936}
@@ -7229,7 +7163,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
7229 * span the current group. 7163 * span the current group.
7230 */ 7164 */
7231 7165
7232 for_each_cpu(cpu, sched_group_cpus(sdg)) { 7166 for_each_cpu(cpu, sched_group_span(sdg)) {
7233 struct sched_group_capacity *sgc; 7167 struct sched_group_capacity *sgc;
7234 struct rq *rq = cpu_rq(cpu); 7168 struct rq *rq = cpu_rq(cpu);
7235 7169
@@ -7408,7 +7342,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
7408 7342
7409 memset(sgs, 0, sizeof(*sgs)); 7343 memset(sgs, 0, sizeof(*sgs));
7410 7344
7411 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 7345 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
7412 struct rq *rq = cpu_rq(i); 7346 struct rq *rq = cpu_rq(i);
7413 7347
7414 /* Bias balancing toward cpus of our domain */ 7348 /* Bias balancing toward cpus of our domain */
@@ -7572,7 +7506,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
7572 struct sg_lb_stats *sgs = &tmp_sgs; 7506 struct sg_lb_stats *sgs = &tmp_sgs;
7573 int local_group; 7507 int local_group;
7574 7508
7575 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); 7509 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
7576 if (local_group) { 7510 if (local_group) {
7577 sds->local = sg; 7511 sds->local = sg;
7578 sgs = local; 7512 sgs = local;
@@ -7927,7 +7861,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
7927 unsigned long busiest_load = 0, busiest_capacity = 1; 7861 unsigned long busiest_load = 0, busiest_capacity = 1;
7928 int i; 7862 int i;
7929 7863
7930 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 7864 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
7931 unsigned long capacity, wl; 7865 unsigned long capacity, wl;
7932 enum fbq_type rt; 7866 enum fbq_type rt;
7933 7867
@@ -8033,7 +7967,6 @@ static int active_load_balance_cpu_stop(void *data);
8033static int should_we_balance(struct lb_env *env) 7967static int should_we_balance(struct lb_env *env)
8034{ 7968{
8035 struct sched_group *sg = env->sd->groups; 7969 struct sched_group *sg = env->sd->groups;
8036 struct cpumask *sg_cpus, *sg_mask;
8037 int cpu, balance_cpu = -1; 7970 int cpu, balance_cpu = -1;
8038 7971
8039 /* 7972 /*
@@ -8043,11 +7976,9 @@ static int should_we_balance(struct lb_env *env)
8043 if (env->idle == CPU_NEWLY_IDLE) 7976 if (env->idle == CPU_NEWLY_IDLE)
8044 return 1; 7977 return 1;
8045 7978
8046 sg_cpus = sched_group_cpus(sg);
8047 sg_mask = sched_group_mask(sg);
8048 /* Try to find first idle cpu */ 7979 /* Try to find first idle cpu */
8049 for_each_cpu_and(cpu, sg_cpus, env->cpus) { 7980 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
8050 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) 7981 if (!idle_cpu(cpu))
8051 continue; 7982 continue;
8052 7983
8053 balance_cpu = cpu; 7984 balance_cpu = cpu;
@@ -8083,7 +8014,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
8083 .sd = sd, 8014 .sd = sd,
8084 .dst_cpu = this_cpu, 8015 .dst_cpu = this_cpu,
8085 .dst_rq = this_rq, 8016 .dst_rq = this_rq,
8086 .dst_grpmask = sched_group_cpus(sd->groups), 8017 .dst_grpmask = sched_group_span(sd->groups),
8087 .idle = idle, 8018 .idle = idle,
8088 .loop_break = sched_nr_migrate_break, 8019 .loop_break = sched_nr_migrate_break,
8089 .cpus = cpus, 8020 .cpus = cpus,
@@ -8091,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
8091 .tasks = LIST_HEAD_INIT(env.tasks), 8022 .tasks = LIST_HEAD_INIT(env.tasks),
8092 }; 8023 };
8093 8024
8094 /* 8025 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
8095 * For NEWLY_IDLE load_balancing, we don't need to consider
8096 * other cpus in our group
8097 */
8098 if (idle == CPU_NEWLY_IDLE)
8099 env.dst_grpmask = NULL;
8100
8101 cpumask_copy(cpus, cpu_active_mask);
8102 8026
8103 schedstat_inc(sd->lb_count[idle]); 8027 schedstat_inc(sd->lb_count[idle]);
8104 8028
@@ -8220,7 +8144,15 @@ more_balance:
8220 /* All tasks on this runqueue were pinned by CPU affinity */ 8144 /* All tasks on this runqueue were pinned by CPU affinity */
8221 if (unlikely(env.flags & LBF_ALL_PINNED)) { 8145 if (unlikely(env.flags & LBF_ALL_PINNED)) {
8222 cpumask_clear_cpu(cpu_of(busiest), cpus); 8146 cpumask_clear_cpu(cpu_of(busiest), cpus);
8223 if (!cpumask_empty(cpus)) { 8147 /*
8148 * Attempting to continue load balancing at the current
8149 * sched_domain level only makes sense if there are
8150 * active CPUs remaining as possible busiest CPUs to
8151 * pull load from which are not contained within the
8152 * destination group that is receiving any migrated
8153 * load.
8154 */
8155 if (!cpumask_subset(cpus, env.dst_grpmask)) {
8224 env.loop = 0; 8156 env.loop = 0;
8225 env.loop_break = sched_nr_migrate_break; 8157 env.loop_break = sched_nr_migrate_break;
8226 goto redo; 8158 goto redo;
@@ -8516,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data)
8516 .src_cpu = busiest_rq->cpu, 8448 .src_cpu = busiest_rq->cpu,
8517 .src_rq = busiest_rq, 8449 .src_rq = busiest_rq,
8518 .idle = CPU_IDLE, 8450 .idle = CPU_IDLE,
8451 /*
8452 * can_migrate_task() doesn't need to compute new_dst_cpu
8453 * for active balancing. Since we have CPU_IDLE, but no
8454 * @dst_grpmask we need to make that test go away with lying
8455 * about DST_PINNED.
8456 */
8457 .flags = LBF_DST_PINNED,
8519 }; 8458 };
8520 8459
8521 schedstat_inc(sd->alb_count); 8460 schedstat_inc(sd->alb_count);
@@ -8659,6 +8598,10 @@ void nohz_balance_enter_idle(int cpu)
8659 if (!cpu_active(cpu)) 8598 if (!cpu_active(cpu))
8660 return; 8599 return;
8661 8600
8601 /* Spare idle load balancing on CPUs that don't want to be disturbed: */
8602 if (!is_housekeeping_cpu(cpu))
8603 return;
8604
8662 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) 8605 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
8663 return; 8606 return;
8664 8607
@@ -9523,10 +9466,10 @@ const struct sched_class fair_sched_class = {
9523#ifdef CONFIG_SCHED_DEBUG 9466#ifdef CONFIG_SCHED_DEBUG
9524void print_cfs_stats(struct seq_file *m, int cpu) 9467void print_cfs_stats(struct seq_file *m, int cpu)
9525{ 9468{
9526 struct cfs_rq *cfs_rq; 9469 struct cfs_rq *cfs_rq, *pos;
9527 9470
9528 rcu_read_lock(); 9471 rcu_read_lock();
9529 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 9472 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
9530 print_cfs_rq(m, cpu, cfs_rq); 9473 print_cfs_rq(m, cpu, cfs_rq);
9531 rcu_read_unlock(); 9474 rcu_read_unlock();
9532} 9475}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 11192e0cb122..d3fb15555291 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -55,6 +55,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
55 * When doing wakeups, attempt to limit superfluous scans of the LLC domain. 55 * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
56 */ 56 */
57SCHED_FEAT(SIS_AVG_CPU, false) 57SCHED_FEAT(SIS_AVG_CPU, false)
58SCHED_FEAT(SIS_PROP, true)
58 59
59/* 60/*
60 * Issue a WARN when we do multiple update_rq_clock() calls 61 * Issue a WARN when we do multiple update_rq_clock() calls
@@ -76,7 +77,6 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
76SCHED_FEAT(RT_PUSH_IPI, true) 77SCHED_FEAT(RT_PUSH_IPI, true)
77#endif 78#endif
78 79
79SCHED_FEAT(FORCE_SD_OVERLAP, false)
80SCHED_FEAT(RT_RUNTIME_SHARE, true) 80SCHED_FEAT(RT_RUNTIME_SHARE, true)
81SCHED_FEAT(LB_MIN, false) 81SCHED_FEAT(LB_MIN, false)
82SCHED_FEAT(ATTACH_AGE_LOAD, true) 82SCHED_FEAT(ATTACH_AGE_LOAD, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index ef63adce0c9c..6c23e30c0e5c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -219,6 +219,7 @@ static void do_idle(void)
219 */ 219 */
220 220
221 __current_set_polling(); 221 __current_set_polling();
222 quiet_vmstat();
222 tick_nohz_idle_enter(); 223 tick_nohz_idle_enter();
223 224
224 while (!need_resched()) { 225 while (!need_resched()) {
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index f15fb2bdbc0d..f14716a3522f 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -117,7 +117,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
117 * load-average relies on per-cpu sampling from the tick, it is affected by 117 * load-average relies on per-cpu sampling from the tick, it is affected by
118 * NO_HZ. 118 * NO_HZ.
119 * 119 *
120 * The basic idea is to fold the nr_active delta into a global idle-delta upon 120 * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
121 * entering NO_HZ state such that we can include this as an 'extra' cpu delta 121 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
122 * when we read the global state. 122 * when we read the global state.
123 * 123 *
@@ -126,7 +126,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
126 * - When we go NO_HZ idle during the window, we can negate our sample 126 * - When we go NO_HZ idle during the window, we can negate our sample
127 * contribution, causing under-accounting. 127 * contribution, causing under-accounting.
128 * 128 *
129 * We avoid this by keeping two idle-delta counters and flipping them 129 * We avoid this by keeping two NO_HZ-delta counters and flipping them
130 * when the window starts, thus separating old and new NO_HZ load. 130 * when the window starts, thus separating old and new NO_HZ load.
131 * 131 *
132 * The only trick is the slight shift in index flip for read vs write. 132 * The only trick is the slight shift in index flip for read vs write.
@@ -137,22 +137,22 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
137 * r:0 0 1 1 0 0 1 1 0 137 * r:0 0 1 1 0 0 1 1 0
138 * w:0 1 1 0 0 1 1 0 0 138 * w:0 1 1 0 0 1 1 0 0
139 * 139 *
140 * This ensures we'll fold the old idle contribution in this window while 140 * This ensures we'll fold the old NO_HZ contribution in this window while
141 * accumlating the new one. 141 * accumlating the new one.
142 * 142 *
143 * - When we wake up from NO_HZ idle during the window, we push up our 143 * - When we wake up from NO_HZ during the window, we push up our
144 * contribution, since we effectively move our sample point to a known 144 * contribution, since we effectively move our sample point to a known
145 * busy state. 145 * busy state.
146 * 146 *
147 * This is solved by pushing the window forward, and thus skipping the 147 * This is solved by pushing the window forward, and thus skipping the
148 * sample, for this cpu (effectively using the idle-delta for this cpu which 148 * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which
149 * was in effect at the time the window opened). This also solves the issue 149 * was in effect at the time the window opened). This also solves the issue
150 * of having to deal with a cpu having been in NOHZ idle for multiple 150 * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ
151 * LOAD_FREQ intervals. 151 * intervals.
152 * 152 *
153 * When making the ILB scale, we should try to pull this in as well. 153 * When making the ILB scale, we should try to pull this in as well.
154 */ 154 */
155static atomic_long_t calc_load_idle[2]; 155static atomic_long_t calc_load_nohz[2];
156static int calc_load_idx; 156static int calc_load_idx;
157 157
158static inline int calc_load_write_idx(void) 158static inline int calc_load_write_idx(void)
@@ -167,7 +167,7 @@ static inline int calc_load_write_idx(void)
167 167
168 /* 168 /*
169 * If the folding window started, make sure we start writing in the 169 * If the folding window started, make sure we start writing in the
170 * next idle-delta. 170 * next NO_HZ-delta.
171 */ 171 */
172 if (!time_before(jiffies, READ_ONCE(calc_load_update))) 172 if (!time_before(jiffies, READ_ONCE(calc_load_update)))
173 idx++; 173 idx++;
@@ -180,24 +180,24 @@ static inline int calc_load_read_idx(void)
180 return calc_load_idx & 1; 180 return calc_load_idx & 1;
181} 181}
182 182
183void calc_load_enter_idle(void) 183void calc_load_nohz_start(void)
184{ 184{
185 struct rq *this_rq = this_rq(); 185 struct rq *this_rq = this_rq();
186 long delta; 186 long delta;
187 187
188 /* 188 /*
189 * We're going into NOHZ mode, if there's any pending delta, fold it 189 * We're going into NO_HZ mode, if there's any pending delta, fold it
190 * into the pending idle delta. 190 * into the pending NO_HZ delta.
191 */ 191 */
192 delta = calc_load_fold_active(this_rq, 0); 192 delta = calc_load_fold_active(this_rq, 0);
193 if (delta) { 193 if (delta) {
194 int idx = calc_load_write_idx(); 194 int idx = calc_load_write_idx();
195 195
196 atomic_long_add(delta, &calc_load_idle[idx]); 196 atomic_long_add(delta, &calc_load_nohz[idx]);
197 } 197 }
198} 198}
199 199
200void calc_load_exit_idle(void) 200void calc_load_nohz_stop(void)
201{ 201{
202 struct rq *this_rq = this_rq(); 202 struct rq *this_rq = this_rq();
203 203
@@ -217,13 +217,13 @@ void calc_load_exit_idle(void)
217 this_rq->calc_load_update += LOAD_FREQ; 217 this_rq->calc_load_update += LOAD_FREQ;
218} 218}
219 219
220static long calc_load_fold_idle(void) 220static long calc_load_nohz_fold(void)
221{ 221{
222 int idx = calc_load_read_idx(); 222 int idx = calc_load_read_idx();
223 long delta = 0; 223 long delta = 0;
224 224
225 if (atomic_long_read(&calc_load_idle[idx])) 225 if (atomic_long_read(&calc_load_nohz[idx]))
226 delta = atomic_long_xchg(&calc_load_idle[idx], 0); 226 delta = atomic_long_xchg(&calc_load_nohz[idx], 0);
227 227
228 return delta; 228 return delta;
229} 229}
@@ -299,9 +299,9 @@ calc_load_n(unsigned long load, unsigned long exp,
299 299
300/* 300/*
301 * NO_HZ can leave us missing all per-cpu ticks calling 301 * NO_HZ can leave us missing all per-cpu ticks calling
302 * calc_load_account_active(), but since an idle CPU folds its delta into 302 * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
303 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold 303 * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
304 * in the pending idle delta if our idle period crossed a load cycle boundary. 304 * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
305 * 305 *
306 * Once we've updated the global active value, we need to apply the exponential 306 * Once we've updated the global active value, we need to apply the exponential
307 * weights adjusted to the number of cycles missed. 307 * weights adjusted to the number of cycles missed.
@@ -330,7 +330,7 @@ static void calc_global_nohz(void)
330 } 330 }
331 331
332 /* 332 /*
333 * Flip the idle index... 333 * Flip the NO_HZ index...
334 * 334 *
335 * Make sure we first write the new time then flip the index, so that 335 * Make sure we first write the new time then flip the index, so that
336 * calc_load_write_idx() will see the new time when it reads the new 336 * calc_load_write_idx() will see the new time when it reads the new
@@ -341,7 +341,7 @@ static void calc_global_nohz(void)
341} 341}
342#else /* !CONFIG_NO_HZ_COMMON */ 342#else /* !CONFIG_NO_HZ_COMMON */
343 343
344static inline long calc_load_fold_idle(void) { return 0; } 344static inline long calc_load_nohz_fold(void) { return 0; }
345static inline void calc_global_nohz(void) { } 345static inline void calc_global_nohz(void) { }
346 346
347#endif /* CONFIG_NO_HZ_COMMON */ 347#endif /* CONFIG_NO_HZ_COMMON */
@@ -362,9 +362,9 @@ void calc_global_load(unsigned long ticks)
362 return; 362 return;
363 363
364 /* 364 /*
365 * Fold the 'old' idle-delta to include all NO_HZ cpus. 365 * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus.
366 */ 366 */
367 delta = calc_load_fold_idle(); 367 delta = calc_load_nohz_fold();
368 if (delta) 368 if (delta)
369 atomic_long_add(delta, &calc_load_tasks); 369 atomic_long_add(delta, &calc_load_tasks);
370 370
@@ -378,7 +378,8 @@ void calc_global_load(unsigned long ticks)
378 WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ); 378 WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
379 379
380 /* 380 /*
381 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. 381 * In case we went to NO_HZ for multiple LOAD_FREQ intervals
382 * catch up in bulk.
382 */ 383 */
383 calc_global_nohz(); 384 calc_global_nohz();
384} 385}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 979b7341008a..45caf937ef90 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -840,6 +840,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
840 int enqueue = 0; 840 int enqueue = 0;
841 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 841 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
842 struct rq *rq = rq_of_rt_rq(rt_rq); 842 struct rq *rq = rq_of_rt_rq(rt_rq);
843 int skip;
844
845 /*
846 * When span == cpu_online_mask, taking each rq->lock
847 * can be time-consuming. Try to avoid it when possible.
848 */
849 raw_spin_lock(&rt_rq->rt_runtime_lock);
850 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
851 raw_spin_unlock(&rt_rq->rt_runtime_lock);
852 if (skip)
853 continue;
843 854
844 raw_spin_lock(&rq->lock); 855 raw_spin_lock(&rq->lock);
845 if (rt_rq->rt_time) { 856 if (rt_rq->rt_time) {
@@ -1819,7 +1830,7 @@ retry:
1819 * pushing. 1830 * pushing.
1820 */ 1831 */
1821 task = pick_next_pushable_task(rq); 1832 task = pick_next_pushable_task(rq);
1822 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1833 if (task == next_task) {
1823 /* 1834 /*
1824 * The task hasn't migrated, and is still the next 1835 * The task hasn't migrated, and is still the next
1825 * eligible task, but we failed to find a run-queue 1836 * eligible task, but we failed to find a run-queue
@@ -2438,6 +2449,316 @@ const struct sched_class rt_sched_class = {
2438 .update_curr = update_curr_rt, 2449 .update_curr = update_curr_rt,
2439}; 2450};
2440 2451
2452#ifdef CONFIG_RT_GROUP_SCHED
2453/*
2454 * Ensure that the real time constraints are schedulable.
2455 */
2456static DEFINE_MUTEX(rt_constraints_mutex);
2457
2458/* Must be called with tasklist_lock held */
2459static inline int tg_has_rt_tasks(struct task_group *tg)
2460{
2461 struct task_struct *g, *p;
2462
2463 /*
2464 * Autogroups do not have RT tasks; see autogroup_create().
2465 */
2466 if (task_group_is_autogroup(tg))
2467 return 0;
2468
2469 for_each_process_thread(g, p) {
2470 if (rt_task(p) && task_group(p) == tg)
2471 return 1;
2472 }
2473
2474 return 0;
2475}
2476
2477struct rt_schedulable_data {
2478 struct task_group *tg;
2479 u64 rt_period;
2480 u64 rt_runtime;
2481};
2482
2483static int tg_rt_schedulable(struct task_group *tg, void *data)
2484{
2485 struct rt_schedulable_data *d = data;
2486 struct task_group *child;
2487 unsigned long total, sum = 0;
2488 u64 period, runtime;
2489
2490 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2491 runtime = tg->rt_bandwidth.rt_runtime;
2492
2493 if (tg == d->tg) {
2494 period = d->rt_period;
2495 runtime = d->rt_runtime;
2496 }
2497
2498 /*
2499 * Cannot have more runtime than the period.
2500 */
2501 if (runtime > period && runtime != RUNTIME_INF)
2502 return -EINVAL;
2503
2504 /*
2505 * Ensure we don't starve existing RT tasks.
2506 */
2507 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
2508 return -EBUSY;
2509
2510 total = to_ratio(period, runtime);
2511
2512 /*
2513 * Nobody can have more than the global setting allows.
2514 */
2515 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2516 return -EINVAL;
2517
2518 /*
2519 * The sum of our children's runtime should not exceed our own.
2520 */
2521 list_for_each_entry_rcu(child, &tg->children, siblings) {
2522 period = ktime_to_ns(child->rt_bandwidth.rt_period);
2523 runtime = child->rt_bandwidth.rt_runtime;
2524
2525 if (child == d->tg) {
2526 period = d->rt_period;
2527 runtime = d->rt_runtime;
2528 }
2529
2530 sum += to_ratio(period, runtime);
2531 }
2532
2533 if (sum > total)
2534 return -EINVAL;
2535
2536 return 0;
2537}
2538
2539static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2540{
2541 int ret;
2542
2543 struct rt_schedulable_data data = {
2544 .tg = tg,
2545 .rt_period = period,
2546 .rt_runtime = runtime,
2547 };
2548
2549 rcu_read_lock();
2550 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2551 rcu_read_unlock();
2552
2553 return ret;
2554}
2555
2556static int tg_set_rt_bandwidth(struct task_group *tg,
2557 u64 rt_period, u64 rt_runtime)
2558{
2559 int i, err = 0;
2560
2561 /*
2562 * Disallowing the root group RT runtime is BAD, it would disallow the
2563 * kernel creating (and or operating) RT threads.
2564 */
2565 if (tg == &root_task_group && rt_runtime == 0)
2566 return -EINVAL;
2567
2568 /* No period doesn't make any sense. */
2569 if (rt_period == 0)
2570 return -EINVAL;
2571
2572 mutex_lock(&rt_constraints_mutex);
2573 read_lock(&tasklist_lock);
2574 err = __rt_schedulable(tg, rt_period, rt_runtime);
2575 if (err)
2576 goto unlock;
2577
2578 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2579 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2580 tg->rt_bandwidth.rt_runtime = rt_runtime;
2581
2582 for_each_possible_cpu(i) {
2583 struct rt_rq *rt_rq = tg->rt_rq[i];
2584
2585 raw_spin_lock(&rt_rq->rt_runtime_lock);
2586 rt_rq->rt_runtime = rt_runtime;
2587 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2588 }
2589 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2590unlock:
2591 read_unlock(&tasklist_lock);
2592 mutex_unlock(&rt_constraints_mutex);
2593
2594 return err;
2595}
2596
2597int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2598{
2599 u64 rt_runtime, rt_period;
2600
2601 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2602 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2603 if (rt_runtime_us < 0)
2604 rt_runtime = RUNTIME_INF;
2605
2606 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2607}
2608
2609long sched_group_rt_runtime(struct task_group *tg)
2610{
2611 u64 rt_runtime_us;
2612
2613 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2614 return -1;
2615
2616 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2617 do_div(rt_runtime_us, NSEC_PER_USEC);
2618 return rt_runtime_us;
2619}
2620
2621int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2622{
2623 u64 rt_runtime, rt_period;
2624
2625 rt_period = rt_period_us * NSEC_PER_USEC;
2626 rt_runtime = tg->rt_bandwidth.rt_runtime;
2627
2628 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2629}
2630
2631long sched_group_rt_period(struct task_group *tg)
2632{
2633 u64 rt_period_us;
2634
2635 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2636 do_div(rt_period_us, NSEC_PER_USEC);
2637 return rt_period_us;
2638}
2639
2640static int sched_rt_global_constraints(void)
2641{
2642 int ret = 0;
2643
2644 mutex_lock(&rt_constraints_mutex);
2645 read_lock(&tasklist_lock);
2646 ret = __rt_schedulable(NULL, 0, 0);
2647 read_unlock(&tasklist_lock);
2648 mutex_unlock(&rt_constraints_mutex);
2649
2650 return ret;
2651}
2652
2653int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2654{
2655 /* Don't accept realtime tasks when there is no way for them to run */
2656 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2657 return 0;
2658
2659 return 1;
2660}
2661
2662#else /* !CONFIG_RT_GROUP_SCHED */
2663static int sched_rt_global_constraints(void)
2664{
2665 unsigned long flags;
2666 int i;
2667
2668 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2669 for_each_possible_cpu(i) {
2670 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
2671
2672 raw_spin_lock(&rt_rq->rt_runtime_lock);
2673 rt_rq->rt_runtime = global_rt_runtime();
2674 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2675 }
2676 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2677
2678 return 0;
2679}
2680#endif /* CONFIG_RT_GROUP_SCHED */
2681
2682static int sched_rt_global_validate(void)
2683{
2684 if (sysctl_sched_rt_period <= 0)
2685 return -EINVAL;
2686
2687 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2688 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
2689 return -EINVAL;
2690
2691 return 0;
2692}
2693
2694static void sched_rt_do_global(void)
2695{
2696 def_rt_bandwidth.rt_runtime = global_rt_runtime();
2697 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
2698}
2699
2700int sched_rt_handler(struct ctl_table *table, int write,
2701 void __user *buffer, size_t *lenp,
2702 loff_t *ppos)
2703{
2704 int old_period, old_runtime;
2705 static DEFINE_MUTEX(mutex);
2706 int ret;
2707
2708 mutex_lock(&mutex);
2709 old_period = sysctl_sched_rt_period;
2710 old_runtime = sysctl_sched_rt_runtime;
2711
2712 ret = proc_dointvec(table, write, buffer, lenp, ppos);
2713
2714 if (!ret && write) {
2715 ret = sched_rt_global_validate();
2716 if (ret)
2717 goto undo;
2718
2719 ret = sched_dl_global_validate();
2720 if (ret)
2721 goto undo;
2722
2723 ret = sched_rt_global_constraints();
2724 if (ret)
2725 goto undo;
2726
2727 sched_rt_do_global();
2728 sched_dl_do_global();
2729 }
2730 if (0) {
2731undo:
2732 sysctl_sched_rt_period = old_period;
2733 sysctl_sched_rt_runtime = old_runtime;
2734 }
2735 mutex_unlock(&mutex);
2736
2737 return ret;
2738}
2739
2740int sched_rr_handler(struct ctl_table *table, int write,
2741 void __user *buffer, size_t *lenp,
2742 loff_t *ppos)
2743{
2744 int ret;
2745 static DEFINE_MUTEX(mutex);
2746
2747 mutex_lock(&mutex);
2748 ret = proc_dointvec(table, write, buffer, lenp, ppos);
2749 /*
2750 * Make sure that internally we keep jiffies.
2751 * Also, writing zero resets the timeslice to default:
2752 */
2753 if (!ret && write) {
2754 sched_rr_timeslice =
2755 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
2756 msecs_to_jiffies(sysctl_sched_rr_timeslice);
2757 }
2758 mutex_unlock(&mutex);
2759 return ret;
2760}
2761
2441#ifdef CONFIG_SCHED_DEBUG 2762#ifdef CONFIG_SCHED_DEBUG
2442extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2763extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
2443 2764
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6dda2aab731e..eeef1a3086d1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -39,9 +39,9 @@
39#include "cpuacct.h" 39#include "cpuacct.h"
40 40
41#ifdef CONFIG_SCHED_DEBUG 41#ifdef CONFIG_SCHED_DEBUG
42#define SCHED_WARN_ON(x) WARN_ONCE(x, #x) 42# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
43#else 43#else
44#define SCHED_WARN_ON(x) ((void)(x)) 44# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
45#endif 45#endif
46 46
47struct rq; 47struct rq;
@@ -218,23 +218,25 @@ static inline int dl_bandwidth_enabled(void)
218 return sysctl_sched_rt_runtime >= 0; 218 return sysctl_sched_rt_runtime >= 0;
219} 219}
220 220
221extern struct dl_bw *dl_bw_of(int i);
222
223struct dl_bw { 221struct dl_bw {
224 raw_spinlock_t lock; 222 raw_spinlock_t lock;
225 u64 bw, total_bw; 223 u64 bw, total_bw;
226}; 224};
227 225
226static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
227
228static inline 228static inline
229void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) 229void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
230{ 230{
231 dl_b->total_bw -= tsk_bw; 231 dl_b->total_bw -= tsk_bw;
232 __dl_update(dl_b, (s32)tsk_bw / cpus);
232} 233}
233 234
234static inline 235static inline
235void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) 236void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
236{ 237{
237 dl_b->total_bw += tsk_bw; 238 dl_b->total_bw += tsk_bw;
239 __dl_update(dl_b, -((s32)tsk_bw / cpus));
238} 240}
239 241
240static inline 242static inline
@@ -244,7 +246,22 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
244 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; 246 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
245} 247}
246 248
249void dl_change_utilization(struct task_struct *p, u64 new_bw);
247extern void init_dl_bw(struct dl_bw *dl_b); 250extern void init_dl_bw(struct dl_bw *dl_b);
251extern int sched_dl_global_validate(void);
252extern void sched_dl_do_global(void);
253extern int sched_dl_overflow(struct task_struct *p, int policy,
254 const struct sched_attr *attr);
255extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
256extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
257extern bool __checkparam_dl(const struct sched_attr *attr);
258extern void __dl_clear_params(struct task_struct *p);
259extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
260extern int dl_task_can_attach(struct task_struct *p,
261 const struct cpumask *cs_cpus_allowed);
262extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
263 const struct cpumask *trial);
264extern bool dl_cpu_busy(unsigned int cpu);
248 265
249#ifdef CONFIG_CGROUP_SCHED 266#ifdef CONFIG_CGROUP_SCHED
250 267
@@ -366,6 +383,11 @@ extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent
366extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 383extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
367 struct sched_rt_entity *rt_se, int cpu, 384 struct sched_rt_entity *rt_se, int cpu,
368 struct sched_rt_entity *parent); 385 struct sched_rt_entity *parent);
386extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us);
387extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us);
388extern long sched_group_rt_runtime(struct task_group *tg);
389extern long sched_group_rt_period(struct task_group *tg);
390extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
369 391
370extern struct task_group *sched_create_group(struct task_group *parent); 392extern struct task_group *sched_create_group(struct task_group *parent);
371extern void sched_online_group(struct task_group *tg, 393extern void sched_online_group(struct task_group *tg,
@@ -558,6 +580,30 @@ struct dl_rq {
558#else 580#else
559 struct dl_bw dl_bw; 581 struct dl_bw dl_bw;
560#endif 582#endif
583 /*
584 * "Active utilization" for this runqueue: increased when a
585 * task wakes up (becomes TASK_RUNNING) and decreased when a
586 * task blocks
587 */
588 u64 running_bw;
589
590 /*
591 * Utilization of the tasks "assigned" to this runqueue (including
592 * the tasks that are in runqueue and the tasks that executed on this
593 * CPU and blocked). Increased when a task moves to this runqueue, and
594 * decreased when the task moves away (migrates, changes scheduling
595 * policy, or terminates).
596 * This is needed to compute the "inactive utilization" for the
597 * runqueue (inactive utilization = this_bw - running_bw).
598 */
599 u64 this_bw;
600 u64 extra_bw;
601
602 /*
603 * Inverse of the fraction of CPU utilization that can be reclaimed
604 * by the GRUB algorithm.
605 */
606 u64 bw_ratio;
561}; 607};
562 608
563#ifdef CONFIG_SMP 609#ifdef CONFIG_SMP
@@ -606,11 +652,9 @@ struct root_domain {
606 652
607extern struct root_domain def_root_domain; 653extern struct root_domain def_root_domain;
608extern struct mutex sched_domains_mutex; 654extern struct mutex sched_domains_mutex;
609extern cpumask_var_t fallback_doms;
610extern cpumask_var_t sched_domains_tmpmask;
611 655
612extern void init_defrootdomain(void); 656extern void init_defrootdomain(void);
613extern int init_sched_domains(const struct cpumask *cpu_map); 657extern int sched_init_domains(const struct cpumask *cpu_map);
614extern void rq_attach_root(struct rq *rq, struct root_domain *rd); 658extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
615 659
616#endif /* CONFIG_SMP */ 660#endif /* CONFIG_SMP */
@@ -1025,7 +1069,11 @@ struct sched_group_capacity {
1025 unsigned long next_update; 1069 unsigned long next_update;
1026 int imbalance; /* XXX unrelated to capacity but shared group state */ 1070 int imbalance; /* XXX unrelated to capacity but shared group state */
1027 1071
1028 unsigned long cpumask[0]; /* iteration mask */ 1072#ifdef CONFIG_SCHED_DEBUG
1073 int id;
1074#endif
1075
1076 unsigned long cpumask[0]; /* balance mask */
1029}; 1077};
1030 1078
1031struct sched_group { 1079struct sched_group {
@@ -1046,16 +1094,15 @@ struct sched_group {
1046 unsigned long cpumask[0]; 1094 unsigned long cpumask[0];
1047}; 1095};
1048 1096
1049static inline struct cpumask *sched_group_cpus(struct sched_group *sg) 1097static inline struct cpumask *sched_group_span(struct sched_group *sg)
1050{ 1098{
1051 return to_cpumask(sg->cpumask); 1099 return to_cpumask(sg->cpumask);
1052} 1100}
1053 1101
1054/* 1102/*
1055 * cpumask masking which cpus in the group are allowed to iterate up the domain 1103 * See build_balance_mask().
1056 * tree.
1057 */ 1104 */
1058static inline struct cpumask *sched_group_mask(struct sched_group *sg) 1105static inline struct cpumask *group_balance_mask(struct sched_group *sg)
1059{ 1106{
1060 return to_cpumask(sg->sgc->cpumask); 1107 return to_cpumask(sg->sgc->cpumask);
1061} 1108}
@@ -1066,7 +1113,7 @@ static inline struct cpumask *sched_group_mask(struct sched_group *sg)
1066 */ 1113 */
1067static inline unsigned int group_first_cpu(struct sched_group *group) 1114static inline unsigned int group_first_cpu(struct sched_group *group)
1068{ 1115{
1069 return cpumask_first(sched_group_cpus(group)); 1116 return cpumask_first(sched_group_span(group));
1070} 1117}
1071 1118
1072extern int group_balance_cpu(struct sched_group *sg); 1119extern int group_balance_cpu(struct sched_group *sg);
@@ -1422,7 +1469,11 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
1422 curr->sched_class->set_curr_task(rq); 1469 curr->sched_class->set_curr_task(rq);
1423} 1470}
1424 1471
1472#ifdef CONFIG_SMP
1425#define sched_class_highest (&stop_sched_class) 1473#define sched_class_highest (&stop_sched_class)
1474#else
1475#define sched_class_highest (&dl_sched_class)
1476#endif
1426#define for_each_class(class) \ 1477#define for_each_class(class) \
1427 for (class = sched_class_highest; class; class = class->next) 1478 for (class = sched_class_highest; class; class = class->next)
1428 1479
@@ -1486,7 +1537,12 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
1486extern struct dl_bandwidth def_dl_bandwidth; 1537extern struct dl_bandwidth def_dl_bandwidth;
1487extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); 1538extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
1488extern void init_dl_task_timer(struct sched_dl_entity *dl_se); 1539extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1540extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
1541extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
1489 1542
1543#define BW_SHIFT 20
1544#define BW_UNIT (1 << BW_SHIFT)
1545#define RATIO_SHIFT 8
1490unsigned long to_ratio(u64 period, u64 runtime); 1546unsigned long to_ratio(u64 period, u64 runtime);
1491 1547
1492extern void init_entity_runnable_average(struct sched_entity *se); 1548extern void init_entity_runnable_average(struct sched_entity *se);
@@ -1928,6 +1984,33 @@ extern void nohz_balance_exit_idle(unsigned int cpu);
1928static inline void nohz_balance_exit_idle(unsigned int cpu) { } 1984static inline void nohz_balance_exit_idle(unsigned int cpu) { }
1929#endif 1985#endif
1930 1986
1987
1988#ifdef CONFIG_SMP
1989static inline
1990void __dl_update(struct dl_bw *dl_b, s64 bw)
1991{
1992 struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
1993 int i;
1994
1995 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
1996 "sched RCU must be held");
1997 for_each_cpu_and(i, rd->span, cpu_active_mask) {
1998 struct rq *rq = cpu_rq(i);
1999
2000 rq->dl.extra_bw += bw;
2001 }
2002}
2003#else
2004static inline
2005void __dl_update(struct dl_bw *dl_b, s64 bw)
2006{
2007 struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
2008
2009 dl->extra_bw += bw;
2010}
2011#endif
2012
2013
1931#ifdef CONFIG_IRQ_TIME_ACCOUNTING 2014#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1932struct irqtime { 2015struct irqtime {
1933 u64 total; 2016 u64 total;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1b0b4fb12837..79895aec281e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -10,6 +10,7 @@ DEFINE_MUTEX(sched_domains_mutex);
10 10
11/* Protected by sched_domains_mutex: */ 11/* Protected by sched_domains_mutex: */
12cpumask_var_t sched_domains_tmpmask; 12cpumask_var_t sched_domains_tmpmask;
13cpumask_var_t sched_domains_tmpmask2;
13 14
14#ifdef CONFIG_SCHED_DEBUG 15#ifdef CONFIG_SCHED_DEBUG
15 16
@@ -35,7 +36,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
35 36
36 cpumask_clear(groupmask); 37 cpumask_clear(groupmask);
37 38
38 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 39 printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
39 40
40 if (!(sd->flags & SD_LOAD_BALANCE)) { 41 if (!(sd->flags & SD_LOAD_BALANCE)) {
41 printk("does not load-balance\n"); 42 printk("does not load-balance\n");
@@ -45,14 +46,14 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
45 return -1; 46 return -1;
46 } 47 }
47 48
48 printk(KERN_CONT "span %*pbl level %s\n", 49 printk(KERN_CONT "span=%*pbl level=%s\n",
49 cpumask_pr_args(sched_domain_span(sd)), sd->name); 50 cpumask_pr_args(sched_domain_span(sd)), sd->name);
50 51
51 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 52 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
52 printk(KERN_ERR "ERROR: domain->span does not contain " 53 printk(KERN_ERR "ERROR: domain->span does not contain "
53 "CPU%d\n", cpu); 54 "CPU%d\n", cpu);
54 } 55 }
55 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 56 if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
56 printk(KERN_ERR "ERROR: domain->groups does not contain" 57 printk(KERN_ERR "ERROR: domain->groups does not contain"
57 " CPU%d\n", cpu); 58 " CPU%d\n", cpu);
58 } 59 }
@@ -65,29 +66,47 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
65 break; 66 break;
66 } 67 }
67 68
68 if (!cpumask_weight(sched_group_cpus(group))) { 69 if (!cpumask_weight(sched_group_span(group))) {
69 printk(KERN_CONT "\n"); 70 printk(KERN_CONT "\n");
70 printk(KERN_ERR "ERROR: empty group\n"); 71 printk(KERN_ERR "ERROR: empty group\n");
71 break; 72 break;
72 } 73 }
73 74
74 if (!(sd->flags & SD_OVERLAP) && 75 if (!(sd->flags & SD_OVERLAP) &&
75 cpumask_intersects(groupmask, sched_group_cpus(group))) { 76 cpumask_intersects(groupmask, sched_group_span(group))) {
76 printk(KERN_CONT "\n"); 77 printk(KERN_CONT "\n");
77 printk(KERN_ERR "ERROR: repeated CPUs\n"); 78 printk(KERN_ERR "ERROR: repeated CPUs\n");
78 break; 79 break;
79 } 80 }
80 81
81 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 82 cpumask_or(groupmask, groupmask, sched_group_span(group));
82 83
83 printk(KERN_CONT " %*pbl", 84 printk(KERN_CONT " %d:{ span=%*pbl",
84 cpumask_pr_args(sched_group_cpus(group))); 85 group->sgc->id,
85 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { 86 cpumask_pr_args(sched_group_span(group)));
86 printk(KERN_CONT " (cpu_capacity = %lu)", 87
87 group->sgc->capacity); 88 if ((sd->flags & SD_OVERLAP) &&
89 !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
90 printk(KERN_CONT " mask=%*pbl",
91 cpumask_pr_args(group_balance_mask(group)));
92 }
93
94 if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
95 printk(KERN_CONT " cap=%lu", group->sgc->capacity);
96
97 if (group == sd->groups && sd->child &&
98 !cpumask_equal(sched_domain_span(sd->child),
99 sched_group_span(group))) {
100 printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
88 } 101 }
89 102
103 printk(KERN_CONT " }");
104
90 group = group->next; 105 group = group->next;
106
107 if (group != sd->groups)
108 printk(KERN_CONT ",");
109
91 } while (group != sd->groups); 110 } while (group != sd->groups);
92 printk(KERN_CONT "\n"); 111 printk(KERN_CONT "\n");
93 112
@@ -113,7 +132,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
113 return; 132 return;
114 } 133 }
115 134
116 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 135 printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
117 136
118 for (;;) { 137 for (;;) {
119 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 138 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
@@ -477,46 +496,214 @@ enum s_alloc {
477}; 496};
478 497
479/* 498/*
480 * Build an iteration mask that can exclude certain CPUs from the upwards 499 * Return the canonical balance CPU for this group, this is the first CPU
481 * domain traversal. 500 * of this group that's also in the balance mask.
482 * 501 *
483 * Asymmetric node setups can result in situations where the domain tree is of 502 * The balance mask are all those CPUs that could actually end up at this
484 * unequal depth, make sure to skip domains that already cover the entire 503 * group. See build_balance_mask().
485 * range.
486 * 504 *
487 * In that case build_sched_domains() will have terminated the iteration early 505 * Also see should_we_balance().
488 * and our sibling sd spans will be empty. Domains should always include the
489 * CPU they're built on, so check that.
490 */ 506 */
491static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) 507int group_balance_cpu(struct sched_group *sg)
492{ 508{
493 const struct cpumask *span = sched_domain_span(sd); 509 return cpumask_first(group_balance_mask(sg));
510}
511
512
513/*
514 * NUMA topology (first read the regular topology blurb below)
515 *
516 * Given a node-distance table, for example:
517 *
518 * node 0 1 2 3
519 * 0: 10 20 30 20
520 * 1: 20 10 20 30
521 * 2: 30 20 10 20
522 * 3: 20 30 20 10
523 *
524 * which represents a 4 node ring topology like:
525 *
526 * 0 ----- 1
527 * | |
528 * | |
529 * | |
530 * 3 ----- 2
531 *
532 * We want to construct domains and groups to represent this. The way we go
533 * about doing this is to build the domains on 'hops'. For each NUMA level we
534 * construct the mask of all nodes reachable in @level hops.
535 *
536 * For the above NUMA topology that gives 3 levels:
537 *
538 * NUMA-2 0-3 0-3 0-3 0-3
539 * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
540 *
541 * NUMA-1 0-1,3 0-2 1-3 0,2-3
542 * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
543 *
544 * NUMA-0 0 1 2 3
545 *
546 *
547 * As can be seen; things don't nicely line up as with the regular topology.
548 * When we iterate a domain in child domain chunks some nodes can be
549 * represented multiple times -- hence the "overlap" naming for this part of
550 * the topology.
551 *
552 * In order to minimize this overlap, we only build enough groups to cover the
553 * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
554 *
555 * Because:
556 *
557 * - the first group of each domain is its child domain; this
558 * gets us the first 0-1,3
559 * - the only uncovered node is 2, who's child domain is 1-3.
560 *
561 * However, because of the overlap, computing a unique CPU for each group is
562 * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
563 * groups include the CPUs of Node-0, while those CPUs would not in fact ever
564 * end up at those groups (they would end up in group: 0-1,3).
565 *
566 * To correct this we have to introduce the group balance mask. This mask
567 * will contain those CPUs in the group that can reach this group given the
568 * (child) domain tree.
569 *
570 * With this we can once again compute balance_cpu and sched_group_capacity
571 * relations.
572 *
573 * XXX include words on how balance_cpu is unique and therefore can be
574 * used for sched_group_capacity links.
575 *
576 *
577 * Another 'interesting' topology is:
578 *
579 * node 0 1 2 3
580 * 0: 10 20 20 30
581 * 1: 20 10 20 20
582 * 2: 20 20 10 20
583 * 3: 30 20 20 10
584 *
585 * Which looks a little like:
586 *
587 * 0 ----- 1
588 * | / |
589 * | / |
590 * | / |
591 * 2 ----- 3
592 *
593 * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
594 * are not.
595 *
596 * This leads to a few particularly weird cases where the sched_domain's are
597 * not of the same number for each cpu. Consider:
598 *
599 * NUMA-2 0-3 0-3
600 * groups: {0-2},{1-3} {1-3},{0-2}
601 *
602 * NUMA-1 0-2 0-3 0-3 1-3
603 *
604 * NUMA-0 0 1 2 3
605 *
606 */
607
608
609/*
610 * Build the balance mask; it contains only those CPUs that can arrive at this
611 * group and should be considered to continue balancing.
612 *
613 * We do this during the group creation pass, therefore the group information
614 * isn't complete yet, however since each group represents a (child) domain we
615 * can fully construct this using the sched_domain bits (which are already
616 * complete).
617 */
618static void
619build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
620{
621 const struct cpumask *sg_span = sched_group_span(sg);
494 struct sd_data *sdd = sd->private; 622 struct sd_data *sdd = sd->private;
495 struct sched_domain *sibling; 623 struct sched_domain *sibling;
496 int i; 624 int i;
497 625
498 for_each_cpu(i, span) { 626 cpumask_clear(mask);
627
628 for_each_cpu(i, sg_span) {
499 sibling = *per_cpu_ptr(sdd->sd, i); 629 sibling = *per_cpu_ptr(sdd->sd, i);
500 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 630
631 /*
632 * Can happen in the asymmetric case, where these siblings are
633 * unused. The mask will not be empty because those CPUs that
634 * do have the top domain _should_ span the domain.
635 */
636 if (!sibling->child)
501 continue; 637 continue;
502 638
503 cpumask_set_cpu(i, sched_group_mask(sg)); 639 /* If we would not end up here, we can't continue from here */
640 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
641 continue;
642
643 cpumask_set_cpu(i, mask);
504 } 644 }
645
646 /* We must not have empty masks here */
647 WARN_ON_ONCE(cpumask_empty(mask));
505} 648}
506 649
507/* 650/*
508 * Return the canonical balance CPU for this group, this is the first CPU 651 * XXX: This creates per-node group entries; since the load-balancer will
509 * of this group that's also in the iteration mask. 652 * immediately access remote memory to construct this group's load-balance
653 * statistics having the groups node local is of dubious benefit.
510 */ 654 */
511int group_balance_cpu(struct sched_group *sg) 655static struct sched_group *
656build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
512{ 657{
513 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); 658 struct sched_group *sg;
659 struct cpumask *sg_span;
660
661 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
662 GFP_KERNEL, cpu_to_node(cpu));
663
664 if (!sg)
665 return NULL;
666
667 sg_span = sched_group_span(sg);
668 if (sd->child)
669 cpumask_copy(sg_span, sched_domain_span(sd->child));
670 else
671 cpumask_copy(sg_span, sched_domain_span(sd));
672
673 return sg;
674}
675
676static void init_overlap_sched_group(struct sched_domain *sd,
677 struct sched_group *sg)
678{
679 struct cpumask *mask = sched_domains_tmpmask2;
680 struct sd_data *sdd = sd->private;
681 struct cpumask *sg_span;
682 int cpu;
683
684 build_balance_mask(sd, sg, mask);
685 cpu = cpumask_first_and(sched_group_span(sg), mask);
686
687 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
688 if (atomic_inc_return(&sg->sgc->ref) == 1)
689 cpumask_copy(group_balance_mask(sg), mask);
690 else
691 WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
692
693 /*
694 * Initialize sgc->capacity such that even if we mess up the
695 * domains and no possible iteration will get us here, we won't
696 * die on a /0 trap.
697 */
698 sg_span = sched_group_span(sg);
699 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
700 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
514} 701}
515 702
516static int 703static int
517build_overlap_sched_groups(struct sched_domain *sd, int cpu) 704build_overlap_sched_groups(struct sched_domain *sd, int cpu)
518{ 705{
519 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; 706 struct sched_group *first = NULL, *last = NULL, *sg;
520 const struct cpumask *span = sched_domain_span(sd); 707 const struct cpumask *span = sched_domain_span(sd);
521 struct cpumask *covered = sched_domains_tmpmask; 708 struct cpumask *covered = sched_domains_tmpmask;
522 struct sd_data *sdd = sd->private; 709 struct sd_data *sdd = sd->private;
@@ -525,7 +712,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
525 712
526 cpumask_clear(covered); 713 cpumask_clear(covered);
527 714
528 for_each_cpu(i, span) { 715 for_each_cpu_wrap(i, span, cpu) {
529 struct cpumask *sg_span; 716 struct cpumask *sg_span;
530 717
531 if (cpumask_test_cpu(i, covered)) 718 if (cpumask_test_cpu(i, covered))
@@ -533,44 +720,27 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
533 720
534 sibling = *per_cpu_ptr(sdd->sd, i); 721 sibling = *per_cpu_ptr(sdd->sd, i);
535 722
536 /* See the comment near build_group_mask(). */ 723 /*
724 * Asymmetric node setups can result in situations where the
725 * domain tree is of unequal depth, make sure to skip domains
726 * that already cover the entire range.
727 *
728 * In that case build_sched_domains() will have terminated the
729 * iteration early and our sibling sd spans will be empty.
730 * Domains should always include the CPU they're built on, so
731 * check that.
732 */
537 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 733 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
538 continue; 734 continue;
539 735
540 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 736 sg = build_group_from_child_sched_domain(sibling, cpu);
541 GFP_KERNEL, cpu_to_node(cpu));
542
543 if (!sg) 737 if (!sg)
544 goto fail; 738 goto fail;
545 739
546 sg_span = sched_group_cpus(sg); 740 sg_span = sched_group_span(sg);
547 if (sibling->child)
548 cpumask_copy(sg_span, sched_domain_span(sibling->child));
549 else
550 cpumask_set_cpu(i, sg_span);
551
552 cpumask_or(covered, covered, sg_span); 741 cpumask_or(covered, covered, sg_span);
553 742
554 sg->sgc = *per_cpu_ptr(sdd->sgc, i); 743 init_overlap_sched_group(sd, sg);
555 if (atomic_inc_return(&sg->sgc->ref) == 1)
556 build_group_mask(sd, sg);
557
558 /*
559 * Initialize sgc->capacity such that even if we mess up the
560 * domains and no possible iteration will get us here, we won't
561 * die on a /0 trap.
562 */
563 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
564 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
565
566 /*
567 * Make sure the first group of this domain contains the
568 * canonical balance CPU. Otherwise the sched_domain iteration
569 * breaks. See update_sg_lb_stats().
570 */
571 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
572 group_balance_cpu(sg) == cpu)
573 groups = sg;
574 744
575 if (!first) 745 if (!first)
576 first = sg; 746 first = sg;
@@ -579,7 +749,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
579 last = sg; 749 last = sg;
580 last->next = first; 750 last->next = first;
581 } 751 }
582 sd->groups = groups; 752 sd->groups = first;
583 753
584 return 0; 754 return 0;
585 755
@@ -589,23 +759,106 @@ fail:
589 return -ENOMEM; 759 return -ENOMEM;
590} 760}
591 761
592static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 762
763/*
764 * Package topology (also see the load-balance blurb in fair.c)
765 *
766 * The scheduler builds a tree structure to represent a number of important
767 * topology features. By default (default_topology[]) these include:
768 *
769 * - Simultaneous multithreading (SMT)
770 * - Multi-Core Cache (MC)
771 * - Package (DIE)
772 *
773 * Where the last one more or less denotes everything up to a NUMA node.
774 *
775 * The tree consists of 3 primary data structures:
776 *
777 * sched_domain -> sched_group -> sched_group_capacity
778 * ^ ^ ^ ^
779 * `-' `-'
780 *
781 * The sched_domains are per-cpu and have a two way link (parent & child) and
782 * denote the ever growing mask of CPUs belonging to that level of topology.
783 *
784 * Each sched_domain has a circular (double) linked list of sched_group's, each
785 * denoting the domains of the level below (or individual CPUs in case of the
786 * first domain level). The sched_group linked by a sched_domain includes the
787 * CPU of that sched_domain [*].
788 *
789 * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
790 *
791 * CPU 0 1 2 3 4 5 6 7
792 *
793 * DIE [ ]
794 * MC [ ] [ ]
795 * SMT [ ] [ ] [ ] [ ]
796 *
797 * - or -
798 *
799 * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
800 * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
801 * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
802 *
803 * CPU 0 1 2 3 4 5 6 7
804 *
805 * One way to think about it is: sched_domain moves you up and down among these
806 * topology levels, while sched_group moves you sideways through it, at child
807 * domain granularity.
808 *
809 * sched_group_capacity ensures each unique sched_group has shared storage.
810 *
811 * There are two related construction problems, both require a CPU that
812 * uniquely identify each group (for a given domain):
813 *
814 * - The first is the balance_cpu (see should_we_balance() and the
815 * load-balance blub in fair.c); for each group we only want 1 CPU to
816 * continue balancing at a higher domain.
817 *
818 * - The second is the sched_group_capacity; we want all identical groups
819 * to share a single sched_group_capacity.
820 *
821 * Since these topologies are exclusive by construction. That is, its
822 * impossible for an SMT thread to belong to multiple cores, and cores to
823 * be part of multiple caches. There is a very clear and unique location
824 * for each CPU in the hierarchy.
825 *
826 * Therefore computing a unique CPU for each group is trivial (the iteration
827 * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
828 * group), we can simply pick the first CPU in each group.
829 *
830 *
831 * [*] in other words, the first group of each domain is its child domain.
832 */
833
834static struct sched_group *get_group(int cpu, struct sd_data *sdd)
593{ 835{
594 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 836 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
595 struct sched_domain *child = sd->child; 837 struct sched_domain *child = sd->child;
838 struct sched_group *sg;
596 839
597 if (child) 840 if (child)
598 cpu = cpumask_first(sched_domain_span(child)); 841 cpu = cpumask_first(sched_domain_span(child));
599 842
600 if (sg) { 843 sg = *per_cpu_ptr(sdd->sg, cpu);
601 *sg = *per_cpu_ptr(sdd->sg, cpu); 844 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
602 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); 845
846 /* For claim_allocations: */
847 atomic_inc(&sg->ref);
848 atomic_inc(&sg->sgc->ref);
603 849
604 /* For claim_allocations: */ 850 if (child) {
605 atomic_set(&(*sg)->sgc->ref, 1); 851 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
852 cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
853 } else {
854 cpumask_set_cpu(cpu, sched_group_span(sg));
855 cpumask_set_cpu(cpu, group_balance_mask(sg));
606 } 856 }
607 857
608 return cpu; 858 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
859 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
860
861 return sg;
609} 862}
610 863
611/* 864/*
@@ -624,34 +877,20 @@ build_sched_groups(struct sched_domain *sd, int cpu)
624 struct cpumask *covered; 877 struct cpumask *covered;
625 int i; 878 int i;
626 879
627 get_group(cpu, sdd, &sd->groups);
628 atomic_inc(&sd->groups->ref);
629
630 if (cpu != cpumask_first(span))
631 return 0;
632
633 lockdep_assert_held(&sched_domains_mutex); 880 lockdep_assert_held(&sched_domains_mutex);
634 covered = sched_domains_tmpmask; 881 covered = sched_domains_tmpmask;
635 882
636 cpumask_clear(covered); 883 cpumask_clear(covered);
637 884
638 for_each_cpu(i, span) { 885 for_each_cpu_wrap(i, span, cpu) {
639 struct sched_group *sg; 886 struct sched_group *sg;
640 int group, j;
641 887
642 if (cpumask_test_cpu(i, covered)) 888 if (cpumask_test_cpu(i, covered))
643 continue; 889 continue;
644 890
645 group = get_group(i, sdd, &sg); 891 sg = get_group(i, sdd);
646 cpumask_setall(sched_group_mask(sg));
647 892
648 for_each_cpu(j, span) { 893 cpumask_or(covered, covered, sched_group_span(sg));
649 if (get_group(j, sdd, NULL) != group)
650 continue;
651
652 cpumask_set_cpu(j, covered);
653 cpumask_set_cpu(j, sched_group_cpus(sg));
654 }
655 894
656 if (!first) 895 if (!first)
657 first = sg; 896 first = sg;
@@ -660,6 +899,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
660 last = sg; 899 last = sg;
661 } 900 }
662 last->next = first; 901 last->next = first;
902 sd->groups = first;
663 903
664 return 0; 904 return 0;
665} 905}
@@ -683,12 +923,12 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
683 do { 923 do {
684 int cpu, max_cpu = -1; 924 int cpu, max_cpu = -1;
685 925
686 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 926 sg->group_weight = cpumask_weight(sched_group_span(sg));
687 927
688 if (!(sd->flags & SD_ASYM_PACKING)) 928 if (!(sd->flags & SD_ASYM_PACKING))
689 goto next; 929 goto next;
690 930
691 for_each_cpu(cpu, sched_group_cpus(sg)) { 931 for_each_cpu(cpu, sched_group_span(sg)) {
692 if (max_cpu < 0) 932 if (max_cpu < 0)
693 max_cpu = cpu; 933 max_cpu = cpu;
694 else if (sched_asym_prefer(cpu, max_cpu)) 934 else if (sched_asym_prefer(cpu, max_cpu))
@@ -1308,6 +1548,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
1308 if (!sgc) 1548 if (!sgc)
1309 return -ENOMEM; 1549 return -ENOMEM;
1310 1550
1551#ifdef CONFIG_SCHED_DEBUG
1552 sgc->id = j;
1553#endif
1554
1311 *per_cpu_ptr(sdd->sgc, j) = sgc; 1555 *per_cpu_ptr(sdd->sgc, j) = sgc;
1312 } 1556 }
1313 } 1557 }
@@ -1407,7 +1651,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
1407 sd = build_sched_domain(tl, cpu_map, attr, sd, i); 1651 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
1408 if (tl == sched_domain_topology) 1652 if (tl == sched_domain_topology)
1409 *per_cpu_ptr(d.sd, i) = sd; 1653 *per_cpu_ptr(d.sd, i) = sd;
1410 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 1654 if (tl->flags & SDTL_OVERLAP)
1411 sd->flags |= SD_OVERLAP; 1655 sd->flags |= SD_OVERLAP;
1412 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 1656 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
1413 break; 1657 break;
@@ -1478,7 +1722,7 @@ static struct sched_domain_attr *dattr_cur;
1478 * cpumask) fails, then fallback to a single sched domain, 1722 * cpumask) fails, then fallback to a single sched domain,
1479 * as determined by the single cpumask fallback_doms. 1723 * as determined by the single cpumask fallback_doms.
1480 */ 1724 */
1481cpumask_var_t fallback_doms; 1725static cpumask_var_t fallback_doms;
1482 1726
1483/* 1727/*
1484 * arch_update_cpu_topology lets virtualized architectures update the 1728 * arch_update_cpu_topology lets virtualized architectures update the
@@ -1520,10 +1764,14 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
1520 * For now this just excludes isolated CPUs, but could be used to 1764 * For now this just excludes isolated CPUs, but could be used to
1521 * exclude other special cases in the future. 1765 * exclude other special cases in the future.
1522 */ 1766 */
1523int init_sched_domains(const struct cpumask *cpu_map) 1767int sched_init_domains(const struct cpumask *cpu_map)
1524{ 1768{
1525 int err; 1769 int err;
1526 1770
1771 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
1772 zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
1773 zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
1774
1527 arch_update_cpu_topology(); 1775 arch_update_cpu_topology();
1528 ndoms_cur = 1; 1776 ndoms_cur = 1;
1529 doms_cur = alloc_sched_domains(ndoms_cur); 1777 doms_cur = alloc_sched_domains(ndoms_cur);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index b8c84c6dee64..17f11c6b0a9f 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -12,44 +12,44 @@
12#include <linux/hash.h> 12#include <linux/hash.h>
13#include <linux/kthread.h> 13#include <linux/kthread.h>
14 14
15void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) 15void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
16{ 16{
17 spin_lock_init(&q->lock); 17 spin_lock_init(&wq_head->lock);
18 lockdep_set_class_and_name(&q->lock, key, name); 18 lockdep_set_class_and_name(&wq_head->lock, key, name);
19 INIT_LIST_HEAD(&q->task_list); 19 INIT_LIST_HEAD(&wq_head->head);
20} 20}
21 21
22EXPORT_SYMBOL(__init_waitqueue_head); 22EXPORT_SYMBOL(__init_waitqueue_head);
23 23
24void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) 24void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
25{ 25{
26 unsigned long flags; 26 unsigned long flags;
27 27
28 wait->flags &= ~WQ_FLAG_EXCLUSIVE; 28 wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
29 spin_lock_irqsave(&q->lock, flags); 29 spin_lock_irqsave(&wq_head->lock, flags);
30 __add_wait_queue(q, wait); 30 __add_wait_queue_entry_tail(wq_head, wq_entry);
31 spin_unlock_irqrestore(&q->lock, flags); 31 spin_unlock_irqrestore(&wq_head->lock, flags);
32} 32}
33EXPORT_SYMBOL(add_wait_queue); 33EXPORT_SYMBOL(add_wait_queue);
34 34
35void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) 35void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
36{ 36{
37 unsigned long flags; 37 unsigned long flags;
38 38
39 wait->flags |= WQ_FLAG_EXCLUSIVE; 39 wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
40 spin_lock_irqsave(&q->lock, flags); 40 spin_lock_irqsave(&wq_head->lock, flags);
41 __add_wait_queue_tail(q, wait); 41 __add_wait_queue_entry_tail(wq_head, wq_entry);
42 spin_unlock_irqrestore(&q->lock, flags); 42 spin_unlock_irqrestore(&wq_head->lock, flags);
43} 43}
44EXPORT_SYMBOL(add_wait_queue_exclusive); 44EXPORT_SYMBOL(add_wait_queue_exclusive);
45 45
46void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) 46void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
47{ 47{
48 unsigned long flags; 48 unsigned long flags;
49 49
50 spin_lock_irqsave(&q->lock, flags); 50 spin_lock_irqsave(&wq_head->lock, flags);
51 __remove_wait_queue(q, wait); 51 __remove_wait_queue(wq_head, wq_entry);
52 spin_unlock_irqrestore(&q->lock, flags); 52 spin_unlock_irqrestore(&wq_head->lock, flags);
53} 53}
54EXPORT_SYMBOL(remove_wait_queue); 54EXPORT_SYMBOL(remove_wait_queue);
55 55
@@ -63,12 +63,12 @@ EXPORT_SYMBOL(remove_wait_queue);
63 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 63 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
64 * zero in this (rare) case, and we handle it by continuing to scan the queue. 64 * zero in this (rare) case, and we handle it by continuing to scan the queue.
65 */ 65 */
66static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 66static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
67 int nr_exclusive, int wake_flags, void *key) 67 int nr_exclusive, int wake_flags, void *key)
68{ 68{
69 wait_queue_t *curr, *next; 69 wait_queue_entry_t *curr, *next;
70 70
71 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 71 list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
72 unsigned flags = curr->flags; 72 unsigned flags = curr->flags;
73 73
74 if (curr->func(curr, mode, wake_flags, key) && 74 if (curr->func(curr, mode, wake_flags, key) &&
@@ -79,7 +79,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
79 79
80/** 80/**
81 * __wake_up - wake up threads blocked on a waitqueue. 81 * __wake_up - wake up threads blocked on a waitqueue.
82 * @q: the waitqueue 82 * @wq_head: the waitqueue
83 * @mode: which threads 83 * @mode: which threads
84 * @nr_exclusive: how many wake-one or wake-many threads to wake up 84 * @nr_exclusive: how many wake-one or wake-many threads to wake up
85 * @key: is directly passed to the wakeup function 85 * @key: is directly passed to the wakeup function
@@ -87,35 +87,35 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
87 * It may be assumed that this function implies a write memory barrier before 87 * It may be assumed that this function implies a write memory barrier before
88 * changing the task state if and only if any tasks are woken up. 88 * changing the task state if and only if any tasks are woken up.
89 */ 89 */
90void __wake_up(wait_queue_head_t *q, unsigned int mode, 90void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
91 int nr_exclusive, void *key) 91 int nr_exclusive, void *key)
92{ 92{
93 unsigned long flags; 93 unsigned long flags;
94 94
95 spin_lock_irqsave(&q->lock, flags); 95 spin_lock_irqsave(&wq_head->lock, flags);
96 __wake_up_common(q, mode, nr_exclusive, 0, key); 96 __wake_up_common(wq_head, mode, nr_exclusive, 0, key);
97 spin_unlock_irqrestore(&q->lock, flags); 97 spin_unlock_irqrestore(&wq_head->lock, flags);
98} 98}
99EXPORT_SYMBOL(__wake_up); 99EXPORT_SYMBOL(__wake_up);
100 100
101/* 101/*
102 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 102 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
103 */ 103 */
104void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) 104void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
105{ 105{
106 __wake_up_common(q, mode, nr, 0, NULL); 106 __wake_up_common(wq_head, mode, nr, 0, NULL);
107} 107}
108EXPORT_SYMBOL_GPL(__wake_up_locked); 108EXPORT_SYMBOL_GPL(__wake_up_locked);
109 109
110void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 110void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
111{ 111{
112 __wake_up_common(q, mode, 1, 0, key); 112 __wake_up_common(wq_head, mode, 1, 0, key);
113} 113}
114EXPORT_SYMBOL_GPL(__wake_up_locked_key); 114EXPORT_SYMBOL_GPL(__wake_up_locked_key);
115 115
116/** 116/**
117 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 117 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
118 * @q: the waitqueue 118 * @wq_head: the waitqueue
119 * @mode: which threads 119 * @mode: which threads
120 * @nr_exclusive: how many wake-one or wake-many threads to wake up 120 * @nr_exclusive: how many wake-one or wake-many threads to wake up
121 * @key: opaque value to be passed to wakeup targets 121 * @key: opaque value to be passed to wakeup targets
@@ -130,30 +130,30 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key);
130 * It may be assumed that this function implies a write memory barrier before 130 * It may be assumed that this function implies a write memory barrier before
131 * changing the task state if and only if any tasks are woken up. 131 * changing the task state if and only if any tasks are woken up.
132 */ 132 */
133void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 133void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
134 int nr_exclusive, void *key) 134 int nr_exclusive, void *key)
135{ 135{
136 unsigned long flags; 136 unsigned long flags;
137 int wake_flags = 1; /* XXX WF_SYNC */ 137 int wake_flags = 1; /* XXX WF_SYNC */
138 138
139 if (unlikely(!q)) 139 if (unlikely(!wq_head))
140 return; 140 return;
141 141
142 if (unlikely(nr_exclusive != 1)) 142 if (unlikely(nr_exclusive != 1))
143 wake_flags = 0; 143 wake_flags = 0;
144 144
145 spin_lock_irqsave(&q->lock, flags); 145 spin_lock_irqsave(&wq_head->lock, flags);
146 __wake_up_common(q, mode, nr_exclusive, wake_flags, key); 146 __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key);
147 spin_unlock_irqrestore(&q->lock, flags); 147 spin_unlock_irqrestore(&wq_head->lock, flags);
148} 148}
149EXPORT_SYMBOL_GPL(__wake_up_sync_key); 149EXPORT_SYMBOL_GPL(__wake_up_sync_key);
150 150
151/* 151/*
152 * __wake_up_sync - see __wake_up_sync_key() 152 * __wake_up_sync - see __wake_up_sync_key()
153 */ 153 */
154void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 154void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive)
155{ 155{
156 __wake_up_sync_key(q, mode, nr_exclusive, NULL); 156 __wake_up_sync_key(wq_head, mode, nr_exclusive, NULL);
157} 157}
158EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 158EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
159 159
@@ -170,48 +170,48 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
170 * loads to move into the critical region). 170 * loads to move into the critical region).
171 */ 171 */
172void 172void
173prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) 173prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
174{ 174{
175 unsigned long flags; 175 unsigned long flags;
176 176
177 wait->flags &= ~WQ_FLAG_EXCLUSIVE; 177 wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
178 spin_lock_irqsave(&q->lock, flags); 178 spin_lock_irqsave(&wq_head->lock, flags);
179 if (list_empty(&wait->task_list)) 179 if (list_empty(&wq_entry->entry))
180 __add_wait_queue(q, wait); 180 __add_wait_queue(wq_head, wq_entry);
181 set_current_state(state); 181 set_current_state(state);
182 spin_unlock_irqrestore(&q->lock, flags); 182 spin_unlock_irqrestore(&wq_head->lock, flags);
183} 183}
184EXPORT_SYMBOL(prepare_to_wait); 184EXPORT_SYMBOL(prepare_to_wait);
185 185
186void 186void
187prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) 187prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
188{ 188{
189 unsigned long flags; 189 unsigned long flags;
190 190
191 wait->flags |= WQ_FLAG_EXCLUSIVE; 191 wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
192 spin_lock_irqsave(&q->lock, flags); 192 spin_lock_irqsave(&wq_head->lock, flags);
193 if (list_empty(&wait->task_list)) 193 if (list_empty(&wq_entry->entry))
194 __add_wait_queue_tail(q, wait); 194 __add_wait_queue_entry_tail(wq_head, wq_entry);
195 set_current_state(state); 195 set_current_state(state);
196 spin_unlock_irqrestore(&q->lock, flags); 196 spin_unlock_irqrestore(&wq_head->lock, flags);
197} 197}
198EXPORT_SYMBOL(prepare_to_wait_exclusive); 198EXPORT_SYMBOL(prepare_to_wait_exclusive);
199 199
200void init_wait_entry(wait_queue_t *wait, int flags) 200void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
201{ 201{
202 wait->flags = flags; 202 wq_entry->flags = flags;
203 wait->private = current; 203 wq_entry->private = current;
204 wait->func = autoremove_wake_function; 204 wq_entry->func = autoremove_wake_function;
205 INIT_LIST_HEAD(&wait->task_list); 205 INIT_LIST_HEAD(&wq_entry->entry);
206} 206}
207EXPORT_SYMBOL(init_wait_entry); 207EXPORT_SYMBOL(init_wait_entry);
208 208
209long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) 209long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
210{ 210{
211 unsigned long flags; 211 unsigned long flags;
212 long ret = 0; 212 long ret = 0;
213 213
214 spin_lock_irqsave(&q->lock, flags); 214 spin_lock_irqsave(&wq_head->lock, flags);
215 if (unlikely(signal_pending_state(state, current))) { 215 if (unlikely(signal_pending_state(state, current))) {
216 /* 216 /*
217 * Exclusive waiter must not fail if it was selected by wakeup, 217 * Exclusive waiter must not fail if it was selected by wakeup,
@@ -219,24 +219,24 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
219 * 219 *
220 * The caller will recheck the condition and return success if 220 * The caller will recheck the condition and return success if
221 * we were already woken up, we can not miss the event because 221 * we were already woken up, we can not miss the event because
222 * wakeup locks/unlocks the same q->lock. 222 * wakeup locks/unlocks the same wq_head->lock.
223 * 223 *
224 * But we need to ensure that set-condition + wakeup after that 224 * But we need to ensure that set-condition + wakeup after that
225 * can't see us, it should wake up another exclusive waiter if 225 * can't see us, it should wake up another exclusive waiter if
226 * we fail. 226 * we fail.
227 */ 227 */
228 list_del_init(&wait->task_list); 228 list_del_init(&wq_entry->entry);
229 ret = -ERESTARTSYS; 229 ret = -ERESTARTSYS;
230 } else { 230 } else {
231 if (list_empty(&wait->task_list)) { 231 if (list_empty(&wq_entry->entry)) {
232 if (wait->flags & WQ_FLAG_EXCLUSIVE) 232 if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
233 __add_wait_queue_tail(q, wait); 233 __add_wait_queue_entry_tail(wq_head, wq_entry);
234 else 234 else
235 __add_wait_queue(q, wait); 235 __add_wait_queue(wq_head, wq_entry);
236 } 236 }
237 set_current_state(state); 237 set_current_state(state);
238 } 238 }
239 spin_unlock_irqrestore(&q->lock, flags); 239 spin_unlock_irqrestore(&wq_head->lock, flags);
240 240
241 return ret; 241 return ret;
242} 242}
@@ -249,10 +249,10 @@ EXPORT_SYMBOL(prepare_to_wait_event);
249 * condition in the caller before they add the wait 249 * condition in the caller before they add the wait
250 * entry to the wake queue. 250 * entry to the wake queue.
251 */ 251 */
252int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait) 252int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
253{ 253{
254 if (likely(list_empty(&wait->task_list))) 254 if (likely(list_empty(&wait->entry)))
255 __add_wait_queue_tail(wq, wait); 255 __add_wait_queue_entry_tail(wq, wait);
256 256
257 set_current_state(TASK_INTERRUPTIBLE); 257 set_current_state(TASK_INTERRUPTIBLE);
258 if (signal_pending(current)) 258 if (signal_pending(current))
@@ -265,10 +265,10 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait)
265} 265}
266EXPORT_SYMBOL(do_wait_intr); 266EXPORT_SYMBOL(do_wait_intr);
267 267
268int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_t *wait) 268int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
269{ 269{
270 if (likely(list_empty(&wait->task_list))) 270 if (likely(list_empty(&wait->entry)))
271 __add_wait_queue_tail(wq, wait); 271 __add_wait_queue_entry_tail(wq, wait);
272 272
273 set_current_state(TASK_INTERRUPTIBLE); 273 set_current_state(TASK_INTERRUPTIBLE);
274 if (signal_pending(current)) 274 if (signal_pending(current))
@@ -283,14 +283,14 @@ EXPORT_SYMBOL(do_wait_intr_irq);
283 283
284/** 284/**
285 * finish_wait - clean up after waiting in a queue 285 * finish_wait - clean up after waiting in a queue
286 * @q: waitqueue waited on 286 * @wq_head: waitqueue waited on
287 * @wait: wait descriptor 287 * @wq_entry: wait descriptor
288 * 288 *
289 * Sets current thread back to running state and removes 289 * Sets current thread back to running state and removes
290 * the wait descriptor from the given waitqueue if still 290 * the wait descriptor from the given waitqueue if still
291 * queued. 291 * queued.
292 */ 292 */
293void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) 293void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
294{ 294{
295 unsigned long flags; 295 unsigned long flags;
296 296
@@ -308,20 +308,20 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
308 * have _one_ other CPU that looks at or modifies 308 * have _one_ other CPU that looks at or modifies
309 * the list). 309 * the list).
310 */ 310 */
311 if (!list_empty_careful(&wait->task_list)) { 311 if (!list_empty_careful(&wq_entry->entry)) {
312 spin_lock_irqsave(&q->lock, flags); 312 spin_lock_irqsave(&wq_head->lock, flags);
313 list_del_init(&wait->task_list); 313 list_del_init(&wq_entry->entry);
314 spin_unlock_irqrestore(&q->lock, flags); 314 spin_unlock_irqrestore(&wq_head->lock, flags);
315 } 315 }
316} 316}
317EXPORT_SYMBOL(finish_wait); 317EXPORT_SYMBOL(finish_wait);
318 318
319int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) 319int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
320{ 320{
321 int ret = default_wake_function(wait, mode, sync, key); 321 int ret = default_wake_function(wq_entry, mode, sync, key);
322 322
323 if (ret) 323 if (ret)
324 list_del_init(&wait->task_list); 324 list_del_init(&wq_entry->entry);
325 return ret; 325 return ret;
326} 326}
327EXPORT_SYMBOL(autoremove_wake_function); 327EXPORT_SYMBOL(autoremove_wake_function);
@@ -334,24 +334,24 @@ static inline bool is_kthread_should_stop(void)
334/* 334/*
335 * DEFINE_WAIT_FUNC(wait, woken_wake_func); 335 * DEFINE_WAIT_FUNC(wait, woken_wake_func);
336 * 336 *
337 * add_wait_queue(&wq, &wait); 337 * add_wait_queue(&wq_head, &wait);
338 * for (;;) { 338 * for (;;) {
339 * if (condition) 339 * if (condition)
340 * break; 340 * break;
341 * 341 *
342 * p->state = mode; condition = true; 342 * p->state = mode; condition = true;
343 * smp_mb(); // A smp_wmb(); // C 343 * smp_mb(); // A smp_wmb(); // C
344 * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; 344 * if (!wq_entry->flags & WQ_FLAG_WOKEN) wq_entry->flags |= WQ_FLAG_WOKEN;
345 * schedule() try_to_wake_up(); 345 * schedule() try_to_wake_up();
346 * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ 346 * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~
347 * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; 347 * wq_entry->flags &= ~WQ_FLAG_WOKEN; condition = true;
348 * smp_mb() // B smp_wmb(); // C 348 * smp_mb() // B smp_wmb(); // C
349 * wait->flags |= WQ_FLAG_WOKEN; 349 * wq_entry->flags |= WQ_FLAG_WOKEN;
350 * } 350 * }
351 * remove_wait_queue(&wq, &wait); 351 * remove_wait_queue(&wq_head, &wait);
352 * 352 *
353 */ 353 */
354long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) 354long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
355{ 355{
356 set_current_state(mode); /* A */ 356 set_current_state(mode); /* A */
357 /* 357 /*
@@ -359,7 +359,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
359 * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must 359 * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
360 * also observe all state before the wakeup. 360 * also observe all state before the wakeup.
361 */ 361 */
362 if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) 362 if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
363 timeout = schedule_timeout(timeout); 363 timeout = schedule_timeout(timeout);
364 __set_current_state(TASK_RUNNING); 364 __set_current_state(TASK_RUNNING);
365 365
@@ -369,13 +369,13 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
369 * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss 369 * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
370 * an event. 370 * an event.
371 */ 371 */
372 smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ 372 smp_store_mb(wq_entry->flags, wq_entry->flags & ~WQ_FLAG_WOKEN); /* B */
373 373
374 return timeout; 374 return timeout;
375} 375}
376EXPORT_SYMBOL(wait_woken); 376EXPORT_SYMBOL(wait_woken);
377 377
378int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) 378int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
379{ 379{
380 /* 380 /*
381 * Although this function is called under waitqueue lock, LOCK 381 * Although this function is called under waitqueue lock, LOCK
@@ -385,267 +385,8 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
385 * and is paired with smp_store_mb() in wait_woken(). 385 * and is paired with smp_store_mb() in wait_woken().
386 */ 386 */
387 smp_wmb(); /* C */ 387 smp_wmb(); /* C */
388 wait->flags |= WQ_FLAG_WOKEN; 388 wq_entry->flags |= WQ_FLAG_WOKEN;
389 389
390 return default_wake_function(wait, mode, sync, key); 390 return default_wake_function(wq_entry, mode, sync, key);
391} 391}
392EXPORT_SYMBOL(woken_wake_function); 392EXPORT_SYMBOL(woken_wake_function);
393
394int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
395{
396 struct wait_bit_key *key = arg;
397 struct wait_bit_queue *wait_bit
398 = container_of(wait, struct wait_bit_queue, wait);
399
400 if (wait_bit->key.flags != key->flags ||
401 wait_bit->key.bit_nr != key->bit_nr ||
402 test_bit(key->bit_nr, key->flags))
403 return 0;
404 else
405 return autoremove_wake_function(wait, mode, sync, key);
406}
407EXPORT_SYMBOL(wake_bit_function);
408
409/*
410 * To allow interruptible waiting and asynchronous (i.e. nonblocking)
411 * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
412 * permitted return codes. Nonzero return codes halt waiting and return.
413 */
414int __sched
415__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
416 wait_bit_action_f *action, unsigned mode)
417{
418 int ret = 0;
419
420 do {
421 prepare_to_wait(wq, &q->wait, mode);
422 if (test_bit(q->key.bit_nr, q->key.flags))
423 ret = (*action)(&q->key, mode);
424 } while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
425 finish_wait(wq, &q->wait);
426 return ret;
427}
428EXPORT_SYMBOL(__wait_on_bit);
429
430int __sched out_of_line_wait_on_bit(void *word, int bit,
431 wait_bit_action_f *action, unsigned mode)
432{
433 wait_queue_head_t *wq = bit_waitqueue(word, bit);
434 DEFINE_WAIT_BIT(wait, word, bit);
435
436 return __wait_on_bit(wq, &wait, action, mode);
437}
438EXPORT_SYMBOL(out_of_line_wait_on_bit);
439
440int __sched out_of_line_wait_on_bit_timeout(
441 void *word, int bit, wait_bit_action_f *action,
442 unsigned mode, unsigned long timeout)
443{
444 wait_queue_head_t *wq = bit_waitqueue(word, bit);
445 DEFINE_WAIT_BIT(wait, word, bit);
446
447 wait.key.timeout = jiffies + timeout;
448 return __wait_on_bit(wq, &wait, action, mode);
449}
450EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
451
452int __sched
453__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
454 wait_bit_action_f *action, unsigned mode)
455{
456 int ret = 0;
457
458 for (;;) {
459 prepare_to_wait_exclusive(wq, &q->wait, mode);
460 if (test_bit(q->key.bit_nr, q->key.flags)) {
461 ret = action(&q->key, mode);
462 /*
463 * See the comment in prepare_to_wait_event().
464 * finish_wait() does not necessarily takes wq->lock,
465 * but test_and_set_bit() implies mb() which pairs with
466 * smp_mb__after_atomic() before wake_up_page().
467 */
468 if (ret)
469 finish_wait(wq, &q->wait);
470 }
471 if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) {
472 if (!ret)
473 finish_wait(wq, &q->wait);
474 return 0;
475 } else if (ret) {
476 return ret;
477 }
478 }
479}
480EXPORT_SYMBOL(__wait_on_bit_lock);
481
482int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
483 wait_bit_action_f *action, unsigned mode)
484{
485 wait_queue_head_t *wq = bit_waitqueue(word, bit);
486 DEFINE_WAIT_BIT(wait, word, bit);
487
488 return __wait_on_bit_lock(wq, &wait, action, mode);
489}
490EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
491
492void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
493{
494 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
495 if (waitqueue_active(wq))
496 __wake_up(wq, TASK_NORMAL, 1, &key);
497}
498EXPORT_SYMBOL(__wake_up_bit);
499
500/**
501 * wake_up_bit - wake up a waiter on a bit
502 * @word: the word being waited on, a kernel virtual address
503 * @bit: the bit of the word being waited on
504 *
505 * There is a standard hashed waitqueue table for generic use. This
506 * is the part of the hashtable's accessor API that wakes up waiters
507 * on a bit. For instance, if one were to have waiters on a bitflag,
508 * one would call wake_up_bit() after clearing the bit.
509 *
510 * In order for this to function properly, as it uses waitqueue_active()
511 * internally, some kind of memory barrier must be done prior to calling
512 * this. Typically, this will be smp_mb__after_atomic(), but in some
513 * cases where bitflags are manipulated non-atomically under a lock, one
514 * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
515 * because spin_unlock() does not guarantee a memory barrier.
516 */
517void wake_up_bit(void *word, int bit)
518{
519 __wake_up_bit(bit_waitqueue(word, bit), word, bit);
520}
521EXPORT_SYMBOL(wake_up_bit);
522
523/*
524 * Manipulate the atomic_t address to produce a better bit waitqueue table hash
525 * index (we're keying off bit -1, but that would produce a horrible hash
526 * value).
527 */
528static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
529{
530 if (BITS_PER_LONG == 64) {
531 unsigned long q = (unsigned long)p;
532 return bit_waitqueue((void *)(q & ~1), q & 1);
533 }
534 return bit_waitqueue(p, 0);
535}
536
537static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
538 void *arg)
539{
540 struct wait_bit_key *key = arg;
541 struct wait_bit_queue *wait_bit
542 = container_of(wait, struct wait_bit_queue, wait);
543 atomic_t *val = key->flags;
544
545 if (wait_bit->key.flags != key->flags ||
546 wait_bit->key.bit_nr != key->bit_nr ||
547 atomic_read(val) != 0)
548 return 0;
549 return autoremove_wake_function(wait, mode, sync, key);
550}
551
552/*
553 * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
554 * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
555 * return codes halt waiting and return.
556 */
557static __sched
558int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
559 int (*action)(atomic_t *), unsigned mode)
560{
561 atomic_t *val;
562 int ret = 0;
563
564 do {
565 prepare_to_wait(wq, &q->wait, mode);
566 val = q->key.flags;
567 if (atomic_read(val) == 0)
568 break;
569 ret = (*action)(val);
570 } while (!ret && atomic_read(val) != 0);
571 finish_wait(wq, &q->wait);
572 return ret;
573}
574
575#define DEFINE_WAIT_ATOMIC_T(name, p) \
576 struct wait_bit_queue name = { \
577 .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
578 .wait = { \
579 .private = current, \
580 .func = wake_atomic_t_function, \
581 .task_list = \
582 LIST_HEAD_INIT((name).wait.task_list), \
583 }, \
584 }
585
586__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
587 unsigned mode)
588{
589 wait_queue_head_t *wq = atomic_t_waitqueue(p);
590 DEFINE_WAIT_ATOMIC_T(wait, p);
591
592 return __wait_on_atomic_t(wq, &wait, action, mode);
593}
594EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
595
596/**
597 * wake_up_atomic_t - Wake up a waiter on a atomic_t
598 * @p: The atomic_t being waited on, a kernel virtual address
599 *
600 * Wake up anyone waiting for the atomic_t to go to zero.
601 *
602 * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
603 * check is done by the waiter's wake function, not the by the waker itself).
604 */
605void wake_up_atomic_t(atomic_t *p)
606{
607 __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
608}
609EXPORT_SYMBOL(wake_up_atomic_t);
610
611__sched int bit_wait(struct wait_bit_key *word, int mode)
612{
613 schedule();
614 if (signal_pending_state(mode, current))
615 return -EINTR;
616 return 0;
617}
618EXPORT_SYMBOL(bit_wait);
619
620__sched int bit_wait_io(struct wait_bit_key *word, int mode)
621{
622 io_schedule();
623 if (signal_pending_state(mode, current))
624 return -EINTR;
625 return 0;
626}
627EXPORT_SYMBOL(bit_wait_io);
628
629__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
630{
631 unsigned long now = READ_ONCE(jiffies);
632 if (time_after_eq(now, word->timeout))
633 return -EAGAIN;
634 schedule_timeout(word->timeout - now);
635 if (signal_pending_state(mode, current))
636 return -EINTR;
637 return 0;
638}
639EXPORT_SYMBOL_GPL(bit_wait_timeout);
640
641__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
642{
643 unsigned long now = READ_ONCE(jiffies);
644 if (time_after_eq(now, word->timeout))
645 return -EAGAIN;
646 io_schedule_timeout(word->timeout - now);
647 if (signal_pending_state(mode, current))
648 return -EINTR;
649 return 0;
650}
651EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
new file mode 100644
index 000000000000..f8159698aa4d
--- /dev/null
+++ b/kernel/sched/wait_bit.c
@@ -0,0 +1,286 @@
1/*
2 * The implementation of the wait_bit*() and related waiting APIs:
3 */
4#include <linux/wait_bit.h>
5#include <linux/sched/signal.h>
6#include <linux/sched/debug.h>
7#include <linux/hash.h>
8
9#define WAIT_TABLE_BITS 8
10#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
11
12static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
13
14wait_queue_head_t *bit_waitqueue(void *word, int bit)
15{
16 const int shift = BITS_PER_LONG == 32 ? 5 : 6;
17 unsigned long val = (unsigned long)word << shift | bit;
18
19 return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
20}
21EXPORT_SYMBOL(bit_waitqueue);
22
23int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg)
24{
25 struct wait_bit_key *key = arg;
26 struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
27
28 if (wait_bit->key.flags != key->flags ||
29 wait_bit->key.bit_nr != key->bit_nr ||
30 test_bit(key->bit_nr, key->flags))
31 return 0;
32 else
33 return autoremove_wake_function(wq_entry, mode, sync, key);
34}
35EXPORT_SYMBOL(wake_bit_function);
36
37/*
38 * To allow interruptible waiting and asynchronous (i.e. nonblocking)
39 * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
40 * permitted return codes. Nonzero return codes halt waiting and return.
41 */
42int __sched
43__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
44 wait_bit_action_f *action, unsigned mode)
45{
46 int ret = 0;
47
48 do {
49 prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
50 if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
51 ret = (*action)(&wbq_entry->key, mode);
52 } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
53 finish_wait(wq_head, &wbq_entry->wq_entry);
54 return ret;
55}
56EXPORT_SYMBOL(__wait_on_bit);
57
58int __sched out_of_line_wait_on_bit(void *word, int bit,
59 wait_bit_action_f *action, unsigned mode)
60{
61 struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
62 DEFINE_WAIT_BIT(wq_entry, word, bit);
63
64 return __wait_on_bit(wq_head, &wq_entry, action, mode);
65}
66EXPORT_SYMBOL(out_of_line_wait_on_bit);
67
68int __sched out_of_line_wait_on_bit_timeout(
69 void *word, int bit, wait_bit_action_f *action,
70 unsigned mode, unsigned long timeout)
71{
72 struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
73 DEFINE_WAIT_BIT(wq_entry, word, bit);
74
75 wq_entry.key.timeout = jiffies + timeout;
76 return __wait_on_bit(wq_head, &wq_entry, action, mode);
77}
78EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
79
80int __sched
81__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
82 wait_bit_action_f *action, unsigned mode)
83{
84 int ret = 0;
85
86 for (;;) {
87 prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode);
88 if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) {
89 ret = action(&wbq_entry->key, mode);
90 /*
91 * See the comment in prepare_to_wait_event().
92 * finish_wait() does not necessarily takes wwq_head->lock,
93 * but test_and_set_bit() implies mb() which pairs with
94 * smp_mb__after_atomic() before wake_up_page().
95 */
96 if (ret)
97 finish_wait(wq_head, &wbq_entry->wq_entry);
98 }
99 if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) {
100 if (!ret)
101 finish_wait(wq_head, &wbq_entry->wq_entry);
102 return 0;
103 } else if (ret) {
104 return ret;
105 }
106 }
107}
108EXPORT_SYMBOL(__wait_on_bit_lock);
109
110int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
111 wait_bit_action_f *action, unsigned mode)
112{
113 struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
114 DEFINE_WAIT_BIT(wq_entry, word, bit);
115
116 return __wait_on_bit_lock(wq_head, &wq_entry, action, mode);
117}
118EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
119
120void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
121{
122 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
123 if (waitqueue_active(wq_head))
124 __wake_up(wq_head, TASK_NORMAL, 1, &key);
125}
126EXPORT_SYMBOL(__wake_up_bit);
127
128/**
129 * wake_up_bit - wake up a waiter on a bit
130 * @word: the word being waited on, a kernel virtual address
131 * @bit: the bit of the word being waited on
132 *
133 * There is a standard hashed waitqueue table for generic use. This
134 * is the part of the hashtable's accessor API that wakes up waiters
135 * on a bit. For instance, if one were to have waiters on a bitflag,
136 * one would call wake_up_bit() after clearing the bit.
137 *
138 * In order for this to function properly, as it uses waitqueue_active()
139 * internally, some kind of memory barrier must be done prior to calling
140 * this. Typically, this will be smp_mb__after_atomic(), but in some
141 * cases where bitflags are manipulated non-atomically under a lock, one
142 * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
143 * because spin_unlock() does not guarantee a memory barrier.
144 */
145void wake_up_bit(void *word, int bit)
146{
147 __wake_up_bit(bit_waitqueue(word, bit), word, bit);
148}
149EXPORT_SYMBOL(wake_up_bit);
150
151/*
152 * Manipulate the atomic_t address to produce a better bit waitqueue table hash
153 * index (we're keying off bit -1, but that would produce a horrible hash
154 * value).
155 */
156static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
157{
158 if (BITS_PER_LONG == 64) {
159 unsigned long q = (unsigned long)p;
160 return bit_waitqueue((void *)(q & ~1), q & 1);
161 }
162 return bit_waitqueue(p, 0);
163}
164
165static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync,
166 void *arg)
167{
168 struct wait_bit_key *key = arg;
169 struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
170 atomic_t *val = key->flags;
171
172 if (wait_bit->key.flags != key->flags ||
173 wait_bit->key.bit_nr != key->bit_nr ||
174 atomic_read(val) != 0)
175 return 0;
176 return autoremove_wake_function(wq_entry, mode, sync, key);
177}
178
179/*
180 * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
181 * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
182 * return codes halt waiting and return.
183 */
184static __sched
185int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
186 int (*action)(atomic_t *), unsigned mode)
187{
188 atomic_t *val;
189 int ret = 0;
190
191 do {
192 prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
193 val = wbq_entry->key.flags;
194 if (atomic_read(val) == 0)
195 break;
196 ret = (*action)(val);
197 } while (!ret && atomic_read(val) != 0);
198 finish_wait(wq_head, &wbq_entry->wq_entry);
199 return ret;
200}
201
202#define DEFINE_WAIT_ATOMIC_T(name, p) \
203 struct wait_bit_queue_entry name = { \
204 .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
205 .wq_entry = { \
206 .private = current, \
207 .func = wake_atomic_t_function, \
208 .entry = \
209 LIST_HEAD_INIT((name).wq_entry.entry), \
210 }, \
211 }
212
213__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
214 unsigned mode)
215{
216 struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
217 DEFINE_WAIT_ATOMIC_T(wq_entry, p);
218
219 return __wait_on_atomic_t(wq_head, &wq_entry, action, mode);
220}
221EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
222
223/**
224 * wake_up_atomic_t - Wake up a waiter on a atomic_t
225 * @p: The atomic_t being waited on, a kernel virtual address
226 *
227 * Wake up anyone waiting for the atomic_t to go to zero.
228 *
229 * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
230 * check is done by the waiter's wake function, not the by the waker itself).
231 */
232void wake_up_atomic_t(atomic_t *p)
233{
234 __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
235}
236EXPORT_SYMBOL(wake_up_atomic_t);
237
238__sched int bit_wait(struct wait_bit_key *word, int mode)
239{
240 schedule();
241 if (signal_pending_state(mode, current))
242 return -EINTR;
243 return 0;
244}
245EXPORT_SYMBOL(bit_wait);
246
247__sched int bit_wait_io(struct wait_bit_key *word, int mode)
248{
249 io_schedule();
250 if (signal_pending_state(mode, current))
251 return -EINTR;
252 return 0;
253}
254EXPORT_SYMBOL(bit_wait_io);
255
256__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
257{
258 unsigned long now = READ_ONCE(jiffies);
259 if (time_after_eq(now, word->timeout))
260 return -EAGAIN;
261 schedule_timeout(word->timeout - now);
262 if (signal_pending_state(mode, current))
263 return -EINTR;
264 return 0;
265}
266EXPORT_SYMBOL_GPL(bit_wait_timeout);
267
268__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
269{
270 unsigned long now = READ_ONCE(jiffies);
271 if (time_after_eq(now, word->timeout))
272 return -EAGAIN;
273 io_schedule_timeout(word->timeout - now);
274 if (signal_pending_state(mode, current))
275 return -EINTR;
276 return 0;
277}
278EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
279
280void __init wait_bit_init(void)
281{
282 int i;
283
284 for (i = 0; i < WAIT_TABLE_SIZE; i++)
285 init_waitqueue_head(bit_wait_table + i);
286}
diff --git a/kernel/signal.c b/kernel/signal.c
index ca92bcfeb322..caed9133ae52 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -39,6 +39,7 @@
39#include <linux/compat.h> 39#include <linux/compat.h>
40#include <linux/cn_proc.h> 40#include <linux/cn_proc.h>
41#include <linux/compiler.h> 41#include <linux/compiler.h>
42#include <linux/posix-timers.h>
42 43
43#define CREATE_TRACE_POINTS 44#define CREATE_TRACE_POINTS
44#include <trace/events/signal.h> 45#include <trace/events/signal.h>
@@ -510,7 +511,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
510 return !tsk->ptrace; 511 return !tsk->ptrace;
511} 512}
512 513
513static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) 514static void collect_signal(int sig, struct sigpending *list, siginfo_t *info,
515 bool *resched_timer)
514{ 516{
515 struct sigqueue *q, *first = NULL; 517 struct sigqueue *q, *first = NULL;
516 518
@@ -532,6 +534,12 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
532still_pending: 534still_pending:
533 list_del_init(&first->list); 535 list_del_init(&first->list);
534 copy_siginfo(info, &first->info); 536 copy_siginfo(info, &first->info);
537
538 *resched_timer =
539 (first->flags & SIGQUEUE_PREALLOC) &&
540 (info->si_code == SI_TIMER) &&
541 (info->si_sys_private);
542
535 __sigqueue_free(first); 543 __sigqueue_free(first);
536 } else { 544 } else {
537 /* 545 /*
@@ -548,12 +556,12 @@ still_pending:
548} 556}
549 557
550static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, 558static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
551 siginfo_t *info) 559 siginfo_t *info, bool *resched_timer)
552{ 560{
553 int sig = next_signal(pending, mask); 561 int sig = next_signal(pending, mask);
554 562
555 if (sig) 563 if (sig)
556 collect_signal(sig, pending, info); 564 collect_signal(sig, pending, info, resched_timer);
557 return sig; 565 return sig;
558} 566}
559 567
@@ -565,15 +573,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
565 */ 573 */
566int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) 574int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
567{ 575{
576 bool resched_timer = false;
568 int signr; 577 int signr;
569 578
570 /* We only dequeue private signals from ourselves, we don't let 579 /* We only dequeue private signals from ourselves, we don't let
571 * signalfd steal them 580 * signalfd steal them
572 */ 581 */
573 signr = __dequeue_signal(&tsk->pending, mask, info); 582 signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
574 if (!signr) { 583 if (!signr) {
575 signr = __dequeue_signal(&tsk->signal->shared_pending, 584 signr = __dequeue_signal(&tsk->signal->shared_pending,
576 mask, info); 585 mask, info, &resched_timer);
577#ifdef CONFIG_POSIX_TIMERS 586#ifdef CONFIG_POSIX_TIMERS
578 /* 587 /*
579 * itimer signal ? 588 * itimer signal ?
@@ -621,7 +630,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
621 current->jobctl |= JOBCTL_STOP_DEQUEUED; 630 current->jobctl |= JOBCTL_STOP_DEQUEUED;
622 } 631 }
623#ifdef CONFIG_POSIX_TIMERS 632#ifdef CONFIG_POSIX_TIMERS
624 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { 633 if (resched_timer) {
625 /* 634 /*
626 * Release the siglock to ensure proper locking order 635 * Release the siglock to ensure proper locking order
627 * of timer locks outside of siglocks. Note, we leave 636 * of timer locks outside of siglocks. Note, we leave
@@ -629,7 +638,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
629 * about to disable them again anyway. 638 * about to disable them again anyway.
630 */ 639 */
631 spin_unlock(&tsk->sighand->siglock); 640 spin_unlock(&tsk->sighand->siglock);
632 do_schedule_next_timer(info); 641 posixtimer_rearm(info);
633 spin_lock(&tsk->sighand->siglock); 642 spin_lock(&tsk->sighand->siglock);
634 } 643 }
635#endif 644#endif
@@ -1393,6 +1402,10 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1393 return ret; 1402 return ret;
1394 } 1403 }
1395 1404
1405 /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */
1406 if (pid == INT_MIN)
1407 return -ESRCH;
1408
1396 read_lock(&tasklist_lock); 1409 read_lock(&tasklist_lock);
1397 if (pid != -1) { 1410 if (pid != -1) {
1398 ret = __kill_pgrp_info(sig, info, 1411 ret = __kill_pgrp_info(sig, info,
@@ -2092,7 +2105,6 @@ static void do_jobctl_trap(void)
2092 2105
2093static int ptrace_signal(int signr, siginfo_t *info) 2106static int ptrace_signal(int signr, siginfo_t *info)
2094{ 2107{
2095 ptrace_signal_deliver();
2096 /* 2108 /*
2097 * We do not check sig_kernel_stop(signr) but set this marker 2109 * We do not check sig_kernel_stop(signr) but set this marker
2098 * unconditionally because we do not know whether debugger will 2110 * unconditionally because we do not know whether debugger will
@@ -2768,7 +2780,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
2768 * @info: if non-null, the signal's siginfo is returned here 2780 * @info: if non-null, the signal's siginfo is returned here
2769 * @ts: upper bound on process time suspension 2781 * @ts: upper bound on process time suspension
2770 */ 2782 */
2771int do_sigtimedwait(const sigset_t *which, siginfo_t *info, 2783static int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2772 const struct timespec *ts) 2784 const struct timespec *ts)
2773{ 2785{
2774 ktime_t *to = NULL, timeout = KTIME_MAX; 2786 ktime_t *to = NULL, timeout = KTIME_MAX;
@@ -2857,6 +2869,40 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2857 return ret; 2869 return ret;
2858} 2870}
2859 2871
2872#ifdef CONFIG_COMPAT
2873COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
2874 struct compat_siginfo __user *, uinfo,
2875 struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
2876{
2877 compat_sigset_t s32;
2878 sigset_t s;
2879 struct timespec t;
2880 siginfo_t info;
2881 long ret;
2882
2883 if (sigsetsize != sizeof(sigset_t))
2884 return -EINVAL;
2885
2886 if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
2887 return -EFAULT;
2888 sigset_from_compat(&s, &s32);
2889
2890 if (uts) {
2891 if (compat_get_timespec(&t, uts))
2892 return -EFAULT;
2893 }
2894
2895 ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
2896
2897 if (ret > 0 && uinfo) {
2898 if (copy_siginfo_to_user32(uinfo, &info))
2899 ret = -EFAULT;
2900 }
2901
2902 return ret;
2903}
2904#endif
2905
2860/** 2906/**
2861 * sys_kill - send a signal to a process 2907 * sys_kill - send a signal to a process
2862 * @pid: the PID of the process 2908 * @pid: the PID of the process
@@ -3113,78 +3159,68 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3113} 3159}
3114 3160
3115static int 3161static int
3116do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) 3162do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp)
3117{ 3163{
3118 stack_t oss; 3164 struct task_struct *t = current;
3119 int error;
3120 3165
3121 oss.ss_sp = (void __user *) current->sas_ss_sp; 3166 if (oss) {
3122 oss.ss_size = current->sas_ss_size; 3167 memset(oss, 0, sizeof(stack_t));
3123 oss.ss_flags = sas_ss_flags(sp) | 3168 oss->ss_sp = (void __user *) t->sas_ss_sp;
3124 (current->sas_ss_flags & SS_FLAG_BITS); 3169 oss->ss_size = t->sas_ss_size;
3170 oss->ss_flags = sas_ss_flags(sp) |
3171 (current->sas_ss_flags & SS_FLAG_BITS);
3172 }
3125 3173
3126 if (uss) { 3174 if (ss) {
3127 void __user *ss_sp; 3175 void __user *ss_sp = ss->ss_sp;
3128 size_t ss_size; 3176 size_t ss_size = ss->ss_size;
3129 unsigned ss_flags; 3177 unsigned ss_flags = ss->ss_flags;
3130 int ss_mode; 3178 int ss_mode;
3131 3179
3132 error = -EFAULT; 3180 if (unlikely(on_sig_stack(sp)))
3133 if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) 3181 return -EPERM;
3134 goto out;
3135 error = __get_user(ss_sp, &uss->ss_sp) |
3136 __get_user(ss_flags, &uss->ss_flags) |
3137 __get_user(ss_size, &uss->ss_size);
3138 if (error)
3139 goto out;
3140
3141 error = -EPERM;
3142 if (on_sig_stack(sp))
3143 goto out;
3144 3182
3145 ss_mode = ss_flags & ~SS_FLAG_BITS; 3183 ss_mode = ss_flags & ~SS_FLAG_BITS;
3146 error = -EINVAL; 3184 if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
3147 if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && 3185 ss_mode != 0))
3148 ss_mode != 0) 3186 return -EINVAL;
3149 goto out;
3150 3187
3151 if (ss_mode == SS_DISABLE) { 3188 if (ss_mode == SS_DISABLE) {
3152 ss_size = 0; 3189 ss_size = 0;
3153 ss_sp = NULL; 3190 ss_sp = NULL;
3154 } else { 3191 } else {
3155 error = -ENOMEM; 3192 if (unlikely(ss_size < MINSIGSTKSZ))
3156 if (ss_size < MINSIGSTKSZ) 3193 return -ENOMEM;
3157 goto out;
3158 } 3194 }
3159 3195
3160 current->sas_ss_sp = (unsigned long) ss_sp; 3196 t->sas_ss_sp = (unsigned long) ss_sp;
3161 current->sas_ss_size = ss_size; 3197 t->sas_ss_size = ss_size;
3162 current->sas_ss_flags = ss_flags; 3198 t->sas_ss_flags = ss_flags;
3163 } 3199 }
3164 3200 return 0;
3165 error = 0;
3166 if (uoss) {
3167 error = -EFAULT;
3168 if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
3169 goto out;
3170 error = __put_user(oss.ss_sp, &uoss->ss_sp) |
3171 __put_user(oss.ss_size, &uoss->ss_size) |
3172 __put_user(oss.ss_flags, &uoss->ss_flags);
3173 }
3174
3175out:
3176 return error;
3177} 3201}
3202
3178SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) 3203SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
3179{ 3204{
3180 return do_sigaltstack(uss, uoss, current_user_stack_pointer()); 3205 stack_t new, old;
3206 int err;
3207 if (uss && copy_from_user(&new, uss, sizeof(stack_t)))
3208 return -EFAULT;
3209 err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL,
3210 current_user_stack_pointer());
3211 if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t)))
3212 err = -EFAULT;
3213 return err;
3181} 3214}
3182 3215
3183int restore_altstack(const stack_t __user *uss) 3216int restore_altstack(const stack_t __user *uss)
3184{ 3217{
3185 int err = do_sigaltstack(uss, NULL, current_user_stack_pointer()); 3218 stack_t new;
3219 if (copy_from_user(&new, uss, sizeof(stack_t)))
3220 return -EFAULT;
3221 (void)do_sigaltstack(&new, NULL, current_user_stack_pointer());
3186 /* squash all but EFAULT for now */ 3222 /* squash all but EFAULT for now */
3187 return err == -EFAULT ? err : 0; 3223 return 0;
3188} 3224}
3189 3225
3190int __save_altstack(stack_t __user *uss, unsigned long sp) 3226int __save_altstack(stack_t __user *uss, unsigned long sp)
@@ -3207,29 +3243,24 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack,
3207{ 3243{
3208 stack_t uss, uoss; 3244 stack_t uss, uoss;
3209 int ret; 3245 int ret;
3210 mm_segment_t seg;
3211 3246
3212 if (uss_ptr) { 3247 if (uss_ptr) {
3213 compat_stack_t uss32; 3248 compat_stack_t uss32;
3214
3215 memset(&uss, 0, sizeof(stack_t));
3216 if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) 3249 if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
3217 return -EFAULT; 3250 return -EFAULT;
3218 uss.ss_sp = compat_ptr(uss32.ss_sp); 3251 uss.ss_sp = compat_ptr(uss32.ss_sp);
3219 uss.ss_flags = uss32.ss_flags; 3252 uss.ss_flags = uss32.ss_flags;
3220 uss.ss_size = uss32.ss_size; 3253 uss.ss_size = uss32.ss_size;
3221 } 3254 }
3222 seg = get_fs(); 3255 ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss,
3223 set_fs(KERNEL_DS);
3224 ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
3225 (stack_t __force __user *) &uoss,
3226 compat_user_stack_pointer()); 3256 compat_user_stack_pointer());
3227 set_fs(seg);
3228 if (ret >= 0 && uoss_ptr) { 3257 if (ret >= 0 && uoss_ptr) {
3229 if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) || 3258 compat_stack_t old;
3230 __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || 3259 memset(&old, 0, sizeof(old));
3231 __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || 3260 old.ss_sp = ptr_to_compat(uoss.ss_sp);
3232 __put_user(uoss.ss_size, &uoss_ptr->ss_size)) 3261 old.ss_flags = uoss.ss_flags;
3262 old.ss_size = uoss.ss_size;
3263 if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t)))
3233 ret = -EFAULT; 3264 ret = -EFAULT;
3234 } 3265 }
3235 return ret; 3266 return ret;
@@ -3269,6 +3300,18 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
3269 return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); 3300 return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));
3270} 3301}
3271 3302
3303#ifdef CONFIG_COMPAT
3304COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
3305{
3306 sigset_t set;
3307 int err = do_sigpending(&set, sizeof(old_sigset_t));
3308 if (err == 0)
3309 if (copy_to_user(set32, &set, sizeof(old_sigset_t)))
3310 err = -EFAULT;
3311 return err;
3312}
3313#endif
3314
3272#endif 3315#endif
3273 3316
3274#ifdef __ARCH_WANT_SYS_SIGPROCMASK 3317#ifdef __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/kernel/smp.c b/kernel/smp.c
index a817769b53c0..3061483cb3ad 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -30,6 +30,7 @@ enum {
30struct call_function_data { 30struct call_function_data {
31 struct call_single_data __percpu *csd; 31 struct call_single_data __percpu *csd;
32 cpumask_var_t cpumask; 32 cpumask_var_t cpumask;
33 cpumask_var_t cpumask_ipi;
33}; 34};
34 35
35static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); 36static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
@@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu)
45 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, 46 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
46 cpu_to_node(cpu))) 47 cpu_to_node(cpu)))
47 return -ENOMEM; 48 return -ENOMEM;
49 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
50 cpu_to_node(cpu))) {
51 free_cpumask_var(cfd->cpumask);
52 return -ENOMEM;
53 }
48 cfd->csd = alloc_percpu(struct call_single_data); 54 cfd->csd = alloc_percpu(struct call_single_data);
49 if (!cfd->csd) { 55 if (!cfd->csd) {
50 free_cpumask_var(cfd->cpumask); 56 free_cpumask_var(cfd->cpumask);
57 free_cpumask_var(cfd->cpumask_ipi);
51 return -ENOMEM; 58 return -ENOMEM;
52 } 59 }
53 60
@@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu)
59 struct call_function_data *cfd = &per_cpu(cfd_data, cpu); 66 struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
60 67
61 free_cpumask_var(cfd->cpumask); 68 free_cpumask_var(cfd->cpumask);
69 free_cpumask_var(cfd->cpumask_ipi);
62 free_percpu(cfd->csd); 70 free_percpu(cfd->csd);
63 return 0; 71 return 0;
64} 72}
@@ -428,12 +436,13 @@ void smp_call_function_many(const struct cpumask *mask,
428 cfd = this_cpu_ptr(&cfd_data); 436 cfd = this_cpu_ptr(&cfd_data);
429 437
430 cpumask_and(cfd->cpumask, mask, cpu_online_mask); 438 cpumask_and(cfd->cpumask, mask, cpu_online_mask);
431 cpumask_clear_cpu(this_cpu, cfd->cpumask); 439 __cpumask_clear_cpu(this_cpu, cfd->cpumask);
432 440
433 /* Some callers race with other cpus changing the passed mask */ 441 /* Some callers race with other cpus changing the passed mask */
434 if (unlikely(!cpumask_weight(cfd->cpumask))) 442 if (unlikely(!cpumask_weight(cfd->cpumask)))
435 return; 443 return;
436 444
445 cpumask_clear(cfd->cpumask_ipi);
437 for_each_cpu(cpu, cfd->cpumask) { 446 for_each_cpu(cpu, cfd->cpumask) {
438 struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); 447 struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
439 448
@@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask,
442 csd->flags |= CSD_FLAG_SYNCHRONOUS; 451 csd->flags |= CSD_FLAG_SYNCHRONOUS;
443 csd->func = func; 452 csd->func = func;
444 csd->info = info; 453 csd->info = info;
445 llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); 454 if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
455 __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
446 } 456 }
447 457
448 /* Send a message to all CPUs in the map */ 458 /* Send a message to all CPUs in the map */
449 arch_send_call_function_ipi_mask(cfd->cpumask); 459 arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
450 460
451 if (wait) { 461 if (wait) {
452 for_each_cpu(cpu, cfd->cpumask) { 462 for_each_cpu(cpu, cfd->cpumask) {
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 1eb82661ecdb..b7591261652d 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -552,7 +552,8 @@ static int __init cpu_stop_init(void)
552} 552}
553early_initcall(cpu_stop_init); 553early_initcall(cpu_stop_init);
554 554
555static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) 555int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
556 const struct cpumask *cpus)
556{ 557{
557 struct multi_stop_data msdata = { 558 struct multi_stop_data msdata = {
558 .fn = fn, 559 .fn = fn,
@@ -561,6 +562,8 @@ static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cp
561 .active_cpus = cpus, 562 .active_cpus = cpus,
562 }; 563 };
563 564
565 lockdep_assert_cpus_held();
566
564 if (!stop_machine_initialized) { 567 if (!stop_machine_initialized) {
565 /* 568 /*
566 * Handle the case where stop_machine() is called 569 * Handle the case where stop_machine() is called
@@ -590,9 +593,9 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
590 int ret; 593 int ret;
591 594
592 /* No CPUs can come up or down during this. */ 595 /* No CPUs can come up or down during this. */
593 get_online_cpus(); 596 cpus_read_lock();
594 ret = __stop_machine(fn, data, cpus); 597 ret = stop_machine_cpuslocked(fn, data, cpus);
595 put_online_cpus(); 598 cpus_read_unlock();
596 return ret; 599 return ret;
597} 600}
598EXPORT_SYMBOL_GPL(stop_machine); 601EXPORT_SYMBOL_GPL(stop_machine);
diff --git a/kernel/sys.c b/kernel/sys.c
index 8a94b4eabcaa..2855ee73acd0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -886,7 +886,7 @@ SYSCALL_DEFINE0(getegid)
886 return from_kgid_munged(current_user_ns(), current_egid()); 886 return from_kgid_munged(current_user_ns(), current_egid());
887} 887}
888 888
889void do_sys_times(struct tms *tms) 889static void do_sys_times(struct tms *tms)
890{ 890{
891 u64 tgutime, tgstime, cutime, cstime; 891 u64 tgutime, tgstime, cutime, cstime;
892 892
@@ -912,6 +912,32 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
912 return (long) jiffies_64_to_clock_t(get_jiffies_64()); 912 return (long) jiffies_64_to_clock_t(get_jiffies_64());
913} 913}
914 914
915#ifdef CONFIG_COMPAT
916static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
917{
918 return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
919}
920
921COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
922{
923 if (tbuf) {
924 struct tms tms;
925 struct compat_tms tmp;
926
927 do_sys_times(&tms);
928 /* Convert our struct tms to the compat version. */
929 tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
930 tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
931 tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
932 tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
933 if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
934 return -EFAULT;
935 }
936 force_successful_syscall_return();
937 return compat_jiffies_to_clock_t(jiffies);
938}
939#endif
940
915/* 941/*
916 * This needs some heavy checking ... 942 * This needs some heavy checking ...
917 * I just haven't the stomach for it. I also don't fully 943 * I just haven't the stomach for it. I also don't fully
@@ -1306,6 +1332,54 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1306 return ret; 1332 return ret;
1307} 1333}
1308 1334
1335#ifdef CONFIG_COMPAT
1336
1337COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
1338 struct compat_rlimit __user *, rlim)
1339{
1340 struct rlimit r;
1341 struct compat_rlimit r32;
1342
1343 if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit)))
1344 return -EFAULT;
1345
1346 if (r32.rlim_cur == COMPAT_RLIM_INFINITY)
1347 r.rlim_cur = RLIM_INFINITY;
1348 else
1349 r.rlim_cur = r32.rlim_cur;
1350 if (r32.rlim_max == COMPAT_RLIM_INFINITY)
1351 r.rlim_max = RLIM_INFINITY;
1352 else
1353 r.rlim_max = r32.rlim_max;
1354 return do_prlimit(current, resource, &r, NULL);
1355}
1356
1357COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
1358 struct compat_rlimit __user *, rlim)
1359{
1360 struct rlimit r;
1361 int ret;
1362
1363 ret = do_prlimit(current, resource, NULL, &r);
1364 if (!ret) {
1365 struct compat_rlimit r32;
1366 if (r.rlim_cur > COMPAT_RLIM_INFINITY)
1367 r32.rlim_cur = COMPAT_RLIM_INFINITY;
1368 else
1369 r32.rlim_cur = r.rlim_cur;
1370 if (r.rlim_max > COMPAT_RLIM_INFINITY)
1371 r32.rlim_max = COMPAT_RLIM_INFINITY;
1372 else
1373 r32.rlim_max = r.rlim_max;
1374
1375 if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit)))
1376 return -EFAULT;
1377 }
1378 return ret;
1379}
1380
1381#endif
1382
1309#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT 1383#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
1310 1384
1311/* 1385/*
@@ -1328,6 +1402,30 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1328 return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; 1402 return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
1329} 1403}
1330 1404
1405#ifdef CONFIG_COMPAT
1406COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1407 struct compat_rlimit __user *, rlim)
1408{
1409 struct rlimit r;
1410
1411 if (resource >= RLIM_NLIMITS)
1412 return -EINVAL;
1413
1414 task_lock(current->group_leader);
1415 r = current->signal->rlim[resource];
1416 task_unlock(current->group_leader);
1417 if (r.rlim_cur > 0x7FFFFFFF)
1418 r.rlim_cur = 0x7FFFFFFF;
1419 if (r.rlim_max > 0x7FFFFFFF)
1420 r.rlim_max = 0x7FFFFFFF;
1421
1422 if (put_user(r.rlim_cur, &rlim->rlim_cur) ||
1423 put_user(r.rlim_max, &rlim->rlim_max))
1424 return -EFAULT;
1425 return 0;
1426}
1427#endif
1428
1331#endif 1429#endif
1332 1430
1333static inline bool rlim64_is_infinity(__u64 rlim64) 1431static inline bool rlim64_is_infinity(__u64 rlim64)
@@ -1552,7 +1650,7 @@ static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
1552 r->ru_oublock += task_io_get_oublock(t); 1650 r->ru_oublock += task_io_get_oublock(t);
1553} 1651}
1554 1652
1555static void k_getrusage(struct task_struct *p, int who, struct rusage *r) 1653void getrusage(struct task_struct *p, int who, struct rusage *r)
1556{ 1654{
1557 struct task_struct *t; 1655 struct task_struct *t;
1558 unsigned long flags; 1656 unsigned long flags;
@@ -1626,20 +1724,16 @@ out:
1626 r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ 1724 r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
1627} 1725}
1628 1726
1629int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1727SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
1630{ 1728{
1631 struct rusage r; 1729 struct rusage r;
1632 1730
1633 k_getrusage(p, who, &r);
1634 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1635}
1636
1637SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
1638{
1639 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && 1731 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1640 who != RUSAGE_THREAD) 1732 who != RUSAGE_THREAD)
1641 return -EINVAL; 1733 return -EINVAL;
1642 return getrusage(current, who, ru); 1734
1735 getrusage(current, who, &r);
1736 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1643} 1737}
1644 1738
1645#ifdef CONFIG_COMPAT 1739#ifdef CONFIG_COMPAT
@@ -1651,7 +1745,7 @@ COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
1651 who != RUSAGE_THREAD) 1745 who != RUSAGE_THREAD)
1652 return -EINVAL; 1746 return -EINVAL;
1653 1747
1654 k_getrusage(current, who, &r); 1748 getrusage(current, who, &r);
1655 return put_compat_rusage(&r, ru); 1749 return put_compat_rusage(&r, ru);
1656} 1750}
1657#endif 1751#endif
@@ -2266,7 +2360,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2266 case PR_GET_THP_DISABLE: 2360 case PR_GET_THP_DISABLE:
2267 if (arg2 || arg3 || arg4 || arg5) 2361 if (arg2 || arg3 || arg4 || arg5)
2268 return -EINVAL; 2362 return -EINVAL;
2269 error = !!(me->mm->def_flags & VM_NOHUGEPAGE); 2363 error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags);
2270 break; 2364 break;
2271 case PR_SET_THP_DISABLE: 2365 case PR_SET_THP_DISABLE:
2272 if (arg3 || arg4 || arg5) 2366 if (arg3 || arg4 || arg5)
@@ -2274,9 +2368,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2274 if (down_write_killable(&me->mm->mmap_sem)) 2368 if (down_write_killable(&me->mm->mmap_sem))
2275 return -EINTR; 2369 return -EINTR;
2276 if (arg2) 2370 if (arg2)
2277 me->mm->def_flags |= VM_NOHUGEPAGE; 2371 set_bit(MMF_DISABLE_THP, &me->mm->flags);
2278 else 2372 else
2279 me->mm->def_flags &= ~VM_NOHUGEPAGE; 2373 clear_bit(MMF_DISABLE_THP, &me->mm->flags);
2280 up_write(&me->mm->mmap_sem); 2374 up_write(&me->mm->mmap_sem);
2281 break; 2375 break;
2282 case PR_MPX_ENABLE_MANAGEMENT: 2376 case PR_MPX_ENABLE_MANAGEMENT:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4dfba1a76cc3..6648fbbb8157 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -174,11 +174,32 @@ extern int no_unaligned_warning;
174 174
175#ifdef CONFIG_PROC_SYSCTL 175#ifdef CONFIG_PROC_SYSCTL
176 176
177#define SYSCTL_WRITES_LEGACY -1 177/**
178#define SYSCTL_WRITES_WARN 0 178 * enum sysctl_writes_mode - supported sysctl write modes
179#define SYSCTL_WRITES_STRICT 1 179 *
180 * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value
181 * to be written, and multiple writes on the same sysctl file descriptor
182 * will rewrite the sysctl value, regardless of file position. No warning
183 * is issued when the initial position is not 0.
184 * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is
185 * not 0.
186 * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at
187 * file position 0 and the value must be fully contained in the buffer
188 * sent to the write syscall. If dealing with strings respect the file
189 * position, but restrict this to the max length of the buffer, anything
190 * passed the max lenght will be ignored. Multiple writes will append
191 * to the buffer.
192 *
193 * These write modes control how current file position affects the behavior of
194 * updating sysctl values through the proc interface on each write.
195 */
196enum sysctl_writes_mode {
197 SYSCTL_WRITES_LEGACY = -1,
198 SYSCTL_WRITES_WARN = 0,
199 SYSCTL_WRITES_STRICT = 1,
200};
180 201
181static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; 202static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
182 203
183static int proc_do_cad_pid(struct ctl_table *table, int write, 204static int proc_do_cad_pid(struct ctl_table *table, int write,
184 void __user *buffer, size_t *lenp, loff_t *ppos); 205 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -880,6 +901,14 @@ static struct ctl_table kern_table[] = {
880#endif 901#endif
881 }, 902 },
882 { 903 {
904 .procname = "watchdog_cpumask",
905 .data = &watchdog_cpumask_bits,
906 .maxlen = NR_CPUS,
907 .mode = 0644,
908 .proc_handler = proc_watchdog_cpumask,
909 },
910#ifdef CONFIG_SOFTLOCKUP_DETECTOR
911 {
883 .procname = "soft_watchdog", 912 .procname = "soft_watchdog",
884 .data = &soft_watchdog_enabled, 913 .data = &soft_watchdog_enabled,
885 .maxlen = sizeof (int), 914 .maxlen = sizeof (int),
@@ -889,13 +918,6 @@ static struct ctl_table kern_table[] = {
889 .extra2 = &one, 918 .extra2 = &one,
890 }, 919 },
891 { 920 {
892 .procname = "watchdog_cpumask",
893 .data = &watchdog_cpumask_bits,
894 .maxlen = NR_CPUS,
895 .mode = 0644,
896 .proc_handler = proc_watchdog_cpumask,
897 },
898 {
899 .procname = "softlockup_panic", 921 .procname = "softlockup_panic",
900 .data = &softlockup_panic, 922 .data = &softlockup_panic,
901 .maxlen = sizeof(int), 923 .maxlen = sizeof(int),
@@ -904,27 +926,29 @@ static struct ctl_table kern_table[] = {
904 .extra1 = &zero, 926 .extra1 = &zero,
905 .extra2 = &one, 927 .extra2 = &one,
906 }, 928 },
907#ifdef CONFIG_HARDLOCKUP_DETECTOR 929#ifdef CONFIG_SMP
908 { 930 {
909 .procname = "hardlockup_panic", 931 .procname = "softlockup_all_cpu_backtrace",
910 .data = &hardlockup_panic, 932 .data = &sysctl_softlockup_all_cpu_backtrace,
911 .maxlen = sizeof(int), 933 .maxlen = sizeof(int),
912 .mode = 0644, 934 .mode = 0644,
913 .proc_handler = proc_dointvec_minmax, 935 .proc_handler = proc_dointvec_minmax,
914 .extra1 = &zero, 936 .extra1 = &zero,
915 .extra2 = &one, 937 .extra2 = &one,
916 }, 938 },
939#endif /* CONFIG_SMP */
917#endif 940#endif
918#ifdef CONFIG_SMP 941#ifdef CONFIG_HARDLOCKUP_DETECTOR
919 { 942 {
920 .procname = "softlockup_all_cpu_backtrace", 943 .procname = "hardlockup_panic",
921 .data = &sysctl_softlockup_all_cpu_backtrace, 944 .data = &hardlockup_panic,
922 .maxlen = sizeof(int), 945 .maxlen = sizeof(int),
923 .mode = 0644, 946 .mode = 0644,
924 .proc_handler = proc_dointvec_minmax, 947 .proc_handler = proc_dointvec_minmax,
925 .extra1 = &zero, 948 .extra1 = &zero,
926 .extra2 = &one, 949 .extra2 = &one,
927 }, 950 },
951#ifdef CONFIG_SMP
928 { 952 {
929 .procname = "hardlockup_all_cpu_backtrace", 953 .procname = "hardlockup_all_cpu_backtrace",
930 .data = &sysctl_hardlockup_all_cpu_backtrace, 954 .data = &sysctl_hardlockup_all_cpu_backtrace,
@@ -936,6 +960,8 @@ static struct ctl_table kern_table[] = {
936 }, 960 },
937#endif /* CONFIG_SMP */ 961#endif /* CONFIG_SMP */
938#endif 962#endif
963#endif
964
939#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 965#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
940 { 966 {
941 .procname = "unknown_nmi_panic", 967 .procname = "unknown_nmi_panic",
@@ -1950,6 +1976,32 @@ static void warn_sysctl_write(struct ctl_table *table)
1950} 1976}
1951 1977
1952/** 1978/**
1979 * proc_first_pos_non_zero_ignore - check if firs position is allowed
1980 * @ppos: file position
1981 * @table: the sysctl table
1982 *
1983 * Returns true if the first position is non-zero and the sysctl_writes_strict
1984 * mode indicates this is not allowed for numeric input types. String proc
1985 * hadlers can ignore the return value.
1986 */
1987static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
1988 struct ctl_table *table)
1989{
1990 if (!*ppos)
1991 return false;
1992
1993 switch (sysctl_writes_strict) {
1994 case SYSCTL_WRITES_STRICT:
1995 return true;
1996 case SYSCTL_WRITES_WARN:
1997 warn_sysctl_write(table);
1998 return false;
1999 default:
2000 return false;
2001 }
2002}
2003
2004/**
1953 * proc_dostring - read a string sysctl 2005 * proc_dostring - read a string sysctl
1954 * @table: the sysctl table 2006 * @table: the sysctl table
1955 * @write: %TRUE if this is a write to the sysctl file 2007 * @write: %TRUE if this is a write to the sysctl file
@@ -1969,8 +2021,8 @@ static void warn_sysctl_write(struct ctl_table *table)
1969int proc_dostring(struct ctl_table *table, int write, 2021int proc_dostring(struct ctl_table *table, int write,
1970 void __user *buffer, size_t *lenp, loff_t *ppos) 2022 void __user *buffer, size_t *lenp, loff_t *ppos)
1971{ 2023{
1972 if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) 2024 if (write)
1973 warn_sysctl_write(table); 2025 proc_first_pos_non_zero_ignore(ppos, table);
1974 2026
1975 return _proc_do_string((char *)(table->data), table->maxlen, write, 2027 return _proc_do_string((char *)(table->data), table->maxlen, write,
1976 (char __user *)buffer, lenp, ppos); 2028 (char __user *)buffer, lenp, ppos);
@@ -2128,19 +2180,18 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
2128 return 0; 2180 return 0;
2129} 2181}
2130 2182
2131static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, 2183static int do_proc_douintvec_conv(unsigned long *lvalp,
2132 int *valp, 2184 unsigned int *valp,
2133 int write, void *data) 2185 int write, void *data)
2134{ 2186{
2135 if (write) { 2187 if (write) {
2136 if (*negp) 2188 if (*lvalp > UINT_MAX)
2137 return -EINVAL; 2189 return -EINVAL;
2138 if (*lvalp > UINT_MAX) 2190 if (*lvalp > UINT_MAX)
2139 return -EINVAL; 2191 return -EINVAL;
2140 *valp = *lvalp; 2192 *valp = *lvalp;
2141 } else { 2193 } else {
2142 unsigned int val = *valp; 2194 unsigned int val = *valp;
2143 *negp = false;
2144 *lvalp = (unsigned long)val; 2195 *lvalp = (unsigned long)val;
2145 } 2196 }
2146 return 0; 2197 return 0;
@@ -2172,17 +2223,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2172 conv = do_proc_dointvec_conv; 2223 conv = do_proc_dointvec_conv;
2173 2224
2174 if (write) { 2225 if (write) {
2175 if (*ppos) { 2226 if (proc_first_pos_non_zero_ignore(ppos, table))
2176 switch (sysctl_writes_strict) { 2227 goto out;
2177 case SYSCTL_WRITES_STRICT:
2178 goto out;
2179 case SYSCTL_WRITES_WARN:
2180 warn_sysctl_write(table);
2181 break;
2182 default:
2183 break;
2184 }
2185 }
2186 2228
2187 if (left > PAGE_SIZE - 1) 2229 if (left > PAGE_SIZE - 1)
2188 left = PAGE_SIZE - 1; 2230 left = PAGE_SIZE - 1;
@@ -2249,6 +2291,146 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
2249 buffer, lenp, ppos, conv, data); 2291 buffer, lenp, ppos, conv, data);
2250} 2292}
2251 2293
2294static int do_proc_douintvec_w(unsigned int *tbl_data,
2295 struct ctl_table *table,
2296 void __user *buffer,
2297 size_t *lenp, loff_t *ppos,
2298 int (*conv)(unsigned long *lvalp,
2299 unsigned int *valp,
2300 int write, void *data),
2301 void *data)
2302{
2303 unsigned long lval;
2304 int err = 0;
2305 size_t left;
2306 bool neg;
2307 char *kbuf = NULL, *p;
2308
2309 left = *lenp;
2310
2311 if (proc_first_pos_non_zero_ignore(ppos, table))
2312 goto bail_early;
2313
2314 if (left > PAGE_SIZE - 1)
2315 left = PAGE_SIZE - 1;
2316
2317 p = kbuf = memdup_user_nul(buffer, left);
2318 if (IS_ERR(kbuf))
2319 return -EINVAL;
2320
2321 left -= proc_skip_spaces(&p);
2322 if (!left) {
2323 err = -EINVAL;
2324 goto out_free;
2325 }
2326
2327 err = proc_get_long(&p, &left, &lval, &neg,
2328 proc_wspace_sep,
2329 sizeof(proc_wspace_sep), NULL);
2330 if (err || neg) {
2331 err = -EINVAL;
2332 goto out_free;
2333 }
2334
2335 if (conv(&lval, tbl_data, 1, data)) {
2336 err = -EINVAL;
2337 goto out_free;
2338 }
2339
2340 if (!err && left)
2341 left -= proc_skip_spaces(&p);
2342
2343out_free:
2344 kfree(kbuf);
2345 if (err)
2346 return -EINVAL;
2347
2348 return 0;
2349
2350 /* This is in keeping with old __do_proc_dointvec() */
2351bail_early:
2352 *ppos += *lenp;
2353 return err;
2354}
2355
2356static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer,
2357 size_t *lenp, loff_t *ppos,
2358 int (*conv)(unsigned long *lvalp,
2359 unsigned int *valp,
2360 int write, void *data),
2361 void *data)
2362{
2363 unsigned long lval;
2364 int err = 0;
2365 size_t left;
2366
2367 left = *lenp;
2368
2369 if (conv(&lval, tbl_data, 0, data)) {
2370 err = -EINVAL;
2371 goto out;
2372 }
2373
2374 err = proc_put_long(&buffer, &left, lval, false);
2375 if (err || !left)
2376 goto out;
2377
2378 err = proc_put_char(&buffer, &left, '\n');
2379
2380out:
2381 *lenp -= left;
2382 *ppos += *lenp;
2383
2384 return err;
2385}
2386
2387static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
2388 int write, void __user *buffer,
2389 size_t *lenp, loff_t *ppos,
2390 int (*conv)(unsigned long *lvalp,
2391 unsigned int *valp,
2392 int write, void *data),
2393 void *data)
2394{
2395 unsigned int *i, vleft;
2396
2397 if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
2398 *lenp = 0;
2399 return 0;
2400 }
2401
2402 i = (unsigned int *) tbl_data;
2403 vleft = table->maxlen / sizeof(*i);
2404
2405 /*
2406 * Arrays are not supported, keep this simple. *Do not* add
2407 * support for them.
2408 */
2409 if (vleft != 1) {
2410 *lenp = 0;
2411 return -EINVAL;
2412 }
2413
2414 if (!conv)
2415 conv = do_proc_douintvec_conv;
2416
2417 if (write)
2418 return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
2419 conv, data);
2420 return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
2421}
2422
2423static int do_proc_douintvec(struct ctl_table *table, int write,
2424 void __user *buffer, size_t *lenp, loff_t *ppos,
2425 int (*conv)(unsigned long *lvalp,
2426 unsigned int *valp,
2427 int write, void *data),
2428 void *data)
2429{
2430 return __do_proc_douintvec(table->data, table, write,
2431 buffer, lenp, ppos, conv, data);
2432}
2433
2252/** 2434/**
2253 * proc_dointvec - read a vector of integers 2435 * proc_dointvec - read a vector of integers
2254 * @table: the sysctl table 2436 * @table: the sysctl table
@@ -2284,8 +2466,8 @@ int proc_dointvec(struct ctl_table *table, int write,
2284int proc_douintvec(struct ctl_table *table, int write, 2466int proc_douintvec(struct ctl_table *table, int write,
2285 void __user *buffer, size_t *lenp, loff_t *ppos) 2467 void __user *buffer, size_t *lenp, loff_t *ppos)
2286{ 2468{
2287 return do_proc_dointvec(table, write, buffer, lenp, ppos, 2469 return do_proc_douintvec(table, write, buffer, lenp, ppos,
2288 do_proc_douintvec_conv, NULL); 2470 do_proc_douintvec_conv, NULL);
2289} 2471}
2290 2472
2291/* 2473/*
@@ -2390,6 +2572,65 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
2390 do_proc_dointvec_minmax_conv, &param); 2572 do_proc_dointvec_minmax_conv, &param);
2391} 2573}
2392 2574
2575struct do_proc_douintvec_minmax_conv_param {
2576 unsigned int *min;
2577 unsigned int *max;
2578};
2579
2580static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
2581 unsigned int *valp,
2582 int write, void *data)
2583{
2584 struct do_proc_douintvec_minmax_conv_param *param = data;
2585
2586 if (write) {
2587 unsigned int val = *lvalp;
2588
2589 if ((param->min && *param->min > val) ||
2590 (param->max && *param->max < val))
2591 return -ERANGE;
2592
2593 if (*lvalp > UINT_MAX)
2594 return -EINVAL;
2595 *valp = val;
2596 } else {
2597 unsigned int val = *valp;
2598 *lvalp = (unsigned long) val;
2599 }
2600
2601 return 0;
2602}
2603
2604/**
2605 * proc_douintvec_minmax - read a vector of unsigned ints with min/max values
2606 * @table: the sysctl table
2607 * @write: %TRUE if this is a write to the sysctl file
2608 * @buffer: the user buffer
2609 * @lenp: the size of the user buffer
2610 * @ppos: file position
2611 *
2612 * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
2613 * values from/to the user buffer, treated as an ASCII string. Negative
2614 * strings are not allowed.
2615 *
2616 * This routine will ensure the values are within the range specified by
2617 * table->extra1 (min) and table->extra2 (max). There is a final sanity
2618 * check for UINT_MAX to avoid having to support wrap around uses from
2619 * userspace.
2620 *
2621 * Returns 0 on success.
2622 */
2623int proc_douintvec_minmax(struct ctl_table *table, int write,
2624 void __user *buffer, size_t *lenp, loff_t *ppos)
2625{
2626 struct do_proc_douintvec_minmax_conv_param param = {
2627 .min = (unsigned int *) table->extra1,
2628 .max = (unsigned int *) table->extra2,
2629 };
2630 return do_proc_douintvec(table, write, buffer, lenp, ppos,
2631 do_proc_douintvec_minmax_conv, &param);
2632}
2633
2393static void validate_coredump_safety(void) 2634static void validate_coredump_safety(void)
2394{ 2635{
2395#ifdef CONFIG_COREDUMP 2636#ifdef CONFIG_COREDUMP
@@ -2447,17 +2688,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2447 left = *lenp; 2688 left = *lenp;
2448 2689
2449 if (write) { 2690 if (write) {
2450 if (*ppos) { 2691 if (proc_first_pos_non_zero_ignore(ppos, table))
2451 switch (sysctl_writes_strict) { 2692 goto out;
2452 case SYSCTL_WRITES_STRICT:
2453 goto out;
2454 case SYSCTL_WRITES_WARN:
2455 warn_sysctl_write(table);
2456 break;
2457 default:
2458 break;
2459 }
2460 }
2461 2693
2462 if (left > PAGE_SIZE - 1) 2694 if (left > PAGE_SIZE - 1)
2463 left = PAGE_SIZE - 1; 2695 left = PAGE_SIZE - 1;
@@ -2898,6 +3130,12 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
2898 return -ENOSYS; 3130 return -ENOSYS;
2899} 3131}
2900 3132
3133int proc_douintvec_minmax(struct ctl_table *table, int write,
3134 void __user *buffer, size_t *lenp, loff_t *ppos)
3135{
3136 return -ENOSYS;
3137}
3138
2901int proc_dointvec_jiffies(struct ctl_table *table, int write, 3139int proc_dointvec_jiffies(struct ctl_table *table, int write,
2902 void __user *buffer, size_t *lenp, loff_t *ppos) 3140 void __user *buffer, size_t *lenp, loff_t *ppos)
2903{ 3141{
@@ -2940,6 +3178,7 @@ EXPORT_SYMBOL(proc_dointvec);
2940EXPORT_SYMBOL(proc_douintvec); 3178EXPORT_SYMBOL(proc_douintvec);
2941EXPORT_SYMBOL(proc_dointvec_jiffies); 3179EXPORT_SYMBOL(proc_dointvec_jiffies);
2942EXPORT_SYMBOL(proc_dointvec_minmax); 3180EXPORT_SYMBOL(proc_dointvec_minmax);
3181EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
2943EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); 3182EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
2944EXPORT_SYMBOL(proc_dointvec_ms_jiffies); 3183EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
2945EXPORT_SYMBOL(proc_dostring); 3184EXPORT_SYMBOL(proc_dostring);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index ece4b177052b..02e1859f2ca8 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1119,7 +1119,7 @@ static ssize_t bin_uuid(struct file *file,
1119 /* Only supports reads */ 1119 /* Only supports reads */
1120 if (oldval && oldlen) { 1120 if (oldval && oldlen) {
1121 char buf[UUID_STRING_LEN + 1]; 1121 char buf[UUID_STRING_LEN + 1];
1122 uuid_be uuid; 1122 uuid_t uuid;
1123 1123
1124 result = kernel_read(file, 0, buf, sizeof(buf) - 1); 1124 result = kernel_read(file, 0, buf, sizeof(buf) - 1);
1125 if (result < 0) 1125 if (result < 0)
@@ -1128,7 +1128,7 @@ static ssize_t bin_uuid(struct file *file,
1128 buf[result] = '\0'; 1128 buf[result] = '\0';
1129 1129
1130 result = -EIO; 1130 result = -EIO;
1131 if (uuid_be_to_bin(buf, &uuid)) 1131 if (uuid_parse(buf, &uuid))
1132 goto out; 1132 goto out;
1133 1133
1134 if (oldlen > 16) 1134 if (oldlen > 16)
@@ -1346,7 +1346,7 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
1346 * CTL_KERN/KERN_VERSION is used by older glibc and cannot 1346 * CTL_KERN/KERN_VERSION is used by older glibc and cannot
1347 * ever go away. 1347 * ever go away.
1348 */ 1348 */
1349 if (name[0] == CTL_KERN && name[1] == KERN_VERSION) 1349 if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION)
1350 return; 1350 return;
1351 1351
1352 if (printk_ratelimit()) { 1352 if (printk_ratelimit()) {
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 4008d9f95dd7..ac09bc29eb08 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -126,56 +126,6 @@ config NO_HZ_FULL_ALL
126 Note the boot CPU will still be kept outside the range to 126 Note the boot CPU will still be kept outside the range to
127 handle the timekeeping duty. 127 handle the timekeeping duty.
128 128
129config NO_HZ_FULL_SYSIDLE
130 bool "Detect full-system idle state for full dynticks system"
131 depends on NO_HZ_FULL
132 default n
133 help
134 At least one CPU must keep the scheduling-clock tick running for
135 timekeeping purposes whenever there is a non-idle CPU, where
136 "non-idle" also includes dynticks CPUs as long as they are
137 running non-idle tasks. Because the underlying adaptive-tick
138 support cannot distinguish between all CPUs being idle and
139 all CPUs each running a single task in dynticks mode, the
140 underlying support simply ensures that there is always a CPU
141 handling the scheduling-clock tick, whether or not all CPUs
142 are idle. This Kconfig option enables scalable detection of
143 the all-CPUs-idle state, thus allowing the scheduling-clock
144 tick to be disabled when all CPUs are idle. Note that scalable
145 detection of the all-CPUs-idle state means that larger systems
146 will be slower to declare the all-CPUs-idle state.
147
148 Say Y if you would like to help debug all-CPUs-idle detection.
149
150 Say N if you are unsure.
151
152config NO_HZ_FULL_SYSIDLE_SMALL
153 int "Number of CPUs above which large-system approach is used"
154 depends on NO_HZ_FULL_SYSIDLE
155 range 1 NR_CPUS
156 default 8
157 help
158 The full-system idle detection mechanism takes a lazy approach
159 on large systems, as is required to attain decent scalability.
160 However, on smaller systems, scalability is not anywhere near as
161 large a concern as is energy efficiency. The sysidle subsystem
162 therefore uses a fast but non-scalable algorithm for small
163 systems and a lazier but scalable algorithm for large systems.
164 This Kconfig parameter defines the number of CPUs in the largest
165 system that will be considered to be "small".
166
167 The default value will be fine in most cases. Battery-powered
168 systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
169 numbers of CPUs, and (3) are suffering from battery-lifetime
170 problems due to long sysidle latencies might wish to experiment
171 with larger values for this Kconfig parameter. On the other
172 hand, they might be even better served by disabling NO_HZ_FULL
173 entirely, given that NO_HZ_FULL is intended for HPC and
174 real-time workloads that at present do not tend to be run on
175 battery-powered systems.
176
177 Take the default if you are unsure.
178
179config NO_HZ 129config NO_HZ
180 bool "Old Idle dynticks config" 130 bool "Old Idle dynticks config"
181 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS 131 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 5cb5b0008d97..0b8ff7d257ea 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -27,6 +27,9 @@
27#include <linux/posix-timers.h> 27#include <linux/posix-timers.h>
28#include <linux/workqueue.h> 28#include <linux/workqueue.h>
29#include <linux/freezer.h> 29#include <linux/freezer.h>
30#include <linux/compat.h>
31
32#include "posix-timers.h"
30 33
31#define CREATE_TRACE_POINTS 34#define CREATE_TRACE_POINTS
32#include <trace/events/alarmtimer.h> 35#include <trace/events/alarmtimer.h>
@@ -45,11 +48,13 @@ static struct alarm_base {
45 clockid_t base_clockid; 48 clockid_t base_clockid;
46} alarm_bases[ALARM_NUMTYPE]; 49} alarm_bases[ALARM_NUMTYPE];
47 50
51#if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS)
48/* freezer information to handle clock_nanosleep triggered wakeups */ 52/* freezer information to handle clock_nanosleep triggered wakeups */
49static enum alarmtimer_type freezer_alarmtype; 53static enum alarmtimer_type freezer_alarmtype;
50static ktime_t freezer_expires; 54static ktime_t freezer_expires;
51static ktime_t freezer_delta; 55static ktime_t freezer_delta;
52static DEFINE_SPINLOCK(freezer_delta_lock); 56static DEFINE_SPINLOCK(freezer_delta_lock);
57#endif
53 58
54static struct wakeup_source *ws; 59static struct wakeup_source *ws;
55 60
@@ -307,38 +312,6 @@ static int alarmtimer_resume(struct device *dev)
307} 312}
308#endif 313#endif
309 314
310static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
311{
312 struct alarm_base *base;
313 unsigned long flags;
314 ktime_t delta;
315
316 switch(type) {
317 case ALARM_REALTIME:
318 base = &alarm_bases[ALARM_REALTIME];
319 type = ALARM_REALTIME_FREEZER;
320 break;
321 case ALARM_BOOTTIME:
322 base = &alarm_bases[ALARM_BOOTTIME];
323 type = ALARM_BOOTTIME_FREEZER;
324 break;
325 default:
326 WARN_ONCE(1, "Invalid alarm type: %d\n", type);
327 return;
328 }
329
330 delta = ktime_sub(absexp, base->gettime());
331
332 spin_lock_irqsave(&freezer_delta_lock, flags);
333 if (!freezer_delta || (delta < freezer_delta)) {
334 freezer_delta = delta;
335 freezer_expires = absexp;
336 freezer_alarmtype = type;
337 }
338 spin_unlock_irqrestore(&freezer_delta_lock, flags);
339}
340
341
342/** 315/**
343 * alarm_init - Initialize an alarm structure 316 * alarm_init - Initialize an alarm structure
344 * @alarm: ptr to alarm to be initialized 317 * @alarm: ptr to alarm to be initialized
@@ -387,7 +360,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start)
387{ 360{
388 struct alarm_base *base = &alarm_bases[alarm->type]; 361 struct alarm_base *base = &alarm_bases[alarm->type];
389 362
390 start = ktime_add(start, base->gettime()); 363 start = ktime_add_safe(start, base->gettime());
391 alarm_start(alarm, start); 364 alarm_start(alarm, start);
392} 365}
393EXPORT_SYMBOL_GPL(alarm_start_relative); 366EXPORT_SYMBOL_GPL(alarm_start_relative);
@@ -475,7 +448,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
475 overrun++; 448 overrun++;
476 } 449 }
477 450
478 alarm->node.expires = ktime_add(alarm->node.expires, interval); 451 alarm->node.expires = ktime_add_safe(alarm->node.expires, interval);
479 return overrun; 452 return overrun;
480} 453}
481EXPORT_SYMBOL_GPL(alarm_forward); 454EXPORT_SYMBOL_GPL(alarm_forward);
@@ -488,6 +461,38 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
488} 461}
489EXPORT_SYMBOL_GPL(alarm_forward_now); 462EXPORT_SYMBOL_GPL(alarm_forward_now);
490 463
464#ifdef CONFIG_POSIX_TIMERS
465
466static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
467{
468 struct alarm_base *base;
469 unsigned long flags;
470 ktime_t delta;
471
472 switch(type) {
473 case ALARM_REALTIME:
474 base = &alarm_bases[ALARM_REALTIME];
475 type = ALARM_REALTIME_FREEZER;
476 break;
477 case ALARM_BOOTTIME:
478 base = &alarm_bases[ALARM_BOOTTIME];
479 type = ALARM_BOOTTIME_FREEZER;
480 break;
481 default:
482 WARN_ONCE(1, "Invalid alarm type: %d\n", type);
483 return;
484 }
485
486 delta = ktime_sub(absexp, base->gettime());
487
488 spin_lock_irqsave(&freezer_delta_lock, flags);
489 if (!freezer_delta || (delta < freezer_delta)) {
490 freezer_delta = delta;
491 freezer_expires = absexp;
492 freezer_alarmtype = type;
493 }
494 spin_unlock_irqrestore(&freezer_delta_lock, flags);
495}
491 496
492/** 497/**
493 * clock2alarm - helper that converts from clockid to alarmtypes 498 * clock2alarm - helper that converts from clockid to alarmtypes
@@ -511,22 +516,26 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
511static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, 516static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
512 ktime_t now) 517 ktime_t now)
513{ 518{
514 unsigned long flags;
515 struct k_itimer *ptr = container_of(alarm, struct k_itimer, 519 struct k_itimer *ptr = container_of(alarm, struct k_itimer,
516 it.alarm.alarmtimer); 520 it.alarm.alarmtimer);
517 enum alarmtimer_restart result = ALARMTIMER_NORESTART; 521 enum alarmtimer_restart result = ALARMTIMER_NORESTART;
522 unsigned long flags;
523 int si_private = 0;
518 524
519 spin_lock_irqsave(&ptr->it_lock, flags); 525 spin_lock_irqsave(&ptr->it_lock, flags);
520 if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) {
521 if (IS_ENABLED(CONFIG_POSIX_TIMERS) &&
522 posix_timer_event(ptr, 0) != 0)
523 ptr->it_overrun++;
524 }
525 526
526 /* Re-add periodic timers */ 527 ptr->it_active = 0;
527 if (ptr->it.alarm.interval) { 528 if (ptr->it_interval)
528 ptr->it_overrun += alarm_forward(alarm, now, 529 si_private = ++ptr->it_requeue_pending;
529 ptr->it.alarm.interval); 530
531 if (posix_timer_event(ptr, si_private) && ptr->it_interval) {
532 /*
533 * Handle ignored signals and rearm the timer. This will go
534 * away once we handle ignored signals proper.
535 */
536 ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval);
537 ++ptr->it_requeue_pending;
538 ptr->it_active = 1;
530 result = ALARMTIMER_RESTART; 539 result = ALARMTIMER_RESTART;
531 } 540 }
532 spin_unlock_irqrestore(&ptr->it_lock, flags); 541 spin_unlock_irqrestore(&ptr->it_lock, flags);
@@ -535,6 +544,72 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
535} 544}
536 545
537/** 546/**
547 * alarm_timer_rearm - Posix timer callback for rearming timer
548 * @timr: Pointer to the posixtimer data struct
549 */
550static void alarm_timer_rearm(struct k_itimer *timr)
551{
552 struct alarm *alarm = &timr->it.alarm.alarmtimer;
553
554 timr->it_overrun += alarm_forward_now(alarm, timr->it_interval);
555 alarm_start(alarm, alarm->node.expires);
556}
557
558/**
559 * alarm_timer_forward - Posix timer callback for forwarding timer
560 * @timr: Pointer to the posixtimer data struct
561 * @now: Current time to forward the timer against
562 */
563static int alarm_timer_forward(struct k_itimer *timr, ktime_t now)
564{
565 struct alarm *alarm = &timr->it.alarm.alarmtimer;
566
567 return (int) alarm_forward(alarm, timr->it_interval, now);
568}
569
570/**
571 * alarm_timer_remaining - Posix timer callback to retrieve remaining time
572 * @timr: Pointer to the posixtimer data struct
573 * @now: Current time to calculate against
574 */
575static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now)
576{
577 struct alarm *alarm = &timr->it.alarm.alarmtimer;
578
579 return ktime_sub(now, alarm->node.expires);
580}
581
582/**
583 * alarm_timer_try_to_cancel - Posix timer callback to cancel a timer
584 * @timr: Pointer to the posixtimer data struct
585 */
586static int alarm_timer_try_to_cancel(struct k_itimer *timr)
587{
588 return alarm_try_to_cancel(&timr->it.alarm.alarmtimer);
589}
590
591/**
592 * alarm_timer_arm - Posix timer callback to arm a timer
593 * @timr: Pointer to the posixtimer data struct
594 * @expires: The new expiry time
595 * @absolute: Expiry value is absolute time
596 * @sigev_none: Posix timer does not deliver signals
597 */
598static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
599 bool absolute, bool sigev_none)
600{
601 struct alarm *alarm = &timr->it.alarm.alarmtimer;
602 struct alarm_base *base = &alarm_bases[alarm->type];
603
604 if (!absolute)
605 expires = ktime_add_safe(expires, base->gettime());
606 if (sigev_none)
607 alarm->node.expires = expires;
608 else
609 alarm_start(&timr->it.alarm.alarmtimer, expires);
610}
611
612/**
538 * alarm_clock_getres - posix getres interface 613 * alarm_clock_getres - posix getres interface
539 * @which_clock: clockid 614 * @which_clock: clockid
540 * @tp: timespec to fill 615 * @tp: timespec to fill
@@ -591,89 +666,6 @@ static int alarm_timer_create(struct k_itimer *new_timer)
591} 666}
592 667
593/** 668/**
594 * alarm_timer_get - posix timer_get interface
595 * @new_timer: k_itimer pointer
596 * @cur_setting: itimerspec data to fill
597 *
598 * Copies out the current itimerspec data
599 */
600static void alarm_timer_get(struct k_itimer *timr,
601 struct itimerspec64 *cur_setting)
602{
603 ktime_t relative_expiry_time =
604 alarm_expires_remaining(&(timr->it.alarm.alarmtimer));
605
606 if (ktime_to_ns(relative_expiry_time) > 0) {
607 cur_setting->it_value = ktime_to_timespec64(relative_expiry_time);
608 } else {
609 cur_setting->it_value.tv_sec = 0;
610 cur_setting->it_value.tv_nsec = 0;
611 }
612
613 cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval);
614}
615
616/**
617 * alarm_timer_del - posix timer_del interface
618 * @timr: k_itimer pointer to be deleted
619 *
620 * Cancels any programmed alarms for the given timer.
621 */
622static int alarm_timer_del(struct k_itimer *timr)
623{
624 if (!rtcdev)
625 return -ENOTSUPP;
626
627 if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
628 return TIMER_RETRY;
629
630 return 0;
631}
632
633/**
634 * alarm_timer_set - posix timer_set interface
635 * @timr: k_itimer pointer to be deleted
636 * @flags: timer flags
637 * @new_setting: itimerspec to be used
638 * @old_setting: itimerspec being replaced
639 *
640 * Sets the timer to new_setting, and starts the timer.
641 */
642static int alarm_timer_set(struct k_itimer *timr, int flags,
643 struct itimerspec64 *new_setting,
644 struct itimerspec64 *old_setting)
645{
646 ktime_t exp;
647
648 if (!rtcdev)
649 return -ENOTSUPP;
650
651 if (flags & ~TIMER_ABSTIME)
652 return -EINVAL;
653
654 if (old_setting)
655 alarm_timer_get(timr, old_setting);
656
657 /* If the timer was already set, cancel it */
658 if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
659 return TIMER_RETRY;
660
661 /* start the timer */
662 timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval);
663 exp = timespec64_to_ktime(new_setting->it_value);
664 /* Convert (if necessary) to absolute time */
665 if (flags != TIMER_ABSTIME) {
666 ktime_t now;
667
668 now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
669 exp = ktime_add(now, exp);
670 }
671
672 alarm_start(&timr->it.alarm.alarmtimer, exp);
673 return 0;
674}
675
676/**
677 * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep 669 * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
678 * @alarm: ptr to alarm that fired 670 * @alarm: ptr to alarm that fired
679 * 671 *
@@ -697,8 +689,10 @@ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
697 * 689 *
698 * Sets the alarm timer and sleeps until it is fired or interrupted. 690 * Sets the alarm timer and sleeps until it is fired or interrupted.
699 */ 691 */
700static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) 692static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
693 enum alarmtimer_type type)
701{ 694{
695 struct restart_block *restart;
702 alarm->data = (void *)current; 696 alarm->data = (void *)current;
703 do { 697 do {
704 set_current_state(TASK_INTERRUPTIBLE); 698 set_current_state(TASK_INTERRUPTIBLE);
@@ -711,36 +705,25 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
711 705
712 __set_current_state(TASK_RUNNING); 706 __set_current_state(TASK_RUNNING);
713 707
714 return (alarm->data == NULL); 708 if (!alarm->data)
715}
716
717
718/**
719 * update_rmtp - Update remaining timespec value
720 * @exp: expiration time
721 * @type: timer type
722 * @rmtp: user pointer to remaining timepsec value
723 *
724 * Helper function that fills in rmtp value with time between
725 * now and the exp value
726 */
727static int update_rmtp(ktime_t exp, enum alarmtimer_type type,
728 struct timespec __user *rmtp)
729{
730 struct timespec rmt;
731 ktime_t rem;
732
733 rem = ktime_sub(exp, alarm_bases[type].gettime());
734
735 if (rem <= 0)
736 return 0; 709 return 0;
737 rmt = ktime_to_timespec(rem);
738 710
739 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) 711 if (freezing(current))
740 return -EFAULT; 712 alarmtimer_freezerset(absexp, type);
713 restart = &current->restart_block;
714 if (restart->nanosleep.type != TT_NONE) {
715 struct timespec64 rmt;
716 ktime_t rem;
717
718 rem = ktime_sub(absexp, alarm_bases[type].gettime());
741 719
742 return 1; 720 if (rem <= 0)
721 return 0;
722 rmt = ktime_to_timespec64(rem);
743 723
724 return nanosleep_copyout(restart, &rmt);
725 }
726 return -ERESTART_RESTARTBLOCK;
744} 727}
745 728
746/** 729/**
@@ -752,32 +735,12 @@ static int update_rmtp(ktime_t exp, enum alarmtimer_type type,
752static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) 735static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
753{ 736{
754 enum alarmtimer_type type = restart->nanosleep.clockid; 737 enum alarmtimer_type type = restart->nanosleep.clockid;
755 ktime_t exp; 738 ktime_t exp = restart->nanosleep.expires;
756 struct timespec __user *rmtp;
757 struct alarm alarm; 739 struct alarm alarm;
758 int ret = 0;
759 740
760 exp = restart->nanosleep.expires;
761 alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); 741 alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
762 742
763 if (alarmtimer_do_nsleep(&alarm, exp)) 743 return alarmtimer_do_nsleep(&alarm, exp, type);
764 goto out;
765
766 if (freezing(current))
767 alarmtimer_freezerset(exp, type);
768
769 rmtp = restart->nanosleep.rmtp;
770 if (rmtp) {
771 ret = update_rmtp(exp, type, rmtp);
772 if (ret <= 0)
773 goto out;
774 }
775
776
777 /* The other values in restart are already filled in */
778 ret = -ERESTART_RESTARTBLOCK;
779out:
780 return ret;
781} 744}
782 745
783/** 746/**
@@ -790,11 +753,10 @@ out:
790 * Handles clock_nanosleep calls against _ALARM clockids 753 * Handles clock_nanosleep calls against _ALARM clockids
791 */ 754 */
792static int alarm_timer_nsleep(const clockid_t which_clock, int flags, 755static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
793 struct timespec64 *tsreq, 756 const struct timespec64 *tsreq)
794 struct timespec __user *rmtp)
795{ 757{
796 enum alarmtimer_type type = clock2alarm(which_clock); 758 enum alarmtimer_type type = clock2alarm(which_clock);
797 struct restart_block *restart; 759 struct restart_block *restart = &current->restart_block;
798 struct alarm alarm; 760 struct alarm alarm;
799 ktime_t exp; 761 ktime_t exp;
800 int ret = 0; 762 int ret = 0;
@@ -817,35 +779,36 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
817 exp = ktime_add(now, exp); 779 exp = ktime_add(now, exp);
818 } 780 }
819 781
820 if (alarmtimer_do_nsleep(&alarm, exp)) 782 ret = alarmtimer_do_nsleep(&alarm, exp, type);
821 goto out; 783 if (ret != -ERESTART_RESTARTBLOCK)
822 784 return ret;
823 if (freezing(current))
824 alarmtimer_freezerset(exp, type);
825 785
826 /* abs timers don't set remaining time or restart */ 786 /* abs timers don't set remaining time or restart */
827 if (flags == TIMER_ABSTIME) { 787 if (flags == TIMER_ABSTIME)
828 ret = -ERESTARTNOHAND; 788 return -ERESTARTNOHAND;
829 goto out;
830 }
831 789
832 if (rmtp) {
833 ret = update_rmtp(exp, type, rmtp);
834 if (ret <= 0)
835 goto out;
836 }
837
838 restart = &current->restart_block;
839 restart->fn = alarm_timer_nsleep_restart; 790 restart->fn = alarm_timer_nsleep_restart;
840 restart->nanosleep.clockid = type; 791 restart->nanosleep.clockid = type;
841 restart->nanosleep.expires = exp; 792 restart->nanosleep.expires = exp;
842 restart->nanosleep.rmtp = rmtp;
843 ret = -ERESTART_RESTARTBLOCK;
844
845out:
846 return ret; 793 return ret;
847} 794}
848 795
796const struct k_clock alarm_clock = {
797 .clock_getres = alarm_clock_getres,
798 .clock_get = alarm_clock_get,
799 .timer_create = alarm_timer_create,
800 .timer_set = common_timer_set,
801 .timer_del = common_timer_del,
802 .timer_get = common_timer_get,
803 .timer_arm = alarm_timer_arm,
804 .timer_rearm = alarm_timer_rearm,
805 .timer_forward = alarm_timer_forward,
806 .timer_remaining = alarm_timer_remaining,
807 .timer_try_to_cancel = alarm_timer_try_to_cancel,
808 .nsleep = alarm_timer_nsleep,
809};
810#endif /* CONFIG_POSIX_TIMERS */
811
849 812
850/* Suspend hook structures */ 813/* Suspend hook structures */
851static const struct dev_pm_ops alarmtimer_pm_ops = { 814static const struct dev_pm_ops alarmtimer_pm_ops = {
@@ -871,23 +834,9 @@ static int __init alarmtimer_init(void)
871 struct platform_device *pdev; 834 struct platform_device *pdev;
872 int error = 0; 835 int error = 0;
873 int i; 836 int i;
874 struct k_clock alarm_clock = {
875 .clock_getres = alarm_clock_getres,
876 .clock_get = alarm_clock_get,
877 .timer_create = alarm_timer_create,
878 .timer_set = alarm_timer_set,
879 .timer_del = alarm_timer_del,
880 .timer_get = alarm_timer_get,
881 .nsleep = alarm_timer_nsleep,
882 };
883 837
884 alarmtimer_rtc_timer_init(); 838 alarmtimer_rtc_timer_init();
885 839
886 if (IS_ENABLED(CONFIG_POSIX_TIMERS)) {
887 posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
888 posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
889 }
890
891 /* Initialize alarm bases */ 840 /* Initialize alarm bases */
892 alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; 841 alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
893 alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; 842 alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 93621ae718d3..03918a19cf2d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -233,6 +233,9 @@ static void clocksource_watchdog(unsigned long data)
233 continue; 233 continue;
234 } 234 }
235 235
236 if (cs == curr_clocksource && cs->tick_stable)
237 cs->tick_stable(cs);
238
236 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && 239 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
237 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && 240 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
238 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { 241 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index ac053bb5296e..88f75f92ef36 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -51,6 +51,7 @@
51#include <linux/sched/debug.h> 51#include <linux/sched/debug.h>
52#include <linux/timer.h> 52#include <linux/timer.h>
53#include <linux/freezer.h> 53#include <linux/freezer.h>
54#include <linux/compat.h>
54 55
55#include <linux/uaccess.h> 56#include <linux/uaccess.h>
56 57
@@ -1439,8 +1440,29 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1439} 1440}
1440EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); 1441EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
1441 1442
1443int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
1444{
1445 switch(restart->nanosleep.type) {
1446#ifdef CONFIG_COMPAT
1447 case TT_COMPAT:
1448 if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp))
1449 return -EFAULT;
1450 break;
1451#endif
1452 case TT_NATIVE:
1453 if (put_timespec64(ts, restart->nanosleep.rmtp))
1454 return -EFAULT;
1455 break;
1456 default:
1457 BUG();
1458 }
1459 return -ERESTART_RESTARTBLOCK;
1460}
1461
1442static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1462static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
1443{ 1463{
1464 struct restart_block *restart;
1465
1444 hrtimer_init_sleeper(t, current); 1466 hrtimer_init_sleeper(t, current);
1445 1467
1446 do { 1468 do {
@@ -1457,53 +1479,38 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1457 1479
1458 __set_current_state(TASK_RUNNING); 1480 __set_current_state(TASK_RUNNING);
1459 1481
1460 return t->task == NULL; 1482 if (!t->task)
1461}
1462
1463static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
1464{
1465 struct timespec rmt;
1466 ktime_t rem;
1467
1468 rem = hrtimer_expires_remaining(timer);
1469 if (rem <= 0)
1470 return 0; 1483 return 0;
1471 rmt = ktime_to_timespec(rem);
1472 1484
1473 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) 1485 restart = &current->restart_block;
1474 return -EFAULT; 1486 if (restart->nanosleep.type != TT_NONE) {
1487 ktime_t rem = hrtimer_expires_remaining(&t->timer);
1488 struct timespec64 rmt;
1489
1490 if (rem <= 0)
1491 return 0;
1492 rmt = ktime_to_timespec64(rem);
1475 1493
1476 return 1; 1494 return nanosleep_copyout(restart, &rmt);
1495 }
1496 return -ERESTART_RESTARTBLOCK;
1477} 1497}
1478 1498
1479long __sched hrtimer_nanosleep_restart(struct restart_block *restart) 1499static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1480{ 1500{
1481 struct hrtimer_sleeper t; 1501 struct hrtimer_sleeper t;
1482 struct timespec __user *rmtp; 1502 int ret;
1483 int ret = 0;
1484 1503
1485 hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, 1504 hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
1486 HRTIMER_MODE_ABS); 1505 HRTIMER_MODE_ABS);
1487 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); 1506 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
1488 1507
1489 if (do_nanosleep(&t, HRTIMER_MODE_ABS)) 1508 ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
1490 goto out;
1491
1492 rmtp = restart->nanosleep.rmtp;
1493 if (rmtp) {
1494 ret = update_rmtp(&t.timer, rmtp);
1495 if (ret <= 0)
1496 goto out;
1497 }
1498
1499 /* The other values in restart are already filled in */
1500 ret = -ERESTART_RESTARTBLOCK;
1501out:
1502 destroy_hrtimer_on_stack(&t.timer); 1509 destroy_hrtimer_on_stack(&t.timer);
1503 return ret; 1510 return ret;
1504} 1511}
1505 1512
1506long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, 1513long hrtimer_nanosleep(const struct timespec64 *rqtp,
1507 const enum hrtimer_mode mode, const clockid_t clockid) 1514 const enum hrtimer_mode mode, const clockid_t clockid)
1508{ 1515{
1509 struct restart_block *restart; 1516 struct restart_block *restart;
@@ -1517,7 +1524,8 @@ long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp,
1517 1524
1518 hrtimer_init_on_stack(&t.timer, clockid, mode); 1525 hrtimer_init_on_stack(&t.timer, clockid, mode);
1519 hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); 1526 hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
1520 if (do_nanosleep(&t, mode)) 1527 ret = do_nanosleep(&t, mode);
1528 if (ret != -ERESTART_RESTARTBLOCK)
1521 goto out; 1529 goto out;
1522 1530
1523 /* Absolute timers do not update the rmtp value and restart: */ 1531 /* Absolute timers do not update the rmtp value and restart: */
@@ -1526,19 +1534,10 @@ long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp,
1526 goto out; 1534 goto out;
1527 } 1535 }
1528 1536
1529 if (rmtp) {
1530 ret = update_rmtp(&t.timer, rmtp);
1531 if (ret <= 0)
1532 goto out;
1533 }
1534
1535 restart = &current->restart_block; 1537 restart = &current->restart_block;
1536 restart->fn = hrtimer_nanosleep_restart; 1538 restart->fn = hrtimer_nanosleep_restart;
1537 restart->nanosleep.clockid = t.timer.base->clockid; 1539 restart->nanosleep.clockid = t.timer.base->clockid;
1538 restart->nanosleep.rmtp = rmtp;
1539 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); 1540 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
1540
1541 ret = -ERESTART_RESTARTBLOCK;
1542out: 1541out:
1543 destroy_hrtimer_on_stack(&t.timer); 1542 destroy_hrtimer_on_stack(&t.timer);
1544 return ret; 1543 return ret;
@@ -1547,18 +1546,37 @@ out:
1547SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, 1546SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
1548 struct timespec __user *, rmtp) 1547 struct timespec __user *, rmtp)
1549{ 1548{
1550 struct timespec64 tu64; 1549 struct timespec64 tu;
1551 struct timespec tu; 1550
1551 if (get_timespec64(&tu, rqtp))
1552 return -EFAULT;
1553
1554 if (!timespec64_valid(&tu))
1555 return -EINVAL;
1556
1557 current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
1558 current->restart_block.nanosleep.rmtp = rmtp;
1559 return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
1560}
1561
1562#ifdef CONFIG_COMPAT
1563
1564COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
1565 struct compat_timespec __user *, rmtp)
1566{
1567 struct timespec64 tu;
1552 1568
1553 if (copy_from_user(&tu, rqtp, sizeof(tu))) 1569 if (compat_get_timespec64(&tu, rqtp))
1554 return -EFAULT; 1570 return -EFAULT;
1555 1571
1556 tu64 = timespec_to_timespec64(tu); 1572 if (!timespec64_valid(&tu))
1557 if (!timespec64_valid(&tu64))
1558 return -EINVAL; 1573 return -EINVAL;
1559 1574
1560 return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); 1575 current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
1576 current->restart_block.nanosleep.compat_rmtp = rmtp;
1577 return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
1561} 1578}
1579#endif
1562 1580
1563/* 1581/*
1564 * Functions related to boot-time initialization: 1582 * Functions related to boot-time initialization:
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 087d6a1279b8..2ef98a02376a 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -15,6 +15,7 @@
15#include <linux/posix-timers.h> 15#include <linux/posix-timers.h>
16#include <linux/hrtimer.h> 16#include <linux/hrtimer.h>
17#include <trace/events/timer.h> 17#include <trace/events/timer.h>
18#include <linux/compat.h>
18 19
19#include <linux/uaccess.h> 20#include <linux/uaccess.h>
20 21
@@ -116,6 +117,19 @@ SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
116 return error; 117 return error;
117} 118}
118 119
120#ifdef CONFIG_COMPAT
121COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
122 struct compat_itimerval __user *, it)
123{
124 struct itimerval kit;
125 int error = do_getitimer(which, &kit);
126
127 if (!error && put_compat_itimerval(it, &kit))
128 error = -EFAULT;
129 return error;
130}
131#endif
132
119 133
120/* 134/*
121 * The timer is automagically restarted, when interval != 0 135 * The timer is automagically restarted, when interval != 0
@@ -138,8 +152,12 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
138 u64 oval, nval, ointerval, ninterval; 152 u64 oval, nval, ointerval, ninterval;
139 struct cpu_itimer *it = &tsk->signal->it[clock_id]; 153 struct cpu_itimer *it = &tsk->signal->it[clock_id];
140 154
141 nval = timeval_to_ns(&value->it_value); 155 /*
142 ninterval = timeval_to_ns(&value->it_interval); 156 * Use the to_ktime conversion because that clamps the maximum
157 * value to KTIME_MAX and avoid multiplication overflows.
158 */
159 nval = ktime_to_ns(timeval_to_ktime(value->it_value));
160 ninterval = ktime_to_ns(timeval_to_ktime(value->it_interval));
143 161
144 spin_lock_irq(&tsk->sighand->siglock); 162 spin_lock_irq(&tsk->sighand->siglock);
145 163
@@ -294,3 +312,27 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
294 return -EFAULT; 312 return -EFAULT;
295 return 0; 313 return 0;
296} 314}
315
316#ifdef CONFIG_COMPAT
317COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
318 struct compat_itimerval __user *, in,
319 struct compat_itimerval __user *, out)
320{
321 struct itimerval kin, kout;
322 int error;
323
324 if (in) {
325 if (get_compat_itimerval(&kin, in))
326 return -EFAULT;
327 } else {
328 memset(&kin, 0, sizeof(kin));
329 }
330
331 error = do_setitimer(which, &kin, out ? &kout : NULL);
332 if (error || !out)
333 return error;
334 if (put_compat_itimerval(out, &kout))
335 return -EFAULT;
336 return 0;
337}
338#endif
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 31d588d37a17..17cdc554c9fe 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -25,6 +25,8 @@
25#include <linux/syscalls.h> 25#include <linux/syscalls.h>
26#include <linux/uaccess.h> 26#include <linux/uaccess.h>
27 27
28#include "posix-timers.h"
29
28static void delete_clock(struct kref *kref); 30static void delete_clock(struct kref *kref);
29 31
30/* 32/*
@@ -82,38 +84,6 @@ static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
82 return result; 84 return result;
83} 85}
84 86
85static int posix_clock_fasync(int fd, struct file *fp, int on)
86{
87 struct posix_clock *clk = get_posix_clock(fp);
88 int err = 0;
89
90 if (!clk)
91 return -ENODEV;
92
93 if (clk->ops.fasync)
94 err = clk->ops.fasync(clk, fd, fp, on);
95
96 put_posix_clock(clk);
97
98 return err;
99}
100
101static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
102{
103 struct posix_clock *clk = get_posix_clock(fp);
104 int err = -ENODEV;
105
106 if (!clk)
107 return -ENODEV;
108
109 if (clk->ops.mmap)
110 err = clk->ops.mmap(clk, vma);
111
112 put_posix_clock(clk);
113
114 return err;
115}
116
117static long posix_clock_ioctl(struct file *fp, 87static long posix_clock_ioctl(struct file *fp,
118 unsigned int cmd, unsigned long arg) 88 unsigned int cmd, unsigned long arg)
119{ 89{
@@ -199,8 +169,6 @@ static const struct file_operations posix_clock_file_operations = {
199 .unlocked_ioctl = posix_clock_ioctl, 169 .unlocked_ioctl = posix_clock_ioctl,
200 .open = posix_clock_open, 170 .open = posix_clock_open,
201 .release = posix_clock_release, 171 .release = posix_clock_release,
202 .fasync = posix_clock_fasync,
203 .mmap = posix_clock_mmap,
204#ifdef CONFIG_COMPAT 172#ifdef CONFIG_COMPAT
205 .compat_ioctl = posix_clock_compat_ioctl, 173 .compat_ioctl = posix_clock_compat_ioctl,
206#endif 174#endif
@@ -359,88 +327,9 @@ out:
359 return err; 327 return err;
360} 328}
361 329
362static int pc_timer_create(struct k_itimer *kit) 330const struct k_clock clock_posix_dynamic = {
363{
364 clockid_t id = kit->it_clock;
365 struct posix_clock_desc cd;
366 int err;
367
368 err = get_clock_desc(id, &cd);
369 if (err)
370 return err;
371
372 if (cd.clk->ops.timer_create)
373 err = cd.clk->ops.timer_create(cd.clk, kit);
374 else
375 err = -EOPNOTSUPP;
376
377 put_clock_desc(&cd);
378
379 return err;
380}
381
382static int pc_timer_delete(struct k_itimer *kit)
383{
384 clockid_t id = kit->it_clock;
385 struct posix_clock_desc cd;
386 int err;
387
388 err = get_clock_desc(id, &cd);
389 if (err)
390 return err;
391
392 if (cd.clk->ops.timer_delete)
393 err = cd.clk->ops.timer_delete(cd.clk, kit);
394 else
395 err = -EOPNOTSUPP;
396
397 put_clock_desc(&cd);
398
399 return err;
400}
401
402static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts)
403{
404 clockid_t id = kit->it_clock;
405 struct posix_clock_desc cd;
406
407 if (get_clock_desc(id, &cd))
408 return;
409
410 if (cd.clk->ops.timer_gettime)
411 cd.clk->ops.timer_gettime(cd.clk, kit, ts);
412
413 put_clock_desc(&cd);
414}
415
416static int pc_timer_settime(struct k_itimer *kit, int flags,
417 struct itimerspec64 *ts, struct itimerspec64 *old)
418{
419 clockid_t id = kit->it_clock;
420 struct posix_clock_desc cd;
421 int err;
422
423 err = get_clock_desc(id, &cd);
424 if (err)
425 return err;
426
427 if (cd.clk->ops.timer_settime)
428 err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
429 else
430 err = -EOPNOTSUPP;
431
432 put_clock_desc(&cd);
433
434 return err;
435}
436
437struct k_clock clock_posix_dynamic = {
438 .clock_getres = pc_clock_getres, 331 .clock_getres = pc_clock_getres,
439 .clock_set = pc_clock_settime, 332 .clock_set = pc_clock_settime,
440 .clock_get = pc_clock_gettime, 333 .clock_get = pc_clock_gettime,
441 .clock_adj = pc_clock_adjtime, 334 .clock_adj = pc_clock_adjtime,
442 .timer_create = pc_timer_create,
443 .timer_set = pc_timer_settime,
444 .timer_del = pc_timer_delete,
445 .timer_get = pc_timer_gettime,
446}; 335};
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 1370f067fb51..a3bd5dbe0dc4 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -12,6 +12,11 @@
12#include <trace/events/timer.h> 12#include <trace/events/timer.h>
13#include <linux/tick.h> 13#include <linux/tick.h>
14#include <linux/workqueue.h> 14#include <linux/workqueue.h>
15#include <linux/compat.h>
16
17#include "posix-timers.h"
18
19static void posix_cpu_timer_rearm(struct k_itimer *timer);
15 20
16/* 21/*
17 * Called after updating RLIMIT_CPU to run cpu timer and update 22 * Called after updating RLIMIT_CPU to run cpu timer and update
@@ -322,6 +327,8 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
322 if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) 327 if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
323 return -EINVAL; 328 return -EINVAL;
324 329
330 new_timer->kclock = &clock_posix_cpu;
331
325 INIT_LIST_HEAD(&new_timer->it.cpu.entry); 332 INIT_LIST_HEAD(&new_timer->it.cpu.entry);
326 333
327 rcu_read_lock(); 334 rcu_read_lock();
@@ -524,7 +531,8 @@ static void cpu_timer_fire(struct k_itimer *timer)
524 * reload the timer. But we need to keep it 531 * reload the timer. But we need to keep it
525 * ticking in case the signal is deliverable next time. 532 * ticking in case the signal is deliverable next time.
526 */ 533 */
527 posix_cpu_timer_schedule(timer); 534 posix_cpu_timer_rearm(timer);
535 ++timer->it_requeue_pending;
528 } 536 }
529} 537}
530 538
@@ -572,7 +580,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
572 580
573 WARN_ON_ONCE(p == NULL); 581 WARN_ON_ONCE(p == NULL);
574 582
575 new_expires = timespec64_to_ns(&new->it_value); 583 /*
584 * Use the to_ktime conversion because that clamps the maximum
585 * value to KTIME_MAX and avoid multiplication overflows.
586 */
587 new_expires = ktime_to_ns(timespec64_to_ktime(new->it_value));
576 588
577 /* 589 /*
578 * Protect against sighand release/switch in exit/exec and p->cpu_timers 590 * Protect against sighand release/switch in exit/exec and p->cpu_timers
@@ -712,10 +724,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
712 */ 724 */
713 itp->it_interval = ns_to_timespec64(timer->it.cpu.incr); 725 itp->it_interval = ns_to_timespec64(timer->it.cpu.incr);
714 726
715 if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ 727 if (!timer->it.cpu.expires)
716 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
717 return; 728 return;
718 }
719 729
720 /* 730 /*
721 * Sample the clock to take the difference with the expiry time. 731 * Sample the clock to take the difference with the expiry time.
@@ -739,7 +749,6 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
739 * Call the timer disarmed, nothing else to do. 749 * Call the timer disarmed, nothing else to do.
740 */ 750 */
741 timer->it.cpu.expires = 0; 751 timer->it.cpu.expires = 0;
742 itp->it_value = ns_to_timespec64(timer->it.cpu.expires);
743 return; 752 return;
744 } else { 753 } else {
745 cpu_timer_sample_group(timer->it_clock, p, &now); 754 cpu_timer_sample_group(timer->it_clock, p, &now);
@@ -825,8 +834,10 @@ static void check_thread_timers(struct task_struct *tsk,
825 * At the hard limit, we just die. 834 * At the hard limit, we just die.
826 * No need to calculate anything else now. 835 * No need to calculate anything else now.
827 */ 836 */
828 pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", 837 if (print_fatal_signals) {
829 tsk->comm, task_pid_nr(tsk)); 838 pr_info("CPU Watchdog Timeout (hard): %s[%d]\n",
839 tsk->comm, task_pid_nr(tsk));
840 }
830 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 841 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
831 return; 842 return;
832 } 843 }
@@ -838,8 +849,10 @@ static void check_thread_timers(struct task_struct *tsk,
838 soft += USEC_PER_SEC; 849 soft += USEC_PER_SEC;
839 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; 850 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
840 } 851 }
841 pr_info("RT Watchdog Timeout (soft): %s[%d]\n", 852 if (print_fatal_signals) {
842 tsk->comm, task_pid_nr(tsk)); 853 pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
854 tsk->comm, task_pid_nr(tsk));
855 }
843 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); 856 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
844 } 857 }
845 } 858 }
@@ -936,8 +949,10 @@ static void check_process_timers(struct task_struct *tsk,
936 * At the hard limit, we just die. 949 * At the hard limit, we just die.
937 * No need to calculate anything else now. 950 * No need to calculate anything else now.
938 */ 951 */
939 pr_info("RT Watchdog Timeout (hard): %s[%d]\n", 952 if (print_fatal_signals) {
940 tsk->comm, task_pid_nr(tsk)); 953 pr_info("RT Watchdog Timeout (hard): %s[%d]\n",
954 tsk->comm, task_pid_nr(tsk));
955 }
941 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 956 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
942 return; 957 return;
943 } 958 }
@@ -945,8 +960,10 @@ static void check_process_timers(struct task_struct *tsk,
945 /* 960 /*
946 * At the soft limit, send a SIGXCPU every second. 961 * At the soft limit, send a SIGXCPU every second.
947 */ 962 */
948 pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", 963 if (print_fatal_signals) {
949 tsk->comm, task_pid_nr(tsk)); 964 pr_info("CPU Watchdog Timeout (soft): %s[%d]\n",
965 tsk->comm, task_pid_nr(tsk));
966 }
950 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); 967 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
951 if (soft < hard) { 968 if (soft < hard) {
952 soft++; 969 soft++;
@@ -968,10 +985,10 @@ static void check_process_timers(struct task_struct *tsk,
968} 985}
969 986
970/* 987/*
971 * This is called from the signal code (via do_schedule_next_timer) 988 * This is called from the signal code (via posixtimer_rearm)
972 * when the last timer signal was delivered and we have to reload the timer. 989 * when the last timer signal was delivered and we have to reload the timer.
973 */ 990 */
974void posix_cpu_timer_schedule(struct k_itimer *timer) 991static void posix_cpu_timer_rearm(struct k_itimer *timer)
975{ 992{
976 struct sighand_struct *sighand; 993 struct sighand_struct *sighand;
977 unsigned long flags; 994 unsigned long flags;
@@ -987,12 +1004,12 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
987 cpu_clock_sample(timer->it_clock, p, &now); 1004 cpu_clock_sample(timer->it_clock, p, &now);
988 bump_cpu_timer(timer, now); 1005 bump_cpu_timer(timer, now);
989 if (unlikely(p->exit_state)) 1006 if (unlikely(p->exit_state))
990 goto out; 1007 return;
991 1008
992 /* Protect timer list r/w in arm_timer() */ 1009 /* Protect timer list r/w in arm_timer() */
993 sighand = lock_task_sighand(p, &flags); 1010 sighand = lock_task_sighand(p, &flags);
994 if (!sighand) 1011 if (!sighand)
995 goto out; 1012 return;
996 } else { 1013 } else {
997 /* 1014 /*
998 * Protect arm_timer() and timer sampling in case of call to 1015 * Protect arm_timer() and timer sampling in case of call to
@@ -1005,11 +1022,10 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1005 * We can't even collect a sample any more. 1022 * We can't even collect a sample any more.
1006 */ 1023 */
1007 timer->it.cpu.expires = 0; 1024 timer->it.cpu.expires = 0;
1008 goto out; 1025 return;
1009 } else if (unlikely(p->exit_state) && thread_group_empty(p)) { 1026 } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1010 unlock_task_sighand(p, &flags); 1027 /* If the process is dying, no need to rearm */
1011 /* Optimizations: if the process is dying, no need to rearm */ 1028 goto unlock;
1012 goto out;
1013 } 1029 }
1014 cpu_timer_sample_group(timer->it_clock, p, &now); 1030 cpu_timer_sample_group(timer->it_clock, p, &now);
1015 bump_cpu_timer(timer, now); 1031 bump_cpu_timer(timer, now);
@@ -1021,12 +1037,8 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1021 */ 1037 */
1022 WARN_ON_ONCE(!irqs_disabled()); 1038 WARN_ON_ONCE(!irqs_disabled());
1023 arm_timer(timer); 1039 arm_timer(timer);
1040unlock:
1024 unlock_task_sighand(p, &flags); 1041 unlock_task_sighand(p, &flags);
1025
1026out:
1027 timer->it_overrun_last = timer->it_overrun;
1028 timer->it_overrun = -1;
1029 ++timer->it_requeue_pending;
1030} 1042}
1031 1043
1032/** 1044/**
@@ -1219,9 +1231,11 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1219} 1231}
1220 1232
1221static int do_cpu_nanosleep(const clockid_t which_clock, int flags, 1233static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1222 struct timespec64 *rqtp, struct itimerspec64 *it) 1234 const struct timespec64 *rqtp)
1223{ 1235{
1236 struct itimerspec64 it;
1224 struct k_itimer timer; 1237 struct k_itimer timer;
1238 u64 expires;
1225 int error; 1239 int error;
1226 1240
1227 /* 1241 /*
@@ -1235,12 +1249,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1235 timer.it_process = current; 1249 timer.it_process = current;
1236 if (!error) { 1250 if (!error) {
1237 static struct itimerspec64 zero_it; 1251 static struct itimerspec64 zero_it;
1252 struct restart_block *restart;
1238 1253
1239 memset(it, 0, sizeof *it); 1254 memset(&it, 0, sizeof(it));
1240 it->it_value = *rqtp; 1255 it.it_value = *rqtp;
1241 1256
1242 spin_lock_irq(&timer.it_lock); 1257 spin_lock_irq(&timer.it_lock);
1243 error = posix_cpu_timer_set(&timer, flags, it, NULL); 1258 error = posix_cpu_timer_set(&timer, flags, &it, NULL);
1244 if (error) { 1259 if (error) {
1245 spin_unlock_irq(&timer.it_lock); 1260 spin_unlock_irq(&timer.it_lock);
1246 return error; 1261 return error;
@@ -1269,8 +1284,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1269 /* 1284 /*
1270 * We were interrupted by a signal. 1285 * We were interrupted by a signal.
1271 */ 1286 */
1272 *rqtp = ns_to_timespec64(timer.it.cpu.expires); 1287 expires = timer.it.cpu.expires;
1273 error = posix_cpu_timer_set(&timer, 0, &zero_it, it); 1288 error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
1274 if (!error) { 1289 if (!error) {
1275 /* 1290 /*
1276 * Timer is now unarmed, deletion can not fail. 1291 * Timer is now unarmed, deletion can not fail.
@@ -1290,7 +1305,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1290 spin_unlock_irq(&timer.it_lock); 1305 spin_unlock_irq(&timer.it_lock);
1291 } 1306 }
1292 1307
1293 if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { 1308 if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
1294 /* 1309 /*
1295 * It actually did fire already. 1310 * It actually did fire already.
1296 */ 1311 */
@@ -1298,6 +1313,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1298 } 1313 }
1299 1314
1300 error = -ERESTART_RESTARTBLOCK; 1315 error = -ERESTART_RESTARTBLOCK;
1316 /*
1317 * Report back to the user the time still remaining.
1318 */
1319 restart = &current->restart_block;
1320 restart->nanosleep.expires = expires;
1321 if (restart->nanosleep.type != TT_NONE)
1322 error = nanosleep_copyout(restart, &it.it_value);
1301 } 1323 }
1302 1324
1303 return error; 1325 return error;
@@ -1306,11 +1328,9 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1306static long posix_cpu_nsleep_restart(struct restart_block *restart_block); 1328static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1307 1329
1308static int posix_cpu_nsleep(const clockid_t which_clock, int flags, 1330static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1309 struct timespec64 *rqtp, struct timespec __user *rmtp) 1331 const struct timespec64 *rqtp)
1310{ 1332{
1311 struct restart_block *restart_block = &current->restart_block; 1333 struct restart_block *restart_block = &current->restart_block;
1312 struct itimerspec64 it;
1313 struct timespec ts;
1314 int error; 1334 int error;
1315 1335
1316 /* 1336 /*
@@ -1321,23 +1341,15 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1321 CPUCLOCK_PID(which_clock) == task_pid_vnr(current))) 1341 CPUCLOCK_PID(which_clock) == task_pid_vnr(current)))
1322 return -EINVAL; 1342 return -EINVAL;
1323 1343
1324 error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); 1344 error = do_cpu_nanosleep(which_clock, flags, rqtp);
1325 1345
1326 if (error == -ERESTART_RESTARTBLOCK) { 1346 if (error == -ERESTART_RESTARTBLOCK) {
1327 1347
1328 if (flags & TIMER_ABSTIME) 1348 if (flags & TIMER_ABSTIME)
1329 return -ERESTARTNOHAND; 1349 return -ERESTARTNOHAND;
1330 /*
1331 * Report back to the user the time still remaining.
1332 */
1333 ts = timespec64_to_timespec(it.it_value);
1334 if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp)))
1335 return -EFAULT;
1336 1350
1337 restart_block->fn = posix_cpu_nsleep_restart; 1351 restart_block->fn = posix_cpu_nsleep_restart;
1338 restart_block->nanosleep.clockid = which_clock; 1352 restart_block->nanosleep.clockid = which_clock;
1339 restart_block->nanosleep.rmtp = rmtp;
1340 restart_block->nanosleep.expires = timespec64_to_ns(rqtp);
1341 } 1353 }
1342 return error; 1354 return error;
1343} 1355}
@@ -1345,28 +1357,11 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1345static long posix_cpu_nsleep_restart(struct restart_block *restart_block) 1357static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1346{ 1358{
1347 clockid_t which_clock = restart_block->nanosleep.clockid; 1359 clockid_t which_clock = restart_block->nanosleep.clockid;
1348 struct itimerspec64 it;
1349 struct timespec64 t; 1360 struct timespec64 t;
1350 struct timespec tmp;
1351 int error;
1352 1361
1353 t = ns_to_timespec64(restart_block->nanosleep.expires); 1362 t = ns_to_timespec64(restart_block->nanosleep.expires);
1354 1363
1355 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); 1364 return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t);
1356
1357 if (error == -ERESTART_RESTARTBLOCK) {
1358 struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
1359 /*
1360 * Report back to the user the time still remaining.
1361 */
1362 tmp = timespec64_to_timespec(it.it_value);
1363 if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp)))
1364 return -EFAULT;
1365
1366 restart_block->nanosleep.expires = timespec64_to_ns(&t);
1367 }
1368 return error;
1369
1370} 1365}
1371 1366
1372#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) 1367#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
@@ -1388,14 +1383,9 @@ static int process_cpu_timer_create(struct k_itimer *timer)
1388 return posix_cpu_timer_create(timer); 1383 return posix_cpu_timer_create(timer);
1389} 1384}
1390static int process_cpu_nsleep(const clockid_t which_clock, int flags, 1385static int process_cpu_nsleep(const clockid_t which_clock, int flags,
1391 struct timespec64 *rqtp, 1386 const struct timespec64 *rqtp)
1392 struct timespec __user *rmtp)
1393{
1394 return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
1395}
1396static long process_cpu_nsleep_restart(struct restart_block *restart_block)
1397{ 1387{
1398 return -EINVAL; 1388 return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
1399} 1389}
1400static int thread_cpu_clock_getres(const clockid_t which_clock, 1390static int thread_cpu_clock_getres(const clockid_t which_clock,
1401 struct timespec64 *tp) 1391 struct timespec64 *tp)
@@ -1413,36 +1403,27 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
1413 return posix_cpu_timer_create(timer); 1403 return posix_cpu_timer_create(timer);
1414} 1404}
1415 1405
1416struct k_clock clock_posix_cpu = { 1406const struct k_clock clock_posix_cpu = {
1417 .clock_getres = posix_cpu_clock_getres, 1407 .clock_getres = posix_cpu_clock_getres,
1418 .clock_set = posix_cpu_clock_set, 1408 .clock_set = posix_cpu_clock_set,
1419 .clock_get = posix_cpu_clock_get, 1409 .clock_get = posix_cpu_clock_get,
1420 .timer_create = posix_cpu_timer_create, 1410 .timer_create = posix_cpu_timer_create,
1421 .nsleep = posix_cpu_nsleep, 1411 .nsleep = posix_cpu_nsleep,
1422 .nsleep_restart = posix_cpu_nsleep_restart,
1423 .timer_set = posix_cpu_timer_set, 1412 .timer_set = posix_cpu_timer_set,
1424 .timer_del = posix_cpu_timer_del, 1413 .timer_del = posix_cpu_timer_del,
1425 .timer_get = posix_cpu_timer_get, 1414 .timer_get = posix_cpu_timer_get,
1415 .timer_rearm = posix_cpu_timer_rearm,
1426}; 1416};
1427 1417
1428static __init int init_posix_cpu_timers(void) 1418const struct k_clock clock_process = {
1429{ 1419 .clock_getres = process_cpu_clock_getres,
1430 struct k_clock process = { 1420 .clock_get = process_cpu_clock_get,
1431 .clock_getres = process_cpu_clock_getres, 1421 .timer_create = process_cpu_timer_create,
1432 .clock_get = process_cpu_clock_get, 1422 .nsleep = process_cpu_nsleep,
1433 .timer_create = process_cpu_timer_create, 1423};
1434 .nsleep = process_cpu_nsleep,
1435 .nsleep_restart = process_cpu_nsleep_restart,
1436 };
1437 struct k_clock thread = {
1438 .clock_getres = thread_cpu_clock_getres,
1439 .clock_get = thread_cpu_clock_get,
1440 .timer_create = thread_cpu_timer_create,
1441 };
1442
1443 posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1444 posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1445 1424
1446 return 0; 1425const struct k_clock clock_thread = {
1447} 1426 .clock_getres = thread_cpu_clock_getres,
1448__initcall(init_posix_cpu_timers); 1427 .clock_get = thread_cpu_clock_get,
1428 .timer_create = thread_cpu_timer_create,
1429};
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index c0cd53eb018a..06f34feb635e 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -17,6 +17,7 @@
17#include <linux/ktime.h> 17#include <linux/ktime.h>
18#include <linux/timekeeping.h> 18#include <linux/timekeeping.h>
19#include <linux/posix-timers.h> 19#include <linux/posix-timers.h>
20#include <linux/compat.h>
20 21
21asmlinkage long sys_ni_posix_timers(void) 22asmlinkage long sys_ni_posix_timers(void)
22{ 23{
@@ -27,6 +28,7 @@ asmlinkage long sys_ni_posix_timers(void)
27} 28}
28 29
29#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) 30#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers)
31#define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers)
30 32
31SYS_NI(timer_create); 33SYS_NI(timer_create);
32SYS_NI(timer_gettime); 34SYS_NI(timer_gettime);
@@ -49,40 +51,52 @@ SYS_NI(alarm);
49SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, 51SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
50 const struct timespec __user *, tp) 52 const struct timespec __user *, tp)
51{ 53{
52 struct timespec64 new_tp64; 54 struct timespec64 new_tp;
53 struct timespec new_tp;
54 55
55 if (which_clock != CLOCK_REALTIME) 56 if (which_clock != CLOCK_REALTIME)
56 return -EINVAL; 57 return -EINVAL;
57 if (copy_from_user(&new_tp, tp, sizeof (*tp))) 58 if (get_timespec64(&new_tp, tp))
58 return -EFAULT; 59 return -EFAULT;
59 60
60 new_tp64 = timespec_to_timespec64(new_tp); 61 return do_sys_settimeofday64(&new_tp, NULL);
61 return do_sys_settimeofday64(&new_tp64, NULL);
62} 62}
63 63
64SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, 64int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
65 struct timespec __user *,tp)
66{ 65{
67 struct timespec64 kernel_tp64;
68 struct timespec kernel_tp;
69
70 switch (which_clock) { 66 switch (which_clock) {
71 case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; 67 case CLOCK_REALTIME:
72 case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; 68 ktime_get_real_ts64(tp);
73 case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; 69 break;
74 default: return -EINVAL; 70 case CLOCK_MONOTONIC:
71 ktime_get_ts64(tp);
72 break;
73 case CLOCK_BOOTTIME:
74 get_monotonic_boottime64(tp);
75 break;
76 default:
77 return -EINVAL;
75 } 78 }
76 79
77 kernel_tp = timespec64_to_timespec(kernel_tp64); 80 return 0;
78 if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) 81}
82SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
83 struct timespec __user *, tp)
84{
85 int ret;
86 struct timespec64 kernel_tp;
87
88 ret = do_clock_gettime(which_clock, &kernel_tp);
89 if (ret)
90 return ret;
91
92 if (put_timespec64(&kernel_tp, tp))
79 return -EFAULT; 93 return -EFAULT;
80 return 0; 94 return 0;
81} 95}
82 96
83SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) 97SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp)
84{ 98{
85 struct timespec rtn_tp = { 99 struct timespec64 rtn_tp = {
86 .tv_sec = 0, 100 .tv_sec = 0,
87 .tv_nsec = hrtimer_resolution, 101 .tv_nsec = hrtimer_resolution,
88 }; 102 };
@@ -91,7 +105,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us
91 case CLOCK_REALTIME: 105 case CLOCK_REALTIME:
92 case CLOCK_MONOTONIC: 106 case CLOCK_MONOTONIC:
93 case CLOCK_BOOTTIME: 107 case CLOCK_BOOTTIME:
94 if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp))) 108 if (put_timespec64(&rtn_tp, tp))
95 return -EFAULT; 109 return -EFAULT;
96 return 0; 110 return 0;
97 default: 111 default:
@@ -110,22 +124,108 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
110 case CLOCK_REALTIME: 124 case CLOCK_REALTIME:
111 case CLOCK_MONOTONIC: 125 case CLOCK_MONOTONIC:
112 case CLOCK_BOOTTIME: 126 case CLOCK_BOOTTIME:
113 if (copy_from_user(&t, rqtp, sizeof (struct timespec))) 127 break;
114 return -EFAULT;
115 t64 = timespec_to_timespec64(t);
116 if (!timespec64_valid(&t64))
117 return -EINVAL;
118 return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ?
119 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
120 which_clock);
121 default: 128 default:
122 return -EINVAL; 129 return -EINVAL;
123 } 130 }
131
132 if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
133 return -EFAULT;
134 t64 = timespec_to_timespec64(t);
135 if (!timespec64_valid(&t64))
136 return -EINVAL;
137 if (flags & TIMER_ABSTIME)
138 rmtp = NULL;
139 current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
140 current->restart_block.nanosleep.rmtp = rmtp;
141 return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ?
142 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
143 which_clock);
124} 144}
125 145
126#ifdef CONFIG_COMPAT 146#ifdef CONFIG_COMPAT
127long clock_nanosleep_restart(struct restart_block *restart_block) 147COMPAT_SYS_NI(timer_create);
148COMPAT_SYS_NI(clock_adjtime);
149COMPAT_SYS_NI(timer_settime);
150COMPAT_SYS_NI(timer_gettime);
151COMPAT_SYS_NI(getitimer);
152COMPAT_SYS_NI(setitimer);
153
154COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
155 struct compat_timespec __user *, tp)
128{ 156{
129 return hrtimer_nanosleep_restart(restart_block); 157 struct timespec64 new_tp;
158
159 if (which_clock != CLOCK_REALTIME)
160 return -EINVAL;
161 if (compat_get_timespec64(&new_tp, tp))
162 return -EFAULT;
163
164 return do_sys_settimeofday64(&new_tp, NULL);
165}
166
167COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
168 struct compat_timespec __user *, tp)
169{
170 int ret;
171 struct timespec64 kernel_tp;
172
173 ret = do_clock_gettime(which_clock, &kernel_tp);
174 if (ret)
175 return ret;
176
177 if (compat_put_timespec64(&kernel_tp, tp))
178 return -EFAULT;
179 return 0;
180}
181
182COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
183 struct compat_timespec __user *, tp)
184{
185 struct timespec64 rtn_tp = {
186 .tv_sec = 0,
187 .tv_nsec = hrtimer_resolution,
188 };
189
190 switch (which_clock) {
191 case CLOCK_REALTIME:
192 case CLOCK_MONOTONIC:
193 case CLOCK_BOOTTIME:
194 if (compat_put_timespec64(&rtn_tp, tp))
195 return -EFAULT;
196 return 0;
197 default:
198 return -EINVAL;
199 }
200}
201
202COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
203 struct compat_timespec __user *, rqtp,
204 struct compat_timespec __user *, rmtp)
205{
206 struct timespec64 t64;
207 struct timespec t;
208
209 switch (which_clock) {
210 case CLOCK_REALTIME:
211 case CLOCK_MONOTONIC:
212 case CLOCK_BOOTTIME:
213 break;
214 default:
215 return -EINVAL;
216 }
217
218 if (compat_get_timespec(&t, rqtp))
219 return -EFAULT;
220 t64 = timespec_to_timespec64(t);
221 if (!timespec64_valid(&t64))
222 return -EINVAL;
223 if (flags & TIMER_ABSTIME)
224 rmtp = NULL;
225 current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
226 current->restart_block.nanosleep.compat_rmtp = rmtp;
227 return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ?
228 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
229 which_clock);
130} 230}
131#endif 231#endif
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 4d7b2ce09c27..13d6881f908b 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -49,8 +49,10 @@
49#include <linux/workqueue.h> 49#include <linux/workqueue.h>
50#include <linux/export.h> 50#include <linux/export.h>
51#include <linux/hashtable.h> 51#include <linux/hashtable.h>
52#include <linux/compat.h>
52 53
53#include "timekeeping.h" 54#include "timekeeping.h"
55#include "posix-timers.h"
54 56
55/* 57/*
56 * Management arrays for POSIX timers. Timers are now kept in static hash table 58 * Management arrays for POSIX timers. Timers are now kept in static hash table
@@ -69,6 +71,10 @@ static struct kmem_cache *posix_timers_cache;
69static DEFINE_HASHTABLE(posix_timers_hashtable, 9); 71static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
70static DEFINE_SPINLOCK(hash_lock); 72static DEFINE_SPINLOCK(hash_lock);
71 73
74static const struct k_clock * const posix_clocks[];
75static const struct k_clock *clockid_to_kclock(const clockid_t id);
76static const struct k_clock clock_realtime, clock_monotonic;
77
72/* 78/*
73 * we assume that the new SIGEV_THREAD_ID shares no bits with the other 79 * we assume that the new SIGEV_THREAD_ID shares no bits with the other
74 * SIGEV values. Here we put out an error if this assumption fails. 80 * SIGEV values. Here we put out an error if this assumption fails.
@@ -124,22 +130,6 @@ static DEFINE_SPINLOCK(hash_lock);
124 * have is CLOCK_REALTIME and its high res counter part, both of 130 * have is CLOCK_REALTIME and its high res counter part, both of
125 * which we beg off on and pass to do_sys_settimeofday(). 131 * which we beg off on and pass to do_sys_settimeofday().
126 */ 132 */
127
128static struct k_clock posix_clocks[MAX_CLOCKS];
129
130/*
131 * These ones are defined below.
132 */
133static int common_nsleep(const clockid_t, int flags, struct timespec64 *t,
134 struct timespec __user *rmtp);
135static int common_timer_create(struct k_itimer *new_timer);
136static void common_timer_get(struct k_itimer *, struct itimerspec64 *);
137static int common_timer_set(struct k_itimer *, int,
138 struct itimerspec64 *, struct itimerspec64 *);
139static int common_timer_del(struct k_itimer *timer);
140
141static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
142
143static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); 133static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
144 134
145#define lock_timer(tid, flags) \ 135#define lock_timer(tid, flags) \
@@ -285,91 +275,23 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
285 */ 275 */
286static __init int init_posix_timers(void) 276static __init int init_posix_timers(void)
287{ 277{
288 struct k_clock clock_realtime = {
289 .clock_getres = posix_get_hrtimer_res,
290 .clock_get = posix_clock_realtime_get,
291 .clock_set = posix_clock_realtime_set,
292 .clock_adj = posix_clock_realtime_adj,
293 .nsleep = common_nsleep,
294 .nsleep_restart = hrtimer_nanosleep_restart,
295 .timer_create = common_timer_create,
296 .timer_set = common_timer_set,
297 .timer_get = common_timer_get,
298 .timer_del = common_timer_del,
299 };
300 struct k_clock clock_monotonic = {
301 .clock_getres = posix_get_hrtimer_res,
302 .clock_get = posix_ktime_get_ts,
303 .nsleep = common_nsleep,
304 .nsleep_restart = hrtimer_nanosleep_restart,
305 .timer_create = common_timer_create,
306 .timer_set = common_timer_set,
307 .timer_get = common_timer_get,
308 .timer_del = common_timer_del,
309 };
310 struct k_clock clock_monotonic_raw = {
311 .clock_getres = posix_get_hrtimer_res,
312 .clock_get = posix_get_monotonic_raw,
313 };
314 struct k_clock clock_realtime_coarse = {
315 .clock_getres = posix_get_coarse_res,
316 .clock_get = posix_get_realtime_coarse,
317 };
318 struct k_clock clock_monotonic_coarse = {
319 .clock_getres = posix_get_coarse_res,
320 .clock_get = posix_get_monotonic_coarse,
321 };
322 struct k_clock clock_tai = {
323 .clock_getres = posix_get_hrtimer_res,
324 .clock_get = posix_get_tai,
325 .nsleep = common_nsleep,
326 .nsleep_restart = hrtimer_nanosleep_restart,
327 .timer_create = common_timer_create,
328 .timer_set = common_timer_set,
329 .timer_get = common_timer_get,
330 .timer_del = common_timer_del,
331 };
332 struct k_clock clock_boottime = {
333 .clock_getres = posix_get_hrtimer_res,
334 .clock_get = posix_get_boottime,
335 .nsleep = common_nsleep,
336 .nsleep_restart = hrtimer_nanosleep_restart,
337 .timer_create = common_timer_create,
338 .timer_set = common_timer_set,
339 .timer_get = common_timer_get,
340 .timer_del = common_timer_del,
341 };
342
343 posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
344 posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
345 posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
346 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
347 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
348 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
349 posix_timers_register_clock(CLOCK_TAI, &clock_tai);
350
351 posix_timers_cache = kmem_cache_create("posix_timers_cache", 278 posix_timers_cache = kmem_cache_create("posix_timers_cache",
352 sizeof (struct k_itimer), 0, SLAB_PANIC, 279 sizeof (struct k_itimer), 0, SLAB_PANIC,
353 NULL); 280 NULL);
354 return 0; 281 return 0;
355} 282}
356
357__initcall(init_posix_timers); 283__initcall(init_posix_timers);
358 284
359static void schedule_next_timer(struct k_itimer *timr) 285static void common_hrtimer_rearm(struct k_itimer *timr)
360{ 286{
361 struct hrtimer *timer = &timr->it.real.timer; 287 struct hrtimer *timer = &timr->it.real.timer;
362 288
363 if (timr->it.real.interval == 0) 289 if (!timr->it_interval)
364 return; 290 return;
365 291
366 timr->it_overrun += (unsigned int) hrtimer_forward(timer, 292 timr->it_overrun += (unsigned int) hrtimer_forward(timer,
367 timer->base->get_time(), 293 timer->base->get_time(),
368 timr->it.real.interval); 294 timr->it_interval);
369
370 timr->it_overrun_last = timr->it_overrun;
371 timr->it_overrun = -1;
372 ++timr->it_requeue_pending;
373 hrtimer_restart(timer); 295 hrtimer_restart(timer);
374} 296}
375 297
@@ -384,24 +306,27 @@ static void schedule_next_timer(struct k_itimer *timr)
384 * To protect against the timer going away while the interrupt is queued, 306 * To protect against the timer going away while the interrupt is queued,
385 * we require that the it_requeue_pending flag be set. 307 * we require that the it_requeue_pending flag be set.
386 */ 308 */
387void do_schedule_next_timer(struct siginfo *info) 309void posixtimer_rearm(struct siginfo *info)
388{ 310{
389 struct k_itimer *timr; 311 struct k_itimer *timr;
390 unsigned long flags; 312 unsigned long flags;
391 313
392 timr = lock_timer(info->si_tid, &flags); 314 timr = lock_timer(info->si_tid, &flags);
315 if (!timr)
316 return;
317
318 if (timr->it_requeue_pending == info->si_sys_private) {
319 timr->kclock->timer_rearm(timr);
393 320
394 if (timr && timr->it_requeue_pending == info->si_sys_private) { 321 timr->it_active = 1;
395 if (timr->it_clock < 0) 322 timr->it_overrun_last = timr->it_overrun;
396 posix_cpu_timer_schedule(timr); 323 timr->it_overrun = -1;
397 else 324 ++timr->it_requeue_pending;
398 schedule_next_timer(timr);
399 325
400 info->si_overrun += timr->it_overrun_last; 326 info->si_overrun += timr->it_overrun_last;
401 } 327 }
402 328
403 if (timr) 329 unlock_timer(timr, flags);
404 unlock_timer(timr, flags);
405} 330}
406 331
407int posix_timer_event(struct k_itimer *timr, int si_private) 332int posix_timer_event(struct k_itimer *timr, int si_private)
@@ -410,12 +335,12 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
410 int shared, ret = -1; 335 int shared, ret = -1;
411 /* 336 /*
412 * FIXME: if ->sigq is queued we can race with 337 * FIXME: if ->sigq is queued we can race with
413 * dequeue_signal()->do_schedule_next_timer(). 338 * dequeue_signal()->posixtimer_rearm().
414 * 339 *
415 * If dequeue_signal() sees the "right" value of 340 * If dequeue_signal() sees the "right" value of
416 * si_sys_private it calls do_schedule_next_timer(). 341 * si_sys_private it calls posixtimer_rearm().
417 * We re-queue ->sigq and drop ->it_lock(). 342 * We re-queue ->sigq and drop ->it_lock().
418 * do_schedule_next_timer() locks the timer 343 * posixtimer_rearm() locks the timer
419 * and re-schedules it while ->sigq is pending. 344 * and re-schedules it while ->sigq is pending.
420 * Not really bad, but not that we want. 345 * Not really bad, but not that we want.
421 */ 346 */
@@ -431,7 +356,6 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
431 /* If we failed to send the signal the timer stops. */ 356 /* If we failed to send the signal the timer stops. */
432 return ret > 0; 357 return ret > 0;
433} 358}
434EXPORT_SYMBOL_GPL(posix_timer_event);
435 359
436/* 360/*
437 * This function gets called when a POSIX.1b interval timer expires. It 361 * This function gets called when a POSIX.1b interval timer expires. It
@@ -450,7 +374,8 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
450 timr = container_of(timer, struct k_itimer, it.real.timer); 374 timr = container_of(timer, struct k_itimer, it.real.timer);
451 spin_lock_irqsave(&timr->it_lock, flags); 375 spin_lock_irqsave(&timr->it_lock, flags);
452 376
453 if (timr->it.real.interval != 0) 377 timr->it_active = 0;
378 if (timr->it_interval != 0)
454 si_private = ++timr->it_requeue_pending; 379 si_private = ++timr->it_requeue_pending;
455 380
456 if (posix_timer_event(timr, si_private)) { 381 if (posix_timer_event(timr, si_private)) {
@@ -459,7 +384,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
459 * we will not get a call back to restart it AND 384 * we will not get a call back to restart it AND
460 * it should be restarted. 385 * it should be restarted.
461 */ 386 */
462 if (timr->it.real.interval != 0) { 387 if (timr->it_interval != 0) {
463 ktime_t now = hrtimer_cb_get_time(timer); 388 ktime_t now = hrtimer_cb_get_time(timer);
464 389
465 /* 390 /*
@@ -488,15 +413,16 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
488 { 413 {
489 ktime_t kj = NSEC_PER_SEC / HZ; 414 ktime_t kj = NSEC_PER_SEC / HZ;
490 415
491 if (timr->it.real.interval < kj) 416 if (timr->it_interval < kj)
492 now = ktime_add(now, kj); 417 now = ktime_add(now, kj);
493 } 418 }
494#endif 419#endif
495 timr->it_overrun += (unsigned int) 420 timr->it_overrun += (unsigned int)
496 hrtimer_forward(timer, now, 421 hrtimer_forward(timer, now,
497 timr->it.real.interval); 422 timr->it_interval);
498 ret = HRTIMER_RESTART; 423 ret = HRTIMER_RESTART;
499 ++timr->it_requeue_pending; 424 ++timr->it_requeue_pending;
425 timr->it_active = 1;
500 } 426 }
501 } 427 }
502 428
@@ -521,30 +447,6 @@ static struct pid *good_sigevent(sigevent_t * event)
521 return task_pid(rtn); 447 return task_pid(rtn);
522} 448}
523 449
524void posix_timers_register_clock(const clockid_t clock_id,
525 struct k_clock *new_clock)
526{
527 if ((unsigned) clock_id >= MAX_CLOCKS) {
528 printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
529 clock_id);
530 return;
531 }
532
533 if (!new_clock->clock_get) {
534 printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
535 clock_id);
536 return;
537 }
538 if (!new_clock->clock_getres) {
539 printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
540 clock_id);
541 return;
542 }
543
544 posix_clocks[clock_id] = *new_clock;
545}
546EXPORT_SYMBOL_GPL(posix_timers_register_clock);
547
548static struct k_itimer * alloc_posix_timer(void) 450static struct k_itimer * alloc_posix_timer(void)
549{ 451{
550 struct k_itimer *tmr; 452 struct k_itimer *tmr;
@@ -581,17 +483,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
581 call_rcu(&tmr->it.rcu, k_itimer_rcu_free); 483 call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
582} 484}
583 485
584static struct k_clock *clockid_to_kclock(const clockid_t id)
585{
586 if (id < 0)
587 return (id & CLOCKFD_MASK) == CLOCKFD ?
588 &clock_posix_dynamic : &clock_posix_cpu;
589
590 if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
591 return NULL;
592 return &posix_clocks[id];
593}
594
595static int common_timer_create(struct k_itimer *new_timer) 486static int common_timer_create(struct k_itimer *new_timer)
596{ 487{
597 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); 488 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
@@ -599,15 +490,12 @@ static int common_timer_create(struct k_itimer *new_timer)
599} 490}
600 491
601/* Create a POSIX.1b interval timer. */ 492/* Create a POSIX.1b interval timer. */
602 493static int do_timer_create(clockid_t which_clock, struct sigevent *event,
603SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, 494 timer_t __user *created_timer_id)
604 struct sigevent __user *, timer_event_spec,
605 timer_t __user *, created_timer_id)
606{ 495{
607 struct k_clock *kc = clockid_to_kclock(which_clock); 496 const struct k_clock *kc = clockid_to_kclock(which_clock);
608 struct k_itimer *new_timer; 497 struct k_itimer *new_timer;
609 int error, new_timer_id; 498 int error, new_timer_id;
610 sigevent_t event;
611 int it_id_set = IT_ID_NOT_SET; 499 int it_id_set = IT_ID_NOT_SET;
612 500
613 if (!kc) 501 if (!kc)
@@ -629,31 +517,28 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
629 it_id_set = IT_ID_SET; 517 it_id_set = IT_ID_SET;
630 new_timer->it_id = (timer_t) new_timer_id; 518 new_timer->it_id = (timer_t) new_timer_id;
631 new_timer->it_clock = which_clock; 519 new_timer->it_clock = which_clock;
520 new_timer->kclock = kc;
632 new_timer->it_overrun = -1; 521 new_timer->it_overrun = -1;
633 522
634 if (timer_event_spec) { 523 if (event) {
635 if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
636 error = -EFAULT;
637 goto out;
638 }
639 rcu_read_lock(); 524 rcu_read_lock();
640 new_timer->it_pid = get_pid(good_sigevent(&event)); 525 new_timer->it_pid = get_pid(good_sigevent(event));
641 rcu_read_unlock(); 526 rcu_read_unlock();
642 if (!new_timer->it_pid) { 527 if (!new_timer->it_pid) {
643 error = -EINVAL; 528 error = -EINVAL;
644 goto out; 529 goto out;
645 } 530 }
531 new_timer->it_sigev_notify = event->sigev_notify;
532 new_timer->sigq->info.si_signo = event->sigev_signo;
533 new_timer->sigq->info.si_value = event->sigev_value;
646 } else { 534 } else {
647 memset(&event.sigev_value, 0, sizeof(event.sigev_value)); 535 new_timer->it_sigev_notify = SIGEV_SIGNAL;
648 event.sigev_notify = SIGEV_SIGNAL; 536 new_timer->sigq->info.si_signo = SIGALRM;
649 event.sigev_signo = SIGALRM; 537 memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t));
650 event.sigev_value.sival_int = new_timer->it_id; 538 new_timer->sigq->info.si_value.sival_int = new_timer->it_id;
651 new_timer->it_pid = get_pid(task_tgid(current)); 539 new_timer->it_pid = get_pid(task_tgid(current));
652 } 540 }
653 541
654 new_timer->it_sigev_notify = event.sigev_notify;
655 new_timer->sigq->info.si_signo = event.sigev_signo;
656 new_timer->sigq->info.si_value = event.sigev_value;
657 new_timer->sigq->info.si_tid = new_timer->it_id; 542 new_timer->sigq->info.si_tid = new_timer->it_id;
658 new_timer->sigq->info.si_code = SI_TIMER; 543 new_timer->sigq->info.si_code = SI_TIMER;
659 544
@@ -684,6 +569,36 @@ out:
684 return error; 569 return error;
685} 570}
686 571
572SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
573 struct sigevent __user *, timer_event_spec,
574 timer_t __user *, created_timer_id)
575{
576 if (timer_event_spec) {
577 sigevent_t event;
578
579 if (copy_from_user(&event, timer_event_spec, sizeof (event)))
580 return -EFAULT;
581 return do_timer_create(which_clock, &event, created_timer_id);
582 }
583 return do_timer_create(which_clock, NULL, created_timer_id);
584}
585
586#ifdef CONFIG_COMPAT
587COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
588 struct compat_sigevent __user *, timer_event_spec,
589 timer_t __user *, created_timer_id)
590{
591 if (timer_event_spec) {
592 sigevent_t event;
593
594 if (get_compat_sigevent(&event, timer_event_spec))
595 return -EFAULT;
596 return do_timer_create(which_clock, &event, created_timer_id);
597 }
598 return do_timer_create(which_clock, NULL, created_timer_id);
599}
600#endif
601
687/* 602/*
688 * Locking issues: We need to protect the result of the id look up until 603 * Locking issues: We need to protect the result of the id look up until
689 * we get the timer locked down so it is not deleted under us. The 604 * we get the timer locked down so it is not deleted under us. The
@@ -717,6 +632,20 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
717 return NULL; 632 return NULL;
718} 633}
719 634
635static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now)
636{
637 struct hrtimer *timer = &timr->it.real.timer;
638
639 return __hrtimer_expires_remaining_adjusted(timer, now);
640}
641
642static int common_hrtimer_forward(struct k_itimer *timr, ktime_t now)
643{
644 struct hrtimer *timer = &timr->it.real.timer;
645
646 return (int)hrtimer_forward(timer, now, timr->it_interval);
647}
648
720/* 649/*
721 * Get the time remaining on a POSIX.1b interval timer. This function 650 * Get the time remaining on a POSIX.1b interval timer. This function
722 * is ALWAYS called with spin_lock_irq on the timer, thus it must not 651 * is ALWAYS called with spin_lock_irq on the timer, thus it must not
@@ -733,55 +662,61 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
733 * it is the same as a requeue pending timer WRT to what we should 662 * it is the same as a requeue pending timer WRT to what we should
734 * report. 663 * report.
735 */ 664 */
736static void 665void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
737common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
738{ 666{
667 const struct k_clock *kc = timr->kclock;
739 ktime_t now, remaining, iv; 668 ktime_t now, remaining, iv;
740 struct hrtimer *timer = &timr->it.real.timer; 669 struct timespec64 ts64;
741 670 bool sig_none;
742 memset(cur_setting, 0, sizeof(*cur_setting));
743 671
744 iv = timr->it.real.interval; 672 sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE;
673 iv = timr->it_interval;
745 674
746 /* interval timer ? */ 675 /* interval timer ? */
747 if (iv) 676 if (iv) {
748 cur_setting->it_interval = ktime_to_timespec64(iv); 677 cur_setting->it_interval = ktime_to_timespec64(iv);
749 else if (!hrtimer_active(timer) && 678 } else if (!timr->it_active) {
750 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) 679 /*
751 return; 680 * SIGEV_NONE oneshot timers are never queued. Check them
681 * below.
682 */
683 if (!sig_none)
684 return;
685 }
752 686
753 now = timer->base->get_time(); 687 /*
688 * The timespec64 based conversion is suboptimal, but it's not
689 * worth to implement yet another callback.
690 */
691 kc->clock_get(timr->it_clock, &ts64);
692 now = timespec64_to_ktime(ts64);
754 693
755 /* 694 /*
756 * When a requeue is pending or this is a SIGEV_NONE 695 * When a requeue is pending or this is a SIGEV_NONE timer move the
757 * timer move the expiry time forward by intervals, so 696 * expiry time forward by intervals, so expiry is > now.
758 * expiry is > now.
759 */ 697 */
760 if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || 698 if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none))
761 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) 699 timr->it_overrun += kc->timer_forward(timr, now);
762 timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
763 700
764 remaining = __hrtimer_expires_remaining_adjusted(timer, now); 701 remaining = kc->timer_remaining(timr, now);
765 /* Return 0 only, when the timer is expired and not pending */ 702 /* Return 0 only, when the timer is expired and not pending */
766 if (remaining <= 0) { 703 if (remaining <= 0) {
767 /* 704 /*
768 * A single shot SIGEV_NONE timer must return 0, when 705 * A single shot SIGEV_NONE timer must return 0, when
769 * it is expired ! 706 * it is expired !
770 */ 707 */
771 if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) 708 if (!sig_none)
772 cur_setting->it_value.tv_nsec = 1; 709 cur_setting->it_value.tv_nsec = 1;
773 } else 710 } else {
774 cur_setting->it_value = ktime_to_timespec64(remaining); 711 cur_setting->it_value = ktime_to_timespec64(remaining);
712 }
775} 713}
776 714
777/* Get the time remaining on a POSIX.1b interval timer. */ 715/* Get the time remaining on a POSIX.1b interval timer. */
778SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, 716static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting)
779 struct itimerspec __user *, setting)
780{ 717{
781 struct itimerspec64 cur_setting64;
782 struct itimerspec cur_setting;
783 struct k_itimer *timr; 718 struct k_itimer *timr;
784 struct k_clock *kc; 719 const struct k_clock *kc;
785 unsigned long flags; 720 unsigned long flags;
786 int ret = 0; 721 int ret = 0;
787 722
@@ -789,20 +724,45 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
789 if (!timr) 724 if (!timr)
790 return -EINVAL; 725 return -EINVAL;
791 726
792 kc = clockid_to_kclock(timr->it_clock); 727 memset(setting, 0, sizeof(*setting));
728 kc = timr->kclock;
793 if (WARN_ON_ONCE(!kc || !kc->timer_get)) 729 if (WARN_ON_ONCE(!kc || !kc->timer_get))
794 ret = -EINVAL; 730 ret = -EINVAL;
795 else 731 else
796 kc->timer_get(timr, &cur_setting64); 732 kc->timer_get(timr, setting);
797 733
798 unlock_timer(timr, flags); 734 unlock_timer(timr, flags);
735 return ret;
736}
799 737
800 cur_setting = itimerspec64_to_itimerspec(&cur_setting64); 738/* Get the time remaining on a POSIX.1b interval timer. */
801 if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) 739SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
802 return -EFAULT; 740 struct itimerspec __user *, setting)
741{
742 struct itimerspec64 cur_setting;
743
744 int ret = do_timer_gettime(timer_id, &cur_setting);
745 if (!ret) {
746 if (put_itimerspec64(&cur_setting, setting))
747 ret = -EFAULT;
748 }
749 return ret;
750}
751
752#ifdef CONFIG_COMPAT
753COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
754 struct compat_itimerspec __user *, setting)
755{
756 struct itimerspec64 cur_setting;
803 757
758 int ret = do_timer_gettime(timer_id, &cur_setting);
759 if (!ret) {
760 if (put_compat_itimerspec64(&cur_setting, setting))
761 ret = -EFAULT;
762 }
804 return ret; 763 return ret;
805} 764}
765#endif
806 766
807/* 767/*
808 * Get the number of overruns of a POSIX.1b interval timer. This is to 768 * Get the number of overruns of a POSIX.1b interval timer. This is to
@@ -810,7 +770,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
810 * accumulating overruns on the next timer. The overrun is frozen when 770 * accumulating overruns on the next timer. The overrun is frozen when
811 * the signal is delivered, either at the notify time (if the info block 771 * the signal is delivered, either at the notify time (if the info block
812 * is not queued) or at the actual delivery time (as we are informed by 772 * is not queued) or at the actual delivery time (as we are informed by
813 * the call back to do_schedule_next_timer(). So all we need to do is 773 * the call back to posixtimer_rearm(). So all we need to do is
814 * to pick up the frozen overrun. 774 * to pick up the frozen overrun.
815 */ 775 */
816SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) 776SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
@@ -829,117 +789,175 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
829 return overrun; 789 return overrun;
830} 790}
831 791
832/* Set a POSIX.1b interval timer. */ 792static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
833/* timr->it_lock is taken. */ 793 bool absolute, bool sigev_none)
834static int
835common_timer_set(struct k_itimer *timr, int flags,
836 struct itimerspec64 *new_setting, struct itimerspec64 *old_setting)
837{ 794{
838 struct hrtimer *timer = &timr->it.real.timer; 795 struct hrtimer *timer = &timr->it.real.timer;
839 enum hrtimer_mode mode; 796 enum hrtimer_mode mode;
840 797
798 mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
799 /*
800 * Posix magic: Relative CLOCK_REALTIME timers are not affected by
801 * clock modifications, so they become CLOCK_MONOTONIC based under the
802 * hood. See hrtimer_init(). Update timr->kclock, so the generic
803 * functions which use timr->kclock->clock_get() work.
804 *
805 * Note: it_clock stays unmodified, because the next timer_set() might
806 * use ABSTIME, so it needs to switch back.
807 */
808 if (timr->it_clock == CLOCK_REALTIME)
809 timr->kclock = absolute ? &clock_realtime : &clock_monotonic;
810
811 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
812 timr->it.real.timer.function = posix_timer_fn;
813
814 if (!absolute)
815 expires = ktime_add_safe(expires, timer->base->get_time());
816 hrtimer_set_expires(timer, expires);
817
818 if (!sigev_none)
819 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
820}
821
822static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
823{
824 return hrtimer_try_to_cancel(&timr->it.real.timer);
825}
826
827/* Set a POSIX.1b interval timer. */
828int common_timer_set(struct k_itimer *timr, int flags,
829 struct itimerspec64 *new_setting,
830 struct itimerspec64 *old_setting)
831{
832 const struct k_clock *kc = timr->kclock;
833 bool sigev_none;
834 ktime_t expires;
835
841 if (old_setting) 836 if (old_setting)
842 common_timer_get(timr, old_setting); 837 common_timer_get(timr, old_setting);
843 838
844 /* disable the timer */ 839 /* Prevent rearming by clearing the interval */
845 timr->it.real.interval = 0; 840 timr->it_interval = 0;
846 /* 841 /*
847 * careful here. If smp we could be in the "fire" routine which will 842 * Careful here. On SMP systems the timer expiry function could be
848 * be spinning as we hold the lock. But this is ONLY an SMP issue. 843 * active and spinning on timr->it_lock.
849 */ 844 */
850 if (hrtimer_try_to_cancel(timer) < 0) 845 if (kc->timer_try_to_cancel(timr) < 0)
851 return TIMER_RETRY; 846 return TIMER_RETRY;
852 847
853 timr->it_requeue_pending = (timr->it_requeue_pending + 2) & 848 timr->it_active = 0;
849 timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
854 ~REQUEUE_PENDING; 850 ~REQUEUE_PENDING;
855 timr->it_overrun_last = 0; 851 timr->it_overrun_last = 0;
856 852
857 /* switch off the timer when it_value is zero */ 853 /* Switch off the timer when it_value is zero */
858 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) 854 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
859 return 0; 855 return 0;
860 856
861 mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; 857 timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
862 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); 858 expires = timespec64_to_ktime(new_setting->it_value);
863 timr->it.real.timer.function = posix_timer_fn; 859 sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE;
864
865 hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value));
866
867 /* Convert interval */
868 timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval);
869
870 /* SIGEV_NONE timers are not queued ! See common_timer_get */
871 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
872 /* Setup correct expiry time for relative timers */
873 if (mode == HRTIMER_MODE_REL) {
874 hrtimer_add_expires(timer, timer->base->get_time());
875 }
876 return 0;
877 }
878 860
879 hrtimer_start_expires(timer, mode); 861 kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
862 timr->it_active = !sigev_none;
880 return 0; 863 return 0;
881} 864}
882 865
883/* Set a POSIX.1b interval timer */ 866static int do_timer_settime(timer_t timer_id, int flags,
884SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, 867 struct itimerspec64 *new_spec64,
885 const struct itimerspec __user *, new_setting, 868 struct itimerspec64 *old_spec64)
886 struct itimerspec __user *, old_setting)
887{ 869{
888 struct itimerspec64 new_spec64, old_spec64; 870 const struct k_clock *kc;
889 struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL;
890 struct itimerspec new_spec, old_spec;
891 struct k_itimer *timr; 871 struct k_itimer *timr;
892 unsigned long flag; 872 unsigned long flag;
893 struct k_clock *kc;
894 int error = 0; 873 int error = 0;
895 874
896 if (!new_setting) 875 if (!timespec64_valid(&new_spec64->it_interval) ||
876 !timespec64_valid(&new_spec64->it_value))
897 return -EINVAL; 877 return -EINVAL;
898 878
899 if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) 879 if (old_spec64)
900 return -EFAULT; 880 memset(old_spec64, 0, sizeof(*old_spec64));
901 new_spec64 = itimerspec_to_itimerspec64(&new_spec);
902
903 if (!timespec64_valid(&new_spec64.it_interval) ||
904 !timespec64_valid(&new_spec64.it_value))
905 return -EINVAL;
906retry: 881retry:
907 timr = lock_timer(timer_id, &flag); 882 timr = lock_timer(timer_id, &flag);
908 if (!timr) 883 if (!timr)
909 return -EINVAL; 884 return -EINVAL;
910 885
911 kc = clockid_to_kclock(timr->it_clock); 886 kc = timr->kclock;
912 if (WARN_ON_ONCE(!kc || !kc->timer_set)) 887 if (WARN_ON_ONCE(!kc || !kc->timer_set))
913 error = -EINVAL; 888 error = -EINVAL;
914 else 889 else
915 error = kc->timer_set(timr, flags, &new_spec64, rtn); 890 error = kc->timer_set(timr, flags, new_spec64, old_spec64);
916 891
917 unlock_timer(timr, flag); 892 unlock_timer(timr, flag);
918 if (error == TIMER_RETRY) { 893 if (error == TIMER_RETRY) {
919 rtn = NULL; // We already got the old time... 894 old_spec64 = NULL; // We already got the old time...
920 goto retry; 895 goto retry;
921 } 896 }
922 897
923 old_spec = itimerspec64_to_itimerspec(&old_spec64); 898 return error;
924 if (old_setting && !error && 899}
925 copy_to_user(old_setting, &old_spec, sizeof (old_spec))) 900
926 error = -EFAULT; 901/* Set a POSIX.1b interval timer */
902SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
903 const struct itimerspec __user *, new_setting,
904 struct itimerspec __user *, old_setting)
905{
906 struct itimerspec64 new_spec, old_spec;
907 struct itimerspec64 *rtn = old_setting ? &old_spec : NULL;
908 int error = 0;
909
910 if (!new_setting)
911 return -EINVAL;
927 912
913 if (get_itimerspec64(&new_spec, new_setting))
914 return -EFAULT;
915
916 error = do_timer_settime(timer_id, flags, &new_spec, rtn);
917 if (!error && old_setting) {
918 if (put_itimerspec64(&old_spec, old_setting))
919 error = -EFAULT;
920 }
921 return error;
922}
923
924#ifdef CONFIG_COMPAT
925COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
926 struct compat_itimerspec __user *, new,
927 struct compat_itimerspec __user *, old)
928{
929 struct itimerspec64 new_spec, old_spec;
930 struct itimerspec64 *rtn = old ? &old_spec : NULL;
931 int error = 0;
932
933 if (!new)
934 return -EINVAL;
935 if (get_compat_itimerspec64(&new_spec, new))
936 return -EFAULT;
937
938 error = do_timer_settime(timer_id, flags, &new_spec, rtn);
939 if (!error && old) {
940 if (put_compat_itimerspec64(&old_spec, old))
941 error = -EFAULT;
942 }
928 return error; 943 return error;
929} 944}
945#endif
930 946
931static int common_timer_del(struct k_itimer *timer) 947int common_timer_del(struct k_itimer *timer)
932{ 948{
933 timer->it.real.interval = 0; 949 const struct k_clock *kc = timer->kclock;
934 950
935 if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) 951 timer->it_interval = 0;
952 if (kc->timer_try_to_cancel(timer) < 0)
936 return TIMER_RETRY; 953 return TIMER_RETRY;
954 timer->it_active = 0;
937 return 0; 955 return 0;
938} 956}
939 957
940static inline int timer_delete_hook(struct k_itimer *timer) 958static inline int timer_delete_hook(struct k_itimer *timer)
941{ 959{
942 struct k_clock *kc = clockid_to_kclock(timer->it_clock); 960 const struct k_clock *kc = timer->kclock;
943 961
944 if (WARN_ON_ONCE(!kc || !kc->timer_del)) 962 if (WARN_ON_ONCE(!kc || !kc->timer_del))
945 return -EINVAL; 963 return -EINVAL;
@@ -1018,35 +1036,31 @@ void exit_itimers(struct signal_struct *sig)
1018SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, 1036SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
1019 const struct timespec __user *, tp) 1037 const struct timespec __user *, tp)
1020{ 1038{
1021 struct k_clock *kc = clockid_to_kclock(which_clock); 1039 const struct k_clock *kc = clockid_to_kclock(which_clock);
1022 struct timespec64 new_tp64; 1040 struct timespec64 new_tp;
1023 struct timespec new_tp;
1024 1041
1025 if (!kc || !kc->clock_set) 1042 if (!kc || !kc->clock_set)
1026 return -EINVAL; 1043 return -EINVAL;
1027 1044
1028 if (copy_from_user(&new_tp, tp, sizeof (*tp))) 1045 if (get_timespec64(&new_tp, tp))
1029 return -EFAULT; 1046 return -EFAULT;
1030 new_tp64 = timespec_to_timespec64(new_tp);
1031 1047
1032 return kc->clock_set(which_clock, &new_tp64); 1048 return kc->clock_set(which_clock, &new_tp);
1033} 1049}
1034 1050
1035SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, 1051SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
1036 struct timespec __user *,tp) 1052 struct timespec __user *,tp)
1037{ 1053{
1038 struct k_clock *kc = clockid_to_kclock(which_clock); 1054 const struct k_clock *kc = clockid_to_kclock(which_clock);
1039 struct timespec64 kernel_tp64; 1055 struct timespec64 kernel_tp;
1040 struct timespec kernel_tp;
1041 int error; 1056 int error;
1042 1057
1043 if (!kc) 1058 if (!kc)
1044 return -EINVAL; 1059 return -EINVAL;
1045 1060
1046 error = kc->clock_get(which_clock, &kernel_tp64); 1061 error = kc->clock_get(which_clock, &kernel_tp);
1047 kernel_tp = timespec64_to_timespec(kernel_tp64);
1048 1062
1049 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) 1063 if (!error && put_timespec64(&kernel_tp, tp))
1050 error = -EFAULT; 1064 error = -EFAULT;
1051 1065
1052 return error; 1066 return error;
@@ -1055,7 +1069,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
1055SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, 1069SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
1056 struct timex __user *, utx) 1070 struct timex __user *, utx)
1057{ 1071{
1058 struct k_clock *kc = clockid_to_kclock(which_clock); 1072 const struct k_clock *kc = clockid_to_kclock(which_clock);
1059 struct timex ktx; 1073 struct timex ktx;
1060 int err; 1074 int err;
1061 1075
@@ -1078,30 +1092,106 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
1078SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, 1092SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
1079 struct timespec __user *, tp) 1093 struct timespec __user *, tp)
1080{ 1094{
1081 struct k_clock *kc = clockid_to_kclock(which_clock); 1095 const struct k_clock *kc = clockid_to_kclock(which_clock);
1082 struct timespec64 rtn_tp64; 1096 struct timespec64 rtn_tp;
1083 struct timespec rtn_tp;
1084 int error; 1097 int error;
1085 1098
1086 if (!kc) 1099 if (!kc)
1087 return -EINVAL; 1100 return -EINVAL;
1088 1101
1089 error = kc->clock_getres(which_clock, &rtn_tp64); 1102 error = kc->clock_getres(which_clock, &rtn_tp);
1090 rtn_tp = timespec64_to_timespec(rtn_tp64);
1091 1103
1092 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) 1104 if (!error && tp && put_timespec64(&rtn_tp, tp))
1093 error = -EFAULT; 1105 error = -EFAULT;
1094 1106
1095 return error; 1107 return error;
1096} 1108}
1097 1109
1110#ifdef CONFIG_COMPAT
1111
1112COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
1113 struct compat_timespec __user *, tp)
1114{
1115 const struct k_clock *kc = clockid_to_kclock(which_clock);
1116 struct timespec64 ts;
1117
1118 if (!kc || !kc->clock_set)
1119 return -EINVAL;
1120
1121 if (compat_get_timespec64(&ts, tp))
1122 return -EFAULT;
1123
1124 return kc->clock_set(which_clock, &ts);
1125}
1126
1127COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
1128 struct compat_timespec __user *, tp)
1129{
1130 const struct k_clock *kc = clockid_to_kclock(which_clock);
1131 struct timespec64 ts;
1132 int err;
1133
1134 if (!kc)
1135 return -EINVAL;
1136
1137 err = kc->clock_get(which_clock, &ts);
1138
1139 if (!err && compat_put_timespec64(&ts, tp))
1140 err = -EFAULT;
1141
1142 return err;
1143}
1144
1145COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
1146 struct compat_timex __user *, utp)
1147{
1148 const struct k_clock *kc = clockid_to_kclock(which_clock);
1149 struct timex ktx;
1150 int err;
1151
1152 if (!kc)
1153 return -EINVAL;
1154 if (!kc->clock_adj)
1155 return -EOPNOTSUPP;
1156
1157 err = compat_get_timex(&ktx, utp);
1158 if (err)
1159 return err;
1160
1161 err = kc->clock_adj(which_clock, &ktx);
1162
1163 if (err >= 0)
1164 err = compat_put_timex(utp, &ktx);
1165
1166 return err;
1167}
1168
1169COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
1170 struct compat_timespec __user *, tp)
1171{
1172 const struct k_clock *kc = clockid_to_kclock(which_clock);
1173 struct timespec64 ts;
1174 int err;
1175
1176 if (!kc)
1177 return -EINVAL;
1178
1179 err = kc->clock_getres(which_clock, &ts);
1180 if (!err && tp && compat_put_timespec64(&ts, tp))
1181 return -EFAULT;
1182
1183 return err;
1184}
1185
1186#endif
1187
1098/* 1188/*
1099 * nanosleep for monotonic and realtime clocks 1189 * nanosleep for monotonic and realtime clocks
1100 */ 1190 */
1101static int common_nsleep(const clockid_t which_clock, int flags, 1191static int common_nsleep(const clockid_t which_clock, int flags,
1102 struct timespec64 *tsave, struct timespec __user *rmtp) 1192 const struct timespec64 *rqtp)
1103{ 1193{
1104 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? 1194 return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ?
1105 HRTIMER_MODE_ABS : HRTIMER_MODE_REL, 1195 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
1106 which_clock); 1196 which_clock);
1107} 1197}
@@ -1110,36 +1200,152 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1110 const struct timespec __user *, rqtp, 1200 const struct timespec __user *, rqtp,
1111 struct timespec __user *, rmtp) 1201 struct timespec __user *, rmtp)
1112{ 1202{
1113 struct k_clock *kc = clockid_to_kclock(which_clock); 1203 const struct k_clock *kc = clockid_to_kclock(which_clock);
1114 struct timespec64 t64; 1204 struct timespec64 t;
1115 struct timespec t;
1116 1205
1117 if (!kc) 1206 if (!kc)
1118 return -EINVAL; 1207 return -EINVAL;
1119 if (!kc->nsleep) 1208 if (!kc->nsleep)
1120 return -ENANOSLEEP_NOTSUP; 1209 return -ENANOSLEEP_NOTSUP;
1121 1210
1122 if (copy_from_user(&t, rqtp, sizeof (struct timespec))) 1211 if (get_timespec64(&t, rqtp))
1123 return -EFAULT; 1212 return -EFAULT;
1124 1213
1125 t64 = timespec_to_timespec64(t); 1214 if (!timespec64_valid(&t))
1126 if (!timespec64_valid(&t64))
1127 return -EINVAL; 1215 return -EINVAL;
1216 if (flags & TIMER_ABSTIME)
1217 rmtp = NULL;
1218 current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
1219 current->restart_block.nanosleep.rmtp = rmtp;
1128 1220
1129 return kc->nsleep(which_clock, flags, &t64, rmtp); 1221 return kc->nsleep(which_clock, flags, &t);
1130} 1222}
1131 1223
1132/* 1224#ifdef CONFIG_COMPAT
1133 * This will restart clock_nanosleep. This is required only by 1225COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
1134 * compat_clock_nanosleep_restart for now. 1226 struct compat_timespec __user *, rqtp,
1135 */ 1227 struct compat_timespec __user *, rmtp)
1136long clock_nanosleep_restart(struct restart_block *restart_block)
1137{ 1228{
1138 clockid_t which_clock = restart_block->nanosleep.clockid; 1229 const struct k_clock *kc = clockid_to_kclock(which_clock);
1139 struct k_clock *kc = clockid_to_kclock(which_clock); 1230 struct timespec64 t;
1231
1232 if (!kc)
1233 return -EINVAL;
1234 if (!kc->nsleep)
1235 return -ENANOSLEEP_NOTSUP;
1140 1236
1141 if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) 1237 if (compat_get_timespec64(&t, rqtp))
1238 return -EFAULT;
1239
1240 if (!timespec64_valid(&t))
1142 return -EINVAL; 1241 return -EINVAL;
1242 if (flags & TIMER_ABSTIME)
1243 rmtp = NULL;
1244 current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
1245 current->restart_block.nanosleep.compat_rmtp = rmtp;
1246
1247 return kc->nsleep(which_clock, flags, &t);
1248}
1249#endif
1143 1250
1144 return kc->nsleep_restart(restart_block); 1251static const struct k_clock clock_realtime = {
1252 .clock_getres = posix_get_hrtimer_res,
1253 .clock_get = posix_clock_realtime_get,
1254 .clock_set = posix_clock_realtime_set,
1255 .clock_adj = posix_clock_realtime_adj,
1256 .nsleep = common_nsleep,
1257 .timer_create = common_timer_create,
1258 .timer_set = common_timer_set,
1259 .timer_get = common_timer_get,
1260 .timer_del = common_timer_del,
1261 .timer_rearm = common_hrtimer_rearm,
1262 .timer_forward = common_hrtimer_forward,
1263 .timer_remaining = common_hrtimer_remaining,
1264 .timer_try_to_cancel = common_hrtimer_try_to_cancel,
1265 .timer_arm = common_hrtimer_arm,
1266};
1267
1268static const struct k_clock clock_monotonic = {
1269 .clock_getres = posix_get_hrtimer_res,
1270 .clock_get = posix_ktime_get_ts,
1271 .nsleep = common_nsleep,
1272 .timer_create = common_timer_create,
1273 .timer_set = common_timer_set,
1274 .timer_get = common_timer_get,
1275 .timer_del = common_timer_del,
1276 .timer_rearm = common_hrtimer_rearm,
1277 .timer_forward = common_hrtimer_forward,
1278 .timer_remaining = common_hrtimer_remaining,
1279 .timer_try_to_cancel = common_hrtimer_try_to_cancel,
1280 .timer_arm = common_hrtimer_arm,
1281};
1282
1283static const struct k_clock clock_monotonic_raw = {
1284 .clock_getres = posix_get_hrtimer_res,
1285 .clock_get = posix_get_monotonic_raw,
1286};
1287
1288static const struct k_clock clock_realtime_coarse = {
1289 .clock_getres = posix_get_coarse_res,
1290 .clock_get = posix_get_realtime_coarse,
1291};
1292
1293static const struct k_clock clock_monotonic_coarse = {
1294 .clock_getres = posix_get_coarse_res,
1295 .clock_get = posix_get_monotonic_coarse,
1296};
1297
1298static const struct k_clock clock_tai = {
1299 .clock_getres = posix_get_hrtimer_res,
1300 .clock_get = posix_get_tai,
1301 .nsleep = common_nsleep,
1302 .timer_create = common_timer_create,
1303 .timer_set = common_timer_set,
1304 .timer_get = common_timer_get,
1305 .timer_del = common_timer_del,
1306 .timer_rearm = common_hrtimer_rearm,
1307 .timer_forward = common_hrtimer_forward,
1308 .timer_remaining = common_hrtimer_remaining,
1309 .timer_try_to_cancel = common_hrtimer_try_to_cancel,
1310 .timer_arm = common_hrtimer_arm,
1311};
1312
1313static const struct k_clock clock_boottime = {
1314 .clock_getres = posix_get_hrtimer_res,
1315 .clock_get = posix_get_boottime,
1316 .nsleep = common_nsleep,
1317 .timer_create = common_timer_create,
1318 .timer_set = common_timer_set,
1319 .timer_get = common_timer_get,
1320 .timer_del = common_timer_del,
1321 .timer_rearm = common_hrtimer_rearm,
1322 .timer_forward = common_hrtimer_forward,
1323 .timer_remaining = common_hrtimer_remaining,
1324 .timer_try_to_cancel = common_hrtimer_try_to_cancel,
1325 .timer_arm = common_hrtimer_arm,
1326};
1327
1328static const struct k_clock * const posix_clocks[] = {
1329 [CLOCK_REALTIME] = &clock_realtime,
1330 [CLOCK_MONOTONIC] = &clock_monotonic,
1331 [CLOCK_PROCESS_CPUTIME_ID] = &clock_process,
1332 [CLOCK_THREAD_CPUTIME_ID] = &clock_thread,
1333 [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw,
1334 [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse,
1335 [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse,
1336 [CLOCK_BOOTTIME] = &clock_boottime,
1337 [CLOCK_REALTIME_ALARM] = &alarm_clock,
1338 [CLOCK_BOOTTIME_ALARM] = &alarm_clock,
1339 [CLOCK_TAI] = &clock_tai,
1340};
1341
1342static const struct k_clock *clockid_to_kclock(const clockid_t id)
1343{
1344 if (id < 0)
1345 return (id & CLOCKFD_MASK) == CLOCKFD ?
1346 &clock_posix_dynamic : &clock_posix_cpu;
1347
1348 if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id])
1349 return NULL;
1350 return posix_clocks[id];
1145} 1351}
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
new file mode 100644
index 000000000000..fb303c3be4d3
--- /dev/null
+++ b/kernel/time/posix-timers.h
@@ -0,0 +1,40 @@
1#define TIMER_RETRY 1
2
3struct k_clock {
4 int (*clock_getres)(const clockid_t which_clock,
5 struct timespec64 *tp);
6 int (*clock_set)(const clockid_t which_clock,
7 const struct timespec64 *tp);
8 int (*clock_get)(const clockid_t which_clock,
9 struct timespec64 *tp);
10 int (*clock_adj)(const clockid_t which_clock, struct timex *tx);
11 int (*timer_create)(struct k_itimer *timer);
12 int (*nsleep)(const clockid_t which_clock, int flags,
13 const struct timespec64 *);
14 int (*timer_set)(struct k_itimer *timr, int flags,
15 struct itimerspec64 *new_setting,
16 struct itimerspec64 *old_setting);
17 int (*timer_del)(struct k_itimer *timr);
18 void (*timer_get)(struct k_itimer *timr,
19 struct itimerspec64 *cur_setting);
20 void (*timer_rearm)(struct k_itimer *timr);
21 int (*timer_forward)(struct k_itimer *timr, ktime_t now);
22 ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now);
23 int (*timer_try_to_cancel)(struct k_itimer *timr);
24 void (*timer_arm)(struct k_itimer *timr, ktime_t expires,
25 bool absolute, bool sigev_none);
26};
27
28extern const struct k_clock clock_posix_cpu;
29extern const struct k_clock clock_posix_dynamic;
30extern const struct k_clock clock_process;
31extern const struct k_clock clock_thread;
32extern const struct k_clock alarm_clock;
33
34int posix_timer_event(struct k_itimer *timr, int si_private);
35
36void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting);
37int common_timer_set(struct k_itimer *timr, int flags,
38 struct itimerspec64 *new_setting,
39 struct itimerspec64 *old_setting);
40int common_timer_del(struct k_itimer *timer);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 987e496bb51a..b398c2ea69b2 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -37,9 +37,11 @@ static int tick_broadcast_forced;
37static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); 37static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
38 38
39#ifdef CONFIG_TICK_ONESHOT 39#ifdef CONFIG_TICK_ONESHOT
40static void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
40static void tick_broadcast_clear_oneshot(int cpu); 41static void tick_broadcast_clear_oneshot(int cpu);
41static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); 42static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
42#else 43#else
44static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
43static inline void tick_broadcast_clear_oneshot(int cpu) { } 45static inline void tick_broadcast_clear_oneshot(int cpu) { }
44static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } 46static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
45#endif 47#endif
@@ -867,7 +869,7 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
867/** 869/**
868 * tick_broadcast_setup_oneshot - setup the broadcast device 870 * tick_broadcast_setup_oneshot - setup the broadcast device
869 */ 871 */
870void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 872static void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
871{ 873{
872 int cpu = smp_processor_id(); 874 int cpu = smp_processor_id();
873 875
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f738251000fe..be0ac01f2e12 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -126,7 +126,6 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
126 126
127/* Functions related to oneshot broadcasting */ 127/* Functions related to oneshot broadcasting */
128#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) 128#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
129extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
130extern void tick_broadcast_switch_to_oneshot(void); 129extern void tick_broadcast_switch_to_oneshot(void);
131extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); 130extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
132extern int tick_broadcast_oneshot_active(void); 131extern int tick_broadcast_oneshot_active(void);
@@ -134,7 +133,6 @@ extern void tick_check_oneshot_broadcast_this_cpu(void);
134bool tick_broadcast_oneshot_available(void); 133bool tick_broadcast_oneshot_available(void);
135extern struct cpumask *tick_get_broadcast_oneshot_mask(void); 134extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
136#else /* !(BROADCAST && ONESHOT): */ 135#else /* !(BROADCAST && ONESHOT): */
137static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
138static inline void tick_broadcast_switch_to_oneshot(void) { } 136static inline void tick_broadcast_switch_to_oneshot(void) { }
139static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } 137static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
140static inline int tick_broadcast_oneshot_active(void) { return 0; } 138static inline int tick_broadcast_oneshot_active(void) { return 0; }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 64c97fc130c4..c7a899c5ce64 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,6 +150,12 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
150 touch_softlockup_watchdog_sched(); 150 touch_softlockup_watchdog_sched();
151 if (is_idle_task(current)) 151 if (is_idle_task(current))
152 ts->idle_jiffies++; 152 ts->idle_jiffies++;
153 /*
154 * In case the current tick fired too early past its expected
155 * expiration, make sure we don't bypass the next clock reprogramming
156 * to the same deadline.
157 */
158 ts->next_tick = 0;
153 } 159 }
154#endif 160#endif
155 update_process_times(user_mode(regs)); 161 update_process_times(user_mode(regs));
@@ -554,7 +560,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
554 update_ts_time_stats(smp_processor_id(), ts, now, NULL); 560 update_ts_time_stats(smp_processor_id(), ts, now, NULL);
555 ts->idle_active = 0; 561 ts->idle_active = 0;
556 562
557 sched_clock_idle_wakeup_event(0); 563 sched_clock_idle_wakeup_event();
558} 564}
559 565
560static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 566static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -660,6 +666,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
660 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); 666 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
661 else 667 else
662 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 668 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
669
670 /*
671 * Reset to make sure next tick stop doesn't get fooled by past
672 * cached clock deadline.
673 */
674 ts->next_tick = 0;
663} 675}
664 676
665static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, 677static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
@@ -701,8 +713,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
701 */ 713 */
702 delta = next_tick - basemono; 714 delta = next_tick - basemono;
703 if (delta <= (u64)TICK_NSEC) { 715 if (delta <= (u64)TICK_NSEC) {
704 tick = 0;
705
706 /* 716 /*
707 * Tell the timer code that the base is not idle, i.e. undo 717 * Tell the timer code that the base is not idle, i.e. undo
708 * the effect of get_next_timer_interrupt(): 718 * the effect of get_next_timer_interrupt():
@@ -712,23 +722,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
712 * We've not stopped the tick yet, and there's a timer in the 722 * We've not stopped the tick yet, and there's a timer in the
713 * next period, so no point in stopping it either, bail. 723 * next period, so no point in stopping it either, bail.
714 */ 724 */
715 if (!ts->tick_stopped) 725 if (!ts->tick_stopped) {
716 goto out; 726 tick = 0;
717
718 /*
719 * If, OTOH, we did stop it, but there's a pending (expired)
720 * timer reprogram the timer hardware to fire now.
721 *
722 * We will not restart the tick proper, just prod the timer
723 * hardware into firing an interrupt to process the pending
724 * timers. Just like tick_irq_exit() will not restart the tick
725 * for 'normal' interrupts.
726 *
727 * Only once we exit the idle loop will we re-enable the tick,
728 * see tick_nohz_idle_exit().
729 */
730 if (delta == 0) {
731 tick_nohz_restart(ts, now);
732 goto out; 727 goto out;
733 } 728 }
734 } 729 }
@@ -771,8 +766,16 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
771 tick = expires; 766 tick = expires;
772 767
773 /* Skip reprogram of event if its not changed */ 768 /* Skip reprogram of event if its not changed */
774 if (ts->tick_stopped && (expires == dev->next_event)) 769 if (ts->tick_stopped && (expires == ts->next_tick)) {
775 goto out; 770 /* Sanity check: make sure clockevent is actually programmed */
771 if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
772 goto out;
773
774 WARN_ON_ONCE(1);
775 printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
776 basemono, ts->next_tick, dev->next_event,
777 hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer));
778 }
776 779
777 /* 780 /*
778 * nohz_stop_sched_tick can be called several times before 781 * nohz_stop_sched_tick can be called several times before
@@ -782,8 +785,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
782 * the scheduler tick in nohz_restart_sched_tick. 785 * the scheduler tick in nohz_restart_sched_tick.
783 */ 786 */
784 if (!ts->tick_stopped) { 787 if (!ts->tick_stopped) {
785 nohz_balance_enter_idle(cpu); 788 calc_load_nohz_start();
786 calc_load_enter_idle();
787 cpu_load_update_nohz_start(); 789 cpu_load_update_nohz_start();
788 790
789 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 791 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -791,6 +793,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
791 trace_tick_stop(1, TICK_DEP_MASK_NONE); 793 trace_tick_stop(1, TICK_DEP_MASK_NONE);
792 } 794 }
793 795
796 ts->next_tick = tick;
797
794 /* 798 /*
795 * If the expiration time == KTIME_MAX, then we simply stop 799 * If the expiration time == KTIME_MAX, then we simply stop
796 * the tick timer. 800 * the tick timer.
@@ -801,12 +805,17 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
801 goto out; 805 goto out;
802 } 806 }
803 807
808 hrtimer_set_expires(&ts->sched_timer, tick);
809
804 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 810 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
805 hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); 811 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
806 else 812 else
807 tick_program_event(tick, 1); 813 tick_program_event(tick, 1);
808out: 814out:
809 /* Update the estimated sleep length */ 815 /*
816 * Update the estimated sleep length until the next timer
817 * (not only the tick).
818 */
810 ts->sleep_length = ktime_sub(dev->next_event, now); 819 ts->sleep_length = ktime_sub(dev->next_event, now);
811 return tick; 820 return tick;
812} 821}
@@ -823,7 +832,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
823 */ 832 */
824 timer_clear_idle(); 833 timer_clear_idle();
825 834
826 calc_load_exit_idle(); 835 calc_load_nohz_stop();
827 touch_softlockup_watchdog_sched(); 836 touch_softlockup_watchdog_sched();
828 /* 837 /*
829 * Cancel the scheduled timer and restore the tick 838 * Cancel the scheduled timer and restore the tick
@@ -864,6 +873,11 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
864 if (unlikely(!cpu_online(cpu))) { 873 if (unlikely(!cpu_online(cpu))) {
865 if (cpu == tick_do_timer_cpu) 874 if (cpu == tick_do_timer_cpu)
866 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 875 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
876 /*
877 * Make sure the CPU doesn't get fooled by obsolete tick
878 * deadline if it comes back online later.
879 */
880 ts->next_tick = 0;
867 return false; 881 return false;
868 } 882 }
869 883
@@ -923,8 +937,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
923 ts->idle_expires = expires; 937 ts->idle_expires = expires;
924 } 938 }
925 939
926 if (!was_stopped && ts->tick_stopped) 940 if (!was_stopped && ts->tick_stopped) {
927 ts->idle_jiffies = ts->last_jiffies; 941 ts->idle_jiffies = ts->last_jiffies;
942 nohz_balance_enter_idle(cpu);
943 }
928 } 944 }
929} 945}
930 946
@@ -1172,6 +1188,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
1172 */ 1188 */
1173 if (regs) 1189 if (regs)
1174 tick_sched_handle(ts, regs); 1190 tick_sched_handle(ts, regs);
1191 else
1192 ts->next_tick = 0;
1175 1193
1176 /* No need to reprogram if we are in idle or full dynticks mode */ 1194 /* No need to reprogram if we are in idle or full dynticks mode */
1177 if (unlikely(ts->tick_stopped)) 1195 if (unlikely(ts->tick_stopped))
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index bf38226e5c17..075444e3d48e 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -27,6 +27,7 @@ enum tick_nohz_mode {
27 * timer is modified for nohz sleeps. This is necessary 27 * timer is modified for nohz sleeps. This is necessary
28 * to resume the tick timer operation in the timeline 28 * to resume the tick timer operation in the timeline
29 * when the CPU returns from nohz sleep. 29 * when the CPU returns from nohz sleep.
30 * @next_tick: Next tick to be fired when in dynticks mode.
30 * @tick_stopped: Indicator that the idle tick has been stopped 31 * @tick_stopped: Indicator that the idle tick has been stopped
31 * @idle_jiffies: jiffies at the entry to idle for idle time accounting 32 * @idle_jiffies: jiffies at the entry to idle for idle time accounting
32 * @idle_calls: Total number of idle calls 33 * @idle_calls: Total number of idle calls
@@ -44,6 +45,7 @@ struct tick_sched {
44 unsigned long check_clocks; 45 unsigned long check_clocks;
45 enum tick_nohz_mode nohz_mode; 46 enum tick_nohz_mode nohz_mode;
46 ktime_t last_tick; 47 ktime_t last_tick;
48 ktime_t next_tick;
47 int inidle; 49 int inidle;
48 int tick_stopped; 50 int tick_stopped;
49 unsigned long idle_jiffies; 51 unsigned long idle_jiffies;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 49c73c6ed648..44a8c1402133 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -39,6 +39,7 @@
39#include <linux/ptrace.h> 39#include <linux/ptrace.h>
40 40
41#include <linux/uaccess.h> 41#include <linux/uaccess.h>
42#include <linux/compat.h>
42#include <asm/unistd.h> 43#include <asm/unistd.h>
43 44
44#include <generated/timeconst.h> 45#include <generated/timeconst.h>
@@ -99,6 +100,47 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr)
99 100
100#endif /* __ARCH_WANT_SYS_TIME */ 101#endif /* __ARCH_WANT_SYS_TIME */
101 102
103#ifdef CONFIG_COMPAT
104#ifdef __ARCH_WANT_COMPAT_SYS_TIME
105
106/* compat_time_t is a 32 bit "long" and needs to get converted. */
107COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
108{
109 struct timeval tv;
110 compat_time_t i;
111
112 do_gettimeofday(&tv);
113 i = tv.tv_sec;
114
115 if (tloc) {
116 if (put_user(i,tloc))
117 return -EFAULT;
118 }
119 force_successful_syscall_return();
120 return i;
121}
122
123COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
124{
125 struct timespec tv;
126 int err;
127
128 if (get_user(tv.tv_sec, tptr))
129 return -EFAULT;
130
131 tv.tv_nsec = 0;
132
133 err = security_settime(&tv, NULL);
134 if (err)
135 return err;
136
137 do_settimeofday(&tv);
138 return 0;
139}
140
141#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
142#endif
143
102SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, 144SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
103 struct timezone __user *, tz) 145 struct timezone __user *, tz)
104{ 146{
@@ -215,6 +257,47 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
215 return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); 257 return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
216} 258}
217 259
260#ifdef CONFIG_COMPAT
261COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
262 struct timezone __user *, tz)
263{
264 if (tv) {
265 struct timeval ktv;
266
267 do_gettimeofday(&ktv);
268 if (compat_put_timeval(&ktv, tv))
269 return -EFAULT;
270 }
271 if (tz) {
272 if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
273 return -EFAULT;
274 }
275
276 return 0;
277}
278
279COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
280 struct timezone __user *, tz)
281{
282 struct timespec64 new_ts;
283 struct timeval user_tv;
284 struct timezone new_tz;
285
286 if (tv) {
287 if (compat_get_timeval(&user_tv, tv))
288 return -EFAULT;
289 new_ts.tv_sec = user_tv.tv_sec;
290 new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
291 }
292 if (tz) {
293 if (copy_from_user(&new_tz, tz, sizeof(*tz)))
294 return -EFAULT;
295 }
296
297 return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
298}
299#endif
300
218SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) 301SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
219{ 302{
220 struct timex txc; /* Local copy of parameter */ 303 struct timex txc; /* Local copy of parameter */
@@ -224,12 +307,33 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
224 * structure. But bear in mind that the structures 307 * structure. But bear in mind that the structures
225 * may change 308 * may change
226 */ 309 */
227 if(copy_from_user(&txc, txc_p, sizeof(struct timex))) 310 if (copy_from_user(&txc, txc_p, sizeof(struct timex)))
228 return -EFAULT; 311 return -EFAULT;
229 ret = do_adjtimex(&txc); 312 ret = do_adjtimex(&txc);
230 return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; 313 return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
231} 314}
232 315
316#ifdef CONFIG_COMPAT
317
318COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
319{
320 struct timex txc;
321 int err, ret;
322
323 err = compat_get_timex(&txc, utp);
324 if (err)
325 return err;
326
327 ret = do_adjtimex(&txc);
328
329 err = compat_put_timex(utp, &txc);
330 if (err)
331 return err;
332
333 return ret;
334}
335#endif
336
233/* 337/*
234 * Convert jiffies to milliseconds and back. 338 * Convert jiffies to milliseconds and back.
235 * 339 *
@@ -786,3 +890,61 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
786 890
787 return res; 891 return res;
788} 892}
893
894int get_timespec64(struct timespec64 *ts,
895 const struct timespec __user *uts)
896{
897 struct timespec kts;
898 int ret;
899
900 ret = copy_from_user(&kts, uts, sizeof(kts));
901 if (ret)
902 return -EFAULT;
903
904 ts->tv_sec = kts.tv_sec;
905 ts->tv_nsec = kts.tv_nsec;
906
907 return 0;
908}
909EXPORT_SYMBOL_GPL(get_timespec64);
910
911int put_timespec64(const struct timespec64 *ts,
912 struct timespec __user *uts)
913{
914 struct timespec kts = {
915 .tv_sec = ts->tv_sec,
916 .tv_nsec = ts->tv_nsec
917 };
918 return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
919}
920EXPORT_SYMBOL_GPL(put_timespec64);
921
922int get_itimerspec64(struct itimerspec64 *it,
923 const struct itimerspec __user *uit)
924{
925 int ret;
926
927 ret = get_timespec64(&it->it_interval, &uit->it_interval);
928 if (ret)
929 return ret;
930
931 ret = get_timespec64(&it->it_value, &uit->it_value);
932
933 return ret;
934}
935EXPORT_SYMBOL_GPL(get_itimerspec64);
936
937int put_itimerspec64(const struct itimerspec64 *it,
938 struct itimerspec __user *uit)
939{
940 int ret;
941
942 ret = put_timespec64(&it->it_interval, &uit->it_interval);
943 if (ret)
944 return ret;
945
946 ret = put_timespec64(&it->it_value, &uit->it_value);
947
948 return ret;
949}
950EXPORT_SYMBOL_GPL(put_itimerspec64);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9652bc57fd09..cedafa008de5 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -72,6 +72,10 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)
72 tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; 72 tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
73 tk->xtime_sec++; 73 tk->xtime_sec++;
74 } 74 }
75 while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
76 tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
77 tk->raw_sec++;
78 }
75} 79}
76 80
77static inline struct timespec64 tk_xtime(struct timekeeper *tk) 81static inline struct timespec64 tk_xtime(struct timekeeper *tk)
@@ -118,6 +122,26 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
118 tk->offs_boot = ktime_add(tk->offs_boot, delta); 122 tk->offs_boot = ktime_add(tk->offs_boot, delta);
119} 123}
120 124
125/*
126 * tk_clock_read - atomic clocksource read() helper
127 *
128 * This helper is necessary to use in the read paths because, while the
129 * seqlock ensures we don't return a bad value while structures are updated,
130 * it doesn't protect from potential crashes. There is the possibility that
131 * the tkr's clocksource may change between the read reference, and the
132 * clock reference passed to the read function. This can cause crashes if
133 * the wrong clocksource is passed to the wrong read function.
134 * This isn't necessary to use when holding the timekeeper_lock or doing
135 * a read of the fast-timekeeper tkrs (which is protected by its own locking
136 * and update logic).
137 */
138static inline u64 tk_clock_read(struct tk_read_base *tkr)
139{
140 struct clocksource *clock = READ_ONCE(tkr->clock);
141
142 return clock->read(clock);
143}
144
121#ifdef CONFIG_DEBUG_TIMEKEEPING 145#ifdef CONFIG_DEBUG_TIMEKEEPING
122#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ 146#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
123 147
@@ -175,7 +199,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
175 */ 199 */
176 do { 200 do {
177 seq = read_seqcount_begin(&tk_core.seq); 201 seq = read_seqcount_begin(&tk_core.seq);
178 now = tkr->read(tkr->clock); 202 now = tk_clock_read(tkr);
179 last = tkr->cycle_last; 203 last = tkr->cycle_last;
180 mask = tkr->mask; 204 mask = tkr->mask;
181 max = tkr->clock->max_cycles; 205 max = tkr->clock->max_cycles;
@@ -209,7 +233,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
209 u64 cycle_now, delta; 233 u64 cycle_now, delta;
210 234
211 /* read clocksource */ 235 /* read clocksource */
212 cycle_now = tkr->read(tkr->clock); 236 cycle_now = tk_clock_read(tkr);
213 237
214 /* calculate the delta since the last update_wall_time */ 238 /* calculate the delta since the last update_wall_time */
215 delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); 239 delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
@@ -238,12 +262,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
238 ++tk->cs_was_changed_seq; 262 ++tk->cs_was_changed_seq;
239 old_clock = tk->tkr_mono.clock; 263 old_clock = tk->tkr_mono.clock;
240 tk->tkr_mono.clock = clock; 264 tk->tkr_mono.clock = clock;
241 tk->tkr_mono.read = clock->read;
242 tk->tkr_mono.mask = clock->mask; 265 tk->tkr_mono.mask = clock->mask;
243 tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); 266 tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);
244 267
245 tk->tkr_raw.clock = clock; 268 tk->tkr_raw.clock = clock;
246 tk->tkr_raw.read = clock->read;
247 tk->tkr_raw.mask = clock->mask; 269 tk->tkr_raw.mask = clock->mask;
248 tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; 270 tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
249 271
@@ -262,17 +284,19 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
262 /* Go back from cycles -> shifted ns */ 284 /* Go back from cycles -> shifted ns */
263 tk->xtime_interval = interval * clock->mult; 285 tk->xtime_interval = interval * clock->mult;
264 tk->xtime_remainder = ntpinterval - tk->xtime_interval; 286 tk->xtime_remainder = ntpinterval - tk->xtime_interval;
265 tk->raw_interval = (interval * clock->mult) >> clock->shift; 287 tk->raw_interval = interval * clock->mult;
266 288
267 /* if changing clocks, convert xtime_nsec shift units */ 289 /* if changing clocks, convert xtime_nsec shift units */
268 if (old_clock) { 290 if (old_clock) {
269 int shift_change = clock->shift - old_clock->shift; 291 int shift_change = clock->shift - old_clock->shift;
270 if (shift_change < 0) 292 if (shift_change < 0) {
271 tk->tkr_mono.xtime_nsec >>= -shift_change; 293 tk->tkr_mono.xtime_nsec >>= -shift_change;
272 else 294 tk->tkr_raw.xtime_nsec >>= -shift_change;
295 } else {
273 tk->tkr_mono.xtime_nsec <<= shift_change; 296 tk->tkr_mono.xtime_nsec <<= shift_change;
297 tk->tkr_raw.xtime_nsec <<= shift_change;
298 }
274 } 299 }
275 tk->tkr_raw.xtime_nsec = 0;
276 300
277 tk->tkr_mono.shift = clock->shift; 301 tk->tkr_mono.shift = clock->shift;
278 tk->tkr_raw.shift = clock->shift; 302 tk->tkr_raw.shift = clock->shift;
@@ -404,7 +428,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
404 428
405 now += timekeeping_delta_to_ns(tkr, 429 now += timekeeping_delta_to_ns(tkr,
406 clocksource_delta( 430 clocksource_delta(
407 tkr->read(tkr->clock), 431 tk_clock_read(tkr),
408 tkr->cycle_last, 432 tkr->cycle_last,
409 tkr->mask)); 433 tkr->mask));
410 } while (read_seqcount_retry(&tkf->seq, seq)); 434 } while (read_seqcount_retry(&tkf->seq, seq));
@@ -461,6 +485,10 @@ static u64 dummy_clock_read(struct clocksource *cs)
461 return cycles_at_suspend; 485 return cycles_at_suspend;
462} 486}
463 487
488static struct clocksource dummy_clock = {
489 .read = dummy_clock_read,
490};
491
464/** 492/**
465 * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. 493 * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
466 * @tk: Timekeeper to snapshot. 494 * @tk: Timekeeper to snapshot.
@@ -477,17 +505,18 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
477 struct tk_read_base *tkr = &tk->tkr_mono; 505 struct tk_read_base *tkr = &tk->tkr_mono;
478 506
479 memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); 507 memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
480 cycles_at_suspend = tkr->read(tkr->clock); 508 cycles_at_suspend = tk_clock_read(tkr);
481 tkr_dummy.read = dummy_clock_read; 509 tkr_dummy.clock = &dummy_clock;
482 update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); 510 update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
483 511
484 tkr = &tk->tkr_raw; 512 tkr = &tk->tkr_raw;
485 memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); 513 memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
486 tkr_dummy.read = dummy_clock_read; 514 tkr_dummy.clock = &dummy_clock;
487 update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); 515 update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
488} 516}
489 517
490#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD 518#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
519#warning Please contact your maintainers, as GENERIC_TIME_VSYSCALL_OLD compatibity will disappear soon.
491 520
492static inline void update_vsyscall(struct timekeeper *tk) 521static inline void update_vsyscall(struct timekeeper *tk)
493{ 522{
@@ -597,9 +626,6 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
597 nsec = (u32) tk->wall_to_monotonic.tv_nsec; 626 nsec = (u32) tk->wall_to_monotonic.tv_nsec;
598 tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); 627 tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
599 628
600 /* Update the monotonic raw base */
601 tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
602
603 /* 629 /*
604 * The sum of the nanoseconds portions of xtime and 630 * The sum of the nanoseconds portions of xtime and
605 * wall_to_monotonic can be greater/equal one second. Take 631 * wall_to_monotonic can be greater/equal one second. Take
@@ -609,6 +635,11 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
609 if (nsec >= NSEC_PER_SEC) 635 if (nsec >= NSEC_PER_SEC)
610 seconds++; 636 seconds++;
611 tk->ktime_sec = seconds; 637 tk->ktime_sec = seconds;
638
639 /* Update the monotonic raw base */
640 seconds = tk->raw_sec;
641 nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift);
642 tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
612} 643}
613 644
614/* must hold timekeeper_lock */ 645/* must hold timekeeper_lock */
@@ -649,11 +680,9 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
649 */ 680 */
650static void timekeeping_forward_now(struct timekeeper *tk) 681static void timekeeping_forward_now(struct timekeeper *tk)
651{ 682{
652 struct clocksource *clock = tk->tkr_mono.clock;
653 u64 cycle_now, delta; 683 u64 cycle_now, delta;
654 u64 nsec;
655 684
656 cycle_now = tk->tkr_mono.read(clock); 685 cycle_now = tk_clock_read(&tk->tkr_mono);
657 delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); 686 delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
658 tk->tkr_mono.cycle_last = cycle_now; 687 tk->tkr_mono.cycle_last = cycle_now;
659 tk->tkr_raw.cycle_last = cycle_now; 688 tk->tkr_raw.cycle_last = cycle_now;
@@ -663,10 +692,13 @@ static void timekeeping_forward_now(struct timekeeper *tk)
663 /* If arch requires, add in get_arch_timeoffset() */ 692 /* If arch requires, add in get_arch_timeoffset() */
664 tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; 693 tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
665 694
666 tk_normalize_xtime(tk);
667 695
668 nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift); 696 tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;
669 timespec64_add_ns(&tk->raw_time, nsec); 697
698 /* If arch requires, add in get_arch_timeoffset() */
699 tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift;
700
701 tk_normalize_xtime(tk);
670} 702}
671 703
672/** 704/**
@@ -929,8 +961,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
929 961
930 do { 962 do {
931 seq = read_seqcount_begin(&tk_core.seq); 963 seq = read_seqcount_begin(&tk_core.seq);
932 964 now = tk_clock_read(&tk->tkr_mono);
933 now = tk->tkr_mono.read(tk->tkr_mono.clock);
934 systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; 965 systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
935 systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; 966 systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
936 base_real = ktime_add(tk->tkr_mono.base, 967 base_real = ktime_add(tk->tkr_mono.base,
@@ -1108,7 +1139,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
1108 * Check whether the system counter value provided by the 1139 * Check whether the system counter value provided by the
1109 * device driver is on the current timekeeping interval. 1140 * device driver is on the current timekeeping interval.
1110 */ 1141 */
1111 now = tk->tkr_mono.read(tk->tkr_mono.clock); 1142 now = tk_clock_read(&tk->tkr_mono);
1112 interval_start = tk->tkr_mono.cycle_last; 1143 interval_start = tk->tkr_mono.cycle_last;
1113 if (!cycle_between(interval_start, cycles, now)) { 1144 if (!cycle_between(interval_start, cycles, now)) {
1114 clock_was_set_seq = tk->clock_was_set_seq; 1145 clock_was_set_seq = tk->clock_was_set_seq;
@@ -1353,19 +1384,18 @@ int timekeeping_notify(struct clocksource *clock)
1353void getrawmonotonic64(struct timespec64 *ts) 1384void getrawmonotonic64(struct timespec64 *ts)
1354{ 1385{
1355 struct timekeeper *tk = &tk_core.timekeeper; 1386 struct timekeeper *tk = &tk_core.timekeeper;
1356 struct timespec64 ts64;
1357 unsigned long seq; 1387 unsigned long seq;
1358 u64 nsecs; 1388 u64 nsecs;
1359 1389
1360 do { 1390 do {
1361 seq = read_seqcount_begin(&tk_core.seq); 1391 seq = read_seqcount_begin(&tk_core.seq);
1392 ts->tv_sec = tk->raw_sec;
1362 nsecs = timekeeping_get_ns(&tk->tkr_raw); 1393 nsecs = timekeeping_get_ns(&tk->tkr_raw);
1363 ts64 = tk->raw_time;
1364 1394
1365 } while (read_seqcount_retry(&tk_core.seq, seq)); 1395 } while (read_seqcount_retry(&tk_core.seq, seq));
1366 1396
1367 timespec64_add_ns(&ts64, nsecs); 1397 ts->tv_nsec = 0;
1368 *ts = ts64; 1398 timespec64_add_ns(ts, nsecs);
1369} 1399}
1370EXPORT_SYMBOL(getrawmonotonic64); 1400EXPORT_SYMBOL(getrawmonotonic64);
1371 1401
@@ -1489,8 +1519,7 @@ void __init timekeeping_init(void)
1489 tk_setup_internals(tk, clock); 1519 tk_setup_internals(tk, clock);
1490 1520
1491 tk_set_xtime(tk, &now); 1521 tk_set_xtime(tk, &now);
1492 tk->raw_time.tv_sec = 0; 1522 tk->raw_sec = 0;
1493 tk->raw_time.tv_nsec = 0;
1494 if (boot.tv_sec == 0 && boot.tv_nsec == 0) 1523 if (boot.tv_sec == 0 && boot.tv_nsec == 0)
1495 boot = tk_xtime(tk); 1524 boot = tk_xtime(tk);
1496 1525
@@ -1629,7 +1658,7 @@ void timekeeping_resume(void)
1629 * The less preferred source will only be tried if there is no better 1658 * The less preferred source will only be tried if there is no better
1630 * usable source. The rtc part is handled separately in rtc core code. 1659 * usable source. The rtc part is handled separately in rtc core code.
1631 */ 1660 */
1632 cycle_now = tk->tkr_mono.read(clock); 1661 cycle_now = tk_clock_read(&tk->tkr_mono);
1633 if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && 1662 if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
1634 cycle_now > tk->tkr_mono.cycle_last) { 1663 cycle_now > tk->tkr_mono.cycle_last) {
1635 u64 nsec, cyc_delta; 1664 u64 nsec, cyc_delta;
@@ -1976,7 +2005,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
1976 u32 shift, unsigned int *clock_set) 2005 u32 shift, unsigned int *clock_set)
1977{ 2006{
1978 u64 interval = tk->cycle_interval << shift; 2007 u64 interval = tk->cycle_interval << shift;
1979 u64 raw_nsecs; 2008 u64 snsec_per_sec;
1980 2009
1981 /* If the offset is smaller than a shifted interval, do nothing */ 2010 /* If the offset is smaller than a shifted interval, do nothing */
1982 if (offset < interval) 2011 if (offset < interval)
@@ -1991,14 +2020,12 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
1991 *clock_set |= accumulate_nsecs_to_secs(tk); 2020 *clock_set |= accumulate_nsecs_to_secs(tk);
1992 2021
1993 /* Accumulate raw time */ 2022 /* Accumulate raw time */
1994 raw_nsecs = (u64)tk->raw_interval << shift; 2023 tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
1995 raw_nsecs += tk->raw_time.tv_nsec; 2024 snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
1996 if (raw_nsecs >= NSEC_PER_SEC) { 2025 while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
1997 u64 raw_secs = raw_nsecs; 2026 tk->tkr_raw.xtime_nsec -= snsec_per_sec;
1998 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); 2027 tk->raw_sec++;
1999 tk->raw_time.tv_sec += raw_secs;
2000 } 2028 }
2001 tk->raw_time.tv_nsec = raw_nsecs;
2002 2029
2003 /* Accumulate error between NTP and clock interval */ 2030 /* Accumulate error between NTP and clock interval */
2004 tk->ntp_error += tk->ntp_tick << shift; 2031 tk->ntp_error += tk->ntp_tick << shift;
@@ -2030,7 +2057,7 @@ void update_wall_time(void)
2030#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 2057#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
2031 offset = real_tk->cycle_interval; 2058 offset = real_tk->cycle_interval;
2032#else 2059#else
2033 offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock), 2060 offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
2034 tk->tkr_mono.cycle_last, tk->tkr_mono.mask); 2061 tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
2035#endif 2062#endif
2036 2063
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 152a706ef8b8..71ce3f4eead3 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -195,7 +195,7 @@ EXPORT_SYMBOL(jiffies_64);
195#endif 195#endif
196 196
197struct timer_base { 197struct timer_base {
198 spinlock_t lock; 198 raw_spinlock_t lock;
199 struct timer_list *running_timer; 199 struct timer_list *running_timer;
200 unsigned long clk; 200 unsigned long clk;
201 unsigned long next_expiry; 201 unsigned long next_expiry;
@@ -913,10 +913,10 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
913 913
914 if (!(tf & TIMER_MIGRATING)) { 914 if (!(tf & TIMER_MIGRATING)) {
915 base = get_timer_base(tf); 915 base = get_timer_base(tf);
916 spin_lock_irqsave(&base->lock, *flags); 916 raw_spin_lock_irqsave(&base->lock, *flags);
917 if (timer->flags == tf) 917 if (timer->flags == tf)
918 return base; 918 return base;
919 spin_unlock_irqrestore(&base->lock, *flags); 919 raw_spin_unlock_irqrestore(&base->lock, *flags);
920 } 920 }
921 cpu_relax(); 921 cpu_relax();
922 } 922 }
@@ -986,9 +986,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
986 /* See the comment in lock_timer_base() */ 986 /* See the comment in lock_timer_base() */
987 timer->flags |= TIMER_MIGRATING; 987 timer->flags |= TIMER_MIGRATING;
988 988
989 spin_unlock(&base->lock); 989 raw_spin_unlock(&base->lock);
990 base = new_base; 990 base = new_base;
991 spin_lock(&base->lock); 991 raw_spin_lock(&base->lock);
992 WRITE_ONCE(timer->flags, 992 WRITE_ONCE(timer->flags,
993 (timer->flags & ~TIMER_BASEMASK) | base->cpu); 993 (timer->flags & ~TIMER_BASEMASK) | base->cpu);
994 } 994 }
@@ -1013,7 +1013,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
1013 } 1013 }
1014 1014
1015out_unlock: 1015out_unlock:
1016 spin_unlock_irqrestore(&base->lock, flags); 1016 raw_spin_unlock_irqrestore(&base->lock, flags);
1017 1017
1018 return ret; 1018 return ret;
1019} 1019}
@@ -1106,16 +1106,16 @@ void add_timer_on(struct timer_list *timer, int cpu)
1106 if (base != new_base) { 1106 if (base != new_base) {
1107 timer->flags |= TIMER_MIGRATING; 1107 timer->flags |= TIMER_MIGRATING;
1108 1108
1109 spin_unlock(&base->lock); 1109 raw_spin_unlock(&base->lock);
1110 base = new_base; 1110 base = new_base;
1111 spin_lock(&base->lock); 1111 raw_spin_lock(&base->lock);
1112 WRITE_ONCE(timer->flags, 1112 WRITE_ONCE(timer->flags,
1113 (timer->flags & ~TIMER_BASEMASK) | cpu); 1113 (timer->flags & ~TIMER_BASEMASK) | cpu);
1114 } 1114 }
1115 1115
1116 debug_activate(timer, timer->expires); 1116 debug_activate(timer, timer->expires);
1117 internal_add_timer(base, timer); 1117 internal_add_timer(base, timer);
1118 spin_unlock_irqrestore(&base->lock, flags); 1118 raw_spin_unlock_irqrestore(&base->lock, flags);
1119} 1119}
1120EXPORT_SYMBOL_GPL(add_timer_on); 1120EXPORT_SYMBOL_GPL(add_timer_on);
1121 1121
@@ -1141,7 +1141,7 @@ int del_timer(struct timer_list *timer)
1141 if (timer_pending(timer)) { 1141 if (timer_pending(timer)) {
1142 base = lock_timer_base(timer, &flags); 1142 base = lock_timer_base(timer, &flags);
1143 ret = detach_if_pending(timer, base, true); 1143 ret = detach_if_pending(timer, base, true);
1144 spin_unlock_irqrestore(&base->lock, flags); 1144 raw_spin_unlock_irqrestore(&base->lock, flags);
1145 } 1145 }
1146 1146
1147 return ret; 1147 return ret;
@@ -1150,7 +1150,7 @@ EXPORT_SYMBOL(del_timer);
1150 1150
1151/** 1151/**
1152 * try_to_del_timer_sync - Try to deactivate a timer 1152 * try_to_del_timer_sync - Try to deactivate a timer
1153 * @timer: timer do del 1153 * @timer: timer to delete
1154 * 1154 *
1155 * This function tries to deactivate a timer. Upon successful (ret >= 0) 1155 * This function tries to deactivate a timer. Upon successful (ret >= 0)
1156 * exit the timer is not queued and the handler is not running on any CPU. 1156 * exit the timer is not queued and the handler is not running on any CPU.
@@ -1168,7 +1168,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
1168 if (base->running_timer != timer) 1168 if (base->running_timer != timer)
1169 ret = detach_if_pending(timer, base, true); 1169 ret = detach_if_pending(timer, base, true);
1170 1170
1171 spin_unlock_irqrestore(&base->lock, flags); 1171 raw_spin_unlock_irqrestore(&base->lock, flags);
1172 1172
1173 return ret; 1173 return ret;
1174} 1174}
@@ -1299,13 +1299,13 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
1299 data = timer->data; 1299 data = timer->data;
1300 1300
1301 if (timer->flags & TIMER_IRQSAFE) { 1301 if (timer->flags & TIMER_IRQSAFE) {
1302 spin_unlock(&base->lock); 1302 raw_spin_unlock(&base->lock);
1303 call_timer_fn(timer, fn, data); 1303 call_timer_fn(timer, fn, data);
1304 spin_lock(&base->lock); 1304 raw_spin_lock(&base->lock);
1305 } else { 1305 } else {
1306 spin_unlock_irq(&base->lock); 1306 raw_spin_unlock_irq(&base->lock);
1307 call_timer_fn(timer, fn, data); 1307 call_timer_fn(timer, fn, data);
1308 spin_lock_irq(&base->lock); 1308 raw_spin_lock_irq(&base->lock);
1309 } 1309 }
1310 } 1310 }
1311} 1311}
@@ -1474,7 +1474,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
1474 if (cpu_is_offline(smp_processor_id())) 1474 if (cpu_is_offline(smp_processor_id()))
1475 return expires; 1475 return expires;
1476 1476
1477 spin_lock(&base->lock); 1477 raw_spin_lock(&base->lock);
1478 nextevt = __next_timer_interrupt(base); 1478 nextevt = __next_timer_interrupt(base);
1479 is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); 1479 is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
1480 base->next_expiry = nextevt; 1480 base->next_expiry = nextevt;
@@ -1502,7 +1502,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
1502 if ((expires - basem) > TICK_NSEC) 1502 if ((expires - basem) > TICK_NSEC)
1503 base->is_idle = true; 1503 base->is_idle = true;
1504 } 1504 }
1505 spin_unlock(&base->lock); 1505 raw_spin_unlock(&base->lock);
1506 1506
1507 return cmp_next_hrtimer_event(basem, expires); 1507 return cmp_next_hrtimer_event(basem, expires);
1508} 1508}
@@ -1590,7 +1590,7 @@ static inline void __run_timers(struct timer_base *base)
1590 if (!time_after_eq(jiffies, base->clk)) 1590 if (!time_after_eq(jiffies, base->clk))
1591 return; 1591 return;
1592 1592
1593 spin_lock_irq(&base->lock); 1593 raw_spin_lock_irq(&base->lock);
1594 1594
1595 while (time_after_eq(jiffies, base->clk)) { 1595 while (time_after_eq(jiffies, base->clk)) {
1596 1596
@@ -1601,7 +1601,7 @@ static inline void __run_timers(struct timer_base *base)
1601 expire_timers(base, heads + levels); 1601 expire_timers(base, heads + levels);
1602 } 1602 }
1603 base->running_timer = NULL; 1603 base->running_timer = NULL;
1604 spin_unlock_irq(&base->lock); 1604 raw_spin_unlock_irq(&base->lock);
1605} 1605}
1606 1606
1607/* 1607/*
@@ -1786,16 +1786,16 @@ int timers_dead_cpu(unsigned int cpu)
1786 * The caller is globally serialized and nobody else 1786 * The caller is globally serialized and nobody else
1787 * takes two locks at once, deadlock is not possible. 1787 * takes two locks at once, deadlock is not possible.
1788 */ 1788 */
1789 spin_lock_irq(&new_base->lock); 1789 raw_spin_lock_irq(&new_base->lock);
1790 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1790 raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1791 1791
1792 BUG_ON(old_base->running_timer); 1792 BUG_ON(old_base->running_timer);
1793 1793
1794 for (i = 0; i < WHEEL_SIZE; i++) 1794 for (i = 0; i < WHEEL_SIZE; i++)
1795 migrate_timer_list(new_base, old_base->vectors + i); 1795 migrate_timer_list(new_base, old_base->vectors + i);
1796 1796
1797 spin_unlock(&old_base->lock); 1797 raw_spin_unlock(&old_base->lock);
1798 spin_unlock_irq(&new_base->lock); 1798 raw_spin_unlock_irq(&new_base->lock);
1799 put_cpu_ptr(&timer_bases); 1799 put_cpu_ptr(&timer_bases);
1800 } 1800 }
1801 return 0; 1801 return 0;
@@ -1811,7 +1811,7 @@ static void __init init_timer_cpu(int cpu)
1811 for (i = 0; i < NR_BASES; i++) { 1811 for (i = 0; i < NR_BASES; i++) {
1812 base = per_cpu_ptr(&timer_bases[i], cpu); 1812 base = per_cpu_ptr(&timer_bases[i], cpu);
1813 base->cpu = cpu; 1813 base->cpu = cpu;
1814 spin_lock_init(&base->lock); 1814 raw_spin_lock_init(&base->lock);
1815 base->clk = jiffies; 1815 base->clk = jiffies;
1816 } 1816 }
1817} 1817}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 7e06f04e98fe..434c840e2d82 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -667,30 +667,30 @@ config RING_BUFFER_STARTUP_TEST
667 667
668 If unsure, say N 668 If unsure, say N
669 669
670config TRACE_ENUM_MAP_FILE 670config TRACE_EVAL_MAP_FILE
671 bool "Show enum mappings for trace events" 671 bool "Show eval mappings for trace events"
672 depends on TRACING 672 depends on TRACING
673 help 673 help
674 The "print fmt" of the trace events will show the enum names instead 674 The "print fmt" of the trace events will show the enum/sizeof names
675 of their values. This can cause problems for user space tools that 675 instead of their values. This can cause problems for user space tools
676 use this string to parse the raw data as user space does not know 676 that use this string to parse the raw data as user space does not know
677 how to convert the string to its value. 677 how to convert the string to its value.
678 678
679 To fix this, there's a special macro in the kernel that can be used 679 To fix this, there's a special macro in the kernel that can be used
680 to convert the enum into its value. If this macro is used, then the 680 to convert an enum/sizeof into its value. If this macro is used, then
681 print fmt strings will have the enums converted to their values. 681 the print fmt strings will be converted to their values.
682 682
683 If something does not get converted properly, this option can be 683 If something does not get converted properly, this option can be
684 used to show what enums the kernel tried to convert. 684 used to show what enums/sizeof the kernel tried to convert.
685 685
686 This option is for debugging the enum conversions. A file is created 686 This option is for debugging the conversions. A file is created
687 in the tracing directory called "enum_map" that will show the enum 687 in the tracing directory called "eval_map" that will show the
688 names matched with their values and what trace event system they 688 names matched with their values and what trace event system they
689 belong too. 689 belong too.
690 690
691 Normally, the mapping of the strings to values will be freed after 691 Normally, the mapping of the strings to values will be freed after
692 boot up or module load. With this option, they will not be freed, as 692 boot up or module load. With this option, they will not be freed, as
693 they are needed for the "enum_map" file. Enabling this option will 693 they are needed for the "eval_map" file. Enabling this option will
694 increase the memory footprint of the running kernel. 694 increase the memory footprint of the running kernel.
695 695
696 If unsure, say N 696 If unsure, say N
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 193c5f5e3f79..bc364f86100a 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -867,7 +867,7 @@ static void blk_add_trace_split(void *ignore,
867 867
868 __blk_add_trace(bt, bio->bi_iter.bi_sector, 868 __blk_add_trace(bt, bio->bi_iter.bi_sector,
869 bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, 869 bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
870 BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), 870 BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
871 &rpdu); 871 &rpdu);
872 } 872 }
873} 873}
@@ -900,7 +900,7 @@ static void blk_add_trace_bio_remap(void *ignore,
900 r.sector_from = cpu_to_be64(from); 900 r.sector_from = cpu_to_be64(from);
901 901
902 __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, 902 __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
903 bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error, 903 bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
904 sizeof(r), &r); 904 sizeof(r), &r);
905} 905}
906 906
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 460a031c77e5..37385193a608 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -122,8 +122,8 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
122} 122}
123 123
124/* 124/*
125 * limited trace_printk() 125 * Only limited trace_printk() conversion specifiers allowed:
126 * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed 126 * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s
127 */ 127 */
128BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, 128BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
129 u64, arg2, u64, arg3) 129 u64, arg2, u64, arg3)
@@ -198,7 +198,8 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
198 i++; 198 i++;
199 } 199 }
200 200
201 if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') 201 if (fmt[i] != 'i' && fmt[i] != 'd' &&
202 fmt[i] != 'u' && fmt[i] != 'x')
202 return -EINVAL; 203 return -EINVAL;
203 fmt_cnt++; 204 fmt_cnt++;
204 } 205 }
@@ -234,7 +235,8 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
234 unsigned int cpu = smp_processor_id(); 235 unsigned int cpu = smp_processor_id();
235 u64 index = flags & BPF_F_INDEX_MASK; 236 u64 index = flags & BPF_F_INDEX_MASK;
236 struct bpf_event_entry *ee; 237 struct bpf_event_entry *ee;
237 struct perf_event *event; 238 u64 value = 0;
239 int err;
238 240
239 if (unlikely(flags & ~(BPF_F_INDEX_MASK))) 241 if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
240 return -EINVAL; 242 return -EINVAL;
@@ -247,21 +249,14 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
247 if (!ee) 249 if (!ee)
248 return -ENOENT; 250 return -ENOENT;
249 251
250 event = ee->event; 252 err = perf_event_read_local(ee->event, &value);
251 if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
252 event->attr.type != PERF_TYPE_RAW))
253 return -EINVAL;
254
255 /* make sure event is local and doesn't have pmu::count */
256 if (unlikely(event->oncpu != cpu || event->pmu->count))
257 return -EINVAL;
258
259 /* 253 /*
260 * we don't know if the function is run successfully by the 254 * this api is ugly since we miss [-22..-2] range of valid
261 * return value. It can be judged in other places, such as 255 * counter values, but that's uapi
262 * eBPF programs.
263 */ 256 */
264 return perf_event_read_local(event); 257 if (err)
258 return err;
259 return value;
265} 260}
266 261
267static const struct bpf_func_proto bpf_perf_event_read_proto = { 262static const struct bpf_func_proto bpf_perf_event_read_proto = {
@@ -272,14 +267,16 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
272 .arg2_type = ARG_ANYTHING, 267 .arg2_type = ARG_ANYTHING,
273}; 268};
274 269
270static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
271
275static __always_inline u64 272static __always_inline u64
276__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, 273__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
277 u64 flags, struct perf_raw_record *raw) 274 u64 flags, struct perf_raw_record *raw)
278{ 275{
279 struct bpf_array *array = container_of(map, struct bpf_array, map); 276 struct bpf_array *array = container_of(map, struct bpf_array, map);
277 struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
280 unsigned int cpu = smp_processor_id(); 278 unsigned int cpu = smp_processor_id();
281 u64 index = flags & BPF_F_INDEX_MASK; 279 u64 index = flags & BPF_F_INDEX_MASK;
282 struct perf_sample_data sample_data;
283 struct bpf_event_entry *ee; 280 struct bpf_event_entry *ee;
284 struct perf_event *event; 281 struct perf_event *event;
285 282
@@ -300,9 +297,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
300 if (unlikely(event->oncpu != cpu)) 297 if (unlikely(event->oncpu != cpu))
301 return -EOPNOTSUPP; 298 return -EOPNOTSUPP;
302 299
303 perf_sample_data_init(&sample_data, 0, 0); 300 perf_sample_data_init(sd, 0, 0);
304 sample_data.raw = raw; 301 sd->raw = raw;
305 perf_event_output(event, &sample_data, regs); 302 perf_event_output(event, sd, regs);
306 return 0; 303 return 0;
307} 304}
308 305
@@ -483,7 +480,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
483 480
484/* bpf+kprobe programs can access fields of 'struct pt_regs' */ 481/* bpf+kprobe programs can access fields of 'struct pt_regs' */
485static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, 482static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
486 enum bpf_reg_type *reg_type) 483 struct bpf_insn_access_aux *info)
487{ 484{
488 if (off < 0 || off >= sizeof(struct pt_regs)) 485 if (off < 0 || off >= sizeof(struct pt_regs))
489 return false; 486 return false;
@@ -566,7 +563,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
566} 563}
567 564
568static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, 565static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
569 enum bpf_reg_type *reg_type) 566 struct bpf_insn_access_aux *info)
570{ 567{
571 if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) 568 if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
572 return false; 569 return false;
@@ -585,40 +582,47 @@ const struct bpf_verifier_ops tracepoint_prog_ops = {
585}; 582};
586 583
587static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, 584static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
588 enum bpf_reg_type *reg_type) 585 struct bpf_insn_access_aux *info)
589{ 586{
587 const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data,
588 sample_period);
589
590 if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) 590 if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
591 return false; 591 return false;
592 if (type != BPF_READ) 592 if (type != BPF_READ)
593 return false; 593 return false;
594 if (off % size != 0) 594 if (off % size != 0)
595 return false; 595 return false;
596 if (off == offsetof(struct bpf_perf_event_data, sample_period)) { 596
597 if (size != sizeof(u64)) 597 switch (off) {
598 case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
599 bpf_ctx_record_field_size(info, size_sp);
600 if (!bpf_ctx_narrow_access_ok(off, size, size_sp))
598 return false; 601 return false;
599 } else { 602 break;
603 default:
600 if (size != sizeof(long)) 604 if (size != sizeof(long))
601 return false; 605 return false;
602 } 606 }
607
603 return true; 608 return true;
604} 609}
605 610
606static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, 611static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
607 const struct bpf_insn *si, 612 const struct bpf_insn *si,
608 struct bpf_insn *insn_buf, 613 struct bpf_insn *insn_buf,
609 struct bpf_prog *prog) 614 struct bpf_prog *prog, u32 *target_size)
610{ 615{
611 struct bpf_insn *insn = insn_buf; 616 struct bpf_insn *insn = insn_buf;
612 617
613 switch (si->off) { 618 switch (si->off) {
614 case offsetof(struct bpf_perf_event_data, sample_period): 619 case offsetof(struct bpf_perf_event_data, sample_period):
615 BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
616
617 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, 620 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
618 data), si->dst_reg, si->src_reg, 621 data), si->dst_reg, si->src_reg,
619 offsetof(struct bpf_perf_event_data_kern, data)); 622 offsetof(struct bpf_perf_event_data_kern, data));
620 *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, 623 *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
621 offsetof(struct perf_sample_data, period)); 624 bpf_target_off(struct perf_sample_data, period, 8,
625 target_size));
622 break; 626 break;
623 default: 627 default:
624 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, 628 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 74fdfe9ed3db..02004ae91860 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -113,7 +113,7 @@ static int ftrace_disabled __read_mostly;
113 113
114static DEFINE_MUTEX(ftrace_lock); 114static DEFINE_MUTEX(ftrace_lock);
115 115
116static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 116static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
117ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 117ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
118static struct ftrace_ops global_ops; 118static struct ftrace_ops global_ops;
119 119
@@ -169,8 +169,11 @@ int ftrace_nr_registered_ops(void)
169 169
170 mutex_lock(&ftrace_lock); 170 mutex_lock(&ftrace_lock);
171 171
172 for (ops = ftrace_ops_list; 172 for (ops = rcu_dereference_protected(ftrace_ops_list,
173 ops != &ftrace_list_end; ops = ops->next) 173 lockdep_is_held(&ftrace_lock));
174 ops != &ftrace_list_end;
175 ops = rcu_dereference_protected(ops->next,
176 lockdep_is_held(&ftrace_lock)))
174 cnt++; 177 cnt++;
175 178
176 mutex_unlock(&ftrace_lock); 179 mutex_unlock(&ftrace_lock);
@@ -275,10 +278,11 @@ static void update_ftrace_function(void)
275 * If there's only one ftrace_ops registered, the ftrace_ops_list 278 * If there's only one ftrace_ops registered, the ftrace_ops_list
276 * will point to the ops we want. 279 * will point to the ops we want.
277 */ 280 */
278 set_function_trace_op = ftrace_ops_list; 281 set_function_trace_op = rcu_dereference_protected(ftrace_ops_list,
282 lockdep_is_held(&ftrace_lock));
279 283
280 /* If there's no ftrace_ops registered, just call the stub function */ 284 /* If there's no ftrace_ops registered, just call the stub function */
281 if (ftrace_ops_list == &ftrace_list_end) { 285 if (set_function_trace_op == &ftrace_list_end) {
282 func = ftrace_stub; 286 func = ftrace_stub;
283 287
284 /* 288 /*
@@ -286,7 +290,8 @@ static void update_ftrace_function(void)
286 * recursion safe and not dynamic and the arch supports passing ops, 290 * recursion safe and not dynamic and the arch supports passing ops,
287 * then have the mcount trampoline call the function directly. 291 * then have the mcount trampoline call the function directly.
288 */ 292 */
289 } else if (ftrace_ops_list->next == &ftrace_list_end) { 293 } else if (rcu_dereference_protected(ftrace_ops_list->next,
294 lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
290 func = ftrace_ops_get_list_func(ftrace_ops_list); 295 func = ftrace_ops_get_list_func(ftrace_ops_list);
291 296
292 } else { 297 } else {
@@ -348,9 +353,11 @@ int using_ftrace_ops_list_func(void)
348 return ftrace_trace_function == ftrace_ops_list_func; 353 return ftrace_trace_function == ftrace_ops_list_func;
349} 354}
350 355
351static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) 356static void add_ftrace_ops(struct ftrace_ops __rcu **list,
357 struct ftrace_ops *ops)
352{ 358{
353 ops->next = *list; 359 rcu_assign_pointer(ops->next, *list);
360
354 /* 361 /*
355 * We are entering ops into the list but another 362 * We are entering ops into the list but another
356 * CPU might be walking that list. We need to make sure 363 * CPU might be walking that list. We need to make sure
@@ -360,7 +367,8 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
360 rcu_assign_pointer(*list, ops); 367 rcu_assign_pointer(*list, ops);
361} 368}
362 369
363static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) 370static int remove_ftrace_ops(struct ftrace_ops __rcu **list,
371 struct ftrace_ops *ops)
364{ 372{
365 struct ftrace_ops **p; 373 struct ftrace_ops **p;
366 374
@@ -368,7 +376,10 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
368 * If we are removing the last function, then simply point 376 * If we are removing the last function, then simply point
369 * to the ftrace_stub. 377 * to the ftrace_stub.
370 */ 378 */
371 if (*list == ops && ops->next == &ftrace_list_end) { 379 if (rcu_dereference_protected(*list,
380 lockdep_is_held(&ftrace_lock)) == ops &&
381 rcu_dereference_protected(ops->next,
382 lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
372 *list = &ftrace_list_end; 383 *list = &ftrace_list_end;
373 return 0; 384 return 0;
374 } 385 }
@@ -1293,6 +1304,28 @@ static void ftrace_hash_clear(struct ftrace_hash *hash)
1293 FTRACE_WARN_ON(hash->count); 1304 FTRACE_WARN_ON(hash->count);
1294} 1305}
1295 1306
1307static void free_ftrace_mod(struct ftrace_mod_load *ftrace_mod)
1308{
1309 list_del(&ftrace_mod->list);
1310 kfree(ftrace_mod->module);
1311 kfree(ftrace_mod->func);
1312 kfree(ftrace_mod);
1313}
1314
1315static void clear_ftrace_mod_list(struct list_head *head)
1316{
1317 struct ftrace_mod_load *p, *n;
1318
1319 /* stack tracer isn't supported yet */
1320 if (!head)
1321 return;
1322
1323 mutex_lock(&ftrace_lock);
1324 list_for_each_entry_safe(p, n, head, list)
1325 free_ftrace_mod(p);
1326 mutex_unlock(&ftrace_lock);
1327}
1328
1296static void free_ftrace_hash(struct ftrace_hash *hash) 1329static void free_ftrace_hash(struct ftrace_hash *hash)
1297{ 1330{
1298 if (!hash || hash == EMPTY_HASH) 1331 if (!hash || hash == EMPTY_HASH)
@@ -1346,6 +1379,35 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1346 return hash; 1379 return hash;
1347} 1380}
1348 1381
1382
1383static int ftrace_add_mod(struct trace_array *tr,
1384 const char *func, const char *module,
1385 int enable)
1386{
1387 struct ftrace_mod_load *ftrace_mod;
1388 struct list_head *mod_head = enable ? &tr->mod_trace : &tr->mod_notrace;
1389
1390 ftrace_mod = kzalloc(sizeof(*ftrace_mod), GFP_KERNEL);
1391 if (!ftrace_mod)
1392 return -ENOMEM;
1393
1394 ftrace_mod->func = kstrdup(func, GFP_KERNEL);
1395 ftrace_mod->module = kstrdup(module, GFP_KERNEL);
1396 ftrace_mod->enable = enable;
1397
1398 if (!ftrace_mod->func || !ftrace_mod->module)
1399 goto out_free;
1400
1401 list_add(&ftrace_mod->list, mod_head);
1402
1403 return 0;
1404
1405 out_free:
1406 free_ftrace_mod(ftrace_mod);
1407
1408 return -ENOMEM;
1409}
1410
1349static struct ftrace_hash * 1411static struct ftrace_hash *
1350alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) 1412alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1351{ 1413{
@@ -1359,6 +1421,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1359 if (!new_hash) 1421 if (!new_hash)
1360 return NULL; 1422 return NULL;
1361 1423
1424 if (hash)
1425 new_hash->flags = hash->flags;
1426
1362 /* Empty hash? */ 1427 /* Empty hash? */
1363 if (ftrace_hash_empty(hash)) 1428 if (ftrace_hash_empty(hash))
1364 return new_hash; 1429 return new_hash;
@@ -1403,7 +1468,7 @@ __ftrace_hash_move(struct ftrace_hash *src)
1403 /* 1468 /*
1404 * If the new source is empty, just return the empty_hash. 1469 * If the new source is empty, just return the empty_hash.
1405 */ 1470 */
1406 if (!src->count) 1471 if (ftrace_hash_empty(src))
1407 return EMPTY_HASH; 1472 return EMPTY_HASH;
1408 1473
1409 /* 1474 /*
@@ -1420,6 +1485,8 @@ __ftrace_hash_move(struct ftrace_hash *src)
1420 if (!new_hash) 1485 if (!new_hash)
1421 return NULL; 1486 return NULL;
1422 1487
1488 new_hash->flags = src->flags;
1489
1423 size = 1 << src->size_bits; 1490 size = 1 << src->size_bits;
1424 for (i = 0; i < size; i++) { 1491 for (i = 0; i < size; i++) {
1425 hhd = &src->buckets[i]; 1492 hhd = &src->buckets[i];
@@ -1513,8 +1580,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
1513 return 0; 1580 return 0;
1514#endif 1581#endif
1515 1582
1516 hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); 1583 rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash);
1517 hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); 1584 rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash);
1518 1585
1519 if (hash_contains_ip(ip, &hash)) 1586 if (hash_contains_ip(ip, &hash))
1520 ret = 1; 1587 ret = 1;
@@ -1650,7 +1717,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
1650 struct dyn_ftrace *rec; 1717 struct dyn_ftrace *rec;
1651 bool update = false; 1718 bool update = false;
1652 int count = 0; 1719 int count = 0;
1653 int all = 0; 1720 int all = false;
1654 1721
1655 /* Only update if the ops has been registered */ 1722 /* Only update if the ops has been registered */
1656 if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) 1723 if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
@@ -1671,7 +1738,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
1671 hash = ops->func_hash->filter_hash; 1738 hash = ops->func_hash->filter_hash;
1672 other_hash = ops->func_hash->notrace_hash; 1739 other_hash = ops->func_hash->notrace_hash;
1673 if (ftrace_hash_empty(hash)) 1740 if (ftrace_hash_empty(hash))
1674 all = 1; 1741 all = true;
1675 } else { 1742 } else {
1676 inc = !inc; 1743 inc = !inc;
1677 hash = ops->func_hash->notrace_hash; 1744 hash = ops->func_hash->notrace_hash;
@@ -2784,7 +2851,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2784 * If there's no more ops registered with ftrace, run a 2851 * If there's no more ops registered with ftrace, run a
2785 * sanity check to make sure all rec flags are cleared. 2852 * sanity check to make sure all rec flags are cleared.
2786 */ 2853 */
2787 if (ftrace_ops_list == &ftrace_list_end) { 2854 if (rcu_dereference_protected(ftrace_ops_list,
2855 lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
2788 struct ftrace_page *pg; 2856 struct ftrace_page *pg;
2789 struct dyn_ftrace *rec; 2857 struct dyn_ftrace *rec;
2790 2858
@@ -3061,6 +3129,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
3061struct ftrace_iterator { 3129struct ftrace_iterator {
3062 loff_t pos; 3130 loff_t pos;
3063 loff_t func_pos; 3131 loff_t func_pos;
3132 loff_t mod_pos;
3064 struct ftrace_page *pg; 3133 struct ftrace_page *pg;
3065 struct dyn_ftrace *func; 3134 struct dyn_ftrace *func;
3066 struct ftrace_func_probe *probe; 3135 struct ftrace_func_probe *probe;
@@ -3068,6 +3137,8 @@ struct ftrace_iterator {
3068 struct trace_parser parser; 3137 struct trace_parser parser;
3069 struct ftrace_hash *hash; 3138 struct ftrace_hash *hash;
3070 struct ftrace_ops *ops; 3139 struct ftrace_ops *ops;
3140 struct trace_array *tr;
3141 struct list_head *mod_list;
3071 int pidx; 3142 int pidx;
3072 int idx; 3143 int idx;
3073 unsigned flags; 3144 unsigned flags;
@@ -3152,13 +3223,13 @@ static void *t_probe_start(struct seq_file *m, loff_t *pos)
3152 if (!(iter->flags & FTRACE_ITER_DO_PROBES)) 3223 if (!(iter->flags & FTRACE_ITER_DO_PROBES))
3153 return NULL; 3224 return NULL;
3154 3225
3155 if (iter->func_pos > *pos) 3226 if (iter->mod_pos > *pos)
3156 return NULL; 3227 return NULL;
3157 3228
3158 iter->probe = NULL; 3229 iter->probe = NULL;
3159 iter->probe_entry = NULL; 3230 iter->probe_entry = NULL;
3160 iter->pidx = 0; 3231 iter->pidx = 0;
3161 for (l = 0; l <= (*pos - iter->func_pos); ) { 3232 for (l = 0; l <= (*pos - iter->mod_pos); ) {
3162 p = t_probe_next(m, &l); 3233 p = t_probe_next(m, &l);
3163 if (!p) 3234 if (!p)
3164 break; 3235 break;
@@ -3197,6 +3268,82 @@ t_probe_show(struct seq_file *m, struct ftrace_iterator *iter)
3197} 3268}
3198 3269
3199static void * 3270static void *
3271t_mod_next(struct seq_file *m, loff_t *pos)
3272{
3273 struct ftrace_iterator *iter = m->private;
3274 struct trace_array *tr = iter->tr;
3275
3276 (*pos)++;
3277 iter->pos = *pos;
3278
3279 iter->mod_list = iter->mod_list->next;
3280
3281 if (iter->mod_list == &tr->mod_trace ||
3282 iter->mod_list == &tr->mod_notrace) {
3283 iter->flags &= ~FTRACE_ITER_MOD;
3284 return NULL;
3285 }
3286
3287 iter->mod_pos = *pos;
3288
3289 return iter;
3290}
3291
3292static void *t_mod_start(struct seq_file *m, loff_t *pos)
3293{
3294 struct ftrace_iterator *iter = m->private;
3295 void *p = NULL;
3296 loff_t l;
3297
3298 if (iter->func_pos > *pos)
3299 return NULL;
3300
3301 iter->mod_pos = iter->func_pos;
3302
3303 /* probes are only available if tr is set */
3304 if (!iter->tr)
3305 return NULL;
3306
3307 for (l = 0; l <= (*pos - iter->func_pos); ) {
3308 p = t_mod_next(m, &l);
3309 if (!p)
3310 break;
3311 }
3312 if (!p) {
3313 iter->flags &= ~FTRACE_ITER_MOD;
3314 return t_probe_start(m, pos);
3315 }
3316
3317 /* Only set this if we have an item */
3318 iter->flags |= FTRACE_ITER_MOD;
3319
3320 return iter;
3321}
3322
3323static int
3324t_mod_show(struct seq_file *m, struct ftrace_iterator *iter)
3325{
3326 struct ftrace_mod_load *ftrace_mod;
3327 struct trace_array *tr = iter->tr;
3328
3329 if (WARN_ON_ONCE(!iter->mod_list) ||
3330 iter->mod_list == &tr->mod_trace ||
3331 iter->mod_list == &tr->mod_notrace)
3332 return -EIO;
3333
3334 ftrace_mod = list_entry(iter->mod_list, struct ftrace_mod_load, list);
3335
3336 if (ftrace_mod->func)
3337 seq_printf(m, "%s", ftrace_mod->func);
3338 else
3339 seq_putc(m, '*');
3340
3341 seq_printf(m, ":mod:%s\n", ftrace_mod->module);
3342
3343 return 0;
3344}
3345
3346static void *
3200t_func_next(struct seq_file *m, loff_t *pos) 3347t_func_next(struct seq_file *m, loff_t *pos)
3201{ 3348{
3202 struct ftrace_iterator *iter = m->private; 3349 struct ftrace_iterator *iter = m->private;
@@ -3237,7 +3384,7 @@ static void *
3237t_next(struct seq_file *m, void *v, loff_t *pos) 3384t_next(struct seq_file *m, void *v, loff_t *pos)
3238{ 3385{
3239 struct ftrace_iterator *iter = m->private; 3386 struct ftrace_iterator *iter = m->private;
3240 loff_t l = *pos; /* t_hash_start() must use original pos */ 3387 loff_t l = *pos; /* t_probe_start() must use original pos */
3241 void *ret; 3388 void *ret;
3242 3389
3243 if (unlikely(ftrace_disabled)) 3390 if (unlikely(ftrace_disabled))
@@ -3246,16 +3393,19 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
3246 if (iter->flags & FTRACE_ITER_PROBE) 3393 if (iter->flags & FTRACE_ITER_PROBE)
3247 return t_probe_next(m, pos); 3394 return t_probe_next(m, pos);
3248 3395
3396 if (iter->flags & FTRACE_ITER_MOD)
3397 return t_mod_next(m, pos);
3398
3249 if (iter->flags & FTRACE_ITER_PRINTALL) { 3399 if (iter->flags & FTRACE_ITER_PRINTALL) {
3250 /* next must increment pos, and t_probe_start does not */ 3400 /* next must increment pos, and t_probe_start does not */
3251 (*pos)++; 3401 (*pos)++;
3252 return t_probe_start(m, &l); 3402 return t_mod_start(m, &l);
3253 } 3403 }
3254 3404
3255 ret = t_func_next(m, pos); 3405 ret = t_func_next(m, pos);
3256 3406
3257 if (!ret) 3407 if (!ret)
3258 return t_probe_start(m, &l); 3408 return t_mod_start(m, &l);
3259 3409
3260 return ret; 3410 return ret;
3261} 3411}
@@ -3264,7 +3414,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
3264{ 3414{
3265 iter->pos = 0; 3415 iter->pos = 0;
3266 iter->func_pos = 0; 3416 iter->func_pos = 0;
3267 iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE); 3417 iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE | FTRACE_ITER_MOD);
3268} 3418}
3269 3419
3270static void *t_start(struct seq_file *m, loff_t *pos) 3420static void *t_start(struct seq_file *m, loff_t *pos)
@@ -3293,15 +3443,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
3293 ftrace_hash_empty(iter->hash)) { 3443 ftrace_hash_empty(iter->hash)) {
3294 iter->func_pos = 1; /* Account for the message */ 3444 iter->func_pos = 1; /* Account for the message */
3295 if (*pos > 0) 3445 if (*pos > 0)
3296 return t_probe_start(m, pos); 3446 return t_mod_start(m, pos);
3297 iter->flags |= FTRACE_ITER_PRINTALL; 3447 iter->flags |= FTRACE_ITER_PRINTALL;
3298 /* reset in case of seek/pread */ 3448 /* reset in case of seek/pread */
3299 iter->flags &= ~FTRACE_ITER_PROBE; 3449 iter->flags &= ~FTRACE_ITER_PROBE;
3300 return iter; 3450 return iter;
3301 } 3451 }
3302 3452
3303 if (iter->flags & FTRACE_ITER_PROBE) 3453 if (iter->flags & FTRACE_ITER_MOD)
3304 return t_probe_start(m, pos); 3454 return t_mod_start(m, pos);
3305 3455
3306 /* 3456 /*
3307 * Unfortunately, we need to restart at ftrace_pages_start 3457 * Unfortunately, we need to restart at ftrace_pages_start
@@ -3317,7 +3467,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
3317 } 3467 }
3318 3468
3319 if (!p) 3469 if (!p)
3320 return t_probe_start(m, pos); 3470 return t_mod_start(m, pos);
3321 3471
3322 return iter; 3472 return iter;
3323} 3473}
@@ -3351,6 +3501,9 @@ static int t_show(struct seq_file *m, void *v)
3351 if (iter->flags & FTRACE_ITER_PROBE) 3501 if (iter->flags & FTRACE_ITER_PROBE)
3352 return t_probe_show(m, iter); 3502 return t_probe_show(m, iter);
3353 3503
3504 if (iter->flags & FTRACE_ITER_MOD)
3505 return t_mod_show(m, iter);
3506
3354 if (iter->flags & FTRACE_ITER_PRINTALL) { 3507 if (iter->flags & FTRACE_ITER_PRINTALL) {
3355 if (iter->flags & FTRACE_ITER_NOTRACE) 3508 if (iter->flags & FTRACE_ITER_NOTRACE)
3356 seq_puts(m, "#### no functions disabled ####\n"); 3509 seq_puts(m, "#### no functions disabled ####\n");
@@ -3457,6 +3610,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
3457{ 3610{
3458 struct ftrace_iterator *iter; 3611 struct ftrace_iterator *iter;
3459 struct ftrace_hash *hash; 3612 struct ftrace_hash *hash;
3613 struct list_head *mod_head;
3614 struct trace_array *tr = ops->private;
3460 int ret = 0; 3615 int ret = 0;
3461 3616
3462 ftrace_ops_init(ops); 3617 ftrace_ops_init(ops);
@@ -3475,21 +3630,29 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
3475 3630
3476 iter->ops = ops; 3631 iter->ops = ops;
3477 iter->flags = flag; 3632 iter->flags = flag;
3633 iter->tr = tr;
3478 3634
3479 mutex_lock(&ops->func_hash->regex_lock); 3635 mutex_lock(&ops->func_hash->regex_lock);
3480 3636
3481 if (flag & FTRACE_ITER_NOTRACE) 3637 if (flag & FTRACE_ITER_NOTRACE) {
3482 hash = ops->func_hash->notrace_hash; 3638 hash = ops->func_hash->notrace_hash;
3483 else 3639 mod_head = tr ? &tr->mod_notrace : NULL;
3640 } else {
3484 hash = ops->func_hash->filter_hash; 3641 hash = ops->func_hash->filter_hash;
3642 mod_head = tr ? &tr->mod_trace : NULL;
3643 }
3644
3645 iter->mod_list = mod_head;
3485 3646
3486 if (file->f_mode & FMODE_WRITE) { 3647 if (file->f_mode & FMODE_WRITE) {
3487 const int size_bits = FTRACE_HASH_DEFAULT_BITS; 3648 const int size_bits = FTRACE_HASH_DEFAULT_BITS;
3488 3649
3489 if (file->f_flags & O_TRUNC) 3650 if (file->f_flags & O_TRUNC) {
3490 iter->hash = alloc_ftrace_hash(size_bits); 3651 iter->hash = alloc_ftrace_hash(size_bits);
3491 else 3652 clear_ftrace_mod_list(mod_head);
3653 } else {
3492 iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); 3654 iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
3655 }
3493 3656
3494 if (!iter->hash) { 3657 if (!iter->hash) {
3495 trace_parser_put(&iter->parser); 3658 trace_parser_put(&iter->parser);
@@ -3665,7 +3828,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
3665 int exclude_mod = 0; 3828 int exclude_mod = 0;
3666 int found = 0; 3829 int found = 0;
3667 int ret; 3830 int ret;
3668 int clear_filter; 3831 int clear_filter = 0;
3669 3832
3670 if (func) { 3833 if (func) {
3671 func_g.type = filter_parse_regex(func, len, &func_g.search, 3834 func_g.type = filter_parse_regex(func, len, &func_g.search,
@@ -3761,6 +3924,165 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
3761 return ret; 3924 return ret;
3762} 3925}
3763 3926
3927static bool module_exists(const char *module)
3928{
3929 /* All modules have the symbol __this_module */
3930 const char this_mod[] = "__this_module";
3931 const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1;
3932 char modname[modname_size + 1];
3933 unsigned long val;
3934 int n;
3935
3936 n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod);
3937
3938 if (n > modname_size)
3939 return false;
3940
3941 val = module_kallsyms_lookup_name(modname);
3942 return val != 0;
3943}
3944
3945static int cache_mod(struct trace_array *tr,
3946 const char *func, char *module, int enable)
3947{
3948 struct ftrace_mod_load *ftrace_mod, *n;
3949 struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace;
3950 int ret;
3951
3952 mutex_lock(&ftrace_lock);
3953
3954 /* We do not cache inverse filters */
3955 if (func[0] == '!') {
3956 func++;
3957 ret = -EINVAL;
3958
3959 /* Look to remove this hash */
3960 list_for_each_entry_safe(ftrace_mod, n, head, list) {
3961 if (strcmp(ftrace_mod->module, module) != 0)
3962 continue;
3963
3964 /* no func matches all */
3965 if (strcmp(func, "*") == 0 ||
3966 (ftrace_mod->func &&
3967 strcmp(ftrace_mod->func, func) == 0)) {
3968 ret = 0;
3969 free_ftrace_mod(ftrace_mod);
3970 continue;
3971 }
3972 }
3973 goto out;
3974 }
3975
3976 ret = -EINVAL;
3977 /* We only care about modules that have not been loaded yet */
3978 if (module_exists(module))
3979 goto out;
3980
3981 /* Save this string off, and execute it when the module is loaded */
3982 ret = ftrace_add_mod(tr, func, module, enable);
3983 out:
3984 mutex_unlock(&ftrace_lock);
3985
3986 return ret;
3987}
3988
3989static int
3990ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3991 int reset, int enable);
3992
3993#ifdef CONFIG_MODULES
3994static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
3995 char *mod, bool enable)
3996{
3997 struct ftrace_mod_load *ftrace_mod, *n;
3998 struct ftrace_hash **orig_hash, *new_hash;
3999 LIST_HEAD(process_mods);
4000 char *func;
4001 int ret;
4002
4003 mutex_lock(&ops->func_hash->regex_lock);
4004
4005 if (enable)
4006 orig_hash = &ops->func_hash->filter_hash;
4007 else
4008 orig_hash = &ops->func_hash->notrace_hash;
4009
4010 new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS,
4011 *orig_hash);
4012 if (!new_hash)
4013 goto out; /* warn? */
4014
4015 mutex_lock(&ftrace_lock);
4016
4017 list_for_each_entry_safe(ftrace_mod, n, head, list) {
4018
4019 if (strcmp(ftrace_mod->module, mod) != 0)
4020 continue;
4021
4022 if (ftrace_mod->func)
4023 func = kstrdup(ftrace_mod->func, GFP_KERNEL);
4024 else
4025 func = kstrdup("*", GFP_KERNEL);
4026
4027 if (!func) /* warn? */
4028 continue;
4029
4030 list_del(&ftrace_mod->list);
4031 list_add(&ftrace_mod->list, &process_mods);
4032
4033 /* Use the newly allocated func, as it may be "*" */
4034 kfree(ftrace_mod->func);
4035 ftrace_mod->func = func;
4036 }
4037
4038 mutex_unlock(&ftrace_lock);
4039
4040 list_for_each_entry_safe(ftrace_mod, n, &process_mods, list) {
4041
4042 func = ftrace_mod->func;
4043
4044 /* Grabs ftrace_lock, which is why we have this extra step */
4045 match_records(new_hash, func, strlen(func), mod);
4046 free_ftrace_mod(ftrace_mod);
4047 }
4048
4049 if (enable && list_empty(head))
4050 new_hash->flags &= ~FTRACE_HASH_FL_MOD;
4051
4052 mutex_lock(&ftrace_lock);
4053
4054 ret = ftrace_hash_move_and_update_ops(ops, orig_hash,
4055 new_hash, enable);
4056 mutex_unlock(&ftrace_lock);
4057
4058 out:
4059 mutex_unlock(&ops->func_hash->regex_lock);
4060
4061 free_ftrace_hash(new_hash);
4062}
4063
4064static void process_cached_mods(const char *mod_name)
4065{
4066 struct trace_array *tr;
4067 char *mod;
4068
4069 mod = kstrdup(mod_name, GFP_KERNEL);
4070 if (!mod)
4071 return;
4072
4073 mutex_lock(&trace_types_lock);
4074 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
4075 if (!list_empty(&tr->mod_trace))
4076 process_mod_list(&tr->mod_trace, tr->ops, mod, true);
4077 if (!list_empty(&tr->mod_notrace))
4078 process_mod_list(&tr->mod_notrace, tr->ops, mod, false);
4079 }
4080 mutex_unlock(&trace_types_lock);
4081
4082 kfree(mod);
4083}
4084#endif
4085
3764/* 4086/*
3765 * We register the module command as a template to show others how 4087 * We register the module command as a template to show others how
3766 * to register the a command as well. 4088 * to register the a command as well.
@@ -3768,10 +4090,16 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
3768 4090
3769static int 4091static int
3770ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, 4092ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
3771 char *func, char *cmd, char *module, int enable) 4093 char *func_orig, char *cmd, char *module, int enable)
3772{ 4094{
4095 char *func;
3773 int ret; 4096 int ret;
3774 4097
4098 /* match_records() modifies func, and we need the original */
4099 func = kstrdup(func_orig, GFP_KERNEL);
4100 if (!func)
4101 return -ENOMEM;
4102
3775 /* 4103 /*
3776 * cmd == 'mod' because we only registered this func 4104 * cmd == 'mod' because we only registered this func
3777 * for the 'mod' ftrace_func_command. 4105 * for the 'mod' ftrace_func_command.
@@ -3780,8 +4108,10 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
3780 * parameter. 4108 * parameter.
3781 */ 4109 */
3782 ret = match_records(hash, func, strlen(func), module); 4110 ret = match_records(hash, func, strlen(func), module);
4111 kfree(func);
4112
3783 if (!ret) 4113 if (!ret)
3784 return -EINVAL; 4114 return cache_mod(tr, func_orig, module, enable);
3785 if (ret < 0) 4115 if (ret < 0)
3786 return ret; 4116 return ret;
3787 return 0; 4117 return 0;
@@ -4337,9 +4667,6 @@ static int ftrace_process_regex(struct ftrace_iterator *iter,
4337 4667
4338 command = strsep(&next, ":"); 4668 command = strsep(&next, ":");
4339 4669
4340 if (WARN_ON_ONCE(!tr))
4341 return -EINVAL;
4342
4343 mutex_lock(&ftrace_cmd_mutex); 4670 mutex_lock(&ftrace_cmd_mutex);
4344 list_for_each_entry(p, &ftrace_commands, list) { 4671 list_for_each_entry(p, &ftrace_commands, list) {
4345 if (strcmp(p->name, command) == 0) { 4672 if (strcmp(p->name, command) == 0) {
@@ -4728,9 +5055,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
4728 if (file->f_mode & FMODE_WRITE) { 5055 if (file->f_mode & FMODE_WRITE) {
4729 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); 5056 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
4730 5057
4731 if (filter_hash) 5058 if (filter_hash) {
4732 orig_hash = &iter->ops->func_hash->filter_hash; 5059 orig_hash = &iter->ops->func_hash->filter_hash;
4733 else 5060 if (iter->tr && !list_empty(&iter->tr->mod_trace))
5061 iter->hash->flags |= FTRACE_HASH_FL_MOD;
5062 } else
4734 orig_hash = &iter->ops->func_hash->notrace_hash; 5063 orig_hash = &iter->ops->func_hash->notrace_hash;
4735 5064
4736 mutex_lock(&ftrace_lock); 5065 mutex_lock(&ftrace_lock);
@@ -5063,7 +5392,7 @@ ftrace_graph_release(struct inode *inode, struct file *file)
5063 } 5392 }
5064 5393
5065 out: 5394 out:
5066 kfree(fgd->new_hash); 5395 free_ftrace_hash(fgd->new_hash);
5067 kfree(fgd); 5396 kfree(fgd);
5068 5397
5069 return ret; 5398 return ret;
@@ -5388,6 +5717,7 @@ void ftrace_release_mod(struct module *mod)
5388 if (pg == ftrace_pages) 5717 if (pg == ftrace_pages)
5389 ftrace_pages = next_to_ftrace_page(last_pg); 5718 ftrace_pages = next_to_ftrace_page(last_pg);
5390 5719
5720 ftrace_update_tot_cnt -= pg->index;
5391 *last_pg = pg->next; 5721 *last_pg = pg->next;
5392 order = get_count_order(pg->size / ENTRIES_PER_PAGE); 5722 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
5393 free_pages((unsigned long)pg->records, order); 5723 free_pages((unsigned long)pg->records, order);
@@ -5466,6 +5796,8 @@ void ftrace_module_enable(struct module *mod)
5466 5796
5467 out_unlock: 5797 out_unlock:
5468 mutex_unlock(&ftrace_lock); 5798 mutex_unlock(&ftrace_lock);
5799
5800 process_cached_mods(mod->name);
5469} 5801}
5470 5802
5471void ftrace_module_init(struct module *mod) 5803void ftrace_module_init(struct module *mod)
@@ -5504,6 +5836,7 @@ void __init ftrace_free_init_mem(void)
5504 if (!rec) 5836 if (!rec)
5505 continue; 5837 continue;
5506 pg->index--; 5838 pg->index--;
5839 ftrace_update_tot_cnt--;
5507 if (!pg->index) { 5840 if (!pg->index) {
5508 *last_pg = pg->next; 5841 *last_pg = pg->next;
5509 order = get_count_order(pg->size / ENTRIES_PER_PAGE); 5842 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
@@ -5570,6 +5903,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
5570void ftrace_init_trace_array(struct trace_array *tr) 5903void ftrace_init_trace_array(struct trace_array *tr)
5571{ 5904{
5572 INIT_LIST_HEAD(&tr->func_probes); 5905 INIT_LIST_HEAD(&tr->func_probes);
5906 INIT_LIST_HEAD(&tr->mod_trace);
5907 INIT_LIST_HEAD(&tr->mod_notrace);
5573} 5908}
5574#else 5909#else
5575 5910
@@ -6130,7 +6465,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
6130 if (ftrace_enabled) { 6465 if (ftrace_enabled) {
6131 6466
6132 /* we are starting ftrace again */ 6467 /* we are starting ftrace again */
6133 if (ftrace_ops_list != &ftrace_list_end) 6468 if (rcu_dereference_protected(ftrace_ops_list,
6469 lockdep_is_held(&ftrace_lock)) != &ftrace_list_end)
6134 update_ftrace_function(); 6470 update_ftrace_function();
6135 6471
6136 ftrace_startup_sysctl(); 6472 ftrace_startup_sysctl();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4ae268e687fe..529cc50d7243 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1136,12 +1136,12 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
1136 for (i = 0; i < nr_pages; i++) { 1136 for (i = 0; i < nr_pages; i++) {
1137 struct page *page; 1137 struct page *page;
1138 /* 1138 /*
1139 * __GFP_NORETRY flag makes sure that the allocation fails 1139 * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
1140 * gracefully without invoking oom-killer and the system is 1140 * gracefully without invoking oom-killer and the system is not
1141 * not destabilized. 1141 * destabilized.
1142 */ 1142 */
1143 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1143 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1144 GFP_KERNEL | __GFP_NORETRY, 1144 GFP_KERNEL | __GFP_RETRY_MAYFAIL,
1145 cpu_to_node(cpu)); 1145 cpu_to_node(cpu));
1146 if (!bpage) 1146 if (!bpage)
1147 goto free_pages; 1147 goto free_pages;
@@ -1149,7 +1149,7 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
1149 list_add(&bpage->list, pages); 1149 list_add(&bpage->list, pages);
1150 1150
1151 page = alloc_pages_node(cpu_to_node(cpu), 1151 page = alloc_pages_node(cpu_to_node(cpu),
1152 GFP_KERNEL | __GFP_NORETRY, 0); 1152 GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0);
1153 if (!page) 1153 if (!page)
1154 goto free_pages; 1154 goto free_pages;
1155 bpage->page = page_address(page); 1155 bpage->page = page_address(page);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1122f151466f..42b9355033d4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -87,7 +87,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
87 * tracing is active, only save the comm when a trace event 87 * tracing is active, only save the comm when a trace event
88 * occurred. 88 * occurred.
89 */ 89 */
90static DEFINE_PER_CPU(bool, trace_cmdline_save); 90static DEFINE_PER_CPU(bool, trace_taskinfo_save);
91 91
92/* 92/*
93 * Kill all tracing for good (never come back). 93 * Kill all tracing for good (never come back).
@@ -120,41 +120,41 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
120/* When set, tracing will stop when a WARN*() is hit */ 120/* When set, tracing will stop when a WARN*() is hit */
121int __disable_trace_on_warning; 121int __disable_trace_on_warning;
122 122
123#ifdef CONFIG_TRACE_ENUM_MAP_FILE 123#ifdef CONFIG_TRACE_EVAL_MAP_FILE
124/* Map of enums to their values, for "enum_map" file */ 124/* Map of enums to their values, for "eval_map" file */
125struct trace_enum_map_head { 125struct trace_eval_map_head {
126 struct module *mod; 126 struct module *mod;
127 unsigned long length; 127 unsigned long length;
128}; 128};
129 129
130union trace_enum_map_item; 130union trace_eval_map_item;
131 131
132struct trace_enum_map_tail { 132struct trace_eval_map_tail {
133 /* 133 /*
134 * "end" is first and points to NULL as it must be different 134 * "end" is first and points to NULL as it must be different
135 * than "mod" or "enum_string" 135 * than "mod" or "eval_string"
136 */ 136 */
137 union trace_enum_map_item *next; 137 union trace_eval_map_item *next;
138 const char *end; /* points to NULL */ 138 const char *end; /* points to NULL */
139}; 139};
140 140
141static DEFINE_MUTEX(trace_enum_mutex); 141static DEFINE_MUTEX(trace_eval_mutex);
142 142
143/* 143/*
144 * The trace_enum_maps are saved in an array with two extra elements, 144 * The trace_eval_maps are saved in an array with two extra elements,
145 * one at the beginning, and one at the end. The beginning item contains 145 * one at the beginning, and one at the end. The beginning item contains
146 * the count of the saved maps (head.length), and the module they 146 * the count of the saved maps (head.length), and the module they
147 * belong to if not built in (head.mod). The ending item contains a 147 * belong to if not built in (head.mod). The ending item contains a
148 * pointer to the next array of saved enum_map items. 148 * pointer to the next array of saved eval_map items.
149 */ 149 */
150union trace_enum_map_item { 150union trace_eval_map_item {
151 struct trace_enum_map map; 151 struct trace_eval_map map;
152 struct trace_enum_map_head head; 152 struct trace_eval_map_head head;
153 struct trace_enum_map_tail tail; 153 struct trace_eval_map_tail tail;
154}; 154};
155 155
156static union trace_enum_map_item *trace_enum_maps; 156static union trace_eval_map_item *trace_eval_maps;
157#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ 157#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
158 158
159static int tracing_set_tracer(struct trace_array *tr, const char *buf); 159static int tracing_set_tracer(struct trace_array *tr, const char *buf);
160 160
@@ -790,7 +790,7 @@ EXPORT_SYMBOL_GPL(tracing_on);
790static __always_inline void 790static __always_inline void
791__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) 791__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
792{ 792{
793 __this_cpu_write(trace_cmdline_save, true); 793 __this_cpu_write(trace_taskinfo_save, true);
794 794
795 /* If this is the temp buffer, we need to commit fully */ 795 /* If this is the temp buffer, we need to commit fully */
796 if (this_cpu_read(trace_buffered_event) == event) { 796 if (this_cpu_read(trace_buffered_event) == event) {
@@ -1141,9 +1141,9 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
1141 1141
1142/* 1142/*
1143 * TRACE_FLAGS is defined as a tuple matching bit masks with strings. 1143 * TRACE_FLAGS is defined as a tuple matching bit masks with strings.
1144 * It uses C(a, b) where 'a' is the enum name and 'b' is the string that 1144 * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that
1145 * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list 1145 * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list
1146 * of strings in the order that the enums were defined. 1146 * of strings in the order that the evals (enum) were defined.
1147 */ 1147 */
1148#undef C 1148#undef C
1149#define C(a, b) b 1149#define C(a, b) b
@@ -1709,6 +1709,8 @@ void tracing_reset_all_online_cpus(void)
1709 } 1709 }
1710} 1710}
1711 1711
1712static int *tgid_map;
1713
1712#define SAVED_CMDLINES_DEFAULT 128 1714#define SAVED_CMDLINES_DEFAULT 128
1713#define NO_CMDLINE_MAP UINT_MAX 1715#define NO_CMDLINE_MAP UINT_MAX
1714static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; 1716static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -1722,7 +1724,7 @@ struct saved_cmdlines_buffer {
1722static struct saved_cmdlines_buffer *savedcmd; 1724static struct saved_cmdlines_buffer *savedcmd;
1723 1725
1724/* temporary disable recording */ 1726/* temporary disable recording */
1725static atomic_t trace_record_cmdline_disabled __read_mostly; 1727static atomic_t trace_record_taskinfo_disabled __read_mostly;
1726 1728
1727static inline char *get_saved_cmdlines(int idx) 1729static inline char *get_saved_cmdlines(int idx)
1728{ 1730{
@@ -1910,13 +1912,15 @@ static void tracing_stop_tr(struct trace_array *tr)
1910 raw_spin_unlock_irqrestore(&tr->start_lock, flags); 1912 raw_spin_unlock_irqrestore(&tr->start_lock, flags);
1911} 1913}
1912 1914
1913void trace_stop_cmdline_recording(void);
1914
1915static int trace_save_cmdline(struct task_struct *tsk) 1915static int trace_save_cmdline(struct task_struct *tsk)
1916{ 1916{
1917 unsigned pid, idx; 1917 unsigned pid, idx;
1918 1918
1919 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) 1919 /* treat recording of idle task as a success */
1920 if (!tsk->pid)
1921 return 1;
1922
1923 if (unlikely(tsk->pid > PID_MAX_DEFAULT))
1920 return 0; 1924 return 0;
1921 1925
1922 /* 1926 /*
@@ -1992,16 +1996,107 @@ void trace_find_cmdline(int pid, char comm[])
1992 preempt_enable(); 1996 preempt_enable();
1993} 1997}
1994 1998
1995void tracing_record_cmdline(struct task_struct *tsk) 1999int trace_find_tgid(int pid)
2000{
2001 if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT))
2002 return 0;
2003
2004 return tgid_map[pid];
2005}
2006
2007static int trace_save_tgid(struct task_struct *tsk)
2008{
2009 /* treat recording of idle task as a success */
2010 if (!tsk->pid)
2011 return 1;
2012
2013 if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT))
2014 return 0;
2015
2016 tgid_map[tsk->pid] = tsk->tgid;
2017 return 1;
2018}
2019
2020static bool tracing_record_taskinfo_skip(int flags)
2021{
2022 if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
2023 return true;
2024 if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on())
2025 return true;
2026 if (!__this_cpu_read(trace_taskinfo_save))
2027 return true;
2028 return false;
2029}
2030
2031/**
2032 * tracing_record_taskinfo - record the task info of a task
2033 *
2034 * @task - task to record
2035 * @flags - TRACE_RECORD_CMDLINE for recording comm
2036 * - TRACE_RECORD_TGID for recording tgid
2037 */
2038void tracing_record_taskinfo(struct task_struct *task, int flags)
1996{ 2039{
1997 if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) 2040 bool done;
2041
2042 if (tracing_record_taskinfo_skip(flags))
1998 return; 2043 return;
1999 2044
2000 if (!__this_cpu_read(trace_cmdline_save)) 2045 /*
2046 * Record as much task information as possible. If some fail, continue
2047 * to try to record the others.
2048 */
2049 done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
2050 done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);
2051
2052 /* If recording any information failed, retry again soon. */
2053 if (!done)
2001 return; 2054 return;
2002 2055
2003 if (trace_save_cmdline(tsk)) 2056 __this_cpu_write(trace_taskinfo_save, false);
2004 __this_cpu_write(trace_cmdline_save, false); 2057}
2058
2059/**
2060 * tracing_record_taskinfo_sched_switch - record task info for sched_switch
2061 *
2062 * @prev - previous task during sched_switch
2063 * @next - next task during sched_switch
2064 * @flags - TRACE_RECORD_CMDLINE for recording comm
2065 * TRACE_RECORD_TGID for recording tgid
2066 */
2067void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
2068 struct task_struct *next, int flags)
2069{
2070 bool done;
2071
2072 if (tracing_record_taskinfo_skip(flags))
2073 return;
2074
2075 /*
2076 * Record as much task information as possible. If some fail, continue
2077 * to try to record the others.
2078 */
2079 done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
2080 done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
2081 done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
2082 done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);
2083
2084 /* If recording any information failed, retry again soon. */
2085 if (!done)
2086 return;
2087
2088 __this_cpu_write(trace_taskinfo_save, false);
2089}
2090
2091/* Helpers to record a specific task information */
2092void tracing_record_cmdline(struct task_struct *task)
2093{
2094 tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
2095}
2096
2097void tracing_record_tgid(struct task_struct *task)
2098{
2099 tracing_record_taskinfo(task, TRACE_RECORD_TGID);
2005} 2100}
2006 2101
2007/* 2102/*
@@ -3146,7 +3241,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
3146#endif 3241#endif
3147 3242
3148 if (!iter->snapshot) 3243 if (!iter->snapshot)
3149 atomic_inc(&trace_record_cmdline_disabled); 3244 atomic_inc(&trace_record_taskinfo_disabled);
3150 3245
3151 if (*pos != iter->pos) { 3246 if (*pos != iter->pos) {
3152 iter->ent = NULL; 3247 iter->ent = NULL;
@@ -3191,7 +3286,7 @@ static void s_stop(struct seq_file *m, void *p)
3191#endif 3286#endif
3192 3287
3193 if (!iter->snapshot) 3288 if (!iter->snapshot)
3194 atomic_dec(&trace_record_cmdline_disabled); 3289 atomic_dec(&trace_record_taskinfo_disabled);
3195 3290
3196 trace_access_unlock(iter->cpu_file); 3291 trace_access_unlock(iter->cpu_file);
3197 trace_event_read_unlock(); 3292 trace_event_read_unlock();
@@ -3248,23 +3343,38 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
3248 seq_puts(m, "#\n"); 3343 seq_puts(m, "#\n");
3249} 3344}
3250 3345
3251static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) 3346static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m,
3347 unsigned int flags)
3252{ 3348{
3349 bool tgid = flags & TRACE_ITER_RECORD_TGID;
3350
3253 print_event_info(buf, m); 3351 print_event_info(buf, m);
3254 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n" 3352
3255 "# | | | | |\n"); 3353 seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : "");
3354 seq_printf(m, "# | | | %s | |\n", tgid ? " | " : "");
3256} 3355}
3257 3356
3258static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) 3357static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m,
3358 unsigned int flags)
3259{ 3359{
3260 print_event_info(buf, m); 3360 bool tgid = flags & TRACE_ITER_RECORD_TGID;
3261 seq_puts(m, "# _-----=> irqs-off\n" 3361 const char tgid_space[] = " ";
3262 "# / _----=> need-resched\n" 3362 const char space[] = " ";
3263 "# | / _---=> hardirq/softirq\n" 3363
3264 "# || / _--=> preempt-depth\n" 3364 seq_printf(m, "# %s _-----=> irqs-off\n",
3265 "# ||| / delay\n" 3365 tgid ? tgid_space : space);
3266 "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n" 3366 seq_printf(m, "# %s / _----=> need-resched\n",
3267 "# | | | |||| | |\n"); 3367 tgid ? tgid_space : space);
3368 seq_printf(m, "# %s| / _---=> hardirq/softirq\n",
3369 tgid ? tgid_space : space);
3370 seq_printf(m, "# %s|| / _--=> preempt-depth\n",
3371 tgid ? tgid_space : space);
3372 seq_printf(m, "# %s||| / delay\n",
3373 tgid ? tgid_space : space);
3374 seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n",
3375 tgid ? " TGID " : space);
3376 seq_printf(m, "# | | | %s|||| | |\n",
3377 tgid ? " | " : space);
3268} 3378}
3269 3379
3270void 3380void
@@ -3580,9 +3690,11 @@ void trace_default_header(struct seq_file *m)
3580 } else { 3690 } else {
3581 if (!(trace_flags & TRACE_ITER_VERBOSE)) { 3691 if (!(trace_flags & TRACE_ITER_VERBOSE)) {
3582 if (trace_flags & TRACE_ITER_IRQ_INFO) 3692 if (trace_flags & TRACE_ITER_IRQ_INFO)
3583 print_func_help_header_irq(iter->trace_buffer, m); 3693 print_func_help_header_irq(iter->trace_buffer,
3694 m, trace_flags);
3584 else 3695 else
3585 print_func_help_header(iter->trace_buffer, m); 3696 print_func_help_header(iter->trace_buffer, m,
3697 trace_flags);
3586 } 3698 }
3587 } 3699 }
3588} 3700}
@@ -4238,6 +4350,18 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
4238 if (mask == TRACE_ITER_RECORD_CMD) 4350 if (mask == TRACE_ITER_RECORD_CMD)
4239 trace_event_enable_cmd_record(enabled); 4351 trace_event_enable_cmd_record(enabled);
4240 4352
4353 if (mask == TRACE_ITER_RECORD_TGID) {
4354 if (!tgid_map)
4355 tgid_map = kzalloc((PID_MAX_DEFAULT + 1) * sizeof(*tgid_map),
4356 GFP_KERNEL);
4357 if (!tgid_map) {
4358 tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
4359 return -ENOMEM;
4360 }
4361
4362 trace_event_enable_tgid_record(enabled);
4363 }
4364
4241 if (mask == TRACE_ITER_EVENT_FORK) 4365 if (mask == TRACE_ITER_EVENT_FORK)
4242 trace_event_follow_fork(tr, enabled); 4366 trace_event_follow_fork(tr, enabled);
4243 4367
@@ -4473,7 +4597,8 @@ static const char readme_msg[] =
4473#endif 4597#endif
4474#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) 4598#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
4475 "\t accepts: event-definitions (one definition per line)\n" 4599 "\t accepts: event-definitions (one definition per line)\n"
4476 "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n" 4600 "\t Format: p[:[<group>/]<event>] <place> [<args>]\n"
4601 "\t r[maxactive][:[<group>/]<event>] <place> [<args>]\n"
4477 "\t -:[<group>/]<event>\n" 4602 "\t -:[<group>/]<event>\n"
4478#ifdef CONFIG_KPROBE_EVENTS 4603#ifdef CONFIG_KPROBE_EVENTS
4479 "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" 4604 "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
@@ -4597,6 +4722,76 @@ static const struct file_operations tracing_readme_fops = {
4597 .llseek = generic_file_llseek, 4722 .llseek = generic_file_llseek,
4598}; 4723};
4599 4724
4725static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
4726{
4727 int *ptr = v;
4728
4729 if (*pos || m->count)
4730 ptr++;
4731
4732 (*pos)++;
4733
4734 for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) {
4735 if (trace_find_tgid(*ptr))
4736 return ptr;
4737 }
4738
4739 return NULL;
4740}
4741
4742static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
4743{
4744 void *v;
4745 loff_t l = 0;
4746
4747 if (!tgid_map)
4748 return NULL;
4749
4750 v = &tgid_map[0];
4751 while (l <= *pos) {
4752 v = saved_tgids_next(m, v, &l);
4753 if (!v)
4754 return NULL;
4755 }
4756
4757 return v;
4758}
4759
4760static void saved_tgids_stop(struct seq_file *m, void *v)
4761{
4762}
4763
4764static int saved_tgids_show(struct seq_file *m, void *v)
4765{
4766 int pid = (int *)v - tgid_map;
4767
4768 seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid));
4769 return 0;
4770}
4771
4772static const struct seq_operations tracing_saved_tgids_seq_ops = {
4773 .start = saved_tgids_start,
4774 .stop = saved_tgids_stop,
4775 .next = saved_tgids_next,
4776 .show = saved_tgids_show,
4777};
4778
4779static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
4780{
4781 if (tracing_disabled)
4782 return -ENODEV;
4783
4784 return seq_open(filp, &tracing_saved_tgids_seq_ops);
4785}
4786
4787
4788static const struct file_operations tracing_saved_tgids_fops = {
4789 .open = tracing_saved_tgids_open,
4790 .read = seq_read,
4791 .llseek = seq_lseek,
4792 .release = seq_release,
4793};
4794
4600static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) 4795static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
4601{ 4796{
4602 unsigned int *ptr = v; 4797 unsigned int *ptr = v;
@@ -4746,11 +4941,11 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
4746 .write = tracing_saved_cmdlines_size_write, 4941 .write = tracing_saved_cmdlines_size_write,
4747}; 4942};
4748 4943
4749#ifdef CONFIG_TRACE_ENUM_MAP_FILE 4944#ifdef CONFIG_TRACE_EVAL_MAP_FILE
4750static union trace_enum_map_item * 4945static union trace_eval_map_item *
4751update_enum_map(union trace_enum_map_item *ptr) 4946update_eval_map(union trace_eval_map_item *ptr)
4752{ 4947{
4753 if (!ptr->map.enum_string) { 4948 if (!ptr->map.eval_string) {
4754 if (ptr->tail.next) { 4949 if (ptr->tail.next) {
4755 ptr = ptr->tail.next; 4950 ptr = ptr->tail.next;
4756 /* Set ptr to the next real item (skip head) */ 4951 /* Set ptr to the next real item (skip head) */
@@ -4761,15 +4956,15 @@ update_enum_map(union trace_enum_map_item *ptr)
4761 return ptr; 4956 return ptr;
4762} 4957}
4763 4958
4764static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) 4959static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos)
4765{ 4960{
4766 union trace_enum_map_item *ptr = v; 4961 union trace_eval_map_item *ptr = v;
4767 4962
4768 /* 4963 /*
4769 * Paranoid! If ptr points to end, we don't want to increment past it. 4964 * Paranoid! If ptr points to end, we don't want to increment past it.
4770 * This really should never happen. 4965 * This really should never happen.
4771 */ 4966 */
4772 ptr = update_enum_map(ptr); 4967 ptr = update_eval_map(ptr);
4773 if (WARN_ON_ONCE(!ptr)) 4968 if (WARN_ON_ONCE(!ptr))
4774 return NULL; 4969 return NULL;
4775 4970
@@ -4777,104 +4972,104 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
4777 4972
4778 (*pos)++; 4973 (*pos)++;
4779 4974
4780 ptr = update_enum_map(ptr); 4975 ptr = update_eval_map(ptr);
4781 4976
4782 return ptr; 4977 return ptr;
4783} 4978}
4784 4979
4785static void *enum_map_start(struct seq_file *m, loff_t *pos) 4980static void *eval_map_start(struct seq_file *m, loff_t *pos)
4786{ 4981{
4787 union trace_enum_map_item *v; 4982 union trace_eval_map_item *v;
4788 loff_t l = 0; 4983 loff_t l = 0;
4789 4984
4790 mutex_lock(&trace_enum_mutex); 4985 mutex_lock(&trace_eval_mutex);
4791 4986
4792 v = trace_enum_maps; 4987 v = trace_eval_maps;
4793 if (v) 4988 if (v)
4794 v++; 4989 v++;
4795 4990
4796 while (v && l < *pos) { 4991 while (v && l < *pos) {
4797 v = enum_map_next(m, v, &l); 4992 v = eval_map_next(m, v, &l);
4798 } 4993 }
4799 4994
4800 return v; 4995 return v;
4801} 4996}
4802 4997
4803static void enum_map_stop(struct seq_file *m, void *v) 4998static void eval_map_stop(struct seq_file *m, void *v)
4804{ 4999{
4805 mutex_unlock(&trace_enum_mutex); 5000 mutex_unlock(&trace_eval_mutex);
4806} 5001}
4807 5002
4808static int enum_map_show(struct seq_file *m, void *v) 5003static int eval_map_show(struct seq_file *m, void *v)
4809{ 5004{
4810 union trace_enum_map_item *ptr = v; 5005 union trace_eval_map_item *ptr = v;
4811 5006
4812 seq_printf(m, "%s %ld (%s)\n", 5007 seq_printf(m, "%s %ld (%s)\n",
4813 ptr->map.enum_string, ptr->map.enum_value, 5008 ptr->map.eval_string, ptr->map.eval_value,
4814 ptr->map.system); 5009 ptr->map.system);
4815 5010
4816 return 0; 5011 return 0;
4817} 5012}
4818 5013
4819static const struct seq_operations tracing_enum_map_seq_ops = { 5014static const struct seq_operations tracing_eval_map_seq_ops = {
4820 .start = enum_map_start, 5015 .start = eval_map_start,
4821 .next = enum_map_next, 5016 .next = eval_map_next,
4822 .stop = enum_map_stop, 5017 .stop = eval_map_stop,
4823 .show = enum_map_show, 5018 .show = eval_map_show,
4824}; 5019};
4825 5020
4826static int tracing_enum_map_open(struct inode *inode, struct file *filp) 5021static int tracing_eval_map_open(struct inode *inode, struct file *filp)
4827{ 5022{
4828 if (tracing_disabled) 5023 if (tracing_disabled)
4829 return -ENODEV; 5024 return -ENODEV;
4830 5025
4831 return seq_open(filp, &tracing_enum_map_seq_ops); 5026 return seq_open(filp, &tracing_eval_map_seq_ops);
4832} 5027}
4833 5028
4834static const struct file_operations tracing_enum_map_fops = { 5029static const struct file_operations tracing_eval_map_fops = {
4835 .open = tracing_enum_map_open, 5030 .open = tracing_eval_map_open,
4836 .read = seq_read, 5031 .read = seq_read,
4837 .llseek = seq_lseek, 5032 .llseek = seq_lseek,
4838 .release = seq_release, 5033 .release = seq_release,
4839}; 5034};
4840 5035
4841static inline union trace_enum_map_item * 5036static inline union trace_eval_map_item *
4842trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) 5037trace_eval_jmp_to_tail(union trace_eval_map_item *ptr)
4843{ 5038{
4844 /* Return tail of array given the head */ 5039 /* Return tail of array given the head */
4845 return ptr + ptr->head.length + 1; 5040 return ptr + ptr->head.length + 1;
4846} 5041}
4847 5042
4848static void 5043static void
4849trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, 5044trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
4850 int len) 5045 int len)
4851{ 5046{
4852 struct trace_enum_map **stop; 5047 struct trace_eval_map **stop;
4853 struct trace_enum_map **map; 5048 struct trace_eval_map **map;
4854 union trace_enum_map_item *map_array; 5049 union trace_eval_map_item *map_array;
4855 union trace_enum_map_item *ptr; 5050 union trace_eval_map_item *ptr;
4856 5051
4857 stop = start + len; 5052 stop = start + len;
4858 5053
4859 /* 5054 /*
4860 * The trace_enum_maps contains the map plus a head and tail item, 5055 * The trace_eval_maps contains the map plus a head and tail item,
4861 * where the head holds the module and length of array, and the 5056 * where the head holds the module and length of array, and the
4862 * tail holds a pointer to the next list. 5057 * tail holds a pointer to the next list.
4863 */ 5058 */
4864 map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); 5059 map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
4865 if (!map_array) { 5060 if (!map_array) {
4866 pr_warn("Unable to allocate trace enum mapping\n"); 5061 pr_warn("Unable to allocate trace eval mapping\n");
4867 return; 5062 return;
4868 } 5063 }
4869 5064
4870 mutex_lock(&trace_enum_mutex); 5065 mutex_lock(&trace_eval_mutex);
4871 5066
4872 if (!trace_enum_maps) 5067 if (!trace_eval_maps)
4873 trace_enum_maps = map_array; 5068 trace_eval_maps = map_array;
4874 else { 5069 else {
4875 ptr = trace_enum_maps; 5070 ptr = trace_eval_maps;
4876 for (;;) { 5071 for (;;) {
4877 ptr = trace_enum_jmp_to_tail(ptr); 5072 ptr = trace_eval_jmp_to_tail(ptr);
4878 if (!ptr->tail.next) 5073 if (!ptr->tail.next)
4879 break; 5074 break;
4880 ptr = ptr->tail.next; 5075 ptr = ptr->tail.next;
@@ -4892,34 +5087,34 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
4892 } 5087 }
4893 memset(map_array, 0, sizeof(*map_array)); 5088 memset(map_array, 0, sizeof(*map_array));
4894 5089
4895 mutex_unlock(&trace_enum_mutex); 5090 mutex_unlock(&trace_eval_mutex);
4896} 5091}
4897 5092
4898static void trace_create_enum_file(struct dentry *d_tracer) 5093static void trace_create_eval_file(struct dentry *d_tracer)
4899{ 5094{
4900 trace_create_file("enum_map", 0444, d_tracer, 5095 trace_create_file("eval_map", 0444, d_tracer,
4901 NULL, &tracing_enum_map_fops); 5096 NULL, &tracing_eval_map_fops);
4902} 5097}
4903 5098
4904#else /* CONFIG_TRACE_ENUM_MAP_FILE */ 5099#else /* CONFIG_TRACE_EVAL_MAP_FILE */
4905static inline void trace_create_enum_file(struct dentry *d_tracer) { } 5100static inline void trace_create_eval_file(struct dentry *d_tracer) { }
4906static inline void trace_insert_enum_map_file(struct module *mod, 5101static inline void trace_insert_eval_map_file(struct module *mod,
4907 struct trace_enum_map **start, int len) { } 5102 struct trace_eval_map **start, int len) { }
4908#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ 5103#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */
4909 5104
4910static void trace_insert_enum_map(struct module *mod, 5105static void trace_insert_eval_map(struct module *mod,
4911 struct trace_enum_map **start, int len) 5106 struct trace_eval_map **start, int len)
4912{ 5107{
4913 struct trace_enum_map **map; 5108 struct trace_eval_map **map;
4914 5109
4915 if (len <= 0) 5110 if (len <= 0)
4916 return; 5111 return;
4917 5112
4918 map = start; 5113 map = start;
4919 5114
4920 trace_event_enum_update(map, len); 5115 trace_event_eval_update(map, len);
4921 5116
4922 trace_insert_enum_map_file(mod, start, len); 5117 trace_insert_eval_map_file(mod, start, len);
4923} 5118}
4924 5119
4925static ssize_t 5120static ssize_t
@@ -6739,33 +6934,18 @@ static const struct file_operations tracing_stats_fops = {
6739 6934
6740#ifdef CONFIG_DYNAMIC_FTRACE 6935#ifdef CONFIG_DYNAMIC_FTRACE
6741 6936
6742int __weak ftrace_arch_read_dyn_info(char *buf, int size)
6743{
6744 return 0;
6745}
6746
6747static ssize_t 6937static ssize_t
6748tracing_read_dyn_info(struct file *filp, char __user *ubuf, 6938tracing_read_dyn_info(struct file *filp, char __user *ubuf,
6749 size_t cnt, loff_t *ppos) 6939 size_t cnt, loff_t *ppos)
6750{ 6940{
6751 static char ftrace_dyn_info_buffer[1024];
6752 static DEFINE_MUTEX(dyn_info_mutex);
6753 unsigned long *p = filp->private_data; 6941 unsigned long *p = filp->private_data;
6754 char *buf = ftrace_dyn_info_buffer; 6942 char buf[64]; /* Not too big for a shallow stack */
6755 int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
6756 int r; 6943 int r;
6757 6944
6758 mutex_lock(&dyn_info_mutex); 6945 r = scnprintf(buf, 63, "%ld", *p);
6759 r = sprintf(buf, "%ld ", *p);
6760
6761 r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
6762 buf[r++] = '\n'; 6946 buf[r++] = '\n';
6763 6947
6764 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 6948 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
6765
6766 mutex_unlock(&dyn_info_mutex);
6767
6768 return r;
6769} 6949}
6770 6950
6771static const struct file_operations tracing_dyn_info_fops = { 6951static const struct file_operations tracing_dyn_info_fops = {
@@ -6881,6 +7061,9 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
6881 char *number; 7061 char *number;
6882 int ret; 7062 int ret;
6883 7063
7064 if (!tr)
7065 return -ENODEV;
7066
6884 /* hash funcs only work with set_ftrace_filter */ 7067 /* hash funcs only work with set_ftrace_filter */
6885 if (!enable) 7068 if (!enable)
6886 return -EINVAL; 7069 return -EINVAL;
@@ -7591,6 +7774,7 @@ static int instance_rmdir(const char *name)
7591 } 7774 }
7592 kfree(tr->topts); 7775 kfree(tr->topts);
7593 7776
7777 free_cpumask_var(tr->tracing_cpumask);
7594 kfree(tr->name); 7778 kfree(tr->name);
7595 kfree(tr); 7779 kfree(tr);
7596 7780
@@ -7734,21 +7918,21 @@ struct dentry *tracing_init_dentry(void)
7734 return NULL; 7918 return NULL;
7735} 7919}
7736 7920
7737extern struct trace_enum_map *__start_ftrace_enum_maps[]; 7921extern struct trace_eval_map *__start_ftrace_eval_maps[];
7738extern struct trace_enum_map *__stop_ftrace_enum_maps[]; 7922extern struct trace_eval_map *__stop_ftrace_eval_maps[];
7739 7923
7740static void __init trace_enum_init(void) 7924static void __init trace_eval_init(void)
7741{ 7925{
7742 int len; 7926 int len;
7743 7927
7744 len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps; 7928 len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps;
7745 trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len); 7929 trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
7746} 7930}
7747 7931
7748#ifdef CONFIG_MODULES 7932#ifdef CONFIG_MODULES
7749static void trace_module_add_enums(struct module *mod) 7933static void trace_module_add_evals(struct module *mod)
7750{ 7934{
7751 if (!mod->num_trace_enums) 7935 if (!mod->num_trace_evals)
7752 return; 7936 return;
7753 7937
7754 /* 7938 /*
@@ -7758,40 +7942,40 @@ static void trace_module_add_enums(struct module *mod)
7758 if (trace_module_has_bad_taint(mod)) 7942 if (trace_module_has_bad_taint(mod))
7759 return; 7943 return;
7760 7944
7761 trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums); 7945 trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
7762} 7946}
7763 7947
7764#ifdef CONFIG_TRACE_ENUM_MAP_FILE 7948#ifdef CONFIG_TRACE_EVAL_MAP_FILE
7765static void trace_module_remove_enums(struct module *mod) 7949static void trace_module_remove_evals(struct module *mod)
7766{ 7950{
7767 union trace_enum_map_item *map; 7951 union trace_eval_map_item *map;
7768 union trace_enum_map_item **last = &trace_enum_maps; 7952 union trace_eval_map_item **last = &trace_eval_maps;
7769 7953
7770 if (!mod->num_trace_enums) 7954 if (!mod->num_trace_evals)
7771 return; 7955 return;
7772 7956
7773 mutex_lock(&trace_enum_mutex); 7957 mutex_lock(&trace_eval_mutex);
7774 7958
7775 map = trace_enum_maps; 7959 map = trace_eval_maps;
7776 7960
7777 while (map) { 7961 while (map) {
7778 if (map->head.mod == mod) 7962 if (map->head.mod == mod)
7779 break; 7963 break;
7780 map = trace_enum_jmp_to_tail(map); 7964 map = trace_eval_jmp_to_tail(map);
7781 last = &map->tail.next; 7965 last = &map->tail.next;
7782 map = map->tail.next; 7966 map = map->tail.next;
7783 } 7967 }
7784 if (!map) 7968 if (!map)
7785 goto out; 7969 goto out;
7786 7970
7787 *last = trace_enum_jmp_to_tail(map)->tail.next; 7971 *last = trace_eval_jmp_to_tail(map)->tail.next;
7788 kfree(map); 7972 kfree(map);
7789 out: 7973 out:
7790 mutex_unlock(&trace_enum_mutex); 7974 mutex_unlock(&trace_eval_mutex);
7791} 7975}
7792#else 7976#else
7793static inline void trace_module_remove_enums(struct module *mod) { } 7977static inline void trace_module_remove_evals(struct module *mod) { }
7794#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ 7978#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
7795 7979
7796static int trace_module_notify(struct notifier_block *self, 7980static int trace_module_notify(struct notifier_block *self,
7797 unsigned long val, void *data) 7981 unsigned long val, void *data)
@@ -7800,10 +7984,10 @@ static int trace_module_notify(struct notifier_block *self,
7800 7984
7801 switch (val) { 7985 switch (val) {
7802 case MODULE_STATE_COMING: 7986 case MODULE_STATE_COMING:
7803 trace_module_add_enums(mod); 7987 trace_module_add_evals(mod);
7804 break; 7988 break;
7805 case MODULE_STATE_GOING: 7989 case MODULE_STATE_GOING:
7806 trace_module_remove_enums(mod); 7990 trace_module_remove_evals(mod);
7807 break; 7991 break;
7808 } 7992 }
7809 7993
@@ -7841,9 +8025,12 @@ static __init int tracer_init_tracefs(void)
7841 trace_create_file("saved_cmdlines_size", 0644, d_tracer, 8025 trace_create_file("saved_cmdlines_size", 0644, d_tracer,
7842 NULL, &tracing_saved_cmdlines_size_fops); 8026 NULL, &tracing_saved_cmdlines_size_fops);
7843 8027
7844 trace_enum_init(); 8028 trace_create_file("saved_tgids", 0444, d_tracer,
8029 NULL, &tracing_saved_tgids_fops);
8030
8031 trace_eval_init();
7845 8032
7846 trace_create_enum_file(d_tracer); 8033 trace_create_eval_file(d_tracer);
7847 8034
7848#ifdef CONFIG_MODULES 8035#ifdef CONFIG_MODULES
7849 register_module_notifier(&trace_module_nb); 8036 register_module_notifier(&trace_module_nb);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 39fd77330aab..490ba229931d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -263,7 +263,10 @@ struct trace_array {
263 struct ftrace_ops *ops; 263 struct ftrace_ops *ops;
264 struct trace_pid_list __rcu *function_pids; 264 struct trace_pid_list __rcu *function_pids;
265#ifdef CONFIG_DYNAMIC_FTRACE 265#ifdef CONFIG_DYNAMIC_FTRACE
266 /* All of these are protected by the ftrace_lock */
266 struct list_head func_probes; 267 struct list_head func_probes;
268 struct list_head mod_trace;
269 struct list_head mod_notrace;
267#endif 270#endif
268 /* function tracing enabled */ 271 /* function tracing enabled */
269 int function_enabled; 272 int function_enabled;
@@ -637,6 +640,9 @@ void set_graph_array(struct trace_array *tr);
637 640
638void tracing_start_cmdline_record(void); 641void tracing_start_cmdline_record(void);
639void tracing_stop_cmdline_record(void); 642void tracing_stop_cmdline_record(void);
643void tracing_start_tgid_record(void);
644void tracing_stop_tgid_record(void);
645
640int register_tracer(struct tracer *type); 646int register_tracer(struct tracer *type);
641int is_tracing_stopped(void); 647int is_tracing_stopped(void);
642 648
@@ -697,6 +703,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
697extern u64 ftrace_now(int cpu); 703extern u64 ftrace_now(int cpu);
698 704
699extern void trace_find_cmdline(int pid, char comm[]); 705extern void trace_find_cmdline(int pid, char comm[]);
706extern int trace_find_tgid(int pid);
700extern void trace_event_follow_fork(struct trace_array *tr, bool enable); 707extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
701 708
702#ifdef CONFIG_DYNAMIC_FTRACE 709#ifdef CONFIG_DYNAMIC_FTRACE
@@ -761,10 +768,24 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
761 768
762extern char trace_find_mark(unsigned long long duration); 769extern char trace_find_mark(unsigned long long duration);
763 770
771struct ftrace_hash;
772
773struct ftrace_mod_load {
774 struct list_head list;
775 char *func;
776 char *module;
777 int enable;
778};
779
780enum {
781 FTRACE_HASH_FL_MOD = (1 << 0),
782};
783
764struct ftrace_hash { 784struct ftrace_hash {
765 unsigned long size_bits; 785 unsigned long size_bits;
766 struct hlist_head *buckets; 786 struct hlist_head *buckets;
767 unsigned long count; 787 unsigned long count;
788 unsigned long flags;
768 struct rcu_head rcu; 789 struct rcu_head rcu;
769}; 790};
770 791
@@ -773,7 +794,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip);
773 794
774static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) 795static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
775{ 796{
776 return !hash || !hash->count; 797 return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD));
777} 798}
778 799
779/* Standard output formatting function used for function return traces */ 800/* Standard output formatting function used for function return traces */
@@ -1107,6 +1128,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
1107 C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ 1128 C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \
1108 C(LATENCY_FMT, "latency-format"), \ 1129 C(LATENCY_FMT, "latency-format"), \
1109 C(RECORD_CMD, "record-cmd"), \ 1130 C(RECORD_CMD, "record-cmd"), \
1131 C(RECORD_TGID, "record-tgid"), \
1110 C(OVERWRITE, "overwrite"), \ 1132 C(OVERWRITE, "overwrite"), \
1111 C(STOP_ON_FREE, "disable_on_free"), \ 1133 C(STOP_ON_FREE, "disable_on_free"), \
1112 C(IRQ_INFO, "irq-info"), \ 1134 C(IRQ_INFO, "irq-info"), \
@@ -1188,9 +1210,9 @@ struct ftrace_event_field {
1188struct event_filter { 1210struct event_filter {
1189 int n_preds; /* Number assigned */ 1211 int n_preds; /* Number assigned */
1190 int a_preds; /* allocated */ 1212 int a_preds; /* allocated */
1191 struct filter_pred *preds; 1213 struct filter_pred __rcu *preds;
1192 struct filter_pred *root; 1214 struct filter_pred __rcu *root;
1193 char *filter_string; 1215 char *filter_string;
1194}; 1216};
1195 1217
1196struct event_subsystem { 1218struct event_subsystem {
@@ -1423,6 +1445,8 @@ struct ftrace_event_field *
1423trace_find_event_field(struct trace_event_call *call, char *name); 1445trace_find_event_field(struct trace_event_call *call, char *name);
1424 1446
1425extern void trace_event_enable_cmd_record(bool enable); 1447extern void trace_event_enable_cmd_record(bool enable);
1448extern void trace_event_enable_tgid_record(bool enable);
1449
1426extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); 1450extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
1427extern int event_trace_del_tracer(struct trace_array *tr); 1451extern int event_trace_del_tracer(struct trace_array *tr);
1428 1452
@@ -1773,10 +1797,10 @@ static inline const char *get_syscall_name(int syscall)
1773 1797
1774#ifdef CONFIG_EVENT_TRACING 1798#ifdef CONFIG_EVENT_TRACING
1775void trace_event_init(void); 1799void trace_event_init(void);
1776void trace_event_enum_update(struct trace_enum_map **map, int len); 1800void trace_event_eval_update(struct trace_eval_map **map, int len);
1777#else 1801#else
1778static inline void __init trace_event_init(void) { } 1802static inline void __init trace_event_init(void) { }
1779static inline void trace_event_enum_update(struct trace_enum_map **map, int len) { } 1803static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
1780#endif 1804#endif
1781 1805
1782extern struct trace_iterator *tracepoint_print_iter; 1806extern struct trace_iterator *tracepoint_print_iter;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e7973e10398c..36132f9280e6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -343,6 +343,28 @@ void trace_event_enable_cmd_record(bool enable)
343 mutex_unlock(&event_mutex); 343 mutex_unlock(&event_mutex);
344} 344}
345 345
346void trace_event_enable_tgid_record(bool enable)
347{
348 struct trace_event_file *file;
349 struct trace_array *tr;
350
351 mutex_lock(&event_mutex);
352 do_for_each_event_file(tr, file) {
353 if (!(file->flags & EVENT_FILE_FL_ENABLED))
354 continue;
355
356 if (enable) {
357 tracing_start_tgid_record();
358 set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
359 } else {
360 tracing_stop_tgid_record();
361 clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT,
362 &file->flags);
363 }
364 } while_for_each_event_file();
365 mutex_unlock(&event_mutex);
366}
367
346static int __ftrace_event_enable_disable(struct trace_event_file *file, 368static int __ftrace_event_enable_disable(struct trace_event_file *file,
347 int enable, int soft_disable) 369 int enable, int soft_disable)
348{ 370{
@@ -381,6 +403,12 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
381 tracing_stop_cmdline_record(); 403 tracing_stop_cmdline_record();
382 clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); 404 clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
383 } 405 }
406
407 if (file->flags & EVENT_FILE_FL_RECORDED_TGID) {
408 tracing_stop_tgid_record();
409 clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
410 }
411
384 call->class->reg(call, TRACE_REG_UNREGISTER, file); 412 call->class->reg(call, TRACE_REG_UNREGISTER, file);
385 } 413 }
386 /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ 414 /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
@@ -407,18 +435,30 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
407 } 435 }
408 436
409 if (!(file->flags & EVENT_FILE_FL_ENABLED)) { 437 if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
438 bool cmd = false, tgid = false;
410 439
411 /* Keep the event disabled, when going to SOFT_MODE. */ 440 /* Keep the event disabled, when going to SOFT_MODE. */
412 if (soft_disable) 441 if (soft_disable)
413 set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); 442 set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
414 443
415 if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { 444 if (tr->trace_flags & TRACE_ITER_RECORD_CMD) {
445 cmd = true;
416 tracing_start_cmdline_record(); 446 tracing_start_cmdline_record();
417 set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); 447 set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
418 } 448 }
449
450 if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
451 tgid = true;
452 tracing_start_tgid_record();
453 set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
454 }
455
419 ret = call->class->reg(call, TRACE_REG_REGISTER, file); 456 ret = call->class->reg(call, TRACE_REG_REGISTER, file);
420 if (ret) { 457 if (ret) {
421 tracing_stop_cmdline_record(); 458 if (cmd)
459 tracing_stop_cmdline_record();
460 if (tgid)
461 tracing_stop_tgid_record();
422 pr_info("event trace: Could not enable event " 462 pr_info("event trace: Could not enable event "
423 "%s\n", trace_event_name(call)); 463 "%s\n", trace_event_name(call));
424 break; 464 break;
@@ -2067,18 +2107,18 @@ __register_event(struct trace_event_call *call, struct module *mod)
2067 return 0; 2107 return 0;
2068} 2108}
2069 2109
2070static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) 2110static char *eval_replace(char *ptr, struct trace_eval_map *map, int len)
2071{ 2111{
2072 int rlen; 2112 int rlen;
2073 int elen; 2113 int elen;
2074 2114
2075 /* Find the length of the enum value as a string */ 2115 /* Find the length of the eval value as a string */
2076 elen = snprintf(ptr, 0, "%ld", map->enum_value); 2116 elen = snprintf(ptr, 0, "%ld", map->eval_value);
2077 /* Make sure there's enough room to replace the string with the value */ 2117 /* Make sure there's enough room to replace the string with the value */
2078 if (len < elen) 2118 if (len < elen)
2079 return NULL; 2119 return NULL;
2080 2120
2081 snprintf(ptr, elen + 1, "%ld", map->enum_value); 2121 snprintf(ptr, elen + 1, "%ld", map->eval_value);
2082 2122
2083 /* Get the rest of the string of ptr */ 2123 /* Get the rest of the string of ptr */
2084 rlen = strlen(ptr + len); 2124 rlen = strlen(ptr + len);
@@ -2090,11 +2130,11 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
2090} 2130}
2091 2131
2092static void update_event_printk(struct trace_event_call *call, 2132static void update_event_printk(struct trace_event_call *call,
2093 struct trace_enum_map *map) 2133 struct trace_eval_map *map)
2094{ 2134{
2095 char *ptr; 2135 char *ptr;
2096 int quote = 0; 2136 int quote = 0;
2097 int len = strlen(map->enum_string); 2137 int len = strlen(map->eval_string);
2098 2138
2099 for (ptr = call->print_fmt; *ptr; ptr++) { 2139 for (ptr = call->print_fmt; *ptr; ptr++) {
2100 if (*ptr == '\\') { 2140 if (*ptr == '\\') {
@@ -2125,16 +2165,16 @@ static void update_event_printk(struct trace_event_call *call,
2125 continue; 2165 continue;
2126 } 2166 }
2127 if (isalpha(*ptr) || *ptr == '_') { 2167 if (isalpha(*ptr) || *ptr == '_') {
2128 if (strncmp(map->enum_string, ptr, len) == 0 && 2168 if (strncmp(map->eval_string, ptr, len) == 0 &&
2129 !isalnum(ptr[len]) && ptr[len] != '_') { 2169 !isalnum(ptr[len]) && ptr[len] != '_') {
2130 ptr = enum_replace(ptr, map, len); 2170 ptr = eval_replace(ptr, map, len);
2131 /* Hmm, enum string smaller than value */ 2171 /* enum/sizeof string smaller than value */
2132 if (WARN_ON_ONCE(!ptr)) 2172 if (WARN_ON_ONCE(!ptr))
2133 return; 2173 return;
2134 /* 2174 /*
2135 * No need to decrement here, as enum_replace() 2175 * No need to decrement here, as eval_replace()
2136 * returns the pointer to the character passed 2176 * returns the pointer to the character passed
2137 * the enum, and two enums can not be placed 2177 * the eval, and two evals can not be placed
2138 * back to back without something in between. 2178 * back to back without something in between.
2139 * We can skip that something in between. 2179 * We can skip that something in between.
2140 */ 2180 */
@@ -2165,7 +2205,7 @@ static void update_event_printk(struct trace_event_call *call,
2165 } 2205 }
2166} 2206}
2167 2207
2168void trace_event_enum_update(struct trace_enum_map **map, int len) 2208void trace_event_eval_update(struct trace_eval_map **map, int len)
2169{ 2209{
2170 struct trace_event_call *call, *p; 2210 struct trace_event_call *call, *p;
2171 const char *last_system = NULL; 2211 const char *last_system = NULL;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index a3bddbfd0874..a0910c0cdf2e 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -654,6 +654,9 @@ ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash,
654{ 654{
655 struct ftrace_probe_ops *ops; 655 struct ftrace_probe_ops *ops;
656 656
657 if (!tr)
658 return -ENODEV;
659
657 /* we register both traceon and traceoff to this callback */ 660 /* we register both traceon and traceoff to this callback */
658 if (strcmp(cmd, "traceon") == 0) 661 if (strcmp(cmd, "traceon") == 0)
659 ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; 662 ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
@@ -670,6 +673,9 @@ ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash,
670{ 673{
671 struct ftrace_probe_ops *ops; 674 struct ftrace_probe_ops *ops;
672 675
676 if (!tr)
677 return -ENODEV;
678
673 ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; 679 ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
674 680
675 return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, 681 return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
@@ -682,6 +688,9 @@ ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash,
682{ 688{
683 struct ftrace_probe_ops *ops; 689 struct ftrace_probe_ops *ops;
684 690
691 if (!tr)
692 return -ENODEV;
693
685 ops = &dump_probe_ops; 694 ops = &dump_probe_ops;
686 695
687 /* Only dump once. */ 696 /* Only dump once. */
@@ -695,6 +704,9 @@ ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash,
695{ 704{
696 struct ftrace_probe_ops *ops; 705 struct ftrace_probe_ops *ops;
697 706
707 if (!tr)
708 return -ENODEV;
709
698 ops = &cpudump_probe_ops; 710 ops = &cpudump_probe_ops;
699 711
700 /* Only dump once. */ 712 /* Only dump once. */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c129fca6ec99..c9b5aa10fbf9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -598,6 +598,14 @@ static struct notifier_block trace_kprobe_module_nb = {
598 .priority = 1 /* Invoked after kprobe module callback */ 598 .priority = 1 /* Invoked after kprobe module callback */
599}; 599};
600 600
601/* Convert certain expected symbols into '_' when generating event names */
602static inline void sanitize_event_name(char *name)
603{
604 while (*name++ != '\0')
605 if (*name == ':' || *name == '.')
606 *name = '_';
607}
608
601static int create_trace_kprobe(int argc, char **argv) 609static int create_trace_kprobe(int argc, char **argv)
602{ 610{
603 /* 611 /*
@@ -707,24 +715,20 @@ static int create_trace_kprobe(int argc, char **argv)
707 pr_info("Probe point is not specified.\n"); 715 pr_info("Probe point is not specified.\n");
708 return -EINVAL; 716 return -EINVAL;
709 } 717 }
710 if (isdigit(argv[1][0])) { 718
711 /* an address specified */ 719 /* try to parse an address. if that fails, try to read the
712 ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); 720 * input as a symbol. */
713 if (ret) { 721 if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
714 pr_info("Failed to parse address.\n");
715 return ret;
716 }
717 } else {
718 /* a symbol specified */ 722 /* a symbol specified */
719 symbol = argv[1]; 723 symbol = argv[1];
720 /* TODO: support .init module functions */ 724 /* TODO: support .init module functions */
721 ret = traceprobe_split_symbol_offset(symbol, &offset); 725 ret = traceprobe_split_symbol_offset(symbol, &offset);
722 if (ret) { 726 if (ret) {
723 pr_info("Failed to parse symbol.\n"); 727 pr_info("Failed to parse either an address or a symbol.\n");
724 return ret; 728 return ret;
725 } 729 }
726 if (offset && is_return && 730 if (offset && is_return &&
727 !function_offset_within_entry(NULL, symbol, offset)) { 731 !kprobe_on_func_entry(NULL, symbol, offset)) {
728 pr_info("Given offset is not valid for return probe.\n"); 732 pr_info("Given offset is not valid for return probe.\n");
729 return -EINVAL; 733 return -EINVAL;
730 } 734 }
@@ -740,6 +744,7 @@ static int create_trace_kprobe(int argc, char **argv)
740 else 744 else
741 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", 745 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
742 is_return ? 'r' : 'p', addr); 746 is_return ? 'r' : 'p', addr);
747 sanitize_event_name(buf);
743 event = buf; 748 event = buf;
744 } 749 }
745 tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, 750 tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 08f9bab8089e..bac629af2285 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -340,31 +340,41 @@ static inline const char *kretprobed(const char *name)
340static void 340static void
341seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) 341seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
342{ 342{
343#ifdef CONFIG_KALLSYMS
344 char str[KSYM_SYMBOL_LEN]; 343 char str[KSYM_SYMBOL_LEN];
344#ifdef CONFIG_KALLSYMS
345 const char *name; 345 const char *name;
346 346
347 kallsyms_lookup(address, NULL, NULL, NULL, str); 347 kallsyms_lookup(address, NULL, NULL, NULL, str);
348 348
349 name = kretprobed(str); 349 name = kretprobed(str);
350 350
351 trace_seq_printf(s, fmt, name); 351 if (name && strlen(name)) {
352 trace_seq_printf(s, fmt, name);
353 return;
354 }
352#endif 355#endif
356 snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
357 trace_seq_printf(s, fmt, str);
353} 358}
354 359
355static void 360static void
356seq_print_sym_offset(struct trace_seq *s, const char *fmt, 361seq_print_sym_offset(struct trace_seq *s, const char *fmt,
357 unsigned long address) 362 unsigned long address)
358{ 363{
359#ifdef CONFIG_KALLSYMS
360 char str[KSYM_SYMBOL_LEN]; 364 char str[KSYM_SYMBOL_LEN];
365#ifdef CONFIG_KALLSYMS
361 const char *name; 366 const char *name;
362 367
363 sprint_symbol(str, address); 368 sprint_symbol(str, address);
364 name = kretprobed(str); 369 name = kretprobed(str);
365 370
366 trace_seq_printf(s, fmt, name); 371 if (name && strlen(name)) {
372 trace_seq_printf(s, fmt, name);
373 return;
374 }
367#endif 375#endif
376 snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
377 trace_seq_printf(s, fmt, str);
368} 378}
369 379
370#ifndef CONFIG_64BIT 380#ifndef CONFIG_64BIT
@@ -587,6 +597,15 @@ int trace_print_context(struct trace_iterator *iter)
587 trace_seq_printf(s, "%16s-%-5d [%03d] ", 597 trace_seq_printf(s, "%16s-%-5d [%03d] ",
588 comm, entry->pid, iter->cpu); 598 comm, entry->pid, iter->cpu);
589 599
600 if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
601 unsigned int tgid = trace_find_tgid(entry->pid);
602
603 if (!tgid)
604 trace_seq_printf(s, "(-----) ");
605 else
606 trace_seq_printf(s, "(%5d) ", tgid);
607 }
608
590 if (tr->trace_flags & TRACE_ITER_IRQ_INFO) 609 if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
591 trace_print_lat_fmt(s, entry); 610 trace_print_lat_fmt(s, entry);
592 611
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 4c896a0101bd..b341c02730be 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -12,27 +12,38 @@
12 12
13#include "trace.h" 13#include "trace.h"
14 14
15static int sched_ref; 15#define RECORD_CMDLINE 1
16#define RECORD_TGID 2
17
18static int sched_cmdline_ref;
19static int sched_tgid_ref;
16static DEFINE_MUTEX(sched_register_mutex); 20static DEFINE_MUTEX(sched_register_mutex);
17 21
18static void 22static void
19probe_sched_switch(void *ignore, bool preempt, 23probe_sched_switch(void *ignore, bool preempt,
20 struct task_struct *prev, struct task_struct *next) 24 struct task_struct *prev, struct task_struct *next)
21{ 25{
22 if (unlikely(!sched_ref)) 26 int flags;
23 return; 27
28 flags = (RECORD_TGID * !!sched_tgid_ref) +
29 (RECORD_CMDLINE * !!sched_cmdline_ref);
24 30
25 tracing_record_cmdline(prev); 31 if (!flags)
26 tracing_record_cmdline(next); 32 return;
33 tracing_record_taskinfo_sched_switch(prev, next, flags);
27} 34}
28 35
29static void 36static void
30probe_sched_wakeup(void *ignore, struct task_struct *wakee) 37probe_sched_wakeup(void *ignore, struct task_struct *wakee)
31{ 38{
32 if (unlikely(!sched_ref)) 39 int flags;
33 return; 40
41 flags = (RECORD_TGID * !!sched_tgid_ref) +
42 (RECORD_CMDLINE * !!sched_cmdline_ref);
34 43
35 tracing_record_cmdline(current); 44 if (!flags)
45 return;
46 tracing_record_taskinfo(current, flags);
36} 47}
37 48
38static int tracing_sched_register(void) 49static int tracing_sched_register(void)
@@ -75,28 +86,61 @@ static void tracing_sched_unregister(void)
75 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); 86 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
76} 87}
77 88
78static void tracing_start_sched_switch(void) 89static void tracing_start_sched_switch(int ops)
79{ 90{
91 bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
80 mutex_lock(&sched_register_mutex); 92 mutex_lock(&sched_register_mutex);
81 if (!(sched_ref++)) 93
94 switch (ops) {
95 case RECORD_CMDLINE:
96 sched_cmdline_ref++;
97 break;
98
99 case RECORD_TGID:
100 sched_tgid_ref++;
101 break;
102 }
103
104 if (sched_register && (sched_cmdline_ref || sched_tgid_ref))
82 tracing_sched_register(); 105 tracing_sched_register();
83 mutex_unlock(&sched_register_mutex); 106 mutex_unlock(&sched_register_mutex);
84} 107}
85 108
86static void tracing_stop_sched_switch(void) 109static void tracing_stop_sched_switch(int ops)
87{ 110{
88 mutex_lock(&sched_register_mutex); 111 mutex_lock(&sched_register_mutex);
89 if (!(--sched_ref)) 112
113 switch (ops) {
114 case RECORD_CMDLINE:
115 sched_cmdline_ref--;
116 break;
117
118 case RECORD_TGID:
119 sched_tgid_ref--;
120 break;
121 }
122
123 if (!sched_cmdline_ref && !sched_tgid_ref)
90 tracing_sched_unregister(); 124 tracing_sched_unregister();
91 mutex_unlock(&sched_register_mutex); 125 mutex_unlock(&sched_register_mutex);
92} 126}
93 127
94void tracing_start_cmdline_record(void) 128void tracing_start_cmdline_record(void)
95{ 129{
96 tracing_start_sched_switch(); 130 tracing_start_sched_switch(RECORD_CMDLINE);
97} 131}
98 132
99void tracing_stop_cmdline_record(void) 133void tracing_stop_cmdline_record(void)
100{ 134{
101 tracing_stop_sched_switch(); 135 tracing_stop_sched_switch(RECORD_CMDLINE);
136}
137
138void tracing_start_tgid_record(void)
139{
140 tracing_start_sched_switch(RECORD_TGID);
141}
142
143void tracing_stop_tgid_record(void)
144{
145 tracing_stop_sched_switch(RECORD_TGID);
102} 146}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 76aa04d4c925..a4df67cbc711 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -406,10 +406,14 @@ static const struct file_operations stack_trace_fops = {
406 .release = seq_release, 406 .release = seq_release,
407}; 407};
408 408
409#ifdef CONFIG_DYNAMIC_FTRACE
410
409static int 411static int
410stack_trace_filter_open(struct inode *inode, struct file *file) 412stack_trace_filter_open(struct inode *inode, struct file *file)
411{ 413{
412 return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER, 414 struct ftrace_ops *ops = inode->i_private;
415
416 return ftrace_regex_open(ops, FTRACE_ITER_FILTER,
413 inode, file); 417 inode, file);
414} 418}
415 419
@@ -421,6 +425,8 @@ static const struct file_operations stack_trace_filter_fops = {
421 .release = ftrace_regex_release, 425 .release = ftrace_regex_release,
422}; 426};
423 427
428#endif /* CONFIG_DYNAMIC_FTRACE */
429
424int 430int
425stack_trace_sysctl(struct ctl_table *table, int write, 431stack_trace_sysctl(struct ctl_table *table, int write,
426 void __user *buffer, size_t *lenp, 432 void __user *buffer, size_t *lenp,
@@ -475,8 +481,10 @@ static __init int stack_trace_init(void)
475 trace_create_file("stack_trace", 0444, d_tracer, 481 trace_create_file("stack_trace", 0444, d_tracer,
476 NULL, &stack_trace_fops); 482 NULL, &stack_trace_fops);
477 483
484#ifdef CONFIG_DYNAMIC_FTRACE
478 trace_create_file("stack_trace_filter", 0444, d_tracer, 485 trace_create_file("stack_trace_filter", 0444, d_tracer,
479 NULL, &stack_trace_filter_fops); 486 &trace_ops, &stack_trace_filter_fops);
487#endif
480 488
481 if (stack_trace_filter_buf[0]) 489 if (stack_trace_filter_buf[0])
482 ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); 490 ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 03e0b69bb5bf..06d3389bca0d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -9,7 +9,7 @@
9 * to those contributors as well. 9 * to those contributors as well.
10 */ 10 */
11 11
12#define pr_fmt(fmt) "NMI watchdog: " fmt 12#define pr_fmt(fmt) "watchdog: " fmt
13 13
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/cpu.h> 15#include <linux/cpu.h>
@@ -29,15 +29,58 @@
29#include <linux/kvm_para.h> 29#include <linux/kvm_para.h>
30#include <linux/kthread.h> 30#include <linux/kthread.h>
31 31
32/* Watchdog configuration */
32static DEFINE_MUTEX(watchdog_proc_mutex); 33static DEFINE_MUTEX(watchdog_proc_mutex);
33 34
34#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) 35int __read_mostly nmi_watchdog_enabled;
35unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; 36
37#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
38unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
39 NMI_WATCHDOG_ENABLED;
36#else 40#else
37unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; 41unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
38#endif 42#endif
39int __read_mostly nmi_watchdog_enabled; 43
44#ifdef CONFIG_HARDLOCKUP_DETECTOR
45/* boot commands */
46/*
47 * Should we panic when a soft-lockup or hard-lockup occurs:
48 */
49unsigned int __read_mostly hardlockup_panic =
50 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
51/*
52 * We may not want to enable hard lockup detection by default in all cases,
53 * for example when running the kernel as a guest on a hypervisor. In these
54 * cases this function can be called to disable hard lockup detection. This
55 * function should only be executed once by the boot processor before the
56 * kernel command line parameters are parsed, because otherwise it is not
57 * possible to override this in hardlockup_panic_setup().
58 */
59void hardlockup_detector_disable(void)
60{
61 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
62}
63
64static int __init hardlockup_panic_setup(char *str)
65{
66 if (!strncmp(str, "panic", 5))
67 hardlockup_panic = 1;
68 else if (!strncmp(str, "nopanic", 7))
69 hardlockup_panic = 0;
70 else if (!strncmp(str, "0", 1))
71 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
72 else if (!strncmp(str, "1", 1))
73 watchdog_enabled |= NMI_WATCHDOG_ENABLED;
74 return 1;
75}
76__setup("nmi_watchdog=", hardlockup_panic_setup);
77
78#endif
79
80#ifdef CONFIG_SOFTLOCKUP_DETECTOR
40int __read_mostly soft_watchdog_enabled; 81int __read_mostly soft_watchdog_enabled;
82#endif
83
41int __read_mostly watchdog_user_enabled; 84int __read_mostly watchdog_user_enabled;
42int __read_mostly watchdog_thresh = 10; 85int __read_mostly watchdog_thresh = 10;
43 86
@@ -45,15 +88,9 @@ int __read_mostly watchdog_thresh = 10;
45int __read_mostly sysctl_softlockup_all_cpu_backtrace; 88int __read_mostly sysctl_softlockup_all_cpu_backtrace;
46int __read_mostly sysctl_hardlockup_all_cpu_backtrace; 89int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
47#endif 90#endif
48static struct cpumask watchdog_cpumask __read_mostly; 91struct cpumask watchdog_cpumask __read_mostly;
49unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); 92unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
50 93
51/* Helper for online, unparked cpus. */
52#define for_each_watchdog_cpu(cpu) \
53 for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
54
55atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
56
57/* 94/*
58 * The 'watchdog_running' variable is set to 1 when the watchdog threads 95 * The 'watchdog_running' variable is set to 1 when the watchdog threads
59 * are registered/started and is set to 0 when the watchdog threads are 96 * are registered/started and is set to 0 when the watchdog threads are
@@ -72,7 +109,47 @@ static int __read_mostly watchdog_running;
72 * of 'watchdog_running' cannot change while the watchdog is deactivated 109 * of 'watchdog_running' cannot change while the watchdog is deactivated
73 * temporarily (see related code in 'proc' handlers). 110 * temporarily (see related code in 'proc' handlers).
74 */ 111 */
75static int __read_mostly watchdog_suspended; 112int __read_mostly watchdog_suspended;
113
114/*
115 * These functions can be overridden if an architecture implements its
116 * own hardlockup detector.
117 *
118 * watchdog_nmi_enable/disable can be implemented to start and stop when
119 * softlockup watchdog threads start and stop. The arch must select the
120 * SOFTLOCKUP_DETECTOR Kconfig.
121 */
122int __weak watchdog_nmi_enable(unsigned int cpu)
123{
124 return 0;
125}
126void __weak watchdog_nmi_disable(unsigned int cpu)
127{
128}
129
130/*
131 * watchdog_nmi_reconfigure can be implemented to be notified after any
132 * watchdog configuration change. The arch hardlockup watchdog should
133 * respond to the following variables:
134 * - nmi_watchdog_enabled
135 * - watchdog_thresh
136 * - watchdog_cpumask
137 * - sysctl_hardlockup_all_cpu_backtrace
138 * - hardlockup_panic
139 * - watchdog_suspended
140 */
141void __weak watchdog_nmi_reconfigure(void)
142{
143}
144
145
146#ifdef CONFIG_SOFTLOCKUP_DETECTOR
147
148/* Helper for online, unparked cpus. */
149#define for_each_watchdog_cpu(cpu) \
150 for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
151
152atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
76 153
77static u64 __read_mostly sample_period; 154static u64 __read_mostly sample_period;
78 155
@@ -120,6 +197,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
120 return 1; 197 return 1;
121} 198}
122__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); 199__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
200#ifdef CONFIG_HARDLOCKUP_DETECTOR
123static int __init hardlockup_all_cpu_backtrace_setup(char *str) 201static int __init hardlockup_all_cpu_backtrace_setup(char *str)
124{ 202{
125 sysctl_hardlockup_all_cpu_backtrace = 203 sysctl_hardlockup_all_cpu_backtrace =
@@ -128,6 +206,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str)
128} 206}
129__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); 207__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
130#endif 208#endif
209#endif
131 210
132/* 211/*
133 * Hard-lockup warnings should be triggered after just a few seconds. Soft- 212 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -213,18 +292,6 @@ void touch_softlockup_watchdog_sync(void)
213 __this_cpu_write(watchdog_touch_ts, 0); 292 __this_cpu_write(watchdog_touch_ts, 0);
214} 293}
215 294
216/* watchdog detector functions */
217bool is_hardlockup(void)
218{
219 unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
220
221 if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
222 return true;
223
224 __this_cpu_write(hrtimer_interrupts_saved, hrint);
225 return false;
226}
227
228static int is_softlockup(unsigned long touch_ts) 295static int is_softlockup(unsigned long touch_ts)
229{ 296{
230 unsigned long now = get_timestamp(); 297 unsigned long now = get_timestamp();
@@ -237,21 +304,21 @@ static int is_softlockup(unsigned long touch_ts)
237 return 0; 304 return 0;
238} 305}
239 306
240static void watchdog_interrupt_count(void) 307/* watchdog detector functions */
308bool is_hardlockup(void)
241{ 309{
242 __this_cpu_inc(hrtimer_interrupts); 310 unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
243}
244 311
245/* 312 if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
246 * These two functions are mostly architecture specific 313 return true;
247 * defining them as weak here. 314
248 */ 315 __this_cpu_write(hrtimer_interrupts_saved, hrint);
249int __weak watchdog_nmi_enable(unsigned int cpu) 316 return false;
250{
251 return 0;
252} 317}
253void __weak watchdog_nmi_disable(unsigned int cpu) 318
319static void watchdog_interrupt_count(void)
254{ 320{
321 __this_cpu_inc(hrtimer_interrupts);
255} 322}
256 323
257static int watchdog_enable_all_cpus(void); 324static int watchdog_enable_all_cpus(void);
@@ -502,57 +569,6 @@ static void watchdog_unpark_threads(void)
502 kthread_unpark(per_cpu(softlockup_watchdog, cpu)); 569 kthread_unpark(per_cpu(softlockup_watchdog, cpu));
503} 570}
504 571
505/*
506 * Suspend the hard and soft lockup detector by parking the watchdog threads.
507 */
508int lockup_detector_suspend(void)
509{
510 int ret = 0;
511
512 get_online_cpus();
513 mutex_lock(&watchdog_proc_mutex);
514 /*
515 * Multiple suspend requests can be active in parallel (counted by
516 * the 'watchdog_suspended' variable). If the watchdog threads are
517 * running, the first caller takes care that they will be parked.
518 * The state of 'watchdog_running' cannot change while a suspend
519 * request is active (see related code in 'proc' handlers).
520 */
521 if (watchdog_running && !watchdog_suspended)
522 ret = watchdog_park_threads();
523
524 if (ret == 0)
525 watchdog_suspended++;
526 else {
527 watchdog_disable_all_cpus();
528 pr_err("Failed to suspend lockup detectors, disabled\n");
529 watchdog_enabled = 0;
530 }
531
532 mutex_unlock(&watchdog_proc_mutex);
533
534 return ret;
535}
536
537/*
538 * Resume the hard and soft lockup detector by unparking the watchdog threads.
539 */
540void lockup_detector_resume(void)
541{
542 mutex_lock(&watchdog_proc_mutex);
543
544 watchdog_suspended--;
545 /*
546 * The watchdog threads are unparked if they were previously running
547 * and if there is no more active suspend request.
548 */
549 if (watchdog_running && !watchdog_suspended)
550 watchdog_unpark_threads();
551
552 mutex_unlock(&watchdog_proc_mutex);
553 put_online_cpus();
554}
555
556static int update_watchdog_all_cpus(void) 572static int update_watchdog_all_cpus(void)
557{ 573{
558 int ret; 574 int ret;
@@ -605,6 +621,100 @@ static void watchdog_disable_all_cpus(void)
605} 621}
606 622
607#ifdef CONFIG_SYSCTL 623#ifdef CONFIG_SYSCTL
624static int watchdog_update_cpus(void)
625{
626 return smpboot_update_cpumask_percpu_thread(
627 &watchdog_threads, &watchdog_cpumask);
628}
629#endif
630
631#else /* SOFTLOCKUP */
632static int watchdog_park_threads(void)
633{
634 return 0;
635}
636
637static void watchdog_unpark_threads(void)
638{
639}
640
641static int watchdog_enable_all_cpus(void)
642{
643 return 0;
644}
645
646static void watchdog_disable_all_cpus(void)
647{
648}
649
650#ifdef CONFIG_SYSCTL
651static int watchdog_update_cpus(void)
652{
653 return 0;
654}
655#endif
656
657static void set_sample_period(void)
658{
659}
660#endif /* SOFTLOCKUP */
661
662/*
663 * Suspend the hard and soft lockup detector by parking the watchdog threads.
664 */
665int lockup_detector_suspend(void)
666{
667 int ret = 0;
668
669 get_online_cpus();
670 mutex_lock(&watchdog_proc_mutex);
671 /*
672 * Multiple suspend requests can be active in parallel (counted by
673 * the 'watchdog_suspended' variable). If the watchdog threads are
674 * running, the first caller takes care that they will be parked.
675 * The state of 'watchdog_running' cannot change while a suspend
676 * request is active (see related code in 'proc' handlers).
677 */
678 if (watchdog_running && !watchdog_suspended)
679 ret = watchdog_park_threads();
680
681 if (ret == 0)
682 watchdog_suspended++;
683 else {
684 watchdog_disable_all_cpus();
685 pr_err("Failed to suspend lockup detectors, disabled\n");
686 watchdog_enabled = 0;
687 }
688
689 watchdog_nmi_reconfigure();
690
691 mutex_unlock(&watchdog_proc_mutex);
692
693 return ret;
694}
695
696/*
697 * Resume the hard and soft lockup detector by unparking the watchdog threads.
698 */
699void lockup_detector_resume(void)
700{
701 mutex_lock(&watchdog_proc_mutex);
702
703 watchdog_suspended--;
704 /*
705 * The watchdog threads are unparked if they were previously running
706 * and if there is no more active suspend request.
707 */
708 if (watchdog_running && !watchdog_suspended)
709 watchdog_unpark_threads();
710
711 watchdog_nmi_reconfigure();
712
713 mutex_unlock(&watchdog_proc_mutex);
714 put_online_cpus();
715}
716
717#ifdef CONFIG_SYSCTL
608 718
609/* 719/*
610 * Update the run state of the lockup detectors. 720 * Update the run state of the lockup detectors.
@@ -625,6 +735,8 @@ static int proc_watchdog_update(void)
625 else 735 else
626 watchdog_disable_all_cpus(); 736 watchdog_disable_all_cpus();
627 737
738 watchdog_nmi_reconfigure();
739
628 return err; 740 return err;
629 741
630} 742}
@@ -810,10 +922,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
810 * a temporary cpumask, so we are likely not in a 922 * a temporary cpumask, so we are likely not in a
811 * position to do much else to make things better. 923 * position to do much else to make things better.
812 */ 924 */
813 if (smpboot_update_cpumask_percpu_thread( 925 if (watchdog_update_cpus() != 0)
814 &watchdog_threads, &watchdog_cpumask) != 0)
815 pr_err("cpumask update failed\n"); 926 pr_err("cpumask update failed\n");
816 } 927 }
928
929 watchdog_nmi_reconfigure();
817 } 930 }
818out: 931out:
819 mutex_unlock(&watchdog_proc_mutex); 932 mutex_unlock(&watchdog_proc_mutex);
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 54a427d1f344..295a0d84934c 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -22,41 +22,9 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn);
22static DEFINE_PER_CPU(bool, watchdog_nmi_touch); 22static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
23static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 23static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
24 24
25/* boot commands */
26/*
27 * Should we panic when a soft-lockup or hard-lockup occurs:
28 */
29unsigned int __read_mostly hardlockup_panic =
30 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
31static unsigned long hardlockup_allcpu_dumped; 25static unsigned long hardlockup_allcpu_dumped;
32/*
33 * We may not want to enable hard lockup detection by default in all cases,
34 * for example when running the kernel as a guest on a hypervisor. In these
35 * cases this function can be called to disable hard lockup detection. This
36 * function should only be executed once by the boot processor before the
37 * kernel command line parameters are parsed, because otherwise it is not
38 * possible to override this in hardlockup_panic_setup().
39 */
40void hardlockup_detector_disable(void)
41{
42 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
43}
44
45static int __init hardlockup_panic_setup(char *str)
46{
47 if (!strncmp(str, "panic", 5))
48 hardlockup_panic = 1;
49 else if (!strncmp(str, "nopanic", 7))
50 hardlockup_panic = 0;
51 else if (!strncmp(str, "0", 1))
52 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
53 else if (!strncmp(str, "1", 1))
54 watchdog_enabled |= NMI_WATCHDOG_ENABLED;
55 return 1;
56}
57__setup("nmi_watchdog=", hardlockup_panic_setup);
58 26
59void touch_nmi_watchdog(void) 27void arch_touch_nmi_watchdog(void)
60{ 28{
61 /* 29 /*
62 * Using __raw here because some code paths have 30 * Using __raw here because some code paths have
@@ -66,9 +34,8 @@ void touch_nmi_watchdog(void)
66 * going off. 34 * going off.
67 */ 35 */
68 raw_cpu_write(watchdog_nmi_touch, true); 36 raw_cpu_write(watchdog_nmi_touch, true);
69 touch_softlockup_watchdog();
70} 37}
71EXPORT_SYMBOL(touch_nmi_watchdog); 38EXPORT_SYMBOL(arch_touch_nmi_watchdog);
72 39
73static struct perf_event_attr wd_hw_attr = { 40static struct perf_event_attr wd_hw_attr = {
74 .type = PERF_TYPE_HARDWARE, 41 .type = PERF_TYPE_HARDWARE,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c74bf39ef764..a86688fabc55 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2864,11 +2864,11 @@ bool flush_work(struct work_struct *work)
2864EXPORT_SYMBOL_GPL(flush_work); 2864EXPORT_SYMBOL_GPL(flush_work);
2865 2865
2866struct cwt_wait { 2866struct cwt_wait {
2867 wait_queue_t wait; 2867 wait_queue_entry_t wait;
2868 struct work_struct *work; 2868 struct work_struct *work;
2869}; 2869};
2870 2870
2871static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key) 2871static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
2872{ 2872{
2873 struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); 2873 struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
2874 2874