aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c43
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditfilter.c10
-rw-r--r--kernel/cgroup.c71
-rw-r--r--kernel/compat.c100
-rw-r--r--kernel/cpu/Makefile1
-rw-r--r--kernel/cpuset.c10
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/events/core.c51
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/futex.c90
-rw-r--r--kernel/irq/irqdomain.c1
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/irq_work.c6
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/ksysfs.c2
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lockdep.c23
-rw-r--r--kernel/locking/locktorture.c452
-rw-r--r--kernel/locking/mcs_spinlock.c178
-rw-r--r--kernel/locking/mcs_spinlock.h129
-rw-r--r--kernel/locking/mutex-debug.c6
-rw-r--r--kernel/locking/mutex.c104
-rw-r--r--kernel/locking/rtmutex.c12
-rw-r--r--kernel/locking/rwsem-xadd.c4
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/power/console.c1
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c4
-rw-r--r--kernel/rcu/Makefile2
-rw-r--r--kernel/rcu/rcu.h7
-rw-r--r--kernel/rcu/rcutorture.c (renamed from kernel/rcu/torture.c)1004
-rw-r--r--kernel/rcu/srcu.c11
-rw-r--r--kernel/rcu/tiny.c8
-rw-r--r--kernel/rcu/tiny_plugin.h4
-rw-r--r--kernel/rcu/tree.c80
-rw-r--r--kernel/rcu/tree.h4
-rw-r--r--kernel/rcu/tree_plugin.h19
-rw-r--r--kernel/rcu/tree_trace.c6
-rw-r--r--kernel/rcu/update.c5
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c2
-rw-r--r--kernel/sched/clock.c4
-rw-r--r--kernel/sched/core.c248
-rw-r--r--kernel/sched/cpudeadline.c6
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/sched/deadline.c76
-rw-r--r--kernel/sched/debug.c7
-rw-r--r--kernel/sched/fair.c610
-rw-r--r--kernel/sched/idle.c (renamed from kernel/cpu/idle.c)7
-rw-r--r--kernel/sched/idle_task.c25
-rw-r--r--kernel/sched/rt.c110
-rw-r--r--kernel/sched/sched.h66
-rw-r--r--kernel/sched/stop_task.c15
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/time/sched_clock.c46
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/torture.c719
-rw-r--r--kernel/trace/ring_buffer_benchmark.c6
-rw-r--r--kernel/trace/trace.c27
-rw-r--r--kernel/trace/trace_event_perf.c22
-rw-r--r--kernel/trace/trace_events.c16
-rw-r--r--kernel/trace/trace_export.c7
-rw-r--r--kernel/trace/trace_irqsoff.c4
-rw-r--r--kernel/tracepoint.c7
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/workqueue.c9
77 files changed, 3062 insertions, 1503 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index bc010ee272b6..f2a8b6246ce9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -18,11 +18,13 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
18CFLAGS_REMOVE_irq_work.o = -pg 18CFLAGS_REMOVE_irq_work.o = -pg
19endif 19endif
20 20
21# cond_syscall is currently not LTO compatible
22CFLAGS_sys_ni.o = $(DISABLE_LTO)
23
21obj-y += sched/ 24obj-y += sched/
22obj-y += locking/ 25obj-y += locking/
23obj-y += power/ 26obj-y += power/
24obj-y += printk/ 27obj-y += printk/
25obj-y += cpu/
26obj-y += irq/ 28obj-y += irq/
27obj-y += rcu/ 29obj-y += rcu/
28 30
@@ -93,6 +95,7 @@ obj-$(CONFIG_PADATA) += padata.o
93obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 95obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
94obj-$(CONFIG_JUMP_LABEL) += jump_label.o 96obj-$(CONFIG_JUMP_LABEL) += jump_label.o
95obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o 97obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
98obj-$(CONFIG_TORTURE_TEST) += torture.o
96 99
97$(obj)/configs.o: $(obj)/config_data.h 100$(obj)/configs.o: $(obj)/config_data.h
98 101
diff --git a/kernel/audit.c b/kernel/audit.c
index 34c5a2310fbf..95a20f3f52f1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -182,7 +182,7 @@ struct audit_buffer {
182 182
183struct audit_reply { 183struct audit_reply {
184 __u32 portid; 184 __u32 portid;
185 pid_t pid; 185 struct net *net;
186 struct sk_buff *skb; 186 struct sk_buff *skb;
187}; 187};
188 188
@@ -500,7 +500,7 @@ int audit_send_list(void *_dest)
500{ 500{
501 struct audit_netlink_list *dest = _dest; 501 struct audit_netlink_list *dest = _dest;
502 struct sk_buff *skb; 502 struct sk_buff *skb;
503 struct net *net = get_net_ns_by_pid(dest->pid); 503 struct net *net = dest->net;
504 struct audit_net *aunet = net_generic(net, audit_net_id); 504 struct audit_net *aunet = net_generic(net, audit_net_id);
505 505
506 /* wait for parent to finish and send an ACK */ 506 /* wait for parent to finish and send an ACK */
@@ -510,6 +510,7 @@ int audit_send_list(void *_dest)
510 while ((skb = __skb_dequeue(&dest->q)) != NULL) 510 while ((skb = __skb_dequeue(&dest->q)) != NULL)
511 netlink_unicast(aunet->nlsk, skb, dest->portid, 0); 511 netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
512 512
513 put_net(net);
513 kfree(dest); 514 kfree(dest);
514 515
515 return 0; 516 return 0;
@@ -543,7 +544,7 @@ out_kfree_skb:
543static int audit_send_reply_thread(void *arg) 544static int audit_send_reply_thread(void *arg)
544{ 545{
545 struct audit_reply *reply = (struct audit_reply *)arg; 546 struct audit_reply *reply = (struct audit_reply *)arg;
546 struct net *net = get_net_ns_by_pid(reply->pid); 547 struct net *net = reply->net;
547 struct audit_net *aunet = net_generic(net, audit_net_id); 548 struct audit_net *aunet = net_generic(net, audit_net_id);
548 549
549 mutex_lock(&audit_cmd_mutex); 550 mutex_lock(&audit_cmd_mutex);
@@ -552,12 +553,13 @@ static int audit_send_reply_thread(void *arg)
552 /* Ignore failure. It'll only happen if the sender goes away, 553 /* Ignore failure. It'll only happen if the sender goes away,
553 because our timeout is set to infinite. */ 554 because our timeout is set to infinite. */
554 netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0); 555 netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
556 put_net(net);
555 kfree(reply); 557 kfree(reply);
556 return 0; 558 return 0;
557} 559}
558/** 560/**
559 * audit_send_reply - send an audit reply message via netlink 561 * audit_send_reply - send an audit reply message via netlink
560 * @portid: netlink port to which to send reply 562 * @request_skb: skb of request we are replying to (used to target the reply)
561 * @seq: sequence number 563 * @seq: sequence number
562 * @type: audit message type 564 * @type: audit message type
563 * @done: done (last) flag 565 * @done: done (last) flag
@@ -568,9 +570,11 @@ static int audit_send_reply_thread(void *arg)
568 * Allocates an skb, builds the netlink message, and sends it to the port id. 570 * Allocates an skb, builds the netlink message, and sends it to the port id.
569 * No failure notifications. 571 * No failure notifications.
570 */ 572 */
571static void audit_send_reply(__u32 portid, int seq, int type, int done, 573static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
572 int multi, const void *payload, int size) 574 int multi, const void *payload, int size)
573{ 575{
576 u32 portid = NETLINK_CB(request_skb).portid;
577 struct net *net = sock_net(NETLINK_CB(request_skb).sk);
574 struct sk_buff *skb; 578 struct sk_buff *skb;
575 struct task_struct *tsk; 579 struct task_struct *tsk;
576 struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), 580 struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
@@ -583,8 +587,8 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done,
583 if (!skb) 587 if (!skb)
584 goto out; 588 goto out;
585 589
590 reply->net = get_net(net);
586 reply->portid = portid; 591 reply->portid = portid;
587 reply->pid = task_pid_vnr(current);
588 reply->skb = skb; 592 reply->skb = skb;
589 593
590 tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); 594 tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
@@ -604,9 +608,19 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
604 int err = 0; 608 int err = 0;
605 609
606 /* Only support the initial namespaces for now. */ 610 /* Only support the initial namespaces for now. */
611 /*
612 * We return ECONNREFUSED because it tricks userspace into thinking
613 * that audit was not configured into the kernel. Lots of users
614 * configure their PAM stack (because that's what the distro does)
615 * to reject login if unable to send messages to audit. If we return
616 * ECONNREFUSED the PAM stack thinks the kernel does not have audit
617 * configured in and will let login proceed. If we return EPERM
618 * userspace will reject all logins. This should be removed when we
619 * support non init namespaces!!
620 */
607 if ((current_user_ns() != &init_user_ns) || 621 if ((current_user_ns() != &init_user_ns) ||
608 (task_active_pid_ns(current) != &init_pid_ns)) 622 (task_active_pid_ns(current) != &init_pid_ns))
609 return -EPERM; 623 return -ECONNREFUSED;
610 624
611 switch (msg_type) { 625 switch (msg_type) {
612 case AUDIT_LIST: 626 case AUDIT_LIST:
@@ -673,8 +687,7 @@ static int audit_get_feature(struct sk_buff *skb)
673 687
674 seq = nlmsg_hdr(skb)->nlmsg_seq; 688 seq = nlmsg_hdr(skb)->nlmsg_seq;
675 689
676 audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, 690 audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af));
677 &af, sizeof(af));
678 691
679 return 0; 692 return 0;
680} 693}
@@ -794,8 +807,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
794 s.backlog = skb_queue_len(&audit_skb_queue); 807 s.backlog = skb_queue_len(&audit_skb_queue);
795 s.version = AUDIT_VERSION_LATEST; 808 s.version = AUDIT_VERSION_LATEST;
796 s.backlog_wait_time = audit_backlog_wait_time; 809 s.backlog_wait_time = audit_backlog_wait_time;
797 audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, 810 audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
798 &s, sizeof(s));
799 break; 811 break;
800 } 812 }
801 case AUDIT_SET: { 813 case AUDIT_SET: {
@@ -905,7 +917,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
905 seq, data, nlmsg_len(nlh)); 917 seq, data, nlmsg_len(nlh));
906 break; 918 break;
907 case AUDIT_LIST_RULES: 919 case AUDIT_LIST_RULES:
908 err = audit_list_rules_send(NETLINK_CB(skb).portid, seq); 920 err = audit_list_rules_send(skb, seq);
909 break; 921 break;
910 case AUDIT_TRIM: 922 case AUDIT_TRIM:
911 audit_trim_trees(); 923 audit_trim_trees();
@@ -970,8 +982,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
970 memcpy(sig_data->ctx, ctx, len); 982 memcpy(sig_data->ctx, ctx, len);
971 security_release_secctx(ctx, len); 983 security_release_secctx(ctx, len);
972 } 984 }
973 audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO, 985 audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
974 0, 0, sig_data, sizeof(*sig_data) + len); 986 sig_data, sizeof(*sig_data) + len);
975 kfree(sig_data); 987 kfree(sig_data);
976 break; 988 break;
977 case AUDIT_TTY_GET: { 989 case AUDIT_TTY_GET: {
@@ -983,8 +995,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
983 s.log_passwd = tsk->signal->audit_tty_log_passwd; 995 s.log_passwd = tsk->signal->audit_tty_log_passwd;
984 spin_unlock(&tsk->sighand->siglock); 996 spin_unlock(&tsk->sighand->siglock);
985 997
986 audit_send_reply(NETLINK_CB(skb).portid, seq, 998 audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
987 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
988 break; 999 break;
989 } 1000 }
990 case AUDIT_TTY_SET: { 1001 case AUDIT_TTY_SET: {
diff --git a/kernel/audit.h b/kernel/audit.h
index 57cc64d67718..8df132214606 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -247,7 +247,7 @@ extern void audit_panic(const char *message);
247 247
248struct audit_netlink_list { 248struct audit_netlink_list {
249 __u32 portid; 249 __u32 portid;
250 pid_t pid; 250 struct net *net;
251 struct sk_buff_head q; 251 struct sk_buff_head q;
252}; 252};
253 253
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 67ccf0e7cca9..135944a7b28a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -916,7 +916,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
916 struct fsnotify_mark *inode_mark, 916 struct fsnotify_mark *inode_mark,
917 struct fsnotify_mark *vfsmount_mark, 917 struct fsnotify_mark *vfsmount_mark,
918 u32 mask, void *data, int data_type, 918 u32 mask, void *data, int data_type,
919 const unsigned char *file_name) 919 const unsigned char *file_name, u32 cookie)
920{ 920{
921 return 0; 921 return 0;
922} 922}
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 2596fac5dcb4..70b4554d2fbe 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -471,7 +471,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
471 struct fsnotify_mark *inode_mark, 471 struct fsnotify_mark *inode_mark,
472 struct fsnotify_mark *vfsmount_mark, 472 struct fsnotify_mark *vfsmount_mark,
473 u32 mask, void *data, int data_type, 473 u32 mask, void *data, int data_type,
474 const unsigned char *dname) 474 const unsigned char *dname, u32 cookie)
475{ 475{
476 struct inode *inode; 476 struct inode *inode;
477 struct audit_parent *parent; 477 struct audit_parent *parent;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 14a78cca384e..92062fd6cc8c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -29,6 +29,8 @@
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <net/net_namespace.h>
33#include <net/sock.h>
32#include "audit.h" 34#include "audit.h"
33 35
34/* 36/*
@@ -1065,11 +1067,13 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
1065 1067
1066/** 1068/**
1067 * audit_list_rules_send - list the audit rules 1069 * audit_list_rules_send - list the audit rules
1068 * @portid: target portid for netlink audit messages 1070 * @request_skb: skb of request we are replying to (used to target the reply)
1069 * @seq: netlink audit message sequence (serial) number 1071 * @seq: netlink audit message sequence (serial) number
1070 */ 1072 */
1071int audit_list_rules_send(__u32 portid, int seq) 1073int audit_list_rules_send(struct sk_buff *request_skb, int seq)
1072{ 1074{
1075 u32 portid = NETLINK_CB(request_skb).portid;
1076 struct net *net = sock_net(NETLINK_CB(request_skb).sk);
1073 struct task_struct *tsk; 1077 struct task_struct *tsk;
1074 struct audit_netlink_list *dest; 1078 struct audit_netlink_list *dest;
1075 int err = 0; 1079 int err = 0;
@@ -1083,8 +1087,8 @@ int audit_list_rules_send(__u32 portid, int seq)
1083 dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); 1087 dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
1084 if (!dest) 1088 if (!dest)
1085 return -ENOMEM; 1089 return -ENOMEM;
1090 dest->net = get_net(net);
1086 dest->portid = portid; 1091 dest->portid = portid;
1087 dest->pid = task_pid_vnr(current);
1088 skb_queue_head_init(&dest->q); 1092 skb_queue_head_init(&dest->q);
1089 1093
1090 mutex_lock(&audit_filter_mutex); 1094 mutex_lock(&audit_filter_mutex);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2f46ba37f72..0c753ddd223b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -886,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
886 * per-subsystem and moved to css->id so that lookups are 886 * per-subsystem and moved to css->id so that lookups are
887 * successful until the target css is released. 887 * successful until the target css is released.
888 */ 888 */
889 mutex_lock(&cgroup_mutex);
889 idr_remove(&cgrp->root->cgroup_idr, cgrp->id); 890 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
891 mutex_unlock(&cgroup_mutex);
890 cgrp->id = -1; 892 cgrp->id = -1;
891 893
892 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 894 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
@@ -1566,10 +1568,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1566 mutex_lock(&cgroup_mutex); 1568 mutex_lock(&cgroup_mutex);
1567 mutex_lock(&cgroup_root_mutex); 1569 mutex_lock(&cgroup_root_mutex);
1568 1570
1569 root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, 1571 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1570 0, 1, GFP_KERNEL); 1572 if (ret < 0)
1571 if (root_cgrp->id < 0)
1572 goto unlock_drop; 1573 goto unlock_drop;
1574 root_cgrp->id = ret;
1573 1575
1574 /* Check for name clashes with existing mounts */ 1576 /* Check for name clashes with existing mounts */
1575 ret = -EBUSY; 1577 ret = -EBUSY;
@@ -2763,10 +2765,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2763 */ 2765 */
2764 update_before = cgroup_serial_nr_next; 2766 update_before = cgroup_serial_nr_next;
2765 2767
2766 mutex_unlock(&cgroup_mutex);
2767
2768 /* add/rm files for all cgroups created before */ 2768 /* add/rm files for all cgroups created before */
2769 rcu_read_lock();
2770 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 2769 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2771 struct cgroup *cgrp = css->cgroup; 2770 struct cgroup *cgrp = css->cgroup;
2772 2771
@@ -2775,23 +2774,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2775 2774
2776 inode = cgrp->dentry->d_inode; 2775 inode = cgrp->dentry->d_inode;
2777 dget(cgrp->dentry); 2776 dget(cgrp->dentry);
2778 rcu_read_unlock();
2779
2780 dput(prev); 2777 dput(prev);
2781 prev = cgrp->dentry; 2778 prev = cgrp->dentry;
2782 2779
2780 mutex_unlock(&cgroup_mutex);
2783 mutex_lock(&inode->i_mutex); 2781 mutex_lock(&inode->i_mutex);
2784 mutex_lock(&cgroup_mutex); 2782 mutex_lock(&cgroup_mutex);
2785 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) 2783 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2786 ret = cgroup_addrm_files(cgrp, cfts, is_add); 2784 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2787 mutex_unlock(&cgroup_mutex);
2788 mutex_unlock(&inode->i_mutex); 2785 mutex_unlock(&inode->i_mutex);
2789
2790 rcu_read_lock();
2791 if (ret) 2786 if (ret)
2792 break; 2787 break;
2793 } 2788 }
2794 rcu_read_unlock(); 2789 mutex_unlock(&cgroup_mutex);
2795 dput(prev); 2790 dput(prev);
2796 deactivate_super(sb); 2791 deactivate_super(sb);
2797 return ret; 2792 return ret;
@@ -2910,9 +2905,14 @@ static void cgroup_enable_task_cg_lists(void)
2910 * We should check if the process is exiting, otherwise 2905 * We should check if the process is exiting, otherwise
2911 * it will race with cgroup_exit() in that the list 2906 * it will race with cgroup_exit() in that the list
2912 * entry won't be deleted though the process has exited. 2907 * entry won't be deleted though the process has exited.
2908 * Do it while holding siglock so that we don't end up
2909 * racing against cgroup_exit().
2913 */ 2910 */
2911 spin_lock_irq(&p->sighand->siglock);
2914 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) 2912 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2915 list_add(&p->cg_list, &task_css_set(p)->tasks); 2913 list_add(&p->cg_list, &task_css_set(p)->tasks);
2914 spin_unlock_irq(&p->sighand->siglock);
2915
2916 task_unlock(p); 2916 task_unlock(p);
2917 } while_each_thread(g, p); 2917 } while_each_thread(g, p);
2918 read_unlock(&tasklist_lock); 2918 read_unlock(&tasklist_lock);
@@ -4112,17 +4112,17 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4112 4112
4113 err = percpu_ref_init(&css->refcnt, css_release); 4113 err = percpu_ref_init(&css->refcnt, css_release);
4114 if (err) 4114 if (err)
4115 goto err_free; 4115 goto err_free_css;
4116 4116
4117 init_css(css, ss, cgrp); 4117 init_css(css, ss, cgrp);
4118 4118
4119 err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); 4119 err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
4120 if (err) 4120 if (err)
4121 goto err_free; 4121 goto err_free_percpu_ref;
4122 4122
4123 err = online_css(css); 4123 err = online_css(css);
4124 if (err) 4124 if (err)
4125 goto err_free; 4125 goto err_clear_dir;
4126 4126
4127 dget(cgrp->dentry); 4127 dget(cgrp->dentry);
4128 css_get(css->parent); 4128 css_get(css->parent);
@@ -4138,8 +4138,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4138 4138
4139 return 0; 4139 return 0;
4140 4140
4141err_free: 4141err_clear_dir:
4142 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
4143err_free_percpu_ref:
4142 percpu_ref_cancel_init(&css->refcnt); 4144 percpu_ref_cancel_init(&css->refcnt);
4145err_free_css:
4143 ss->css_free(css); 4146 ss->css_free(css);
4144 return err; 4147 return err;
4145} 4148}
@@ -4158,7 +4161,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4158 struct cgroup *cgrp; 4161 struct cgroup *cgrp;
4159 struct cgroup_name *name; 4162 struct cgroup_name *name;
4160 struct cgroupfs_root *root = parent->root; 4163 struct cgroupfs_root *root = parent->root;
4161 int ssid, err = 0; 4164 int ssid, err;
4162 struct cgroup_subsys *ss; 4165 struct cgroup_subsys *ss;
4163 struct super_block *sb = root->sb; 4166 struct super_block *sb = root->sb;
4164 4167
@@ -4168,19 +4171,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4168 return -ENOMEM; 4171 return -ENOMEM;
4169 4172
4170 name = cgroup_alloc_name(dentry); 4173 name = cgroup_alloc_name(dentry);
4171 if (!name) 4174 if (!name) {
4175 err = -ENOMEM;
4172 goto err_free_cgrp; 4176 goto err_free_cgrp;
4177 }
4173 rcu_assign_pointer(cgrp->name, name); 4178 rcu_assign_pointer(cgrp->name, name);
4174 4179
4175 /* 4180 /*
4176 * Temporarily set the pointer to NULL, so idr_find() won't return
4177 * a half-baked cgroup.
4178 */
4179 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4180 if (cgrp->id < 0)
4181 goto err_free_name;
4182
4183 /*
4184 * Only live parents can have children. Note that the liveliness 4181 * Only live parents can have children. Note that the liveliness
4185 * check isn't strictly necessary because cgroup_mkdir() and 4182 * check isn't strictly necessary because cgroup_mkdir() and
4186 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it 4183 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
@@ -4189,7 +4186,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4189 */ 4186 */
4190 if (!cgroup_lock_live_group(parent)) { 4187 if (!cgroup_lock_live_group(parent)) {
4191 err = -ENODEV; 4188 err = -ENODEV;
4192 goto err_free_id; 4189 goto err_free_name;
4190 }
4191
4192 /*
4193 * Temporarily set the pointer to NULL, so idr_find() won't return
4194 * a half-baked cgroup.
4195 */
4196 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4197 if (cgrp->id < 0) {
4198 err = -ENOMEM;
4199 goto err_unlock;
4193 } 4200 }
4194 4201
4195 /* Grab a reference on the superblock so the hierarchy doesn't 4202 /* Grab a reference on the superblock so the hierarchy doesn't
@@ -4221,7 +4228,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4221 */ 4228 */
4222 err = cgroup_create_file(dentry, S_IFDIR | mode, sb); 4229 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4223 if (err < 0) 4230 if (err < 0)
4224 goto err_unlock; 4231 goto err_free_id;
4225 lockdep_assert_held(&dentry->d_inode->i_mutex); 4232 lockdep_assert_held(&dentry->d_inode->i_mutex);
4226 4233
4227 cgrp->serial_nr = cgroup_serial_nr_next++; 4234 cgrp->serial_nr = cgroup_serial_nr_next++;
@@ -4257,12 +4264,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4257 4264
4258 return 0; 4265 return 0;
4259 4266
4260err_unlock:
4261 mutex_unlock(&cgroup_mutex);
4262 /* Release the reference count that we took on the superblock */
4263 deactivate_super(sb);
4264err_free_id: 4267err_free_id:
4265 idr_remove(&root->cgroup_idr, cgrp->id); 4268 idr_remove(&root->cgroup_idr, cgrp->id);
4269 /* Release the reference count that we took on the superblock */
4270 deactivate_super(sb);
4271err_unlock:
4272 mutex_unlock(&cgroup_mutex);
4266err_free_name: 4273err_free_name:
4267 kfree(rcu_dereference_raw(cgrp->name)); 4274 kfree(rcu_dereference_raw(cgrp->name));
4268err_free_cgrp: 4275err_free_cgrp:
diff --git a/kernel/compat.c b/kernel/compat.c
index 0a09e481b70b..488ff8c4cf48 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -110,8 +110,8 @@ static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
110 return 0; 110 return 0;
111} 111}
112 112
113asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, 113COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
114 struct timezone __user *tz) 114 struct timezone __user *, tz)
115{ 115{
116 if (tv) { 116 if (tv) {
117 struct timeval ktv; 117 struct timeval ktv;
@@ -127,8 +127,8 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
127 return 0; 127 return 0;
128} 128}
129 129
130asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, 130COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
131 struct timezone __user *tz) 131 struct timezone __user *, tz)
132{ 132{
133 struct timespec kts; 133 struct timespec kts;
134 struct timezone ktz; 134 struct timezone ktz;
@@ -236,8 +236,8 @@ static long compat_nanosleep_restart(struct restart_block *restart)
236 return ret; 236 return ret;
237} 237}
238 238
239asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, 239COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
240 struct compat_timespec __user *rmtp) 240 struct compat_timespec __user *, rmtp)
241{ 241{
242 struct timespec tu, rmt; 242 struct timespec tu, rmt;
243 mm_segment_t oldfs; 243 mm_segment_t oldfs;
@@ -328,7 +328,7 @@ static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
328 return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); 328 return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
329} 329}
330 330
331asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) 331COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
332{ 332{
333 if (tbuf) { 333 if (tbuf) {
334 struct tms tms; 334 struct tms tms;
@@ -354,7 +354,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
354 * types that can be passed to put_user()/get_user(). 354 * types that can be passed to put_user()/get_user().
355 */ 355 */
356 356
357asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) 357COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
358{ 358{
359 old_sigset_t s; 359 old_sigset_t s;
360 long ret; 360 long ret;
@@ -424,8 +424,8 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
424 424
425#endif 425#endif
426 426
427asmlinkage long compat_sys_setrlimit(unsigned int resource, 427COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
428 struct compat_rlimit __user *rlim) 428 struct compat_rlimit __user *, rlim)
429{ 429{
430 struct rlimit r; 430 struct rlimit r;
431 431
@@ -443,8 +443,8 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
443 443
444#ifdef COMPAT_RLIM_OLD_INFINITY 444#ifdef COMPAT_RLIM_OLD_INFINITY
445 445
446asmlinkage long compat_sys_old_getrlimit(unsigned int resource, 446COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
447 struct compat_rlimit __user *rlim) 447 struct compat_rlimit __user *, rlim)
448{ 448{
449 struct rlimit r; 449 struct rlimit r;
450 int ret; 450 int ret;
@@ -470,8 +470,8 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
470 470
471#endif 471#endif
472 472
473asmlinkage long compat_sys_getrlimit(unsigned int resource, 473COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
474 struct compat_rlimit __user *rlim) 474 struct compat_rlimit __user *, rlim)
475{ 475{
476 struct rlimit r; 476 struct rlimit r;
477 int ret; 477 int ret;
@@ -596,9 +596,9 @@ static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
596 return compat_get_bitmap(k, user_mask_ptr, len * 8); 596 return compat_get_bitmap(k, user_mask_ptr, len * 8);
597} 597}
598 598
599asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, 599COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid,
600 unsigned int len, 600 unsigned int, len,
601 compat_ulong_t __user *user_mask_ptr) 601 compat_ulong_t __user *, user_mask_ptr)
602{ 602{
603 cpumask_var_t new_mask; 603 cpumask_var_t new_mask;
604 int retval; 604 int retval;
@@ -616,8 +616,8 @@ out:
616 return retval; 616 return retval;
617} 617}
618 618
619asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, 619COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
620 compat_ulong_t __user *user_mask_ptr) 620 compat_ulong_t __user *, user_mask_ptr)
621{ 621{
622 int ret; 622 int ret;
623 cpumask_var_t mask; 623 cpumask_var_t mask;
@@ -662,9 +662,9 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst,
662 return 0; 662 return 0;
663} 663}
664 664
665long compat_sys_timer_create(clockid_t which_clock, 665COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
666 struct compat_sigevent __user *timer_event_spec, 666 struct compat_sigevent __user *, timer_event_spec,
667 timer_t __user *created_timer_id) 667 timer_t __user *, created_timer_id)
668{ 668{
669 struct sigevent __user *event = NULL; 669 struct sigevent __user *event = NULL;
670 670
@@ -680,9 +680,9 @@ long compat_sys_timer_create(clockid_t which_clock,
680 return sys_timer_create(which_clock, event, created_timer_id); 680 return sys_timer_create(which_clock, event, created_timer_id);
681} 681}
682 682
683long compat_sys_timer_settime(timer_t timer_id, int flags, 683COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
684 struct compat_itimerspec __user *new, 684 struct compat_itimerspec __user *, new,
685 struct compat_itimerspec __user *old) 685 struct compat_itimerspec __user *, old)
686{ 686{
687 long err; 687 long err;
688 mm_segment_t oldfs; 688 mm_segment_t oldfs;
@@ -703,8 +703,8 @@ long compat_sys_timer_settime(timer_t timer_id, int flags,
703 return err; 703 return err;
704} 704}
705 705
706long compat_sys_timer_gettime(timer_t timer_id, 706COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
707 struct compat_itimerspec __user *setting) 707 struct compat_itimerspec __user *, setting)
708{ 708{
709 long err; 709 long err;
710 mm_segment_t oldfs; 710 mm_segment_t oldfs;
@@ -720,8 +720,8 @@ long compat_sys_timer_gettime(timer_t timer_id,
720 return err; 720 return err;
721} 721}
722 722
723long compat_sys_clock_settime(clockid_t which_clock, 723COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
724 struct compat_timespec __user *tp) 724 struct compat_timespec __user *, tp)
725{ 725{
726 long err; 726 long err;
727 mm_segment_t oldfs; 727 mm_segment_t oldfs;
@@ -737,8 +737,8 @@ long compat_sys_clock_settime(clockid_t which_clock,
737 return err; 737 return err;
738} 738}
739 739
740long compat_sys_clock_gettime(clockid_t which_clock, 740COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
741 struct compat_timespec __user *tp) 741 struct compat_timespec __user *, tp)
742{ 742{
743 long err; 743 long err;
744 mm_segment_t oldfs; 744 mm_segment_t oldfs;
@@ -754,8 +754,8 @@ long compat_sys_clock_gettime(clockid_t which_clock,
754 return err; 754 return err;
755} 755}
756 756
757long compat_sys_clock_adjtime(clockid_t which_clock, 757COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
758 struct compat_timex __user *utp) 758 struct compat_timex __user *, utp)
759{ 759{
760 struct timex txc; 760 struct timex txc;
761 mm_segment_t oldfs; 761 mm_segment_t oldfs;
@@ -777,8 +777,8 @@ long compat_sys_clock_adjtime(clockid_t which_clock,
777 return ret; 777 return ret;
778} 778}
779 779
780long compat_sys_clock_getres(clockid_t which_clock, 780COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
781 struct compat_timespec __user *tp) 781 struct compat_timespec __user *, tp)
782{ 782{
783 long err; 783 long err;
784 mm_segment_t oldfs; 784 mm_segment_t oldfs;
@@ -818,9 +818,9 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
818 return err; 818 return err;
819} 819}
820 820
821long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, 821COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
822 struct compat_timespec __user *rqtp, 822 struct compat_timespec __user *, rqtp,
823 struct compat_timespec __user *rmtp) 823 struct compat_timespec __user *, rmtp)
824{ 824{
825 long err; 825 long err;
826 mm_segment_t oldfs; 826 mm_segment_t oldfs;
@@ -1010,7 +1010,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
1010 1010
1011/* compat_time_t is a 32 bit "long" and needs to get converted. */ 1011/* compat_time_t is a 32 bit "long" and needs to get converted. */
1012 1012
1013asmlinkage long compat_sys_time(compat_time_t __user * tloc) 1013COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
1014{ 1014{
1015 compat_time_t i; 1015 compat_time_t i;
1016 struct timeval tv; 1016 struct timeval tv;
@@ -1026,7 +1026,7 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc)
1026 return i; 1026 return i;
1027} 1027}
1028 1028
1029asmlinkage long compat_sys_stime(compat_time_t __user *tptr) 1029COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
1030{ 1030{
1031 struct timespec tv; 1031 struct timespec tv;
1032 int err; 1032 int err;
@@ -1046,7 +1046,7 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
1046 1046
1047#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ 1047#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
1048 1048
1049asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) 1049COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
1050{ 1050{
1051 struct timex txc; 1051 struct timex txc;
1052 int err, ret; 1052 int err, ret;
@@ -1065,11 +1065,11 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
1065} 1065}
1066 1066
1067#ifdef CONFIG_NUMA 1067#ifdef CONFIG_NUMA
1068asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, 1068COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
1069 compat_uptr_t __user *pages32, 1069 compat_uptr_t __user *, pages32,
1070 const int __user *nodes, 1070 const int __user *, nodes,
1071 int __user *status, 1071 int __user *, status,
1072 int flags) 1072 int, flags)
1073{ 1073{
1074 const void __user * __user *pages; 1074 const void __user * __user *pages;
1075 int i; 1075 int i;
@@ -1085,10 +1085,10 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
1085 return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); 1085 return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
1086} 1086}
1087 1087
1088asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, 1088COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1089 compat_ulong_t maxnode, 1089 compat_ulong_t, maxnode,
1090 const compat_ulong_t __user *old_nodes, 1090 const compat_ulong_t __user *, old_nodes,
1091 const compat_ulong_t __user *new_nodes) 1091 const compat_ulong_t __user *, new_nodes)
1092{ 1092{
1093 unsigned long __user *old = NULL; 1093 unsigned long __user *old = NULL;
1094 unsigned long __user *new = NULL; 1094 unsigned long __user *new = NULL;
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
deleted file mode 100644
index 59ab052ef7a0..000000000000
--- a/kernel/cpu/Makefile
+++ /dev/null
@@ -1 +0,0 @@
1obj-y = idle.o
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4410ac6a55f1..e6b1b66afe52 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -974,12 +974,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
974 * Temporarilly set tasks mems_allowed to target nodes of migration, 974 * Temporarilly set tasks mems_allowed to target nodes of migration,
975 * so that the migration code can allocate pages on these nodes. 975 * so that the migration code can allocate pages on these nodes.
976 * 976 *
977 * Call holding cpuset_mutex, so current's cpuset won't change
978 * during this call, as manage_mutex holds off any cpuset_attach()
979 * calls. Therefore we don't need to take task_lock around the
980 * call to guarantee_online_mems(), as we know no one is changing
981 * our task's cpuset.
982 *
983 * While the mm_struct we are migrating is typically from some 977 * While the mm_struct we are migrating is typically from some
984 * other task, the task_struct mems_allowed that we are hacking 978 * other task, the task_struct mems_allowed that we are hacking
985 * is for our current task, which must allocate new pages for that 979 * is for our current task, which must allocate new pages for that
@@ -996,8 +990,10 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
996 990
997 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 991 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
998 992
993 rcu_read_lock();
999 mems_cs = effective_nodemask_cpuset(task_cs(tsk)); 994 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
1000 guarantee_online_mems(mems_cs, &tsk->mems_allowed); 995 guarantee_online_mems(mems_cs, &tsk->mems_allowed);
996 rcu_read_unlock();
1001} 997}
1002 998
1003/* 999/*
@@ -2486,9 +2482,9 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2486 2482
2487 task_lock(current); 2483 task_lock(current);
2488 cs = nearest_hardwall_ancestor(task_cs(current)); 2484 cs = nearest_hardwall_ancestor(task_cs(current));
2485 allowed = node_isset(node, cs->mems_allowed);
2489 task_unlock(current); 2486 task_unlock(current);
2490 2487
2491 allowed = node_isset(node, cs->mems_allowed);
2492 mutex_unlock(&callback_mutex); 2488 mutex_unlock(&callback_mutex);
2493 return allowed; 2489 return allowed;
2494} 2490}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 334b3980ffc1..99982a70ddad 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1035,7 +1035,7 @@ int dbg_io_get_char(void)
1035 * otherwise as a quick means to stop program execution and "break" into 1035 * otherwise as a quick means to stop program execution and "break" into
1036 * the debugger. 1036 * the debugger.
1037 */ 1037 */
1038void kgdb_breakpoint(void) 1038noinline void kgdb_breakpoint(void)
1039{ 1039{
1040 atomic_inc(&kgdb_setting_breakpoint); 1040 atomic_inc(&kgdb_setting_breakpoint);
1041 wmb(); /* Sync point before breakpoint */ 1041 wmb(); /* Sync point before breakpoint */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 56003c6edfd3..661951ab8ae7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -231,11 +231,29 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
231#define NR_ACCUMULATED_SAMPLES 128 231#define NR_ACCUMULATED_SAMPLES 128
232static DEFINE_PER_CPU(u64, running_sample_length); 232static DEFINE_PER_CPU(u64, running_sample_length);
233 233
234void perf_sample_event_took(u64 sample_len_ns) 234static void perf_duration_warn(struct irq_work *w)
235{ 235{
236 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
236 u64 avg_local_sample_len; 237 u64 avg_local_sample_len;
237 u64 local_samples_len; 238 u64 local_samples_len;
239
240 local_samples_len = __get_cpu_var(running_sample_length);
241 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
242
243 printk_ratelimited(KERN_WARNING
244 "perf interrupt took too long (%lld > %lld), lowering "
245 "kernel.perf_event_max_sample_rate to %d\n",
246 avg_local_sample_len, allowed_ns >> 1,
247 sysctl_perf_event_sample_rate);
248}
249
250static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
251
252void perf_sample_event_took(u64 sample_len_ns)
253{
238 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); 254 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
255 u64 avg_local_sample_len;
256 u64 local_samples_len;
239 257
240 if (allowed_ns == 0) 258 if (allowed_ns == 0)
241 return; 259 return;
@@ -263,13 +281,14 @@ void perf_sample_event_took(u64 sample_len_ns)
263 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; 281 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
264 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 282 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
265 283
266 printk_ratelimited(KERN_WARNING
267 "perf samples too long (%lld > %lld), lowering "
268 "kernel.perf_event_max_sample_rate to %d\n",
269 avg_local_sample_len, allowed_ns,
270 sysctl_perf_event_sample_rate);
271
272 update_perf_cpu_limits(); 284 update_perf_cpu_limits();
285
286 if (!irq_work_queue(&perf_duration_work)) {
287 early_printk("perf interrupt took too long (%lld > %lld), lowering "
288 "kernel.perf_event_max_sample_rate to %d\n",
289 avg_local_sample_len, allowed_ns >> 1,
290 sysctl_perf_event_sample_rate);
291 }
273} 292}
274 293
275static atomic64_t perf_event_id; 294static atomic64_t perf_event_id;
@@ -1714,7 +1733,7 @@ group_sched_in(struct perf_event *group_event,
1714 struct perf_event_context *ctx) 1733 struct perf_event_context *ctx)
1715{ 1734{
1716 struct perf_event *event, *partial_group = NULL; 1735 struct perf_event *event, *partial_group = NULL;
1717 struct pmu *pmu = group_event->pmu; 1736 struct pmu *pmu = ctx->pmu;
1718 u64 now = ctx->time; 1737 u64 now = ctx->time;
1719 bool simulate = false; 1738 bool simulate = false;
1720 1739
@@ -2563,8 +2582,6 @@ static void perf_branch_stack_sched_in(struct task_struct *prev,
2563 if (cpuctx->ctx.nr_branch_stack > 0 2582 if (cpuctx->ctx.nr_branch_stack > 0
2564 && pmu->flush_branch_stack) { 2583 && pmu->flush_branch_stack) {
2565 2584
2566 pmu = cpuctx->ctx.pmu;
2567
2568 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 2585 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2569 2586
2570 perf_pmu_disable(pmu); 2587 perf_pmu_disable(pmu);
@@ -6294,7 +6311,7 @@ static int perf_event_idx_default(struct perf_event *event)
6294 * Ensures all contexts with the same task_ctx_nr have the same 6311 * Ensures all contexts with the same task_ctx_nr have the same
6295 * pmu_cpu_context too. 6312 * pmu_cpu_context too.
6296 */ 6313 */
6297static void *find_pmu_context(int ctxn) 6314static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
6298{ 6315{
6299 struct pmu *pmu; 6316 struct pmu *pmu;
6300 6317
@@ -7856,14 +7873,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
7856static void __perf_event_exit_context(void *__info) 7873static void __perf_event_exit_context(void *__info)
7857{ 7874{
7858 struct perf_event_context *ctx = __info; 7875 struct perf_event_context *ctx = __info;
7859 struct perf_event *event, *tmp; 7876 struct perf_event *event;
7860 7877
7861 perf_pmu_rotate_stop(ctx->pmu); 7878 perf_pmu_rotate_stop(ctx->pmu);
7862 7879
7863 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 7880 rcu_read_lock();
7864 __perf_remove_from_context(event); 7881 list_for_each_entry_rcu(event, &ctx->event_list, event_entry)
7865 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
7866 __perf_remove_from_context(event); 7882 __perf_remove_from_context(event);
7883 rcu_read_unlock();
7867} 7884}
7868 7885
7869static void perf_event_exit_cpu_context(int cpu) 7886static void perf_event_exit_cpu_context(int cpu)
@@ -7887,11 +7904,11 @@ static void perf_event_exit_cpu(int cpu)
7887{ 7904{
7888 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 7905 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7889 7906
7907 perf_event_exit_cpu_context(cpu);
7908
7890 mutex_lock(&swhash->hlist_mutex); 7909 mutex_lock(&swhash->hlist_mutex);
7891 swevent_hlist_release(swhash); 7910 swevent_hlist_release(swhash);
7892 mutex_unlock(&swhash->hlist_mutex); 7911 mutex_unlock(&swhash->hlist_mutex);
7893
7894 perf_event_exit_cpu_context(cpu);
7895} 7912}
7896#else 7913#else
7897static inline void perf_event_exit_cpu(int cpu) { } 7914static inline void perf_event_exit_cpu(int cpu) { }
diff --git a/kernel/extable.c b/kernel/extable.c
index 763faf037ec1..d8a6446adbcb 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -36,7 +36,7 @@ extern struct exception_table_entry __start___ex_table[];
36extern struct exception_table_entry __stop___ex_table[]; 36extern struct exception_table_entry __stop___ex_table[];
37 37
38/* Cleared by build time tools if the table is already sorted. */ 38/* Cleared by build time tools if the table is already sorted. */
39u32 __initdata main_extable_sort_needed = 1; 39u32 __initdata __visible main_extable_sort_needed = 1;
40 40
41/* Sort the kernel's built-in exception table */ 41/* Sort the kernel's built-in exception table */
42void __init sort_main_extable(void) 42void __init sort_main_extable(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index a17621c6cd42..332688e5e7b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -237,6 +237,7 @@ void __put_task_struct(struct task_struct *tsk)
237 WARN_ON(atomic_read(&tsk->usage)); 237 WARN_ON(atomic_read(&tsk->usage));
238 WARN_ON(tsk == current); 238 WARN_ON(tsk == current);
239 239
240 task_numa_free(tsk);
240 security_task_free(tsk); 241 security_task_free(tsk);
241 exit_creds(tsk); 242 exit_creds(tsk);
242 delayacct_tsk_free(tsk); 243 delayacct_tsk_free(tsk);
diff --git a/kernel/futex.c b/kernel/futex.c
index 44a1261cb9ff..67dacaf93e56 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -157,7 +157,9 @@
157 * enqueue. 157 * enqueue.
158 */ 158 */
159 159
160#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
160int __read_mostly futex_cmpxchg_enabled; 161int __read_mostly futex_cmpxchg_enabled;
162#endif
161 163
162/* 164/*
163 * Futex flags used to encode options to functions and preserve them across 165 * Futex flags used to encode options to functions and preserve them across
@@ -234,6 +236,7 @@ static const struct futex_q futex_q_init = {
234 * waiting on a futex. 236 * waiting on a futex.
235 */ 237 */
236struct futex_hash_bucket { 238struct futex_hash_bucket {
239 atomic_t waiters;
237 spinlock_t lock; 240 spinlock_t lock;
238 struct plist_head chain; 241 struct plist_head chain;
239} ____cacheline_aligned_in_smp; 242} ____cacheline_aligned_in_smp;
@@ -253,22 +256,37 @@ static inline void futex_get_mm(union futex_key *key)
253 smp_mb__after_atomic_inc(); 256 smp_mb__after_atomic_inc();
254} 257}
255 258
256static inline bool hb_waiters_pending(struct futex_hash_bucket *hb) 259/*
260 * Reflects a new waiter being added to the waitqueue.
261 */
262static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
257{ 263{
258#ifdef CONFIG_SMP 264#ifdef CONFIG_SMP
265 atomic_inc(&hb->waiters);
259 /* 266 /*
260 * Tasks trying to enter the critical region are most likely 267 * Full barrier (A), see the ordering comment above.
261 * potential waiters that will be added to the plist. Ensure
262 * that wakers won't miss to-be-slept tasks in the window between
263 * the wait call and the actual plist_add.
264 */ 268 */
265 if (spin_is_locked(&hb->lock)) 269 smp_mb__after_atomic_inc();
266 return true; 270#endif
267 smp_rmb(); /* Make sure we check the lock state first */ 271}
272
273/*
274 * Reflects a waiter being removed from the waitqueue by wakeup
275 * paths.
276 */
277static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
278{
279#ifdef CONFIG_SMP
280 atomic_dec(&hb->waiters);
281#endif
282}
268 283
269 return !plist_head_empty(&hb->chain); 284static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
285{
286#ifdef CONFIG_SMP
287 return atomic_read(&hb->waiters);
270#else 288#else
271 return true; 289 return 1;
272#endif 290#endif
273} 291}
274 292
@@ -954,6 +972,7 @@ static void __unqueue_futex(struct futex_q *q)
954 972
955 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 973 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
956 plist_del(&q->list, &hb->chain); 974 plist_del(&q->list, &hb->chain);
975 hb_waiters_dec(hb);
957} 976}
958 977
959/* 978/*
@@ -1257,7 +1276,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1257 */ 1276 */
1258 if (likely(&hb1->chain != &hb2->chain)) { 1277 if (likely(&hb1->chain != &hb2->chain)) {
1259 plist_del(&q->list, &hb1->chain); 1278 plist_del(&q->list, &hb1->chain);
1279 hb_waiters_dec(hb1);
1260 plist_add(&q->list, &hb2->chain); 1280 plist_add(&q->list, &hb2->chain);
1281 hb_waiters_inc(hb2);
1261 q->lock_ptr = &hb2->lock; 1282 q->lock_ptr = &hb2->lock;
1262 } 1283 }
1263 get_futex_key_refs(key2); 1284 get_futex_key_refs(key2);
@@ -1600,6 +1621,17 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1600 struct futex_hash_bucket *hb; 1621 struct futex_hash_bucket *hb;
1601 1622
1602 hb = hash_futex(&q->key); 1623 hb = hash_futex(&q->key);
1624
1625 /*
1626 * Increment the counter before taking the lock so that
1627 * a potential waker won't miss a to-be-slept task that is
1628 * waiting for the spinlock. This is safe as all queue_lock()
1629 * users end up calling queue_me(). Similarly, for housekeeping,
1630 * decrement the counter at queue_unlock() when some error has
1631 * occurred and we don't end up adding the task to the list.
1632 */
1633 hb_waiters_inc(hb);
1634
1603 q->lock_ptr = &hb->lock; 1635 q->lock_ptr = &hb->lock;
1604 1636
1605 spin_lock(&hb->lock); /* implies MB (A) */ 1637 spin_lock(&hb->lock); /* implies MB (A) */
@@ -1611,6 +1643,7 @@ queue_unlock(struct futex_hash_bucket *hb)
1611 __releases(&hb->lock) 1643 __releases(&hb->lock)
1612{ 1644{
1613 spin_unlock(&hb->lock); 1645 spin_unlock(&hb->lock);
1646 hb_waiters_dec(hb);
1614} 1647}
1615 1648
1616/** 1649/**
@@ -2342,6 +2375,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2342 * Unqueue the futex_q and determine which it was. 2375 * Unqueue the futex_q and determine which it was.
2343 */ 2376 */
2344 plist_del(&q->list, &hb->chain); 2377 plist_del(&q->list, &hb->chain);
2378 hb_waiters_dec(hb);
2345 2379
2346 /* Handle spurious wakeups gracefully */ 2380 /* Handle spurious wakeups gracefully */
2347 ret = -EWOULDBLOCK; 2381 ret = -EWOULDBLOCK;
@@ -2843,9 +2877,28 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2843 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 2877 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
2844} 2878}
2845 2879
2846static int __init futex_init(void) 2880static void __init futex_detect_cmpxchg(void)
2847{ 2881{
2882#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
2848 u32 curval; 2883 u32 curval;
2884
2885 /*
2886 * This will fail and we want it. Some arch implementations do
2887 * runtime detection of the futex_atomic_cmpxchg_inatomic()
2888 * functionality. We want to know that before we call in any
2889 * of the complex code paths. Also we want to prevent
2890 * registration of robust lists in that case. NULL is
2891 * guaranteed to fault and we get -EFAULT on functional
2892 * implementation, the non-functional ones will return
2893 * -ENOSYS.
2894 */
2895 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2896 futex_cmpxchg_enabled = 1;
2897#endif
2898}
2899
2900static int __init futex_init(void)
2901{
2849 unsigned int futex_shift; 2902 unsigned int futex_shift;
2850 unsigned long i; 2903 unsigned long i;
2851 2904
@@ -2861,20 +2914,11 @@ static int __init futex_init(void)
2861 &futex_shift, NULL, 2914 &futex_shift, NULL,
2862 futex_hashsize, futex_hashsize); 2915 futex_hashsize, futex_hashsize);
2863 futex_hashsize = 1UL << futex_shift; 2916 futex_hashsize = 1UL << futex_shift;
2864 /* 2917
2865 * This will fail and we want it. Some arch implementations do 2918 futex_detect_cmpxchg();
2866 * runtime detection of the futex_atomic_cmpxchg_inatomic()
2867 * functionality. We want to know that before we call in any
2868 * of the complex code paths. Also we want to prevent
2869 * registration of robust lists in that case. NULL is
2870 * guaranteed to fault and we get -EFAULT on functional
2871 * implementation, the non-functional ones will return
2872 * -ENOSYS.
2873 */
2874 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2875 futex_cmpxchg_enabled = 1;
2876 2919
2877 for (i = 0; i < futex_hashsize; i++) { 2920 for (i = 0; i < futex_hashsize; i++) {
2921 atomic_set(&futex_queues[i].waiters, 0);
2878 plist_head_init(&futex_queues[i].chain); 2922 plist_head_init(&futex_queues[i].chain);
2879 spin_lock_init(&futex_queues[i].lock); 2923 spin_lock_init(&futex_queues[i].lock);
2880 } 2924 }
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index cf68bb36fe58..f14033700c25 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -10,6 +10,7 @@
10#include <linux/mutex.h> 10#include <linux/mutex.h>
11#include <linux/of.h> 11#include <linux/of.h>
12#include <linux/of_address.h> 12#include <linux/of_address.h>
13#include <linux/of_irq.h>
13#include <linux/topology.h> 14#include <linux/topology.h>
14#include <linux/seq_file.h> 15#include <linux/seq_file.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 481a13c43b17..d3bf660cb57f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -802,8 +802,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
802 802
803static void wake_threads_waitq(struct irq_desc *desc) 803static void wake_threads_waitq(struct irq_desc *desc)
804{ 804{
805 if (atomic_dec_and_test(&desc->threads_active) && 805 if (atomic_dec_and_test(&desc->threads_active))
806 waitqueue_active(&desc->wait_for_threads))
807 wake_up(&desc->wait_for_threads); 806 wake_up(&desc->wait_for_threads);
808} 807}
809 808
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 55fcce6065cf..a82170e2fa78 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -61,11 +61,11 @@ void __weak arch_irq_work_raise(void)
61 * 61 *
62 * Can be re-enqueued while the callback is still in progress. 62 * Can be re-enqueued while the callback is still in progress.
63 */ 63 */
64void irq_work_queue(struct irq_work *work) 64bool irq_work_queue(struct irq_work *work)
65{ 65{
66 /* Only queue if not already pending */ 66 /* Only queue if not already pending */
67 if (!irq_work_claim(work)) 67 if (!irq_work_claim(work))
68 return; 68 return false;
69 69
70 /* Queue the entry and raise the IPI if needed. */ 70 /* Queue the entry and raise the IPI if needed. */
71 preempt_disable(); 71 preempt_disable();
@@ -83,6 +83,8 @@ void irq_work_queue(struct irq_work *work)
83 } 83 }
84 84
85 preempt_enable(); 85 preempt_enable();
86
87 return true;
86} 88}
87EXPORT_SYMBOL_GPL(irq_work_queue); 89EXPORT_SYMBOL_GPL(irq_work_queue);
88 90
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 60bafbed06ab..45601cf41bee 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1039,10 +1039,10 @@ void __weak crash_unmap_reserved_pages(void)
1039{} 1039{}
1040 1040
1041#ifdef CONFIG_COMPAT 1041#ifdef CONFIG_COMPAT
1042asmlinkage long compat_sys_kexec_load(unsigned long entry, 1042COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
1043 unsigned long nr_segments, 1043 compat_ulong_t, nr_segments,
1044 struct compat_kexec_segment __user *segments, 1044 struct compat_kexec_segment __user *, segments,
1045 unsigned long flags) 1045 compat_ulong_t, flags)
1046{ 1046{
1047 struct compat_kexec_segment in; 1047 struct compat_kexec_segment in;
1048 struct kexec_segment out, __user *ksegments; 1048 struct kexec_segment out, __user *ksegments;
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d945a949760f..e660964086e2 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -19,6 +19,8 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/capability.h> 20#include <linux/capability.h>
21 21
22#include <linux/rcupdate.h> /* rcu_expedited */
23
22#define KERNEL_ATTR_RO(_name) \ 24#define KERNEL_ATTR_RO(_name) \
23static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 25static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
24 26
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index baab8e5e7f66..306a76b51e0f 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
1 1
2obj-y += mutex.o semaphore.o rwsem.o lglock.o 2obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o
3 3
4ifdef CONFIG_FUNCTION_TRACER 4ifdef CONFIG_FUNCTION_TRACER
5CFLAGS_REMOVE_lockdep.o = -pg 5CFLAGS_REMOVE_lockdep.o = -pg
@@ -23,3 +23,4 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
23obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o 23obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
24obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o 24obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
25obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o 25obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
26obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index eb8a54783fa0..b0e9467922e1 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1936,12 +1936,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1936 1936
1937 for (;;) { 1937 for (;;) {
1938 int distance = curr->lockdep_depth - depth + 1; 1938 int distance = curr->lockdep_depth - depth + 1;
1939 hlock = curr->held_locks + depth-1; 1939 hlock = curr->held_locks + depth - 1;
1940 /* 1940 /*
1941 * Only non-recursive-read entries get new dependencies 1941 * Only non-recursive-read entries get new dependencies
1942 * added: 1942 * added:
1943 */ 1943 */
1944 if (hlock->read != 2) { 1944 if (hlock->read != 2 && hlock->check) {
1945 if (!check_prev_add(curr, hlock, next, 1945 if (!check_prev_add(curr, hlock, next,
1946 distance, trylock_loop)) 1946 distance, trylock_loop))
1947 return 0; 1947 return 0;
@@ -2098,7 +2098,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
2098 * (If lookup_chain_cache() returns with 1 it acquires 2098 * (If lookup_chain_cache() returns with 1 it acquires
2099 * graph_lock for us) 2099 * graph_lock for us)
2100 */ 2100 */
2101 if (!hlock->trylock && (hlock->check == 2) && 2101 if (!hlock->trylock && hlock->check &&
2102 lookup_chain_cache(curr, hlock, chain_key)) { 2102 lookup_chain_cache(curr, hlock, chain_key)) {
2103 /* 2103 /*
2104 * Check whether last held lock: 2104 * Check whether last held lock:
@@ -2517,7 +2517,7 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
2517 2517
2518 BUG_ON(usage_bit >= LOCK_USAGE_STATES); 2518 BUG_ON(usage_bit >= LOCK_USAGE_STATES);
2519 2519
2520 if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys) 2520 if (!hlock->check)
2521 continue; 2521 continue;
2522 2522
2523 if (!mark_lock(curr, hlock, usage_bit)) 2523 if (!mark_lock(curr, hlock, usage_bit))
@@ -2557,7 +2557,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
2557 debug_atomic_inc(hardirqs_on_events); 2557 debug_atomic_inc(hardirqs_on_events);
2558} 2558}
2559 2559
2560void trace_hardirqs_on_caller(unsigned long ip) 2560__visible void trace_hardirqs_on_caller(unsigned long ip)
2561{ 2561{
2562 time_hardirqs_on(CALLER_ADDR0, ip); 2562 time_hardirqs_on(CALLER_ADDR0, ip);
2563 2563
@@ -2610,7 +2610,7 @@ EXPORT_SYMBOL(trace_hardirqs_on);
2610/* 2610/*
2611 * Hardirqs were disabled: 2611 * Hardirqs were disabled:
2612 */ 2612 */
2613void trace_hardirqs_off_caller(unsigned long ip) 2613__visible void trace_hardirqs_off_caller(unsigned long ip)
2614{ 2614{
2615 struct task_struct *curr = current; 2615 struct task_struct *curr = current;
2616 2616
@@ -3055,9 +3055,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3055 int class_idx; 3055 int class_idx;
3056 u64 chain_key; 3056 u64 chain_key;
3057 3057
3058 if (!prove_locking)
3059 check = 1;
3060
3061 if (unlikely(!debug_locks)) 3058 if (unlikely(!debug_locks))
3062 return 0; 3059 return 0;
3063 3060
@@ -3069,8 +3066,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3069 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 3066 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
3070 return 0; 3067 return 0;
3071 3068
3072 if (lock->key == &__lockdep_no_validate__) 3069 if (!prove_locking || lock->key == &__lockdep_no_validate__)
3073 check = 1; 3070 check = 0;
3074 3071
3075 if (subclass < NR_LOCKDEP_CACHING_CLASSES) 3072 if (subclass < NR_LOCKDEP_CACHING_CLASSES)
3076 class = lock->class_cache[subclass]; 3073 class = lock->class_cache[subclass];
@@ -3138,7 +3135,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3138 hlock->holdtime_stamp = lockstat_clock(); 3135 hlock->holdtime_stamp = lockstat_clock();
3139#endif 3136#endif
3140 3137
3141 if (check == 2 && !mark_irqflags(curr, hlock)) 3138 if (check && !mark_irqflags(curr, hlock))
3142 return 0; 3139 return 0;
3143 3140
3144 /* mark it as used: */ 3141 /* mark it as used: */
@@ -4191,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task)
4191} 4188}
4192EXPORT_SYMBOL_GPL(debug_show_held_locks); 4189EXPORT_SYMBOL_GPL(debug_show_held_locks);
4193 4190
4194void lockdep_sys_exit(void) 4191asmlinkage void lockdep_sys_exit(void)
4195{ 4192{
4196 struct task_struct *curr = current; 4193 struct task_struct *curr = current;
4197 4194
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
new file mode 100644
index 000000000000..f26b1a18e34e
--- /dev/null
+++ b/kernel/locking/locktorture.c
@@ -0,0 +1,452 @@
1/*
2 * Module-based torture test facility for locking
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright (C) IBM Corporation, 2014
19 *
20 * Author: Paul E. McKenney <paulmck@us.ibm.com>
21 * Based on kernel/rcu/torture.c.
22 */
23#include <linux/types.h>
24#include <linux/kernel.h>
25#include <linux/init.h>
26#include <linux/module.h>
27#include <linux/kthread.h>
28#include <linux/err.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/interrupt.h>
32#include <linux/sched.h>
33#include <linux/atomic.h>
34#include <linux/bitops.h>
35#include <linux/completion.h>
36#include <linux/moduleparam.h>
37#include <linux/percpu.h>
38#include <linux/notifier.h>
39#include <linux/reboot.h>
40#include <linux/freezer.h>
41#include <linux/cpu.h>
42#include <linux/delay.h>
43#include <linux/stat.h>
44#include <linux/slab.h>
45#include <linux/trace_clock.h>
46#include <asm/byteorder.h>
47#include <linux/torture.h>
48
49MODULE_LICENSE("GPL");
50MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
51
52torture_param(int, nwriters_stress, -1,
53 "Number of write-locking stress-test threads");
54torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
55torture_param(int, onoff_interval, 0,
56 "Time between CPU hotplugs (s), 0=disable");
57torture_param(int, shuffle_interval, 3,
58 "Number of jiffies between shuffles, 0=disable");
59torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
60torture_param(int, stat_interval, 60,
61 "Number of seconds between stats printk()s");
62torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
63torture_param(bool, verbose, true,
64 "Enable verbose debugging printk()s");
65
66static char *torture_type = "spin_lock";
67module_param(torture_type, charp, 0444);
68MODULE_PARM_DESC(torture_type,
69 "Type of lock to torture (spin_lock, spin_lock_irq, ...)");
70
71static atomic_t n_lock_torture_errors;
72
73static struct task_struct *stats_task;
74static struct task_struct **writer_tasks;
75
76static int nrealwriters_stress;
77static bool lock_is_write_held;
78
79struct lock_writer_stress_stats {
80 long n_write_lock_fail;
81 long n_write_lock_acquired;
82};
83static struct lock_writer_stress_stats *lwsa;
84
85#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE)
86#define LOCKTORTURE_RUNNABLE_INIT 1
87#else
88#define LOCKTORTURE_RUNNABLE_INIT 0
89#endif
90int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
91module_param(locktorture_runnable, int, 0444);
92MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot");
93
94/* Forward reference. */
95static void lock_torture_cleanup(void);
96
97/*
98 * Operations vector for selecting different types of tests.
99 */
100struct lock_torture_ops {
101 void (*init)(void);
102 int (*writelock)(void);
103 void (*write_delay)(struct torture_random_state *trsp);
104 void (*writeunlock)(void);
105 unsigned long flags;
106 const char *name;
107};
108
109static struct lock_torture_ops *cur_ops;
110
111/*
112 * Definitions for lock torture testing.
113 */
114
115static int torture_lock_busted_write_lock(void)
116{
117 return 0; /* BUGGY, do not use in real life!!! */
118}
119
120static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
121{
122 const unsigned long longdelay_us = 100;
123
124 /* We want a long delay occasionally to force massive contention. */
125 if (!(torture_random(trsp) %
126 (nrealwriters_stress * 2000 * longdelay_us)))
127 mdelay(longdelay_us);
128#ifdef CONFIG_PREEMPT
129 if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
130 preempt_schedule(); /* Allow test to be preempted. */
131#endif
132}
133
134static void torture_lock_busted_write_unlock(void)
135{
136 /* BUGGY, do not use in real life!!! */
137}
138
139static struct lock_torture_ops lock_busted_ops = {
140 .writelock = torture_lock_busted_write_lock,
141 .write_delay = torture_lock_busted_write_delay,
142 .writeunlock = torture_lock_busted_write_unlock,
143 .name = "lock_busted"
144};
145
146static DEFINE_SPINLOCK(torture_spinlock);
147
148static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
149{
150 spin_lock(&torture_spinlock);
151 return 0;
152}
153
154static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
155{
156 const unsigned long shortdelay_us = 2;
157 const unsigned long longdelay_us = 100;
158
159 /* We want a short delay mostly to emulate likely code, and
160 * we want a long delay occasionally to force massive contention.
161 */
162 if (!(torture_random(trsp) %
163 (nrealwriters_stress * 2000 * longdelay_us)))
164 mdelay(longdelay_us);
165 if (!(torture_random(trsp) %
166 (nrealwriters_stress * 2 * shortdelay_us)))
167 udelay(shortdelay_us);
168#ifdef CONFIG_PREEMPT
169 if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
170 preempt_schedule(); /* Allow test to be preempted. */
171#endif
172}
173
174static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
175{
176 spin_unlock(&torture_spinlock);
177}
178
179static struct lock_torture_ops spin_lock_ops = {
180 .writelock = torture_spin_lock_write_lock,
181 .write_delay = torture_spin_lock_write_delay,
182 .writeunlock = torture_spin_lock_write_unlock,
183 .name = "spin_lock"
184};
185
186static int torture_spin_lock_write_lock_irq(void)
187__acquires(torture_spinlock_irq)
188{
189 unsigned long flags;
190
191 spin_lock_irqsave(&torture_spinlock, flags);
192 cur_ops->flags = flags;
193 return 0;
194}
195
196static void torture_lock_spin_write_unlock_irq(void)
197__releases(torture_spinlock)
198{
199 spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags);
200}
201
202static struct lock_torture_ops spin_lock_irq_ops = {
203 .writelock = torture_spin_lock_write_lock_irq,
204 .write_delay = torture_spin_lock_write_delay,
205 .writeunlock = torture_lock_spin_write_unlock_irq,
206 .name = "spin_lock_irq"
207};
208
209/*
210 * Lock torture writer kthread. Repeatedly acquires and releases
211 * the lock, checking for duplicate acquisitions.
212 */
213static int lock_torture_writer(void *arg)
214{
215 struct lock_writer_stress_stats *lwsp = arg;
216 static DEFINE_TORTURE_RANDOM(rand);
217
218 VERBOSE_TOROUT_STRING("lock_torture_writer task started");
219 set_user_nice(current, 19);
220
221 do {
222 schedule_timeout_uninterruptible(1);
223 cur_ops->writelock();
224 if (WARN_ON_ONCE(lock_is_write_held))
225 lwsp->n_write_lock_fail++;
226 lock_is_write_held = 1;
227 lwsp->n_write_lock_acquired++;
228 cur_ops->write_delay(&rand);
229 lock_is_write_held = 0;
230 cur_ops->writeunlock();
231 stutter_wait("lock_torture_writer");
232 } while (!torture_must_stop());
233 torture_kthread_stopping("lock_torture_writer");
234 return 0;
235}
236
237/*
238 * Create an lock-torture-statistics message in the specified buffer.
239 */
240static void lock_torture_printk(char *page)
241{
242 bool fail = 0;
243 int i;
244 long max = 0;
245 long min = lwsa[0].n_write_lock_acquired;
246 long long sum = 0;
247
248 for (i = 0; i < nrealwriters_stress; i++) {
249 if (lwsa[i].n_write_lock_fail)
250 fail = true;
251 sum += lwsa[i].n_write_lock_acquired;
252 if (max < lwsa[i].n_write_lock_fail)
253 max = lwsa[i].n_write_lock_fail;
254 if (min > lwsa[i].n_write_lock_fail)
255 min = lwsa[i].n_write_lock_fail;
256 }
257 page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
258 page += sprintf(page,
259 "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
260 sum, max, min, max / 2 > min ? "???" : "",
261 fail, fail ? "!!!" : "");
262 if (fail)
263 atomic_inc(&n_lock_torture_errors);
264}
265
266/*
267 * Print torture statistics. Caller must ensure that there is only one
268 * call to this function at a given time!!! This is normally accomplished
269 * by relying on the module system to only have one copy of the module
270 * loaded, and then by giving the lock_torture_stats kthread full control
271 * (or the init/cleanup functions when lock_torture_stats thread is not
272 * running).
273 */
274static void lock_torture_stats_print(void)
275{
276 int size = nrealwriters_stress * 200 + 8192;
277 char *buf;
278
279 buf = kmalloc(size, GFP_KERNEL);
280 if (!buf) {
281 pr_err("lock_torture_stats_print: Out of memory, need: %d",
282 size);
283 return;
284 }
285 lock_torture_printk(buf);
286 pr_alert("%s", buf);
287 kfree(buf);
288}
289
290/*
291 * Periodically prints torture statistics, if periodic statistics printing
292 * was specified via the stat_interval module parameter.
293 *
294 * No need to worry about fullstop here, since this one doesn't reference
295 * volatile state or register callbacks.
296 */
297static int lock_torture_stats(void *arg)
298{
299 VERBOSE_TOROUT_STRING("lock_torture_stats task started");
300 do {
301 schedule_timeout_interruptible(stat_interval * HZ);
302 lock_torture_stats_print();
303 torture_shutdown_absorb("lock_torture_stats");
304 } while (!torture_must_stop());
305 torture_kthread_stopping("lock_torture_stats");
306 return 0;
307}
308
309static inline void
310lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
311 const char *tag)
312{
313 pr_alert("%s" TORTURE_FLAG
314 "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
315 torture_type, tag, nrealwriters_stress, stat_interval, verbose,
316 shuffle_interval, stutter, shutdown_secs,
317 onoff_interval, onoff_holdoff);
318}
319
320static void lock_torture_cleanup(void)
321{
322 int i;
323
324 if (torture_cleanup())
325 return;
326
327 if (writer_tasks) {
328 for (i = 0; i < nrealwriters_stress; i++)
329 torture_stop_kthread(lock_torture_writer,
330 writer_tasks[i]);
331 kfree(writer_tasks);
332 writer_tasks = NULL;
333 }
334
335 torture_stop_kthread(lock_torture_stats, stats_task);
336 lock_torture_stats_print(); /* -After- the stats thread is stopped! */
337
338 if (atomic_read(&n_lock_torture_errors))
339 lock_torture_print_module_parms(cur_ops,
340 "End of test: FAILURE");
341 else if (torture_onoff_failures())
342 lock_torture_print_module_parms(cur_ops,
343 "End of test: LOCK_HOTPLUG");
344 else
345 lock_torture_print_module_parms(cur_ops,
346 "End of test: SUCCESS");
347}
348
349static int __init lock_torture_init(void)
350{
351 int i;
352 int firsterr = 0;
353 static struct lock_torture_ops *torture_ops[] = {
354 &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
355 };
356
357 torture_init_begin(torture_type, verbose, &locktorture_runnable);
358
359 /* Process args and tell the world that the torturer is on the job. */
360 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
361 cur_ops = torture_ops[i];
362 if (strcmp(torture_type, cur_ops->name) == 0)
363 break;
364 }
365 if (i == ARRAY_SIZE(torture_ops)) {
366 pr_alert("lock-torture: invalid torture type: \"%s\"\n",
367 torture_type);
368 pr_alert("lock-torture types:");
369 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
370 pr_alert(" %s", torture_ops[i]->name);
371 pr_alert("\n");
372 torture_init_end();
373 return -EINVAL;
374 }
375 if (cur_ops->init)
376 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
377
378 if (nwriters_stress >= 0)
379 nrealwriters_stress = nwriters_stress;
380 else
381 nrealwriters_stress = 2 * num_online_cpus();
382 lock_torture_print_module_parms(cur_ops, "Start of test");
383
384 /* Initialize the statistics so that each run gets its own numbers. */
385
386 lock_is_write_held = 0;
387 lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL);
388 if (lwsa == NULL) {
389 VERBOSE_TOROUT_STRING("lwsa: Out of memory");
390 firsterr = -ENOMEM;
391 goto unwind;
392 }
393 for (i = 0; i < nrealwriters_stress; i++) {
394 lwsa[i].n_write_lock_fail = 0;
395 lwsa[i].n_write_lock_acquired = 0;
396 }
397
398 /* Start up the kthreads. */
399
400 if (onoff_interval > 0) {
401 firsterr = torture_onoff_init(onoff_holdoff * HZ,
402 onoff_interval * HZ);
403 if (firsterr)
404 goto unwind;
405 }
406 if (shuffle_interval > 0) {
407 firsterr = torture_shuffle_init(shuffle_interval);
408 if (firsterr)
409 goto unwind;
410 }
411 if (shutdown_secs > 0) {
412 firsterr = torture_shutdown_init(shutdown_secs,
413 lock_torture_cleanup);
414 if (firsterr)
415 goto unwind;
416 }
417 if (stutter > 0) {
418 firsterr = torture_stutter_init(stutter);
419 if (firsterr)
420 goto unwind;
421 }
422
423 writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]),
424 GFP_KERNEL);
425 if (writer_tasks == NULL) {
426 VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
427 firsterr = -ENOMEM;
428 goto unwind;
429 }
430 for (i = 0; i < nrealwriters_stress; i++) {
431 firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i],
432 writer_tasks[i]);
433 if (firsterr)
434 goto unwind;
435 }
436 if (stat_interval > 0) {
437 firsterr = torture_create_kthread(lock_torture_stats, NULL,
438 stats_task);
439 if (firsterr)
440 goto unwind;
441 }
442 torture_init_end();
443 return 0;
444
445unwind:
446 torture_init_end();
447 lock_torture_cleanup();
448 return firsterr;
449}
450
451module_init(lock_torture_init);
452module_exit(lock_torture_cleanup);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
new file mode 100644
index 000000000000..838dc9e00669
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.c
@@ -0,0 +1,178 @@
1
2#include <linux/percpu.h>
3#include <linux/mutex.h>
4#include <linux/sched.h>
5#include "mcs_spinlock.h"
6
7#ifdef CONFIG_SMP
8
9/*
10 * An MCS like lock especially tailored for optimistic spinning for sleeping
11 * lock implementations (mutex, rwsem, etc).
12 *
13 * Using a single mcs node per CPU is safe because sleeping locks should not be
14 * called from interrupt context and we have preemption disabled while
15 * spinning.
16 */
17static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
18
19/*
20 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
21 * Can return NULL in case we were the last queued and we updated @lock instead.
22 */
23static inline struct optimistic_spin_queue *
24osq_wait_next(struct optimistic_spin_queue **lock,
25 struct optimistic_spin_queue *node,
26 struct optimistic_spin_queue *prev)
27{
28 struct optimistic_spin_queue *next = NULL;
29
30 for (;;) {
31 if (*lock == node && cmpxchg(lock, node, prev) == node) {
32 /*
33 * We were the last queued, we moved @lock back. @prev
34 * will now observe @lock and will complete its
35 * unlock()/unqueue().
36 */
37 break;
38 }
39
40 /*
41 * We must xchg() the @node->next value, because if we were to
42 * leave it in, a concurrent unlock()/unqueue() from
43 * @node->next might complete Step-A and think its @prev is
44 * still valid.
45 *
46 * If the concurrent unlock()/unqueue() wins the race, we'll
47 * wait for either @lock to point to us, through its Step-B, or
48 * wait for a new @node->next from its Step-C.
49 */
50 if (node->next) {
51 next = xchg(&node->next, NULL);
52 if (next)
53 break;
54 }
55
56 arch_mutex_cpu_relax();
57 }
58
59 return next;
60}
61
62bool osq_lock(struct optimistic_spin_queue **lock)
63{
64 struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
65 struct optimistic_spin_queue *prev, *next;
66
67 node->locked = 0;
68 node->next = NULL;
69
70 node->prev = prev = xchg(lock, node);
71 if (likely(prev == NULL))
72 return true;
73
74 ACCESS_ONCE(prev->next) = node;
75
76 /*
77 * Normally @prev is untouchable after the above store; because at that
78 * moment unlock can proceed and wipe the node element from stack.
79 *
80 * However, since our nodes are static per-cpu storage, we're
81 * guaranteed their existence -- this allows us to apply
82 * cmpxchg in an attempt to undo our queueing.
83 */
84
85 while (!smp_load_acquire(&node->locked)) {
86 /*
87 * If we need to reschedule bail... so we can block.
88 */
89 if (need_resched())
90 goto unqueue;
91
92 arch_mutex_cpu_relax();
93 }
94 return true;
95
96unqueue:
97 /*
98 * Step - A -- stabilize @prev
99 *
100 * Undo our @prev->next assignment; this will make @prev's
101 * unlock()/unqueue() wait for a next pointer since @lock points to us
102 * (or later).
103 */
104
105 for (;;) {
106 if (prev->next == node &&
107 cmpxchg(&prev->next, node, NULL) == node)
108 break;
109
110 /*
111 * We can only fail the cmpxchg() racing against an unlock(),
112 * in which case we should observe @node->locked becomming
113 * true.
114 */
115 if (smp_load_acquire(&node->locked))
116 return true;
117
118 arch_mutex_cpu_relax();
119
120 /*
121 * Or we race against a concurrent unqueue()'s step-B, in which
122 * case its step-C will write us a new @node->prev pointer.
123 */
124 prev = ACCESS_ONCE(node->prev);
125 }
126
127 /*
128 * Step - B -- stabilize @next
129 *
130 * Similar to unlock(), wait for @node->next or move @lock from @node
131 * back to @prev.
132 */
133
134 next = osq_wait_next(lock, node, prev);
135 if (!next)
136 return false;
137
138 /*
139 * Step - C -- unlink
140 *
141 * @prev is stable because its still waiting for a new @prev->next
142 * pointer, @next is stable because our @node->next pointer is NULL and
143 * it will wait in Step-A.
144 */
145
146 ACCESS_ONCE(next->prev) = prev;
147 ACCESS_ONCE(prev->next) = next;
148
149 return false;
150}
151
152void osq_unlock(struct optimistic_spin_queue **lock)
153{
154 struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
155 struct optimistic_spin_queue *next;
156
157 /*
158 * Fast path for the uncontended case.
159 */
160 if (likely(cmpxchg(lock, node, NULL) == node))
161 return;
162
163 /*
164 * Second most likely case.
165 */
166 next = xchg(&node->next, NULL);
167 if (next) {
168 ACCESS_ONCE(next->locked) = 1;
169 return;
170 }
171
172 next = osq_wait_next(lock, node, NULL);
173 if (next)
174 ACCESS_ONCE(next->locked) = 1;
175}
176
177#endif
178
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
new file mode 100644
index 000000000000..a2dbac4aca6b
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.h
@@ -0,0 +1,129 @@
1/*
2 * MCS lock defines
3 *
4 * This file contains the main data structure and API definitions of MCS lock.
5 *
6 * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
7 * with the desirable properties of being fair, and with each cpu trying
8 * to acquire the lock spinning on a local variable.
9 * It avoids expensive cache bouncings that common test-and-set spin-lock
10 * implementations incur.
11 */
12#ifndef __LINUX_MCS_SPINLOCK_H
13#define __LINUX_MCS_SPINLOCK_H
14
15#include <asm/mcs_spinlock.h>
16
17struct mcs_spinlock {
18 struct mcs_spinlock *next;
19 int locked; /* 1 if lock acquired */
20};
21
22#ifndef arch_mcs_spin_lock_contended
23/*
24 * Using smp_load_acquire() provides a memory barrier that ensures
25 * subsequent operations happen after the lock is acquired.
26 */
27#define arch_mcs_spin_lock_contended(l) \
28do { \
29 while (!(smp_load_acquire(l))) \
30 arch_mutex_cpu_relax(); \
31} while (0)
32#endif
33
34#ifndef arch_mcs_spin_unlock_contended
35/*
36 * smp_store_release() provides a memory barrier to ensure all
37 * operations in the critical section has been completed before
38 * unlocking.
39 */
40#define arch_mcs_spin_unlock_contended(l) \
41 smp_store_release((l), 1)
42#endif
43
44/*
45 * Note: the smp_load_acquire/smp_store_release pair is not
46 * sufficient to form a full memory barrier across
47 * cpus for many architectures (except x86) for mcs_unlock and mcs_lock.
48 * For applications that need a full barrier across multiple cpus
49 * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be
50 * used after mcs_lock.
51 */
52
53/*
54 * In order to acquire the lock, the caller should declare a local node and
55 * pass a reference of the node to this function in addition to the lock.
56 * If the lock has already been acquired, then this will proceed to spin
57 * on this node->locked until the previous lock holder sets the node->locked
58 * in mcs_spin_unlock().
59 *
60 * We don't inline mcs_spin_lock() so that perf can correctly account for the
61 * time spent in this lock function.
62 */
63static inline
64void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
65{
66 struct mcs_spinlock *prev;
67
68 /* Init node */
69 node->locked = 0;
70 node->next = NULL;
71
72 prev = xchg(lock, node);
73 if (likely(prev == NULL)) {
74 /*
75 * Lock acquired, don't need to set node->locked to 1. Threads
76 * only spin on its own node->locked value for lock acquisition.
77 * However, since this thread can immediately acquire the lock
78 * and does not proceed to spin on its own node->locked, this
79 * value won't be used. If a debug mode is needed to
80 * audit lock status, then set node->locked value here.
81 */
82 return;
83 }
84 ACCESS_ONCE(prev->next) = node;
85
86 /* Wait until the lock holder passes the lock down. */
87 arch_mcs_spin_lock_contended(&node->locked);
88}
89
90/*
91 * Releases the lock. The caller should pass in the corresponding node that
92 * was used to acquire the lock.
93 */
94static inline
95void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
96{
97 struct mcs_spinlock *next = ACCESS_ONCE(node->next);
98
99 if (likely(!next)) {
100 /*
101 * Release the lock by setting it to NULL
102 */
103 if (likely(cmpxchg(lock, node, NULL) == node))
104 return;
105 /* Wait until the next pointer is set */
106 while (!(next = ACCESS_ONCE(node->next)))
107 arch_mutex_cpu_relax();
108 }
109
110 /* Pass lock to next waiter. */
111 arch_mcs_spin_unlock_contended(&next->locked);
112}
113
114/*
115 * Cancellable version of the MCS lock above.
116 *
117 * Intended for adaptive spinning of sleeping locks:
118 * mutex_lock()/rwsem_down_{read,write}() etc.
119 */
120
121struct optimistic_spin_queue {
122 struct optimistic_spin_queue *next, *prev;
123 int locked; /* 1 if lock acquired */
124};
125
126extern bool osq_lock(struct optimistic_spin_queue **lock);
127extern void osq_unlock(struct optimistic_spin_queue **lock);
128
129#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index faf6f5b53e77..e1191c996c59 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -83,6 +83,12 @@ void debug_mutex_unlock(struct mutex *lock)
83 83
84 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 84 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
85 mutex_clear_owner(lock); 85 mutex_clear_owner(lock);
86
87 /*
88 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
89 * mutexes so that we can do it here after we've verified state.
90 */
91 atomic_set(&lock->count, 1);
86} 92}
87 93
88void debug_mutex_init(struct mutex *lock, const char *name, 94void debug_mutex_init(struct mutex *lock, const char *name,
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4dd6e4c219de..bc73d33c6760 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,6 +25,7 @@
25#include <linux/spinlock.h> 25#include <linux/spinlock.h>
26#include <linux/interrupt.h> 26#include <linux/interrupt.h>
27#include <linux/debug_locks.h> 27#include <linux/debug_locks.h>
28#include "mcs_spinlock.h"
28 29
29/* 30/*
30 * In the DEBUG case we are using the "NULL fastpath" for mutexes, 31 * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -33,6 +34,13 @@
33#ifdef CONFIG_DEBUG_MUTEXES 34#ifdef CONFIG_DEBUG_MUTEXES
34# include "mutex-debug.h" 35# include "mutex-debug.h"
35# include <asm-generic/mutex-null.h> 36# include <asm-generic/mutex-null.h>
37/*
38 * Must be 0 for the debug case so we do not do the unlock outside of the
39 * wait_lock region. debug_mutex_unlock() will do the actual unlock in this
40 * case.
41 */
42# undef __mutex_slowpath_needs_to_unlock
43# define __mutex_slowpath_needs_to_unlock() 0
36#else 44#else
37# include "mutex.h" 45# include "mutex.h"
38# include <asm/mutex.h> 46# include <asm/mutex.h>
@@ -52,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
52 INIT_LIST_HEAD(&lock->wait_list); 60 INIT_LIST_HEAD(&lock->wait_list);
53 mutex_clear_owner(lock); 61 mutex_clear_owner(lock);
54#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 62#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
55 lock->spin_mlock = NULL; 63 lock->osq = NULL;
56#endif 64#endif
57 65
58 debug_mutex_init(lock, name, key); 66 debug_mutex_init(lock, name, key);
@@ -67,8 +75,7 @@ EXPORT_SYMBOL(__mutex_init);
67 * We also put the fastpath first in the kernel image, to make sure the 75 * We also put the fastpath first in the kernel image, to make sure the
68 * branch is predicted by the CPU as default-untaken. 76 * branch is predicted by the CPU as default-untaken.
69 */ 77 */
70static __used noinline void __sched 78__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
71__mutex_lock_slowpath(atomic_t *lock_count);
72 79
73/** 80/**
74 * mutex_lock - acquire the mutex 81 * mutex_lock - acquire the mutex
@@ -111,54 +118,7 @@ EXPORT_SYMBOL(mutex_lock);
111 * more or less simultaneously, the spinners need to acquire a MCS lock 118 * more or less simultaneously, the spinners need to acquire a MCS lock
112 * first before spinning on the owner field. 119 * first before spinning on the owner field.
113 * 120 *
114 * We don't inline mspin_lock() so that perf can correctly account for the
115 * time spent in this lock function.
116 */ 121 */
117struct mspin_node {
118 struct mspin_node *next ;
119 int locked; /* 1 if lock acquired */
120};
121#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
122
123static noinline
124void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
125{
126 struct mspin_node *prev;
127
128 /* Init node */
129 node->locked = 0;
130 node->next = NULL;
131
132 prev = xchg(lock, node);
133 if (likely(prev == NULL)) {
134 /* Lock acquired */
135 node->locked = 1;
136 return;
137 }
138 ACCESS_ONCE(prev->next) = node;
139 smp_wmb();
140 /* Wait until the lock holder passes the lock down */
141 while (!ACCESS_ONCE(node->locked))
142 arch_mutex_cpu_relax();
143}
144
145static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
146{
147 struct mspin_node *next = ACCESS_ONCE(node->next);
148
149 if (likely(!next)) {
150 /*
151 * Release the lock by setting it to NULL
152 */
153 if (cmpxchg(lock, node, NULL) == node)
154 return;
155 /* Wait until the next pointer is set */
156 while (!(next = ACCESS_ONCE(node->next)))
157 arch_mutex_cpu_relax();
158 }
159 ACCESS_ONCE(next->locked) = 1;
160 smp_wmb();
161}
162 122
163/* 123/*
164 * Mutex spinning code migrated from kernel/sched/core.c 124 * Mutex spinning code migrated from kernel/sched/core.c
@@ -212,6 +172,9 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
212 struct task_struct *owner; 172 struct task_struct *owner;
213 int retval = 1; 173 int retval = 1;
214 174
175 if (need_resched())
176 return 0;
177
215 rcu_read_lock(); 178 rcu_read_lock();
216 owner = ACCESS_ONCE(lock->owner); 179 owner = ACCESS_ONCE(lock->owner);
217 if (owner) 180 if (owner)
@@ -225,7 +188,8 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
225} 188}
226#endif 189#endif
227 190
228static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 191__visible __used noinline
192void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
229 193
230/** 194/**
231 * mutex_unlock - release the mutex 195 * mutex_unlock - release the mutex
@@ -446,9 +410,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
446 if (!mutex_can_spin_on_owner(lock)) 410 if (!mutex_can_spin_on_owner(lock))
447 goto slowpath; 411 goto slowpath;
448 412
413 if (!osq_lock(&lock->osq))
414 goto slowpath;
415
449 for (;;) { 416 for (;;) {
450 struct task_struct *owner; 417 struct task_struct *owner;
451 struct mspin_node node;
452 418
453 if (use_ww_ctx && ww_ctx->acquired > 0) { 419 if (use_ww_ctx && ww_ctx->acquired > 0) {
454 struct ww_mutex *ww; 420 struct ww_mutex *ww;
@@ -463,19 +429,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
463 * performed the optimistic spinning cannot be done. 429 * performed the optimistic spinning cannot be done.
464 */ 430 */
465 if (ACCESS_ONCE(ww->ctx)) 431 if (ACCESS_ONCE(ww->ctx))
466 goto slowpath; 432 break;
467 } 433 }
468 434
469 /* 435 /*
470 * If there's an owner, wait for it to either 436 * If there's an owner, wait for it to either
471 * release the lock or go to sleep. 437 * release the lock or go to sleep.
472 */ 438 */
473 mspin_lock(MLOCK(lock), &node);
474 owner = ACCESS_ONCE(lock->owner); 439 owner = ACCESS_ONCE(lock->owner);
475 if (owner && !mutex_spin_on_owner(lock, owner)) { 440 if (owner && !mutex_spin_on_owner(lock, owner))
476 mspin_unlock(MLOCK(lock), &node); 441 break;
477 goto slowpath;
478 }
479 442
480 if ((atomic_read(&lock->count) == 1) && 443 if ((atomic_read(&lock->count) == 1) &&
481 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { 444 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
@@ -488,11 +451,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
488 } 451 }
489 452
490 mutex_set_owner(lock); 453 mutex_set_owner(lock);
491 mspin_unlock(MLOCK(lock), &node); 454 osq_unlock(&lock->osq);
492 preempt_enable(); 455 preempt_enable();
493 return 0; 456 return 0;
494 } 457 }
495 mspin_unlock(MLOCK(lock), &node);
496 458
497 /* 459 /*
498 * When there's no owner, we might have preempted between the 460 * When there's no owner, we might have preempted between the
@@ -501,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
501 * the owner complete. 463 * the owner complete.
502 */ 464 */
503 if (!owner && (need_resched() || rt_task(task))) 465 if (!owner && (need_resched() || rt_task(task)))
504 goto slowpath; 466 break;
505 467
506 /* 468 /*
507 * The cpu_relax() call is a compiler barrier which forces 469 * The cpu_relax() call is a compiler barrier which forces
@@ -511,7 +473,15 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
511 */ 473 */
512 arch_mutex_cpu_relax(); 474 arch_mutex_cpu_relax();
513 } 475 }
476 osq_unlock(&lock->osq);
514slowpath: 477slowpath:
478 /*
479 * If we fell out of the spin path because of need_resched(),
480 * reschedule now, before we try-lock the mutex. This avoids getting
481 * scheduled out right after we obtained the mutex.
482 */
483 if (need_resched())
484 schedule_preempt_disabled();
515#endif 485#endif
516 spin_lock_mutex(&lock->wait_lock, flags); 486 spin_lock_mutex(&lock->wait_lock, flags);
517 487
@@ -717,10 +687,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
717 struct mutex *lock = container_of(lock_count, struct mutex, count); 687 struct mutex *lock = container_of(lock_count, struct mutex, count);
718 unsigned long flags; 688 unsigned long flags;
719 689
720 spin_lock_mutex(&lock->wait_lock, flags);
721 mutex_release(&lock->dep_map, nested, _RET_IP_);
722 debug_mutex_unlock(lock);
723
724 /* 690 /*
725 * some architectures leave the lock unlocked in the fastpath failure 691 * some architectures leave the lock unlocked in the fastpath failure
726 * case, others need to leave it locked. In the later case we have to 692 * case, others need to leave it locked. In the later case we have to
@@ -729,6 +695,10 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
729 if (__mutex_slowpath_needs_to_unlock()) 695 if (__mutex_slowpath_needs_to_unlock())
730 atomic_set(&lock->count, 1); 696 atomic_set(&lock->count, 1);
731 697
698 spin_lock_mutex(&lock->wait_lock, flags);
699 mutex_release(&lock->dep_map, nested, _RET_IP_);
700 debug_mutex_unlock(lock);
701
732 if (!list_empty(&lock->wait_list)) { 702 if (!list_empty(&lock->wait_list)) {
733 /* get the first entry from the wait-list: */ 703 /* get the first entry from the wait-list: */
734 struct mutex_waiter *waiter = 704 struct mutex_waiter *waiter =
@@ -746,7 +716,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
746/* 716/*
747 * Release the lock, slowpath: 717 * Release the lock, slowpath:
748 */ 718 */
749static __used noinline void 719__visible void
750__mutex_unlock_slowpath(atomic_t *lock_count) 720__mutex_unlock_slowpath(atomic_t *lock_count)
751{ 721{
752 __mutex_unlock_common_slowpath(lock_count, 1); 722 __mutex_unlock_common_slowpath(lock_count, 1);
@@ -803,7 +773,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
803} 773}
804EXPORT_SYMBOL(mutex_lock_killable); 774EXPORT_SYMBOL(mutex_lock_killable);
805 775
806static __used noinline void __sched 776__visible void __sched
807__mutex_lock_slowpath(atomic_t *lock_count) 777__mutex_lock_slowpath(atomic_t *lock_count)
808{ 778{
809 struct mutex *lock = container_of(lock_count, struct mutex, count); 779 struct mutex *lock = container_of(lock_count, struct mutex, count);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 2e960a2bab81..aa4dff04b594 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -213,6 +213,18 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
213} 213}
214 214
215/* 215/*
216 * Called by sched_setscheduler() to check whether the priority change
217 * is overruled by a possible priority boosting.
218 */
219int rt_mutex_check_prio(struct task_struct *task, int newprio)
220{
221 if (!task_has_pi_waiters(task))
222 return 0;
223
224 return task_top_pi_waiter(task)->task->prio <= newprio;
225}
226
227/*
216 * Adjust the priority of a task, after its pi_waiters got modified. 228 * Adjust the priority of a task, after its pi_waiters got modified.
217 * 229 *
218 * This can be both boosting and unboosting. task->pi_lock must be held. 230 * This can be both boosting and unboosting. task->pi_lock must be held.
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 19c5fa95e0b4..1d66e08e897d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -143,6 +143,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
143/* 143/*
144 * wait for the read lock to be granted 144 * wait for the read lock to be granted
145 */ 145 */
146__visible
146struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) 147struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
147{ 148{
148 long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; 149 long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
@@ -190,6 +191,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
190/* 191/*
191 * wait until we successfully acquire the write lock 192 * wait until we successfully acquire the write lock
192 */ 193 */
194__visible
193struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) 195struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
194{ 196{
195 long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; 197 long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
@@ -252,6 +254,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
252 * handle waking up a waiter on the semaphore 254 * handle waking up a waiter on the semaphore
253 * - up_read/up_write has decremented the active part of count if we come here 255 * - up_read/up_write has decremented the active part of count if we come here
254 */ 256 */
257__visible
255struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) 258struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
256{ 259{
257 unsigned long flags; 260 unsigned long flags;
@@ -272,6 +275,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
272 * - caller incremented waiting part of count and discovered it still negative 275 * - caller incremented waiting part of count and discovered it still negative
273 * - just wake up any readers at the front of the queue 276 * - just wake up any readers at the front of the queue
274 */ 277 */
278__visible
275struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) 279struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
276{ 280{
277 unsigned long flags; 281 unsigned long flags;
diff --git a/kernel/module.c b/kernel/module.c
index d24fcf29cb64..8dc7f5e80dd8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1015,7 +1015,7 @@ static size_t module_flags_taint(struct module *mod, char *buf)
1015 buf[l++] = 'C'; 1015 buf[l++] = 'C';
1016 /* 1016 /*
1017 * TAINT_FORCED_RMMOD: could be added. 1017 * TAINT_FORCED_RMMOD: could be added.
1018 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't 1018 * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
1019 * apply to modules. 1019 * apply to modules.
1020 */ 1020 */
1021 return l; 1021 return l;
@@ -1948,6 +1948,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1948 1948
1949 switch (sym[i].st_shndx) { 1949 switch (sym[i].st_shndx) {
1950 case SHN_COMMON: 1950 case SHN_COMMON:
1951 /* Ignore common symbols */
1952 if (!strncmp(name, "__gnu_lto", 9))
1953 break;
1954
1951 /* We compiled with -fno-common. These are not 1955 /* We compiled with -fno-common. These are not
1952 supposed to happen. */ 1956 supposed to happen. */
1953 pr_debug("Common symbol: %s\n", name); 1957 pr_debug("Common symbol: %s\n", name);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2d5cc4ccff7f..db4c8b08a50c 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
309 * racy then it does not matter what the result of the test 309 * racy then it does not matter what the result of the test
310 * is, we re-check the list after having taken the lock anyway: 310 * is, we re-check the list after having taken the lock anyway:
311 */ 311 */
312 if (rcu_dereference_raw(nh->head)) { 312 if (rcu_access_pointer(nh->head)) {
313 down_read(&nh->rwsem); 313 down_read(&nh->rwsem);
314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, 314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
315 nr_calls); 315 nr_calls);
diff --git a/kernel/panic.c b/kernel/panic.c
index 6d6300375090..cca8a913ae7c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -199,7 +199,7 @@ struct tnt {
199static const struct tnt tnts[] = { 199static const struct tnt tnts[] = {
200 { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, 200 { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
201 { TAINT_FORCED_MODULE, 'F', ' ' }, 201 { TAINT_FORCED_MODULE, 'F', ' ' },
202 { TAINT_UNSAFE_SMP, 'S', ' ' }, 202 { TAINT_CPU_OUT_OF_SPEC, 'S', ' ' },
203 { TAINT_FORCED_RMMOD, 'R', ' ' }, 203 { TAINT_FORCED_RMMOD, 'R', ' ' },
204 { TAINT_MACHINE_CHECK, 'M', ' ' }, 204 { TAINT_MACHINE_CHECK, 'M', ' ' },
205 { TAINT_BAD_PAGE, 'B', ' ' }, 205 { TAINT_BAD_PAGE, 'B', ' ' },
@@ -459,7 +459,7 @@ EXPORT_SYMBOL(warn_slowpath_null);
459 * Called when gcc's -fstack-protector feature is used, and 459 * Called when gcc's -fstack-protector feature is used, and
460 * gcc detects corruption of the on-stack canary value 460 * gcc detects corruption of the on-stack canary value
461 */ 461 */
462void __stack_chk_fail(void) 462__visible void __stack_chk_fail(void)
463{ 463{
464 panic("stack-protector: Kernel stack is corrupted in: %p\n", 464 panic("stack-protector: Kernel stack is corrupted in: %p\n",
465 __builtin_return_address(0)); 465 __builtin_return_address(0));
diff --git a/kernel/power/console.c b/kernel/power/console.c
index eacb8bd8cab4..aba9c545a0e3 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -9,6 +9,7 @@
9#include <linux/kbd_kern.h> 9#include <linux/kbd_kern.h>
10#include <linux/vt.h> 10#include <linux/vt.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/slab.h>
12#include "power.h" 13#include "power.h"
13 14
14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 15#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b1d255f04135..4dae9cbe9259 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1076,7 +1076,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1076 next_seq = log_next_seq; 1076 next_seq = log_next_seq;
1077 1077
1078 len = 0; 1078 len = 0;
1079 prev = 0;
1080 while (len >= 0 && seq < next_seq) { 1079 while (len >= 0 && seq < next_seq) {
1081 struct printk_log *msg = log_from_idx(idx); 1080 struct printk_log *msg = log_from_idx(idx);
1082 int textlen; 1081 int textlen;
@@ -2788,7 +2787,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2788 next_idx = idx; 2787 next_idx = idx;
2789 2788
2790 l = 0; 2789 l = 0;
2791 prev = 0;
2792 while (seq < dumper->next_seq) { 2790 while (seq < dumper->next_seq) {
2793 struct printk_log *msg = log_from_idx(idx); 2791 struct printk_log *msg = log_from_idx(idx);
2794 2792
diff --git a/kernel/profile.c b/kernel/profile.c
index 6631e1ef55ab..ebdd9c1a86b4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -549,14 +549,14 @@ static int create_hash_tables(void)
549 struct page *page; 549 struct page *page;
550 550
551 page = alloc_pages_exact_node(node, 551 page = alloc_pages_exact_node(node,
552 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 552 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
553 0); 553 0);
554 if (!page) 554 if (!page)
555 goto out_cleanup; 555 goto out_cleanup;
556 per_cpu(cpu_profile_hits, cpu)[1] 556 per_cpu(cpu_profile_hits, cpu)[1]
557 = (struct profile_hit *)page_address(page); 557 = (struct profile_hit *)page_address(page);
558 page = alloc_pages_exact_node(node, 558 page = alloc_pages_exact_node(node,
559 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 559 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
560 0); 560 0);
561 if (!page) 561 if (!page)
562 goto out_cleanup; 562 goto out_cleanup;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1f4bcb3cc21c..adf98622cb32 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1180,8 +1180,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
1180 return ret; 1180 return ret;
1181} 1181}
1182 1182
1183asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, 1183COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
1184 compat_long_t addr, compat_long_t data) 1184 compat_long_t, addr, compat_long_t, data)
1185{ 1185{
1186 struct task_struct *child; 1186 struct task_struct *child;
1187 long ret; 1187 long ret;
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 01e9ec37a3e3..807ccfbf69b3 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,5 +1,5 @@
1obj-y += update.o srcu.o 1obj-y += update.o srcu.o
2obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o 2obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
3obj-$(CONFIG_TREE_RCU) += tree.o 3obj-$(CONFIG_TREE_RCU) += tree.o
4obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o 4obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
5obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o 5obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 79c3877e9c5b..bfda2726ca45 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -12,8 +12,8 @@
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, you can access it online at
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 * 17 *
18 * Copyright IBM Corporation, 2011 18 * Copyright IBM Corporation, 2011
19 * 19 *
@@ -23,6 +23,7 @@
23#ifndef __LINUX_RCU_H 23#ifndef __LINUX_RCU_H
24#define __LINUX_RCU_H 24#define __LINUX_RCU_H
25 25
26#include <trace/events/rcu.h>
26#ifdef CONFIG_RCU_TRACE 27#ifdef CONFIG_RCU_TRACE
27#define RCU_TRACE(stmt) stmt 28#define RCU_TRACE(stmt) stmt
28#else /* #ifdef CONFIG_RCU_TRACE */ 29#else /* #ifdef CONFIG_RCU_TRACE */
@@ -116,8 +117,6 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
116 } 117 }
117} 118}
118 119
119extern int rcu_expedited;
120
121#ifdef CONFIG_RCU_STALL_COMMON 120#ifdef CONFIG_RCU_STALL_COMMON
122 121
123extern int rcu_cpu_stall_suppress; 122extern int rcu_cpu_stall_suppress;
diff --git a/kernel/rcu/torture.c b/kernel/rcu/rcutorture.c
index 732f8ae3086a..bd30bc61bc05 100644
--- a/kernel/rcu/torture.c
+++ b/kernel/rcu/rcutorture.c
@@ -12,8 +12,8 @@
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, you can access it online at
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2005, 2006 18 * Copyright (C) IBM Corporation, 2005, 2006
19 * 19 *
@@ -48,110 +48,58 @@
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <linux/trace_clock.h> 49#include <linux/trace_clock.h>
50#include <asm/byteorder.h> 50#include <asm/byteorder.h>
51#include <linux/torture.h>
51 52
52MODULE_LICENSE("GPL"); 53MODULE_LICENSE("GPL");
53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); 54MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
54 55
55MODULE_ALIAS("rcutorture"); 56
56#ifdef MODULE_PARAM_PREFIX 57torture_param(int, fqs_duration, 0,
57#undef MODULE_PARAM_PREFIX 58 "Duration of fqs bursts (us), 0 to disable");
58#endif 59torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
59#define MODULE_PARAM_PREFIX "rcutorture." 60torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
60 61torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
61static int fqs_duration; 62torture_param(bool, gp_normal, false,
62module_param(fqs_duration, int, 0444); 63 "Use normal (non-expedited) GP wait primitives");
63MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); 64torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
64static int fqs_holdoff; 65torture_param(int, n_barrier_cbs, 0,
65module_param(fqs_holdoff, int, 0444); 66 "# of callbacks/kthreads for barrier testing");
66MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 67torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
67static int fqs_stutter = 3; 68torture_param(int, nreaders, -1, "Number of RCU reader threads");
68module_param(fqs_stutter, int, 0444); 69torture_param(int, object_debug, 0,
69MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 70 "Enable debug-object double call_rcu() testing");
70static bool gp_exp; 71torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
71module_param(gp_exp, bool, 0444); 72torture_param(int, onoff_interval, 0,
72MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); 73 "Time between CPU hotplugs (s), 0=disable");
73static bool gp_normal; 74torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
74module_param(gp_normal, bool, 0444); 75torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
75MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); 76torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
76static int irqreader = 1; 77torture_param(int, stall_cpu_holdoff, 10,
77module_param(irqreader, int, 0444); 78 "Time to wait before starting stall (s).");
78MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); 79torture_param(int, stat_interval, 60,
79static int n_barrier_cbs; 80 "Number of seconds between stats printk()s");
80module_param(n_barrier_cbs, int, 0444); 81torture_param(int, stutter, 5, "Number of seconds to run/halt test");
81MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); 82torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
82static int nfakewriters = 4; 83torture_param(int, test_boost_duration, 4,
83module_param(nfakewriters, int, 0444); 84 "Duration of each boost test, seconds.");
84MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); 85torture_param(int, test_boost_interval, 7,
85static int nreaders = -1; 86 "Interval between boost tests, seconds.");
86module_param(nreaders, int, 0444); 87torture_param(bool, test_no_idle_hz, true,
87MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 88 "Test support for tickless idle CPUs");
88static int object_debug; 89torture_param(bool, verbose, true,
89module_param(object_debug, int, 0444); 90 "Enable verbose debugging printk()s");
90MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); 91
91static int onoff_holdoff;
92module_param(onoff_holdoff, int, 0444);
93MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
94static int onoff_interval;
95module_param(onoff_interval, int, 0444);
96MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
97static int shuffle_interval = 3;
98module_param(shuffle_interval, int, 0444);
99MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
100static int shutdown_secs;
101module_param(shutdown_secs, int, 0444);
102MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable.");
103static int stall_cpu;
104module_param(stall_cpu, int, 0444);
105MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
106static int stall_cpu_holdoff = 10;
107module_param(stall_cpu_holdoff, int, 0444);
108MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
109static int stat_interval = 60;
110module_param(stat_interval, int, 0644);
111MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
112static int stutter = 5;
113module_param(stutter, int, 0444);
114MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
115static int test_boost = 1;
116module_param(test_boost, int, 0444);
117MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
118static int test_boost_duration = 4;
119module_param(test_boost_duration, int, 0444);
120MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
121static int test_boost_interval = 7;
122module_param(test_boost_interval, int, 0444);
123MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
124static bool test_no_idle_hz = true;
125module_param(test_no_idle_hz, bool, 0444);
126MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
127static char *torture_type = "rcu"; 92static char *torture_type = "rcu";
128module_param(torture_type, charp, 0444); 93module_param(torture_type, charp, 0444);
129MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); 94MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
130static bool verbose;
131module_param(verbose, bool, 0444);
132MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
133
134#define TORTURE_FLAG "-torture:"
135#define PRINTK_STRING(s) \
136 do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
137#define VERBOSE_PRINTK_STRING(s) \
138 do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
139#define VERBOSE_PRINTK_ERRSTRING(s) \
140 do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
141 95
142static int nrealreaders; 96static int nrealreaders;
143static struct task_struct *writer_task; 97static struct task_struct *writer_task;
144static struct task_struct **fakewriter_tasks; 98static struct task_struct **fakewriter_tasks;
145static struct task_struct **reader_tasks; 99static struct task_struct **reader_tasks;
146static struct task_struct *stats_task; 100static struct task_struct *stats_task;
147static struct task_struct *shuffler_task;
148static struct task_struct *stutter_task;
149static struct task_struct *fqs_task; 101static struct task_struct *fqs_task;
150static struct task_struct *boost_tasks[NR_CPUS]; 102static struct task_struct *boost_tasks[NR_CPUS];
151static struct task_struct *shutdown_task;
152#ifdef CONFIG_HOTPLUG_CPU
153static struct task_struct *onoff_task;
154#endif /* #ifdef CONFIG_HOTPLUG_CPU */
155static struct task_struct *stall_task; 103static struct task_struct *stall_task;
156static struct task_struct **barrier_cbs_tasks; 104static struct task_struct **barrier_cbs_tasks;
157static struct task_struct *barrier_task; 105static struct task_struct *barrier_task;
@@ -170,10 +118,10 @@ static struct rcu_torture __rcu *rcu_torture_current;
170static unsigned long rcu_torture_current_version; 118static unsigned long rcu_torture_current_version;
171static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 119static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
172static DEFINE_SPINLOCK(rcu_torture_lock); 120static DEFINE_SPINLOCK(rcu_torture_lock);
173static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 121static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
174 { 0 }; 122 rcu_torture_count) = { 0 };
175static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = 123static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
176 { 0 }; 124 rcu_torture_batch) = { 0 };
177static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; 125static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
178static atomic_t n_rcu_torture_alloc; 126static atomic_t n_rcu_torture_alloc;
179static atomic_t n_rcu_torture_alloc_fail; 127static atomic_t n_rcu_torture_alloc_fail;
@@ -186,22 +134,9 @@ static long n_rcu_torture_boost_rterror;
186static long n_rcu_torture_boost_failure; 134static long n_rcu_torture_boost_failure;
187static long n_rcu_torture_boosts; 135static long n_rcu_torture_boosts;
188static long n_rcu_torture_timers; 136static long n_rcu_torture_timers;
189static long n_offline_attempts;
190static long n_offline_successes;
191static unsigned long sum_offline;
192static int min_offline = -1;
193static int max_offline;
194static long n_online_attempts;
195static long n_online_successes;
196static unsigned long sum_online;
197static int min_online = -1;
198static int max_online;
199static long n_barrier_attempts; 137static long n_barrier_attempts;
200static long n_barrier_successes; 138static long n_barrier_successes;
201static struct list_head rcu_torture_removed; 139static struct list_head rcu_torture_removed;
202static cpumask_var_t shuffle_tmp_mask;
203
204static int stutter_pause_test;
205 140
206#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) 141#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
207#define RCUTORTURE_RUNNABLE_INIT 1 142#define RCUTORTURE_RUNNABLE_INIT 1
@@ -232,7 +167,6 @@ static u64 notrace rcu_trace_clock_local(void)
232} 167}
233#endif /* #else #ifdef CONFIG_RCU_TRACE */ 168#endif /* #else #ifdef CONFIG_RCU_TRACE */
234 169
235static unsigned long shutdown_time; /* jiffies to system shutdown. */
236static unsigned long boost_starttime; /* jiffies of next boost test start. */ 170static unsigned long boost_starttime; /* jiffies of next boost test start. */
237DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
238 /* and boost task create/destroy. */ 172 /* and boost task create/destroy. */
@@ -242,51 +176,6 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
242static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ 176static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
243static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); 177static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
244 178
245/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
246
247#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
248#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */
249#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */
250static int fullstop = FULLSTOP_RMMOD;
251/*
252 * Protect fullstop transitions and spawning of kthreads.
253 */
254static DEFINE_MUTEX(fullstop_mutex);
255
256/* Forward reference. */
257static void rcu_torture_cleanup(void);
258
259/*
260 * Detect and respond to a system shutdown.
261 */
262static int
263rcutorture_shutdown_notify(struct notifier_block *unused1,
264 unsigned long unused2, void *unused3)
265{
266 mutex_lock(&fullstop_mutex);
267 if (fullstop == FULLSTOP_DONTSTOP)
268 fullstop = FULLSTOP_SHUTDOWN;
269 else
270 pr_warn(/* but going down anyway, so... */
271 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
272 mutex_unlock(&fullstop_mutex);
273 return NOTIFY_DONE;
274}
275
276/*
277 * Absorb kthreads into a kernel function that won't return, so that
278 * they won't ever access module text or data again.
279 */
280static void rcutorture_shutdown_absorb(const char *title)
281{
282 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
283 pr_notice(
284 "rcutorture thread %s parking due to system shutdown\n",
285 title);
286 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
287 }
288}
289
290/* 179/*
291 * Allocate an element from the rcu_tortures pool. 180 * Allocate an element from the rcu_tortures pool.
292 */ 181 */
@@ -320,44 +209,6 @@ rcu_torture_free(struct rcu_torture *p)
320 spin_unlock_bh(&rcu_torture_lock); 209 spin_unlock_bh(&rcu_torture_lock);
321} 210}
322 211
323struct rcu_random_state {
324 unsigned long rrs_state;
325 long rrs_count;
326};
327
328#define RCU_RANDOM_MULT 39916801 /* prime */
329#define RCU_RANDOM_ADD 479001701 /* prime */
330#define RCU_RANDOM_REFRESH 10000
331
332#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
333
334/*
335 * Crude but fast random-number generator. Uses a linear congruential
336 * generator, with occasional help from cpu_clock().
337 */
338static unsigned long
339rcu_random(struct rcu_random_state *rrsp)
340{
341 if (--rrsp->rrs_count < 0) {
342 rrsp->rrs_state += (unsigned long)local_clock();
343 rrsp->rrs_count = RCU_RANDOM_REFRESH;
344 }
345 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
346 return swahw32(rrsp->rrs_state);
347}
348
349static void
350rcu_stutter_wait(const char *title)
351{
352 while (stutter_pause_test || !rcutorture_runnable) {
353 if (rcutorture_runnable)
354 schedule_timeout_interruptible(1);
355 else
356 schedule_timeout_interruptible(round_jiffies_relative(HZ));
357 rcutorture_shutdown_absorb(title);
358 }
359}
360
361/* 212/*
362 * Operations vector for selecting different types of tests. 213 * Operations vector for selecting different types of tests.
363 */ 214 */
@@ -365,7 +216,7 @@ rcu_stutter_wait(const char *title)
365struct rcu_torture_ops { 216struct rcu_torture_ops {
366 void (*init)(void); 217 void (*init)(void);
367 int (*readlock)(void); 218 int (*readlock)(void);
368 void (*read_delay)(struct rcu_random_state *rrsp); 219 void (*read_delay)(struct torture_random_state *rrsp);
369 void (*readunlock)(int idx); 220 void (*readunlock)(int idx);
370 int (*completed)(void); 221 int (*completed)(void);
371 void (*deferred_free)(struct rcu_torture *p); 222 void (*deferred_free)(struct rcu_torture *p);
@@ -392,7 +243,7 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
392 return 0; 243 return 0;
393} 244}
394 245
395static void rcu_read_delay(struct rcu_random_state *rrsp) 246static void rcu_read_delay(struct torture_random_state *rrsp)
396{ 247{
397 const unsigned long shortdelay_us = 200; 248 const unsigned long shortdelay_us = 200;
398 const unsigned long longdelay_ms = 50; 249 const unsigned long longdelay_ms = 50;
@@ -401,12 +252,13 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
401 * period, and we want a long delay occasionally to trigger 252 * period, and we want a long delay occasionally to trigger
402 * force_quiescent_state. */ 253 * force_quiescent_state. */
403 254
404 if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) 255 if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
405 mdelay(longdelay_ms); 256 mdelay(longdelay_ms);
406 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) 257 if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
407 udelay(shortdelay_us); 258 udelay(shortdelay_us);
408#ifdef CONFIG_PREEMPT 259#ifdef CONFIG_PREEMPT
409 if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) 260 if (!preempt_count() &&
261 !(torture_random(rrsp) % (nrealreaders * 20000)))
410 preempt_schedule(); /* No QS if preempt_disable() in effect */ 262 preempt_schedule(); /* No QS if preempt_disable() in effect */
411#endif 263#endif
412} 264}
@@ -427,7 +279,7 @@ rcu_torture_cb(struct rcu_head *p)
427 int i; 279 int i;
428 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); 280 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
429 281
430 if (fullstop != FULLSTOP_DONTSTOP) { 282 if (torture_must_stop_irq()) {
431 /* Test is ending, just drop callbacks on the floor. */ 283 /* Test is ending, just drop callbacks on the floor. */
432 /* The next initialization will pick up the pieces. */ 284 /* The next initialization will pick up the pieces. */
433 return; 285 return;
@@ -520,6 +372,48 @@ static struct rcu_torture_ops rcu_bh_ops = {
520}; 372};
521 373
522/* 374/*
375 * Don't even think about trying any of these in real life!!!
376 * The names includes "busted", and they really means it!
377 * The only purpose of these functions is to provide a buggy RCU
378 * implementation to make sure that rcutorture correctly emits
379 * buggy-RCU error messages.
380 */
381static void rcu_busted_torture_deferred_free(struct rcu_torture *p)
382{
383 /* This is a deliberate bug for testing purposes only! */
384 rcu_torture_cb(&p->rtort_rcu);
385}
386
387static void synchronize_rcu_busted(void)
388{
389 /* This is a deliberate bug for testing purposes only! */
390}
391
392static void
393call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
394{
395 /* This is a deliberate bug for testing purposes only! */
396 func(head);
397}
398
399static struct rcu_torture_ops rcu_busted_ops = {
400 .init = rcu_sync_torture_init,
401 .readlock = rcu_torture_read_lock,
402 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
403 .readunlock = rcu_torture_read_unlock,
404 .completed = rcu_no_completed,
405 .deferred_free = rcu_busted_torture_deferred_free,
406 .sync = synchronize_rcu_busted,
407 .exp_sync = synchronize_rcu_busted,
408 .call = call_rcu_busted,
409 .cb_barrier = NULL,
410 .fqs = NULL,
411 .stats = NULL,
412 .irq_capable = 1,
413 .name = "rcu_busted"
414};
415
416/*
523 * Definitions for srcu torture testing. 417 * Definitions for srcu torture testing.
524 */ 418 */
525 419
@@ -530,7 +424,7 @@ static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
530 return srcu_read_lock(&srcu_ctl); 424 return srcu_read_lock(&srcu_ctl);
531} 425}
532 426
533static void srcu_read_delay(struct rcu_random_state *rrsp) 427static void srcu_read_delay(struct torture_random_state *rrsp)
534{ 428{
535 long delay; 429 long delay;
536 const long uspertick = 1000000 / HZ; 430 const long uspertick = 1000000 / HZ;
@@ -538,7 +432,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
538 432
539 /* We want there to be long-running readers, but not all the time. */ 433 /* We want there to be long-running readers, but not all the time. */
540 434
541 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); 435 delay = torture_random(rrsp) %
436 (nrealreaders * 2 * longdelay * uspertick);
542 if (!delay) 437 if (!delay)
543 schedule_timeout_interruptible(longdelay); 438 schedule_timeout_interruptible(longdelay);
544 else 439 else
@@ -677,12 +572,12 @@ static int rcu_torture_boost(void *arg)
677 struct rcu_boost_inflight rbi = { .inflight = 0 }; 572 struct rcu_boost_inflight rbi = { .inflight = 0 };
678 struct sched_param sp; 573 struct sched_param sp;
679 574
680 VERBOSE_PRINTK_STRING("rcu_torture_boost started"); 575 VERBOSE_TOROUT_STRING("rcu_torture_boost started");
681 576
682 /* Set real-time priority. */ 577 /* Set real-time priority. */
683 sp.sched_priority = 1; 578 sp.sched_priority = 1;
684 if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { 579 if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
685 VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); 580 VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
686 n_rcu_torture_boost_rterror++; 581 n_rcu_torture_boost_rterror++;
687 } 582 }
688 583
@@ -693,9 +588,8 @@ static int rcu_torture_boost(void *arg)
693 oldstarttime = boost_starttime; 588 oldstarttime = boost_starttime;
694 while (ULONG_CMP_LT(jiffies, oldstarttime)) { 589 while (ULONG_CMP_LT(jiffies, oldstarttime)) {
695 schedule_timeout_interruptible(oldstarttime - jiffies); 590 schedule_timeout_interruptible(oldstarttime - jiffies);
696 rcu_stutter_wait("rcu_torture_boost"); 591 stutter_wait("rcu_torture_boost");
697 if (kthread_should_stop() || 592 if (torture_must_stop())
698 fullstop != FULLSTOP_DONTSTOP)
699 goto checkwait; 593 goto checkwait;
700 } 594 }
701 595
@@ -710,15 +604,14 @@ static int rcu_torture_boost(void *arg)
710 call_rcu(&rbi.rcu, rcu_torture_boost_cb); 604 call_rcu(&rbi.rcu, rcu_torture_boost_cb);
711 if (jiffies - call_rcu_time > 605 if (jiffies - call_rcu_time >
712 test_boost_duration * HZ - HZ / 2) { 606 test_boost_duration * HZ - HZ / 2) {
713 VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); 607 VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
714 n_rcu_torture_boost_failure++; 608 n_rcu_torture_boost_failure++;
715 } 609 }
716 call_rcu_time = jiffies; 610 call_rcu_time = jiffies;
717 } 611 }
718 cond_resched(); 612 cond_resched();
719 rcu_stutter_wait("rcu_torture_boost"); 613 stutter_wait("rcu_torture_boost");
720 if (kthread_should_stop() || 614 if (torture_must_stop())
721 fullstop != FULLSTOP_DONTSTOP)
722 goto checkwait; 615 goto checkwait;
723 } 616 }
724 617
@@ -742,16 +635,17 @@ static int rcu_torture_boost(void *arg)
742 } 635 }
743 636
744 /* Go do the stutter. */ 637 /* Go do the stutter. */
745checkwait: rcu_stutter_wait("rcu_torture_boost"); 638checkwait: stutter_wait("rcu_torture_boost");
746 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 639 } while (!torture_must_stop());
747 640
748 /* Clean up and exit. */ 641 /* Clean up and exit. */
749 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); 642 while (!kthread_should_stop() || rbi.inflight) {
750 rcutorture_shutdown_absorb("rcu_torture_boost"); 643 torture_shutdown_absorb("rcu_torture_boost");
751 while (!kthread_should_stop() || rbi.inflight)
752 schedule_timeout_uninterruptible(1); 644 schedule_timeout_uninterruptible(1);
645 }
753 smp_mb(); /* order accesses to ->inflight before stack-frame death. */ 646 smp_mb(); /* order accesses to ->inflight before stack-frame death. */
754 destroy_rcu_head_on_stack(&rbi.rcu); 647 destroy_rcu_head_on_stack(&rbi.rcu);
648 torture_kthread_stopping("rcu_torture_boost");
755 return 0; 649 return 0;
756} 650}
757 651
@@ -766,7 +660,7 @@ rcu_torture_fqs(void *arg)
766 unsigned long fqs_resume_time; 660 unsigned long fqs_resume_time;
767 int fqs_burst_remaining; 661 int fqs_burst_remaining;
768 662
769 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); 663 VERBOSE_TOROUT_STRING("rcu_torture_fqs task started");
770 do { 664 do {
771 fqs_resume_time = jiffies + fqs_stutter * HZ; 665 fqs_resume_time = jiffies + fqs_stutter * HZ;
772 while (ULONG_CMP_LT(jiffies, fqs_resume_time) && 666 while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
@@ -780,12 +674,9 @@ rcu_torture_fqs(void *arg)
780 udelay(fqs_holdoff); 674 udelay(fqs_holdoff);
781 fqs_burst_remaining -= fqs_holdoff; 675 fqs_burst_remaining -= fqs_holdoff;
782 } 676 }
783 rcu_stutter_wait("rcu_torture_fqs"); 677 stutter_wait("rcu_torture_fqs");
784 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 678 } while (!torture_must_stop());
785 VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); 679 torture_kthread_stopping("rcu_torture_fqs");
786 rcutorture_shutdown_absorb("rcu_torture_fqs");
787 while (!kthread_should_stop())
788 schedule_timeout_uninterruptible(1);
789 return 0; 680 return 0;
790} 681}
791 682
@@ -802,10 +693,10 @@ rcu_torture_writer(void *arg)
802 struct rcu_torture *rp; 693 struct rcu_torture *rp;
803 struct rcu_torture *rp1; 694 struct rcu_torture *rp1;
804 struct rcu_torture *old_rp; 695 struct rcu_torture *old_rp;
805 static DEFINE_RCU_RANDOM(rand); 696 static DEFINE_TORTURE_RANDOM(rand);
806 697
807 VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); 698 VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
808 set_user_nice(current, 19); 699 set_user_nice(current, MAX_NICE);
809 700
810 do { 701 do {
811 schedule_timeout_uninterruptible(1); 702 schedule_timeout_uninterruptible(1);
@@ -813,7 +704,7 @@ rcu_torture_writer(void *arg)
813 if (rp == NULL) 704 if (rp == NULL)
814 continue; 705 continue;
815 rp->rtort_pipe_count = 0; 706 rp->rtort_pipe_count = 0;
816 udelay(rcu_random(&rand) & 0x3ff); 707 udelay(torture_random(&rand) & 0x3ff);
817 old_rp = rcu_dereference_check(rcu_torture_current, 708 old_rp = rcu_dereference_check(rcu_torture_current,
818 current == writer_task); 709 current == writer_task);
819 rp->rtort_mbtest = 1; 710 rp->rtort_mbtest = 1;
@@ -826,7 +717,7 @@ rcu_torture_writer(void *arg)
826 atomic_inc(&rcu_torture_wcount[i]); 717 atomic_inc(&rcu_torture_wcount[i]);
827 old_rp->rtort_pipe_count++; 718 old_rp->rtort_pipe_count++;
828 if (gp_normal == gp_exp) 719 if (gp_normal == gp_exp)
829 exp = !!(rcu_random(&rand) & 0x80); 720 exp = !!(torture_random(&rand) & 0x80);
830 else 721 else
831 exp = gp_exp; 722 exp = gp_exp;
832 if (!exp) { 723 if (!exp) {
@@ -852,12 +743,9 @@ rcu_torture_writer(void *arg)
852 } 743 }
853 } 744 }
854 rcutorture_record_progress(++rcu_torture_current_version); 745 rcutorture_record_progress(++rcu_torture_current_version);
855 rcu_stutter_wait("rcu_torture_writer"); 746 stutter_wait("rcu_torture_writer");
856 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 747 } while (!torture_must_stop());
857 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 748 torture_kthread_stopping("rcu_torture_writer");
858 rcutorture_shutdown_absorb("rcu_torture_writer");
859 while (!kthread_should_stop())
860 schedule_timeout_uninterruptible(1);
861 return 0; 749 return 0;
862} 750}
863 751
@@ -868,19 +756,19 @@ rcu_torture_writer(void *arg)
868static int 756static int
869rcu_torture_fakewriter(void *arg) 757rcu_torture_fakewriter(void *arg)
870{ 758{
871 DEFINE_RCU_RANDOM(rand); 759 DEFINE_TORTURE_RANDOM(rand);
872 760
873 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); 761 VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
874 set_user_nice(current, 19); 762 set_user_nice(current, MAX_NICE);
875 763
876 do { 764 do {
877 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 765 schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
878 udelay(rcu_random(&rand) & 0x3ff); 766 udelay(torture_random(&rand) & 0x3ff);
879 if (cur_ops->cb_barrier != NULL && 767 if (cur_ops->cb_barrier != NULL &&
880 rcu_random(&rand) % (nfakewriters * 8) == 0) { 768 torture_random(&rand) % (nfakewriters * 8) == 0) {
881 cur_ops->cb_barrier(); 769 cur_ops->cb_barrier();
882 } else if (gp_normal == gp_exp) { 770 } else if (gp_normal == gp_exp) {
883 if (rcu_random(&rand) & 0x80) 771 if (torture_random(&rand) & 0x80)
884 cur_ops->sync(); 772 cur_ops->sync();
885 else 773 else
886 cur_ops->exp_sync(); 774 cur_ops->exp_sync();
@@ -889,13 +777,10 @@ rcu_torture_fakewriter(void *arg)
889 } else { 777 } else {
890 cur_ops->exp_sync(); 778 cur_ops->exp_sync();
891 } 779 }
892 rcu_stutter_wait("rcu_torture_fakewriter"); 780 stutter_wait("rcu_torture_fakewriter");
893 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 781 } while (!torture_must_stop());
894 782
895 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); 783 torture_kthread_stopping("rcu_torture_fakewriter");
896 rcutorture_shutdown_absorb("rcu_torture_fakewriter");
897 while (!kthread_should_stop())
898 schedule_timeout_uninterruptible(1);
899 return 0; 784 return 0;
900} 785}
901 786
@@ -921,7 +806,7 @@ static void rcu_torture_timer(unsigned long unused)
921 int idx; 806 int idx;
922 int completed; 807 int completed;
923 int completed_end; 808 int completed_end;
924 static DEFINE_RCU_RANDOM(rand); 809 static DEFINE_TORTURE_RANDOM(rand);
925 static DEFINE_SPINLOCK(rand_lock); 810 static DEFINE_SPINLOCK(rand_lock);
926 struct rcu_torture *p; 811 struct rcu_torture *p;
927 int pipe_count; 812 int pipe_count;
@@ -980,14 +865,14 @@ rcu_torture_reader(void *arg)
980 int completed; 865 int completed;
981 int completed_end; 866 int completed_end;
982 int idx; 867 int idx;
983 DEFINE_RCU_RANDOM(rand); 868 DEFINE_TORTURE_RANDOM(rand);
984 struct rcu_torture *p; 869 struct rcu_torture *p;
985 int pipe_count; 870 int pipe_count;
986 struct timer_list t; 871 struct timer_list t;
987 unsigned long long ts; 872 unsigned long long ts;
988 873
989 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 874 VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
990 set_user_nice(current, 19); 875 set_user_nice(current, MAX_NICE);
991 if (irqreader && cur_ops->irq_capable) 876 if (irqreader && cur_ops->irq_capable)
992 setup_timer_on_stack(&t, rcu_torture_timer, 0); 877 setup_timer_on_stack(&t, rcu_torture_timer, 0);
993 878
@@ -1034,14 +919,11 @@ rcu_torture_reader(void *arg)
1034 preempt_enable(); 919 preempt_enable();
1035 cur_ops->readunlock(idx); 920 cur_ops->readunlock(idx);
1036 schedule(); 921 schedule();
1037 rcu_stutter_wait("rcu_torture_reader"); 922 stutter_wait("rcu_torture_reader");
1038 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 923 } while (!torture_must_stop());
1039 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
1040 rcutorture_shutdown_absorb("rcu_torture_reader");
1041 if (irqreader && cur_ops->irq_capable) 924 if (irqreader && cur_ops->irq_capable)
1042 del_timer_sync(&t); 925 del_timer_sync(&t);
1043 while (!kthread_should_stop()) 926 torture_kthread_stopping("rcu_torture_reader");
1044 schedule_timeout_uninterruptible(1);
1045 return 0; 927 return 0;
1046} 928}
1047 929
@@ -1083,13 +965,7 @@ rcu_torture_printk(char *page)
1083 n_rcu_torture_boost_failure, 965 n_rcu_torture_boost_failure,
1084 n_rcu_torture_boosts, 966 n_rcu_torture_boosts,
1085 n_rcu_torture_timers); 967 n_rcu_torture_timers);
1086 page += sprintf(page, 968 page = torture_onoff_stats(page);
1087 "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
1088 n_online_successes, n_online_attempts,
1089 n_offline_successes, n_offline_attempts,
1090 min_online, max_online,
1091 min_offline, max_offline,
1092 sum_online, sum_offline, HZ);
1093 page += sprintf(page, "barrier: %ld/%ld:%ld", 969 page += sprintf(page, "barrier: %ld/%ld:%ld",
1094 n_barrier_successes, 970 n_barrier_successes,
1095 n_barrier_attempts, 971 n_barrier_attempts,
@@ -1150,123 +1026,17 @@ rcu_torture_stats_print(void)
1150/* 1026/*
1151 * Periodically prints torture statistics, if periodic statistics printing 1027 * Periodically prints torture statistics, if periodic statistics printing
1152 * was specified via the stat_interval module parameter. 1028 * was specified via the stat_interval module parameter.
1153 *
1154 * No need to worry about fullstop here, since this one doesn't reference
1155 * volatile state or register callbacks.
1156 */ 1029 */
1157static int 1030static int
1158rcu_torture_stats(void *arg) 1031rcu_torture_stats(void *arg)
1159{ 1032{
1160 VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); 1033 VERBOSE_TOROUT_STRING("rcu_torture_stats task started");
1161 do { 1034 do {
1162 schedule_timeout_interruptible(stat_interval * HZ); 1035 schedule_timeout_interruptible(stat_interval * HZ);
1163 rcu_torture_stats_print(); 1036 rcu_torture_stats_print();
1164 rcutorture_shutdown_absorb("rcu_torture_stats"); 1037 torture_shutdown_absorb("rcu_torture_stats");
1165 } while (!kthread_should_stop()); 1038 } while (!torture_must_stop());
1166 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); 1039 torture_kthread_stopping("rcu_torture_stats");
1167 return 0;
1168}
1169
1170static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
1171
1172/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
1173 * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
1174 */
1175static void rcu_torture_shuffle_tasks(void)
1176{
1177 int i;
1178
1179 cpumask_setall(shuffle_tmp_mask);
1180 get_online_cpus();
1181
1182 /* No point in shuffling if there is only one online CPU (ex: UP) */
1183 if (num_online_cpus() == 1) {
1184 put_online_cpus();
1185 return;
1186 }
1187
1188 if (rcu_idle_cpu != -1)
1189 cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
1190
1191 set_cpus_allowed_ptr(current, shuffle_tmp_mask);
1192
1193 if (reader_tasks) {
1194 for (i = 0; i < nrealreaders; i++)
1195 if (reader_tasks[i])
1196 set_cpus_allowed_ptr(reader_tasks[i],
1197 shuffle_tmp_mask);
1198 }
1199 if (fakewriter_tasks) {
1200 for (i = 0; i < nfakewriters; i++)
1201 if (fakewriter_tasks[i])
1202 set_cpus_allowed_ptr(fakewriter_tasks[i],
1203 shuffle_tmp_mask);
1204 }
1205 if (writer_task)
1206 set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
1207 if (stats_task)
1208 set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
1209 if (stutter_task)
1210 set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
1211 if (fqs_task)
1212 set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
1213 if (shutdown_task)
1214 set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
1215#ifdef CONFIG_HOTPLUG_CPU
1216 if (onoff_task)
1217 set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
1218#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1219 if (stall_task)
1220 set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
1221 if (barrier_cbs_tasks)
1222 for (i = 0; i < n_barrier_cbs; i++)
1223 if (barrier_cbs_tasks[i])
1224 set_cpus_allowed_ptr(barrier_cbs_tasks[i],
1225 shuffle_tmp_mask);
1226 if (barrier_task)
1227 set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
1228
1229 if (rcu_idle_cpu == -1)
1230 rcu_idle_cpu = num_online_cpus() - 1;
1231 else
1232 rcu_idle_cpu--;
1233
1234 put_online_cpus();
1235}
1236
1237/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
1238 * system to become idle at a time and cut off its timer ticks. This is meant
1239 * to test the support for such tickless idle CPU in RCU.
1240 */
1241static int
1242rcu_torture_shuffle(void *arg)
1243{
1244 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
1245 do {
1246 schedule_timeout_interruptible(shuffle_interval * HZ);
1247 rcu_torture_shuffle_tasks();
1248 rcutorture_shutdown_absorb("rcu_torture_shuffle");
1249 } while (!kthread_should_stop());
1250 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
1251 return 0;
1252}
1253
1254/* Cause the rcutorture test to "stutter", starting and stopping all
1255 * threads periodically.
1256 */
1257static int
1258rcu_torture_stutter(void *arg)
1259{
1260 VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
1261 do {
1262 schedule_timeout_interruptible(stutter * HZ);
1263 stutter_pause_test = 1;
1264 if (!kthread_should_stop())
1265 schedule_timeout_interruptible(stutter * HZ);
1266 stutter_pause_test = 0;
1267 rcutorture_shutdown_absorb("rcu_torture_stutter");
1268 } while (!kthread_should_stop());
1269 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
1270 return 0; 1040 return 0;
1271} 1041}
1272 1042
@@ -1293,10 +1063,6 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
1293 onoff_interval, onoff_holdoff); 1063 onoff_interval, onoff_holdoff);
1294} 1064}
1295 1065
1296static struct notifier_block rcutorture_shutdown_nb = {
1297 .notifier_call = rcutorture_shutdown_notify,
1298};
1299
1300static void rcutorture_booster_cleanup(int cpu) 1066static void rcutorture_booster_cleanup(int cpu)
1301{ 1067{
1302 struct task_struct *t; 1068 struct task_struct *t;
@@ -1304,14 +1070,12 @@ static void rcutorture_booster_cleanup(int cpu)
1304 if (boost_tasks[cpu] == NULL) 1070 if (boost_tasks[cpu] == NULL)
1305 return; 1071 return;
1306 mutex_lock(&boost_mutex); 1072 mutex_lock(&boost_mutex);
1307 VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
1308 t = boost_tasks[cpu]; 1073 t = boost_tasks[cpu];
1309 boost_tasks[cpu] = NULL; 1074 boost_tasks[cpu] = NULL;
1310 mutex_unlock(&boost_mutex); 1075 mutex_unlock(&boost_mutex);
1311 1076
1312 /* This must be outside of the mutex, otherwise deadlock! */ 1077 /* This must be outside of the mutex, otherwise deadlock! */
1313 kthread_stop(t); 1078 torture_stop_kthread(rcu_torture_boost, t);
1314 boost_tasks[cpu] = NULL;
1315} 1079}
1316 1080
1317static int rcutorture_booster_init(int cpu) 1081static int rcutorture_booster_init(int cpu)
@@ -1323,13 +1087,13 @@ static int rcutorture_booster_init(int cpu)
1323 1087
1324 /* Don't allow time recalculation while creating a new task. */ 1088 /* Don't allow time recalculation while creating a new task. */
1325 mutex_lock(&boost_mutex); 1089 mutex_lock(&boost_mutex);
1326 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); 1090 VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
1327 boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, 1091 boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
1328 cpu_to_node(cpu), 1092 cpu_to_node(cpu),
1329 "rcu_torture_boost"); 1093 "rcu_torture_boost");
1330 if (IS_ERR(boost_tasks[cpu])) { 1094 if (IS_ERR(boost_tasks[cpu])) {
1331 retval = PTR_ERR(boost_tasks[cpu]); 1095 retval = PTR_ERR(boost_tasks[cpu]);
1332 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); 1096 VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
1333 n_rcu_torture_boost_ktrerror++; 1097 n_rcu_torture_boost_ktrerror++;
1334 boost_tasks[cpu] = NULL; 1098 boost_tasks[cpu] = NULL;
1335 mutex_unlock(&boost_mutex); 1099 mutex_unlock(&boost_mutex);
@@ -1342,175 +1106,6 @@ static int rcutorture_booster_init(int cpu)
1342} 1106}
1343 1107
1344/* 1108/*
1345 * Cause the rcutorture test to shutdown the system after the test has
1346 * run for the time specified by the shutdown_secs module parameter.
1347 */
1348static int
1349rcu_torture_shutdown(void *arg)
1350{
1351 long delta;
1352 unsigned long jiffies_snap;
1353
1354 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
1355 jiffies_snap = ACCESS_ONCE(jiffies);
1356 while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
1357 !kthread_should_stop()) {
1358 delta = shutdown_time - jiffies_snap;
1359 if (verbose)
1360 pr_alert("%s" TORTURE_FLAG
1361 "rcu_torture_shutdown task: %lu jiffies remaining\n",
1362 torture_type, delta);
1363 schedule_timeout_interruptible(delta);
1364 jiffies_snap = ACCESS_ONCE(jiffies);
1365 }
1366 if (kthread_should_stop()) {
1367 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
1368 return 0;
1369 }
1370
1371 /* OK, shut down the system. */
1372
1373 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
1374 shutdown_task = NULL; /* Avoid self-kill deadlock. */
1375 rcu_torture_cleanup(); /* Get the success/failure message. */
1376 kernel_power_off(); /* Shut down the system. */
1377 return 0;
1378}
1379
1380#ifdef CONFIG_HOTPLUG_CPU
1381
1382/*
1383 * Execute random CPU-hotplug operations at the interval specified
1384 * by the onoff_interval.
1385 */
1386static int
1387rcu_torture_onoff(void *arg)
1388{
1389 int cpu;
1390 unsigned long delta;
1391 int maxcpu = -1;
1392 DEFINE_RCU_RANDOM(rand);
1393 int ret;
1394 unsigned long starttime;
1395
1396 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
1397 for_each_online_cpu(cpu)
1398 maxcpu = cpu;
1399 WARN_ON(maxcpu < 0);
1400 if (onoff_holdoff > 0) {
1401 VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff");
1402 schedule_timeout_interruptible(onoff_holdoff * HZ);
1403 VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff");
1404 }
1405 while (!kthread_should_stop()) {
1406 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
1407 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
1408 if (verbose)
1409 pr_alert("%s" TORTURE_FLAG
1410 "rcu_torture_onoff task: offlining %d\n",
1411 torture_type, cpu);
1412 starttime = jiffies;
1413 n_offline_attempts++;
1414 ret = cpu_down(cpu);
1415 if (ret) {
1416 if (verbose)
1417 pr_alert("%s" TORTURE_FLAG
1418 "rcu_torture_onoff task: offline %d failed: errno %d\n",
1419 torture_type, cpu, ret);
1420 } else {
1421 if (verbose)
1422 pr_alert("%s" TORTURE_FLAG
1423 "rcu_torture_onoff task: offlined %d\n",
1424 torture_type, cpu);
1425 n_offline_successes++;
1426 delta = jiffies - starttime;
1427 sum_offline += delta;
1428 if (min_offline < 0) {
1429 min_offline = delta;
1430 max_offline = delta;
1431 }
1432 if (min_offline > delta)
1433 min_offline = delta;
1434 if (max_offline < delta)
1435 max_offline = delta;
1436 }
1437 } else if (cpu_is_hotpluggable(cpu)) {
1438 if (verbose)
1439 pr_alert("%s" TORTURE_FLAG
1440 "rcu_torture_onoff task: onlining %d\n",
1441 torture_type, cpu);
1442 starttime = jiffies;
1443 n_online_attempts++;
1444 ret = cpu_up(cpu);
1445 if (ret) {
1446 if (verbose)
1447 pr_alert("%s" TORTURE_FLAG
1448 "rcu_torture_onoff task: online %d failed: errno %d\n",
1449 torture_type, cpu, ret);
1450 } else {
1451 if (verbose)
1452 pr_alert("%s" TORTURE_FLAG
1453 "rcu_torture_onoff task: onlined %d\n",
1454 torture_type, cpu);
1455 n_online_successes++;
1456 delta = jiffies - starttime;
1457 sum_online += delta;
1458 if (min_online < 0) {
1459 min_online = delta;
1460 max_online = delta;
1461 }
1462 if (min_online > delta)
1463 min_online = delta;
1464 if (max_online < delta)
1465 max_online = delta;
1466 }
1467 }
1468 schedule_timeout_interruptible(onoff_interval * HZ);
1469 }
1470 VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
1471 return 0;
1472}
1473
1474static int
1475rcu_torture_onoff_init(void)
1476{
1477 int ret;
1478
1479 if (onoff_interval <= 0)
1480 return 0;
1481 onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
1482 if (IS_ERR(onoff_task)) {
1483 ret = PTR_ERR(onoff_task);
1484 onoff_task = NULL;
1485 return ret;
1486 }
1487 return 0;
1488}
1489
1490static void rcu_torture_onoff_cleanup(void)
1491{
1492 if (onoff_task == NULL)
1493 return;
1494 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
1495 kthread_stop(onoff_task);
1496 onoff_task = NULL;
1497}
1498
1499#else /* #ifdef CONFIG_HOTPLUG_CPU */
1500
1501static int
1502rcu_torture_onoff_init(void)
1503{
1504 return 0;
1505}
1506
1507static void rcu_torture_onoff_cleanup(void)
1508{
1509}
1510
1511#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1512
1513/*
1514 * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then 1109 * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
1515 * induces a CPU stall for the time specified by stall_cpu. 1110 * induces a CPU stall for the time specified by stall_cpu.
1516 */ 1111 */
@@ -1518,11 +1113,11 @@ static int rcu_torture_stall(void *args)
1518{ 1113{
1519 unsigned long stop_at; 1114 unsigned long stop_at;
1520 1115
1521 VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); 1116 VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
1522 if (stall_cpu_holdoff > 0) { 1117 if (stall_cpu_holdoff > 0) {
1523 VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); 1118 VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
1524 schedule_timeout_interruptible(stall_cpu_holdoff * HZ); 1119 schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
1525 VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); 1120 VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff");
1526 } 1121 }
1527 if (!kthread_should_stop()) { 1122 if (!kthread_should_stop()) {
1528 stop_at = get_seconds() + stall_cpu; 1123 stop_at = get_seconds() + stall_cpu;
@@ -1536,7 +1131,7 @@ static int rcu_torture_stall(void *args)
1536 rcu_read_unlock(); 1131 rcu_read_unlock();
1537 pr_alert("rcu_torture_stall end.\n"); 1132 pr_alert("rcu_torture_stall end.\n");
1538 } 1133 }
1539 rcutorture_shutdown_absorb("rcu_torture_stall"); 1134 torture_shutdown_absorb("rcu_torture_stall");
1540 while (!kthread_should_stop()) 1135 while (!kthread_should_stop())
1541 schedule_timeout_interruptible(10 * HZ); 1136 schedule_timeout_interruptible(10 * HZ);
1542 return 0; 1137 return 0;
@@ -1545,27 +1140,9 @@ static int rcu_torture_stall(void *args)
1545/* Spawn CPU-stall kthread, if stall_cpu specified. */ 1140/* Spawn CPU-stall kthread, if stall_cpu specified. */
1546static int __init rcu_torture_stall_init(void) 1141static int __init rcu_torture_stall_init(void)
1547{ 1142{
1548 int ret;
1549
1550 if (stall_cpu <= 0) 1143 if (stall_cpu <= 0)
1551 return 0; 1144 return 0;
1552 stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); 1145 return torture_create_kthread(rcu_torture_stall, NULL, stall_task);
1553 if (IS_ERR(stall_task)) {
1554 ret = PTR_ERR(stall_task);
1555 stall_task = NULL;
1556 return ret;
1557 }
1558 return 0;
1559}
1560
1561/* Clean up after the CPU-stall kthread, if one was spawned. */
1562static void rcu_torture_stall_cleanup(void)
1563{
1564 if (stall_task == NULL)
1565 return;
1566 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
1567 kthread_stop(stall_task);
1568 stall_task = NULL;
1569} 1146}
1570 1147
1571/* Callback function for RCU barrier testing. */ 1148/* Callback function for RCU barrier testing. */
@@ -1583,28 +1160,24 @@ static int rcu_torture_barrier_cbs(void *arg)
1583 struct rcu_head rcu; 1160 struct rcu_head rcu;
1584 1161
1585 init_rcu_head_on_stack(&rcu); 1162 init_rcu_head_on_stack(&rcu);
1586 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); 1163 VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started");
1587 set_user_nice(current, 19); 1164 set_user_nice(current, MAX_NICE);
1588 do { 1165 do {
1589 wait_event(barrier_cbs_wq[myid], 1166 wait_event(barrier_cbs_wq[myid],
1590 (newphase = 1167 (newphase =
1591 ACCESS_ONCE(barrier_phase)) != lastphase || 1168 ACCESS_ONCE(barrier_phase)) != lastphase ||
1592 kthread_should_stop() || 1169 torture_must_stop());
1593 fullstop != FULLSTOP_DONTSTOP);
1594 lastphase = newphase; 1170 lastphase = newphase;
1595 smp_mb(); /* ensure barrier_phase load before ->call(). */ 1171 smp_mb(); /* ensure barrier_phase load before ->call(). */
1596 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) 1172 if (torture_must_stop())
1597 break; 1173 break;
1598 cur_ops->call(&rcu, rcu_torture_barrier_cbf); 1174 cur_ops->call(&rcu, rcu_torture_barrier_cbf);
1599 if (atomic_dec_and_test(&barrier_cbs_count)) 1175 if (atomic_dec_and_test(&barrier_cbs_count))
1600 wake_up(&barrier_wq); 1176 wake_up(&barrier_wq);
1601 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 1177 } while (!torture_must_stop());
1602 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
1603 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
1604 while (!kthread_should_stop())
1605 schedule_timeout_interruptible(1);
1606 cur_ops->cb_barrier(); 1178 cur_ops->cb_barrier();
1607 destroy_rcu_head_on_stack(&rcu); 1179 destroy_rcu_head_on_stack(&rcu);
1180 torture_kthread_stopping("rcu_torture_barrier_cbs");
1608 return 0; 1181 return 0;
1609} 1182}
1610 1183
@@ -1613,7 +1186,7 @@ static int rcu_torture_barrier(void *arg)
1613{ 1186{
1614 int i; 1187 int i;
1615 1188
1616 VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); 1189 VERBOSE_TOROUT_STRING("rcu_torture_barrier task starting");
1617 do { 1190 do {
1618 atomic_set(&barrier_cbs_invoked, 0); 1191 atomic_set(&barrier_cbs_invoked, 0);
1619 atomic_set(&barrier_cbs_count, n_barrier_cbs); 1192 atomic_set(&barrier_cbs_count, n_barrier_cbs);
@@ -1623,9 +1196,8 @@ static int rcu_torture_barrier(void *arg)
1623 wake_up(&barrier_cbs_wq[i]); 1196 wake_up(&barrier_cbs_wq[i]);
1624 wait_event(barrier_wq, 1197 wait_event(barrier_wq,
1625 atomic_read(&barrier_cbs_count) == 0 || 1198 atomic_read(&barrier_cbs_count) == 0 ||
1626 kthread_should_stop() || 1199 torture_must_stop());
1627 fullstop != FULLSTOP_DONTSTOP); 1200 if (torture_must_stop())
1628 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1629 break; 1201 break;
1630 n_barrier_attempts++; 1202 n_barrier_attempts++;
1631 cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ 1203 cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
@@ -1635,11 +1207,8 @@ static int rcu_torture_barrier(void *arg)
1635 } 1207 }
1636 n_barrier_successes++; 1208 n_barrier_successes++;
1637 schedule_timeout_interruptible(HZ / 10); 1209 schedule_timeout_interruptible(HZ / 10);
1638 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 1210 } while (!torture_must_stop());
1639 VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); 1211 torture_kthread_stopping("rcu_torture_barrier");
1640 rcutorture_shutdown_absorb("rcu_torture_barrier");
1641 while (!kthread_should_stop())
1642 schedule_timeout_interruptible(1);
1643 return 0; 1212 return 0;
1644} 1213}
1645 1214
@@ -1672,24 +1241,13 @@ static int rcu_torture_barrier_init(void)
1672 return -ENOMEM; 1241 return -ENOMEM;
1673 for (i = 0; i < n_barrier_cbs; i++) { 1242 for (i = 0; i < n_barrier_cbs; i++) {
1674 init_waitqueue_head(&barrier_cbs_wq[i]); 1243 init_waitqueue_head(&barrier_cbs_wq[i]);
1675 barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, 1244 ret = torture_create_kthread(rcu_torture_barrier_cbs,
1676 (void *)(long)i, 1245 (void *)(long)i,
1677 "rcu_torture_barrier_cbs"); 1246 barrier_cbs_tasks[i]);
1678 if (IS_ERR(barrier_cbs_tasks[i])) { 1247 if (ret)
1679 ret = PTR_ERR(barrier_cbs_tasks[i]);
1680 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
1681 barrier_cbs_tasks[i] = NULL;
1682 return ret; 1248 return ret;
1683 }
1684 } 1249 }
1685 barrier_task = kthread_run(rcu_torture_barrier, NULL, 1250 return torture_create_kthread(rcu_torture_barrier, NULL, barrier_task);
1686 "rcu_torture_barrier");
1687 if (IS_ERR(barrier_task)) {
1688 ret = PTR_ERR(barrier_task);
1689 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
1690 barrier_task = NULL;
1691 }
1692 return 0;
1693} 1251}
1694 1252
1695/* Clean up after RCU barrier testing. */ 1253/* Clean up after RCU barrier testing. */
@@ -1697,19 +1255,11 @@ static void rcu_torture_barrier_cleanup(void)
1697{ 1255{
1698 int i; 1256 int i;
1699 1257
1700 if (barrier_task != NULL) { 1258 torture_stop_kthread(rcu_torture_barrier, barrier_task);
1701 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
1702 kthread_stop(barrier_task);
1703 barrier_task = NULL;
1704 }
1705 if (barrier_cbs_tasks != NULL) { 1259 if (barrier_cbs_tasks != NULL) {
1706 for (i = 0; i < n_barrier_cbs; i++) { 1260 for (i = 0; i < n_barrier_cbs; i++)
1707 if (barrier_cbs_tasks[i] != NULL) { 1261 torture_stop_kthread(rcu_torture_barrier_cbs,
1708 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); 1262 barrier_cbs_tasks[i]);
1709 kthread_stop(barrier_cbs_tasks[i]);
1710 barrier_cbs_tasks[i] = NULL;
1711 }
1712 }
1713 kfree(barrier_cbs_tasks); 1263 kfree(barrier_cbs_tasks);
1714 barrier_cbs_tasks = NULL; 1264 barrier_cbs_tasks = NULL;
1715 } 1265 }
@@ -1747,90 +1297,42 @@ rcu_torture_cleanup(void)
1747{ 1297{
1748 int i; 1298 int i;
1749 1299
1750 mutex_lock(&fullstop_mutex);
1751 rcutorture_record_test_transition(); 1300 rcutorture_record_test_transition();
1752 if (fullstop == FULLSTOP_SHUTDOWN) { 1301 if (torture_cleanup()) {
1753 pr_warn(/* but going down anyway, so... */
1754 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
1755 mutex_unlock(&fullstop_mutex);
1756 schedule_timeout_uninterruptible(10);
1757 if (cur_ops->cb_barrier != NULL) 1302 if (cur_ops->cb_barrier != NULL)
1758 cur_ops->cb_barrier(); 1303 cur_ops->cb_barrier();
1759 return; 1304 return;
1760 } 1305 }
1761 fullstop = FULLSTOP_RMMOD;
1762 mutex_unlock(&fullstop_mutex);
1763 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1764 rcu_torture_barrier_cleanup();
1765 rcu_torture_stall_cleanup();
1766 if (stutter_task) {
1767 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
1768 kthread_stop(stutter_task);
1769 }
1770 stutter_task = NULL;
1771 if (shuffler_task) {
1772 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
1773 kthread_stop(shuffler_task);
1774 free_cpumask_var(shuffle_tmp_mask);
1775 }
1776 shuffler_task = NULL;
1777 1306
1778 if (writer_task) { 1307 rcu_torture_barrier_cleanup();
1779 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); 1308 torture_stop_kthread(rcu_torture_stall, stall_task);
1780 kthread_stop(writer_task); 1309 torture_stop_kthread(rcu_torture_writer, writer_task);
1781 }
1782 writer_task = NULL;
1783 1310
1784 if (reader_tasks) { 1311 if (reader_tasks) {
1785 for (i = 0; i < nrealreaders; i++) { 1312 for (i = 0; i < nrealreaders; i++)
1786 if (reader_tasks[i]) { 1313 torture_stop_kthread(rcu_torture_reader,
1787 VERBOSE_PRINTK_STRING( 1314 reader_tasks[i]);
1788 "Stopping rcu_torture_reader task");
1789 kthread_stop(reader_tasks[i]);
1790 }
1791 reader_tasks[i] = NULL;
1792 }
1793 kfree(reader_tasks); 1315 kfree(reader_tasks);
1794 reader_tasks = NULL;
1795 } 1316 }
1796 rcu_torture_current = NULL; 1317 rcu_torture_current = NULL;
1797 1318
1798 if (fakewriter_tasks) { 1319 if (fakewriter_tasks) {
1799 for (i = 0; i < nfakewriters; i++) { 1320 for (i = 0; i < nfakewriters; i++) {
1800 if (fakewriter_tasks[i]) { 1321 torture_stop_kthread(rcu_torture_fakewriter,
1801 VERBOSE_PRINTK_STRING( 1322 fakewriter_tasks[i]);
1802 "Stopping rcu_torture_fakewriter task");
1803 kthread_stop(fakewriter_tasks[i]);
1804 }
1805 fakewriter_tasks[i] = NULL;
1806 } 1323 }
1807 kfree(fakewriter_tasks); 1324 kfree(fakewriter_tasks);
1808 fakewriter_tasks = NULL; 1325 fakewriter_tasks = NULL;
1809 } 1326 }
1810 1327
1811 if (stats_task) { 1328 torture_stop_kthread(rcu_torture_stats, stats_task);
1812 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); 1329 torture_stop_kthread(rcu_torture_fqs, fqs_task);
1813 kthread_stop(stats_task);
1814 }
1815 stats_task = NULL;
1816
1817 if (fqs_task) {
1818 VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
1819 kthread_stop(fqs_task);
1820 }
1821 fqs_task = NULL;
1822 if ((test_boost == 1 && cur_ops->can_boost) || 1330 if ((test_boost == 1 && cur_ops->can_boost) ||
1823 test_boost == 2) { 1331 test_boost == 2) {
1824 unregister_cpu_notifier(&rcutorture_cpu_nb); 1332 unregister_cpu_notifier(&rcutorture_cpu_nb);
1825 for_each_possible_cpu(i) 1333 for_each_possible_cpu(i)
1826 rcutorture_booster_cleanup(i); 1334 rcutorture_booster_cleanup(i);
1827 } 1335 }
1828 if (shutdown_task != NULL) {
1829 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
1830 kthread_stop(shutdown_task);
1831 }
1832 shutdown_task = NULL;
1833 rcu_torture_onoff_cleanup();
1834 1336
1835 /* Wait for all RCU callbacks to fire. */ 1337 /* Wait for all RCU callbacks to fire. */
1836 1338
@@ -1841,8 +1343,7 @@ rcu_torture_cleanup(void)
1841 1343
1842 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) 1344 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
1843 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1345 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1844 else if (n_online_successes != n_online_attempts || 1346 else if (torture_onoff_failures())
1845 n_offline_successes != n_offline_attempts)
1846 rcu_torture_print_module_parms(cur_ops, 1347 rcu_torture_print_module_parms(cur_ops,
1847 "End of test: RCU_HOTPLUG"); 1348 "End of test: RCU_HOTPLUG");
1848 else 1349 else
@@ -1911,12 +1412,11 @@ rcu_torture_init(void)
1911 int i; 1412 int i;
1912 int cpu; 1413 int cpu;
1913 int firsterr = 0; 1414 int firsterr = 0;
1914 int retval;
1915 static struct rcu_torture_ops *torture_ops[] = { 1415 static struct rcu_torture_ops *torture_ops[] = {
1916 &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, 1416 &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
1917 }; 1417 };
1918 1418
1919 mutex_lock(&fullstop_mutex); 1419 torture_init_begin(torture_type, verbose, &rcutorture_runnable);
1920 1420
1921 /* Process args and tell the world that the torturer is on the job. */ 1421 /* Process args and tell the world that the torturer is on the job. */
1922 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 1422 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
@@ -1931,7 +1431,7 @@ rcu_torture_init(void)
1931 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) 1431 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1932 pr_alert(" %s", torture_ops[i]->name); 1432 pr_alert(" %s", torture_ops[i]->name);
1933 pr_alert("\n"); 1433 pr_alert("\n");
1934 mutex_unlock(&fullstop_mutex); 1434 torture_init_end();
1935 return -EINVAL; 1435 return -EINVAL;
1936 } 1436 }
1937 if (cur_ops->fqs == NULL && fqs_duration != 0) { 1437 if (cur_ops->fqs == NULL && fqs_duration != 0) {
@@ -1946,7 +1446,6 @@ rcu_torture_init(void)
1946 else 1446 else
1947 nrealreaders = 2 * num_online_cpus(); 1447 nrealreaders = 2 * num_online_cpus();
1948 rcu_torture_print_module_parms(cur_ops, "Start of test"); 1448 rcu_torture_print_module_parms(cur_ops, "Start of test");
1949 fullstop = FULLSTOP_DONTSTOP;
1950 1449
1951 /* Set up the freelist. */ 1450 /* Set up the freelist. */
1952 1451
@@ -1982,108 +1481,61 @@ rcu_torture_init(void)
1982 1481
1983 /* Start up the kthreads. */ 1482 /* Start up the kthreads. */
1984 1483
1985 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); 1484 firsterr = torture_create_kthread(rcu_torture_writer, NULL,
1986 writer_task = kthread_create(rcu_torture_writer, NULL, 1485 writer_task);
1987 "rcu_torture_writer"); 1486 if (firsterr)
1988 if (IS_ERR(writer_task)) {
1989 firsterr = PTR_ERR(writer_task);
1990 VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
1991 writer_task = NULL;
1992 goto unwind; 1487 goto unwind;
1993 }
1994 wake_up_process(writer_task);
1995 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), 1488 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
1996 GFP_KERNEL); 1489 GFP_KERNEL);
1997 if (fakewriter_tasks == NULL) { 1490 if (fakewriter_tasks == NULL) {
1998 VERBOSE_PRINTK_ERRSTRING("out of memory"); 1491 VERBOSE_TOROUT_ERRSTRING("out of memory");
1999 firsterr = -ENOMEM; 1492 firsterr = -ENOMEM;
2000 goto unwind; 1493 goto unwind;
2001 } 1494 }
2002 for (i = 0; i < nfakewriters; i++) { 1495 for (i = 0; i < nfakewriters; i++) {
2003 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); 1496 firsterr = torture_create_kthread(rcu_torture_fakewriter,
2004 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, 1497 NULL, fakewriter_tasks[i]);
2005 "rcu_torture_fakewriter"); 1498 if (firsterr)
2006 if (IS_ERR(fakewriter_tasks[i])) {
2007 firsterr = PTR_ERR(fakewriter_tasks[i]);
2008 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
2009 fakewriter_tasks[i] = NULL;
2010 goto unwind; 1499 goto unwind;
2011 }
2012 } 1500 }
2013 reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), 1501 reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
2014 GFP_KERNEL); 1502 GFP_KERNEL);
2015 if (reader_tasks == NULL) { 1503 if (reader_tasks == NULL) {
2016 VERBOSE_PRINTK_ERRSTRING("out of memory"); 1504 VERBOSE_TOROUT_ERRSTRING("out of memory");
2017 firsterr = -ENOMEM; 1505 firsterr = -ENOMEM;
2018 goto unwind; 1506 goto unwind;
2019 } 1507 }
2020 for (i = 0; i < nrealreaders; i++) { 1508 for (i = 0; i < nrealreaders; i++) {
2021 VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); 1509 firsterr = torture_create_kthread(rcu_torture_reader, NULL,
2022 reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, 1510 reader_tasks[i]);
2023 "rcu_torture_reader"); 1511 if (firsterr)
2024 if (IS_ERR(reader_tasks[i])) {
2025 firsterr = PTR_ERR(reader_tasks[i]);
2026 VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
2027 reader_tasks[i] = NULL;
2028 goto unwind; 1512 goto unwind;
2029 }
2030 } 1513 }
2031 if (stat_interval > 0) { 1514 if (stat_interval > 0) {
2032 VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); 1515 firsterr = torture_create_kthread(rcu_torture_stats, NULL,
2033 stats_task = kthread_run(rcu_torture_stats, NULL, 1516 stats_task);
2034 "rcu_torture_stats"); 1517 if (firsterr)
2035 if (IS_ERR(stats_task)) {
2036 firsterr = PTR_ERR(stats_task);
2037 VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
2038 stats_task = NULL;
2039 goto unwind; 1518 goto unwind;
2040 }
2041 } 1519 }
2042 if (test_no_idle_hz) { 1520 if (test_no_idle_hz) {
2043 rcu_idle_cpu = num_online_cpus() - 1; 1521 firsterr = torture_shuffle_init(shuffle_interval * HZ);
2044 1522 if (firsterr)
2045 if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
2046 firsterr = -ENOMEM;
2047 VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
2048 goto unwind;
2049 }
2050
2051 /* Create the shuffler thread */
2052 shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
2053 "rcu_torture_shuffle");
2054 if (IS_ERR(shuffler_task)) {
2055 free_cpumask_var(shuffle_tmp_mask);
2056 firsterr = PTR_ERR(shuffler_task);
2057 VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
2058 shuffler_task = NULL;
2059 goto unwind; 1523 goto unwind;
2060 }
2061 } 1524 }
2062 if (stutter < 0) 1525 if (stutter < 0)
2063 stutter = 0; 1526 stutter = 0;
2064 if (stutter) { 1527 if (stutter) {
2065 /* Create the stutter thread */ 1528 firsterr = torture_stutter_init(stutter * HZ);
2066 stutter_task = kthread_run(rcu_torture_stutter, NULL, 1529 if (firsterr)
2067 "rcu_torture_stutter");
2068 if (IS_ERR(stutter_task)) {
2069 firsterr = PTR_ERR(stutter_task);
2070 VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
2071 stutter_task = NULL;
2072 goto unwind; 1530 goto unwind;
2073 }
2074 } 1531 }
2075 if (fqs_duration < 0) 1532 if (fqs_duration < 0)
2076 fqs_duration = 0; 1533 fqs_duration = 0;
2077 if (fqs_duration) { 1534 if (fqs_duration) {
2078 /* Create the stutter thread */ 1535 /* Create the fqs thread */
2079 fqs_task = kthread_run(rcu_torture_fqs, NULL, 1536 torture_create_kthread(rcu_torture_fqs, NULL, fqs_task);
2080 "rcu_torture_fqs"); 1537 if (firsterr)
2081 if (IS_ERR(fqs_task)) {
2082 firsterr = PTR_ERR(fqs_task);
2083 VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
2084 fqs_task = NULL;
2085 goto unwind; 1538 goto unwind;
2086 }
2087 } 1539 }
2088 if (test_boost_interval < 1) 1540 if (test_boost_interval < 1)
2089 test_boost_interval = 1; 1541 test_boost_interval = 1;
@@ -2097,49 +1549,31 @@ rcu_torture_init(void)
2097 for_each_possible_cpu(i) { 1549 for_each_possible_cpu(i) {
2098 if (cpu_is_offline(i)) 1550 if (cpu_is_offline(i))
2099 continue; /* Heuristic: CPU can go offline. */ 1551 continue; /* Heuristic: CPU can go offline. */
2100 retval = rcutorture_booster_init(i); 1552 firsterr = rcutorture_booster_init(i);
2101 if (retval < 0) { 1553 if (firsterr)
2102 firsterr = retval;
2103 goto unwind; 1554 goto unwind;
2104 }
2105 } 1555 }
2106 } 1556 }
2107 if (shutdown_secs > 0) { 1557 firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
2108 shutdown_time = jiffies + shutdown_secs * HZ; 1558 if (firsterr)
2109 shutdown_task = kthread_create(rcu_torture_shutdown, NULL,
2110 "rcu_torture_shutdown");
2111 if (IS_ERR(shutdown_task)) {
2112 firsterr = PTR_ERR(shutdown_task);
2113 VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
2114 shutdown_task = NULL;
2115 goto unwind;
2116 }
2117 wake_up_process(shutdown_task);
2118 }
2119 i = rcu_torture_onoff_init();
2120 if (i != 0) {
2121 firsterr = i;
2122 goto unwind; 1559 goto unwind;
2123 } 1560 firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ);
2124 register_reboot_notifier(&rcutorture_shutdown_nb); 1561 if (firsterr)
2125 i = rcu_torture_stall_init();
2126 if (i != 0) {
2127 firsterr = i;
2128 goto unwind; 1562 goto unwind;
2129 } 1563 firsterr = rcu_torture_stall_init();
2130 retval = rcu_torture_barrier_init(); 1564 if (firsterr)
2131 if (retval != 0) { 1565 goto unwind;
2132 firsterr = retval; 1566 firsterr = rcu_torture_barrier_init();
1567 if (firsterr)
2133 goto unwind; 1568 goto unwind;
2134 }
2135 if (object_debug) 1569 if (object_debug)
2136 rcu_test_debug_objects(); 1570 rcu_test_debug_objects();
2137 rcutorture_record_test_transition(); 1571 rcutorture_record_test_transition();
2138 mutex_unlock(&fullstop_mutex); 1572 torture_init_end();
2139 return 0; 1573 return 0;
2140 1574
2141unwind: 1575unwind:
2142 mutex_unlock(&fullstop_mutex); 1576 torture_init_end();
2143 rcu_torture_cleanup(); 1577 rcu_torture_cleanup();
2144 return firsterr; 1578 return firsterr;
2145} 1579}
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 3318d8284384..c639556f3fa0 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -12,8 +12,8 @@
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, you can access it online at
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2006 18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012 19 * Copyright (C) Fujitsu, 2012
@@ -36,8 +36,6 @@
36#include <linux/delay.h> 36#include <linux/delay.h>
37#include <linux/srcu.h> 37#include <linux/srcu.h>
38 38
39#include <trace/events/rcu.h>
40
41#include "rcu.h" 39#include "rcu.h"
42 40
43/* 41/*
@@ -398,7 +396,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
398 rcu_batch_queue(&sp->batch_queue, head); 396 rcu_batch_queue(&sp->batch_queue, head);
399 if (!sp->running) { 397 if (!sp->running) {
400 sp->running = true; 398 sp->running = true;
401 schedule_delayed_work(&sp->work, 0); 399 queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
402 } 400 }
403 spin_unlock_irqrestore(&sp->queue_lock, flags); 401 spin_unlock_irqrestore(&sp->queue_lock, flags);
404} 402}
@@ -674,7 +672,8 @@ static void srcu_reschedule(struct srcu_struct *sp)
674 } 672 }
675 673
676 if (pending) 674 if (pending)
677 schedule_delayed_work(&sp->work, SRCU_INTERVAL); 675 queue_delayed_work(system_power_efficient_wq,
676 &sp->work, SRCU_INTERVAL);
678} 677}
679 678
680/* 679/*
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 1254f312d024..d9efcc13008c 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -12,8 +12,8 @@
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, you can access it online at
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 * 17 *
18 * Copyright IBM Corporation, 2008 18 * Copyright IBM Corporation, 2008
19 * 19 *
@@ -37,10 +37,6 @@
37#include <linux/prefetch.h> 37#include <linux/prefetch.h>
38#include <linux/ftrace_event.h> 38#include <linux/ftrace_event.h>
39 39
40#ifdef CONFIG_RCU_TRACE
41#include <trace/events/rcu.h>
42#endif /* #else #ifdef CONFIG_RCU_TRACE */
43
44#include "rcu.h" 40#include "rcu.h"
45 41
46/* Forward declarations for tiny_plugin.h. */ 42/* Forward declarations for tiny_plugin.h. */
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 280d06cae352..431528520562 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -14,8 +14,8 @@
14 * GNU General Public License for more details. 14 * GNU General Public License for more details.
15 * 15 *
16 * You should have received a copy of the GNU General Public License 16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, you can access it online at
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 18 * http://www.gnu.org/licenses/gpl-2.0.html.
19 * 19 *
20 * Copyright (c) 2010 Linaro 20 * Copyright (c) 2010 Linaro
21 * 21 *
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b3d116cd072d..0c47e300210a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -12,8 +12,8 @@
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, you can access it online at
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 * 17 *
18 * Copyright IBM Corporation, 2008 18 * Copyright IBM Corporation, 2008
19 * 19 *
@@ -58,8 +58,6 @@
58#include <linux/suspend.h> 58#include <linux/suspend.h>
59 59
60#include "tree.h" 60#include "tree.h"
61#include <trace/events/rcu.h>
62
63#include "rcu.h" 61#include "rcu.h"
64 62
65MODULE_ALIAS("rcutree"); 63MODULE_ALIAS("rcutree");
@@ -837,7 +835,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
837 * to the next. Only do this for the primary flavor of RCU. 835 * to the next. Only do this for the primary flavor of RCU.
838 */ 836 */
839 if (rdp->rsp == rcu_state && 837 if (rdp->rsp == rcu_state &&
840 ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) { 838 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
841 rdp->rsp->jiffies_resched += 5; 839 rdp->rsp->jiffies_resched += 5;
842 resched_cpu(rdp->cpu); 840 resched_cpu(rdp->cpu);
843 } 841 }
@@ -847,7 +845,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
847 845
848static void record_gp_stall_check_time(struct rcu_state *rsp) 846static void record_gp_stall_check_time(struct rcu_state *rsp)
849{ 847{
850 unsigned long j = ACCESS_ONCE(jiffies); 848 unsigned long j = jiffies;
851 unsigned long j1; 849 unsigned long j1;
852 850
853 rsp->gp_start = j; 851 rsp->gp_start = j;
@@ -1005,7 +1003,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
1005 1003
1006 if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) 1004 if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
1007 return; 1005 return;
1008 j = ACCESS_ONCE(jiffies); 1006 j = jiffies;
1009 1007
1010 /* 1008 /*
1011 * Lots of memory barriers to reject false positives. 1009 * Lots of memory barriers to reject false positives.
@@ -1423,13 +1421,14 @@ static int rcu_gp_init(struct rcu_state *rsp)
1423 1421
1424 /* Advance to a new grace period and initialize state. */ 1422 /* Advance to a new grace period and initialize state. */
1425 record_gp_stall_check_time(rsp); 1423 record_gp_stall_check_time(rsp);
1426 smp_wmb(); /* Record GP times before starting GP. */ 1424 /* Record GP times before starting GP, hence smp_store_release(). */
1427 rsp->gpnum++; 1425 smp_store_release(&rsp->gpnum, rsp->gpnum + 1);
1428 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); 1426 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
1429 raw_spin_unlock_irq(&rnp->lock); 1427 raw_spin_unlock_irq(&rnp->lock);
1430 1428
1431 /* Exclude any concurrent CPU-hotplug operations. */ 1429 /* Exclude any concurrent CPU-hotplug operations. */
1432 mutex_lock(&rsp->onoff_mutex); 1430 mutex_lock(&rsp->onoff_mutex);
1431 smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
1433 1432
1434 /* 1433 /*
1435 * Set the quiescent-state-needed bits in all the rcu_node 1434 * Set the quiescent-state-needed bits in all the rcu_node
@@ -1557,10 +1556,11 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1557 } 1556 }
1558 rnp = rcu_get_root(rsp); 1557 rnp = rcu_get_root(rsp);
1559 raw_spin_lock_irq(&rnp->lock); 1558 raw_spin_lock_irq(&rnp->lock);
1560 smp_mb__after_unlock_lock(); 1559 smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
1561 rcu_nocb_gp_set(rnp, nocb); 1560 rcu_nocb_gp_set(rnp, nocb);
1562 1561
1563 rsp->completed = rsp->gpnum; /* Declare grace period done. */ 1562 /* Declare grace period done. */
1563 ACCESS_ONCE(rsp->completed) = rsp->gpnum;
1564 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); 1564 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
1565 rsp->fqs_state = RCU_GP_IDLE; 1565 rsp->fqs_state = RCU_GP_IDLE;
1566 rdp = this_cpu_ptr(rsp->rda); 1566 rdp = this_cpu_ptr(rsp->rda);
@@ -2304,7 +2304,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
2304 if (rnp_old != NULL) 2304 if (rnp_old != NULL)
2305 raw_spin_unlock(&rnp_old->fqslock); 2305 raw_spin_unlock(&rnp_old->fqslock);
2306 if (ret) { 2306 if (ret) {
2307 rsp->n_force_qs_lh++; 2307 ACCESS_ONCE(rsp->n_force_qs_lh)++;
2308 return; 2308 return;
2309 } 2309 }
2310 rnp_old = rnp; 2310 rnp_old = rnp;
@@ -2316,7 +2316,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
2316 smp_mb__after_unlock_lock(); 2316 smp_mb__after_unlock_lock();
2317 raw_spin_unlock(&rnp_old->fqslock); 2317 raw_spin_unlock(&rnp_old->fqslock);
2318 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2318 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
2319 rsp->n_force_qs_lh++; 2319 ACCESS_ONCE(rsp->n_force_qs_lh)++;
2320 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2320 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2321 return; /* Someone beat us to it. */ 2321 return; /* Someone beat us to it. */
2322 } 2322 }
@@ -2639,6 +2639,58 @@ void synchronize_rcu_bh(void)
2639} 2639}
2640EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 2640EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
2641 2641
2642/**
2643 * get_state_synchronize_rcu - Snapshot current RCU state
2644 *
2645 * Returns a cookie that is used by a later call to cond_synchronize_rcu()
2646 * to determine whether or not a full grace period has elapsed in the
2647 * meantime.
2648 */
2649unsigned long get_state_synchronize_rcu(void)
2650{
2651 /*
2652 * Any prior manipulation of RCU-protected data must happen
2653 * before the load from ->gpnum.
2654 */
2655 smp_mb(); /* ^^^ */
2656
2657 /*
2658 * Make sure this load happens before the purportedly
2659 * time-consuming work between get_state_synchronize_rcu()
2660 * and cond_synchronize_rcu().
2661 */
2662 return smp_load_acquire(&rcu_state->gpnum);
2663}
2664EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
2665
2666/**
2667 * cond_synchronize_rcu - Conditionally wait for an RCU grace period
2668 *
2669 * @oldstate: return value from earlier call to get_state_synchronize_rcu()
2670 *
2671 * If a full RCU grace period has elapsed since the earlier call to
2672 * get_state_synchronize_rcu(), just return. Otherwise, invoke
2673 * synchronize_rcu() to wait for a full grace period.
2674 *
2675 * Yes, this function does not take counter wrap into account. But
2676 * counter wrap is harmless. If the counter wraps, we have waited for
2677 * more than 2 billion grace periods (and way more on a 64-bit system!),
2678 * so waiting for one additional grace period should be just fine.
2679 */
2680void cond_synchronize_rcu(unsigned long oldstate)
2681{
2682 unsigned long newstate;
2683
2684 /*
2685 * Ensure that this load happens before any RCU-destructive
2686 * actions the caller might carry out after we return.
2687 */
2688 newstate = smp_load_acquire(&rcu_state->completed);
2689 if (ULONG_CMP_GE(oldstate, newstate))
2690 synchronize_rcu();
2691}
2692EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
2693
2642static int synchronize_sched_expedited_cpu_stop(void *data) 2694static int synchronize_sched_expedited_cpu_stop(void *data)
2643{ 2695{
2644 /* 2696 /*
@@ -2880,7 +2932,7 @@ static int rcu_pending(int cpu)
2880 * non-NULL, store an indication of whether all callbacks are lazy. 2932 * non-NULL, store an indication of whether all callbacks are lazy.
2881 * (If there are no callbacks, all of them are deemed to be lazy.) 2933 * (If there are no callbacks, all of them are deemed to be lazy.)
2882 */ 2934 */
2883static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) 2935static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2884{ 2936{
2885 bool al = true; 2937 bool al = true;
2886 bool hc = false; 2938 bool hc = false;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8c19873f1ac9..75dc3c39a02a 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -13,8 +13,8 @@
13 * GNU General Public License for more details. 13 * GNU General Public License for more details.
14 * 14 *
15 * You should have received a copy of the GNU General Public License 15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software 16 * along with this program; if not, you can access it online at
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 17 * http://www.gnu.org/licenses/gpl-2.0.html.
18 * 18 *
19 * Copyright IBM Corporation, 2008 19 * Copyright IBM Corporation, 2008
20 * 20 *
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6e2ef4b2b920..962d1d589929 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -14,8 +14,8 @@
14 * GNU General Public License for more details. 14 * GNU General Public License for more details.
15 * 15 *
16 * You should have received a copy of the GNU General Public License 16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, you can access it online at
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 18 * http://www.gnu.org/licenses/gpl-2.0.html.
19 * 19 *
20 * Copyright Red Hat, 2009 20 * Copyright Red Hat, 2009
21 * Copyright IBM Corporation, 2009 21 * Copyright IBM Corporation, 2009
@@ -1586,11 +1586,13 @@ static void rcu_prepare_kthreads(int cpu)
1586 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs 1586 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1587 * any flavor of RCU. 1587 * any flavor of RCU.
1588 */ 1588 */
1589#ifndef CONFIG_RCU_NOCB_CPU_ALL
1589int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1590int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1590{ 1591{
1591 *delta_jiffies = ULONG_MAX; 1592 *delta_jiffies = ULONG_MAX;
1592 return rcu_cpu_has_callbacks(cpu, NULL); 1593 return rcu_cpu_has_callbacks(cpu, NULL);
1593} 1594}
1595#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1594 1596
1595/* 1597/*
1596 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up 1598 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1656,7 +1658,7 @@ extern int tick_nohz_active;
1656 * only if it has been awhile since the last time we did so. Afterwards, 1658 * only if it has been awhile since the last time we did so. Afterwards,
1657 * if there are any callbacks ready for immediate invocation, return true. 1659 * if there are any callbacks ready for immediate invocation, return true.
1658 */ 1660 */
1659static bool rcu_try_advance_all_cbs(void) 1661static bool __maybe_unused rcu_try_advance_all_cbs(void)
1660{ 1662{
1661 bool cbs_ready = false; 1663 bool cbs_ready = false;
1662 struct rcu_data *rdp; 1664 struct rcu_data *rdp;
@@ -1696,6 +1698,7 @@ static bool rcu_try_advance_all_cbs(void)
1696 * 1698 *
1697 * The caller must have disabled interrupts. 1699 * The caller must have disabled interrupts.
1698 */ 1700 */
1701#ifndef CONFIG_RCU_NOCB_CPU_ALL
1699int rcu_needs_cpu(int cpu, unsigned long *dj) 1702int rcu_needs_cpu(int cpu, unsigned long *dj)
1700{ 1703{
1701 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1704 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
@@ -1726,6 +1729,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
1726 } 1729 }
1727 return 0; 1730 return 0;
1728} 1731}
1732#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1729 1733
1730/* 1734/*
1731 * Prepare a CPU for idle from an RCU perspective. The first major task 1735 * Prepare a CPU for idle from an RCU perspective. The first major task
@@ -1739,6 +1743,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
1739 */ 1743 */
1740static void rcu_prepare_for_idle(int cpu) 1744static void rcu_prepare_for_idle(int cpu)
1741{ 1745{
1746#ifndef CONFIG_RCU_NOCB_CPU_ALL
1742 struct rcu_data *rdp; 1747 struct rcu_data *rdp;
1743 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1748 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1744 struct rcu_node *rnp; 1749 struct rcu_node *rnp;
@@ -1790,6 +1795,7 @@ static void rcu_prepare_for_idle(int cpu)
1790 rcu_accelerate_cbs(rsp, rnp, rdp); 1795 rcu_accelerate_cbs(rsp, rnp, rdp);
1791 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1796 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1792 } 1797 }
1798#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1793} 1799}
1794 1800
1795/* 1801/*
@@ -1799,11 +1805,12 @@ static void rcu_prepare_for_idle(int cpu)
1799 */ 1805 */
1800static void rcu_cleanup_after_idle(int cpu) 1806static void rcu_cleanup_after_idle(int cpu)
1801{ 1807{
1802 1808#ifndef CONFIG_RCU_NOCB_CPU_ALL
1803 if (rcu_is_nocb_cpu(cpu)) 1809 if (rcu_is_nocb_cpu(cpu))
1804 return; 1810 return;
1805 if (rcu_try_advance_all_cbs()) 1811 if (rcu_try_advance_all_cbs())
1806 invoke_rcu_core(); 1812 invoke_rcu_core();
1813#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1807} 1814}
1808 1815
1809/* 1816/*
@@ -2101,6 +2108,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
2101 init_waitqueue_head(&rnp->nocb_gp_wq[1]); 2108 init_waitqueue_head(&rnp->nocb_gp_wq[1]);
2102} 2109}
2103 2110
2111#ifndef CONFIG_RCU_NOCB_CPU_ALL
2104/* Is the specified CPU a no-CPUs CPU? */ 2112/* Is the specified CPU a no-CPUs CPU? */
2105bool rcu_is_nocb_cpu(int cpu) 2113bool rcu_is_nocb_cpu(int cpu)
2106{ 2114{
@@ -2108,6 +2116,7 @@ bool rcu_is_nocb_cpu(int cpu)
2108 return cpumask_test_cpu(cpu, rcu_nocb_mask); 2116 return cpumask_test_cpu(cpu, rcu_nocb_mask);
2109 return false; 2117 return false;
2110} 2118}
2119#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
2111 2120
2112/* 2121/*
2113 * Enqueue the specified string of rcu_head structures onto the specified 2122 * Enqueue the specified string of rcu_head structures onto the specified
@@ -2893,7 +2902,7 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2893 * CPU unless the grace period has extended for too long. 2902 * CPU unless the grace period has extended for too long.
2894 * 2903 *
2895 * This code relies on the fact that all NO_HZ_FULL CPUs are also 2904 * This code relies on the fact that all NO_HZ_FULL CPUs are also
2896 * CONFIG_RCU_NOCB_CPUs. 2905 * CONFIG_RCU_NOCB_CPU CPUs.
2897 */ 2906 */
2898static bool rcu_nohz_full_cpu(struct rcu_state *rsp) 2907static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
2899{ 2908{
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 4def475336d4..5cdc62e1beeb 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -12,8 +12,8 @@
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, you can access it online at
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 * 17 *
18 * Copyright IBM Corporation, 2008 18 * Copyright IBM Corporation, 2008
19 * 19 *
@@ -273,7 +273,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
273 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 273 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
274 rsp->n_force_qs, rsp->n_force_qs_ngp, 274 rsp->n_force_qs, rsp->n_force_qs_ngp,
275 rsp->n_force_qs - rsp->n_force_qs_ngp, 275 rsp->n_force_qs - rsp->n_force_qs_ngp,
276 rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); 276 ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
277 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { 277 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
278 if (rnp->level != level) { 278 if (rnp->level != level) {
279 seq_puts(m, "\n"); 279 seq_puts(m, "\n");
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c54609faf233..4c0a9b0af469 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -12,8 +12,8 @@
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, you can access it online at
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 * 17 *
18 * Copyright IBM Corporation, 2001 18 * Copyright IBM Corporation, 2001
19 * 19 *
@@ -49,7 +49,6 @@
49#include <linux/module.h> 49#include <linux/module.h>
50 50
51#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
52#include <trace/events/rcu.h>
53 52
54#include "rcu.h" 53#include "rcu.h"
55 54
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a95c8c2af2a..ab32b7b0db5c 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
13 13
14obj-y += core.o proc.o clock.o cputime.o 14obj-y += core.o proc.o clock.o cputime.o
15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o 15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
16obj-y += wait.o completion.o 16obj-y += wait.o completion.o idle.o
17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o 17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
19obj-$(CONFIG_SCHEDSTATS) += stats.o 19obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 4a073539c58e..e73efba98301 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -203,7 +203,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
203 struct autogroup *ag; 203 struct autogroup *ag;
204 int err; 204 int err;
205 205
206 if (nice < -20 || nice > 19) 206 if (nice < MIN_NICE || nice > MAX_NICE)
207 return -EINVAL; 207 return -EINVAL;
208 208
209 err = security_task_setnice(current, nice); 209 err = security_task_setnice(current, nice);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 43c2bcc35761..b30a2924ef14 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -301,14 +301,14 @@ u64 sched_clock_cpu(int cpu)
301 if (unlikely(!sched_clock_running)) 301 if (unlikely(!sched_clock_running))
302 return 0ull; 302 return 0ull;
303 303
304 preempt_disable(); 304 preempt_disable_notrace();
305 scd = cpu_sdc(cpu); 305 scd = cpu_sdc(cpu);
306 306
307 if (cpu != smp_processor_id()) 307 if (cpu != smp_processor_id())
308 clock = sched_clock_remote(scd); 308 clock = sched_clock_remote(scd);
309 else 309 else
310 clock = sched_clock_local(scd); 310 clock = sched_clock_local(scd);
311 preempt_enable(); 311 preempt_enable_notrace();
312 312
313 return clock; 313 return clock;
314} 314}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131ef6aab..a47902c687ae 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1745,8 +1745,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1745 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1745 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1746 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1746 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1747 p->numa_work.next = &p->numa_work; 1747 p->numa_work.next = &p->numa_work;
1748 p->numa_faults = NULL; 1748 p->numa_faults_memory = NULL;
1749 p->numa_faults_buffer = NULL; 1749 p->numa_faults_buffer_memory = NULL;
1750 p->last_task_numa_placement = 0;
1751 p->last_sum_exec_runtime = 0;
1750 1752
1751 INIT_LIST_HEAD(&p->numa_entry); 1753 INIT_LIST_HEAD(&p->numa_entry);
1752 p->numa_group = NULL; 1754 p->numa_group = NULL;
@@ -1952,7 +1954,7 @@ static int dl_overflow(struct task_struct *p, int policy,
1952{ 1954{
1953 1955
1954 struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); 1956 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1955 u64 period = attr->sched_period; 1957 u64 period = attr->sched_period ?: attr->sched_deadline;
1956 u64 runtime = attr->sched_runtime; 1958 u64 runtime = attr->sched_runtime;
1957 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; 1959 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
1958 int cpus, err = -1; 1960 int cpus, err = -1;
@@ -2149,8 +2151,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2149 if (mm) 2151 if (mm)
2150 mmdrop(mm); 2152 mmdrop(mm);
2151 if (unlikely(prev_state == TASK_DEAD)) { 2153 if (unlikely(prev_state == TASK_DEAD)) {
2152 task_numa_free(prev);
2153
2154 if (prev->sched_class->task_dead) 2154 if (prev->sched_class->task_dead)
2155 prev->sched_class->task_dead(prev); 2155 prev->sched_class->task_dead(prev);
2156 2156
@@ -2167,13 +2167,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2167 2167
2168#ifdef CONFIG_SMP 2168#ifdef CONFIG_SMP
2169 2169
2170/* assumes rq->lock is held */
2171static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2172{
2173 if (prev->sched_class->pre_schedule)
2174 prev->sched_class->pre_schedule(rq, prev);
2175}
2176
2177/* rq->lock is NOT held, but preemption is disabled */ 2170/* rq->lock is NOT held, but preemption is disabled */
2178static inline void post_schedule(struct rq *rq) 2171static inline void post_schedule(struct rq *rq)
2179{ 2172{
@@ -2191,10 +2184,6 @@ static inline void post_schedule(struct rq *rq)
2191 2184
2192#else 2185#else
2193 2186
2194static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2195{
2196}
2197
2198static inline void post_schedule(struct rq *rq) 2187static inline void post_schedule(struct rq *rq)
2199{ 2188{
2200} 2189}
@@ -2510,8 +2499,13 @@ void __kprobes preempt_count_add(int val)
2510 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 2499 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2511 PREEMPT_MASK - 10); 2500 PREEMPT_MASK - 10);
2512#endif 2501#endif
2513 if (preempt_count() == val) 2502 if (preempt_count() == val) {
2514 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2503 unsigned long ip = get_parent_ip(CALLER_ADDR1);
2504#ifdef CONFIG_DEBUG_PREEMPT
2505 current->preempt_disable_ip = ip;
2506#endif
2507 trace_preempt_off(CALLER_ADDR0, ip);
2508 }
2515} 2509}
2516EXPORT_SYMBOL(preempt_count_add); 2510EXPORT_SYMBOL(preempt_count_add);
2517 2511
@@ -2554,6 +2548,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
2554 print_modules(); 2548 print_modules();
2555 if (irqs_disabled()) 2549 if (irqs_disabled())
2556 print_irqtrace_events(prev); 2550 print_irqtrace_events(prev);
2551#ifdef CONFIG_DEBUG_PREEMPT
2552 if (in_atomic_preempt_off()) {
2553 pr_err("Preemption disabled at:");
2554 print_ip_sym(current->preempt_disable_ip);
2555 pr_cont("\n");
2556 }
2557#endif
2557 dump_stack(); 2558 dump_stack();
2558 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 2559 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2559} 2560}
@@ -2577,36 +2578,34 @@ static inline void schedule_debug(struct task_struct *prev)
2577 schedstat_inc(this_rq(), sched_count); 2578 schedstat_inc(this_rq(), sched_count);
2578} 2579}
2579 2580
2580static void put_prev_task(struct rq *rq, struct task_struct *prev)
2581{
2582 if (prev->on_rq || rq->skip_clock_update < 0)
2583 update_rq_clock(rq);
2584 prev->sched_class->put_prev_task(rq, prev);
2585}
2586
2587/* 2581/*
2588 * Pick up the highest-prio task: 2582 * Pick up the highest-prio task:
2589 */ 2583 */
2590static inline struct task_struct * 2584static inline struct task_struct *
2591pick_next_task(struct rq *rq) 2585pick_next_task(struct rq *rq, struct task_struct *prev)
2592{ 2586{
2593 const struct sched_class *class; 2587 const struct sched_class *class = &fair_sched_class;
2594 struct task_struct *p; 2588 struct task_struct *p;
2595 2589
2596 /* 2590 /*
2597 * Optimization: we know that if all tasks are in 2591 * Optimization: we know that if all tasks are in
2598 * the fair class we can call that function directly: 2592 * the fair class we can call that function directly:
2599 */ 2593 */
2600 if (likely(rq->nr_running == rq->cfs.h_nr_running)) { 2594 if (likely(prev->sched_class == class &&
2601 p = fair_sched_class.pick_next_task(rq); 2595 rq->nr_running == rq->cfs.h_nr_running)) {
2602 if (likely(p)) 2596 p = fair_sched_class.pick_next_task(rq, prev);
2597 if (likely(p && p != RETRY_TASK))
2603 return p; 2598 return p;
2604 } 2599 }
2605 2600
2601again:
2606 for_each_class(class) { 2602 for_each_class(class) {
2607 p = class->pick_next_task(rq); 2603 p = class->pick_next_task(rq, prev);
2608 if (p) 2604 if (p) {
2605 if (unlikely(p == RETRY_TASK))
2606 goto again;
2609 return p; 2607 return p;
2608 }
2610 } 2609 }
2611 2610
2612 BUG(); /* the idle class will always have a runnable task */ 2611 BUG(); /* the idle class will always have a runnable task */
@@ -2700,13 +2699,10 @@ need_resched:
2700 switch_count = &prev->nvcsw; 2699 switch_count = &prev->nvcsw;
2701 } 2700 }
2702 2701
2703 pre_schedule(rq, prev); 2702 if (prev->on_rq || rq->skip_clock_update < 0)
2704 2703 update_rq_clock(rq);
2705 if (unlikely(!rq->nr_running))
2706 idle_balance(cpu, rq);
2707 2704
2708 put_prev_task(rq, prev); 2705 next = pick_next_task(rq, prev);
2709 next = pick_next_task(rq);
2710 clear_tsk_need_resched(prev); 2706 clear_tsk_need_resched(prev);
2711 clear_preempt_need_resched(); 2707 clear_preempt_need_resched();
2712 rq->skip_clock_update = 0; 2708 rq->skip_clock_update = 0;
@@ -2908,7 +2904,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
2908 * This function changes the 'effective' priority of a task. It does 2904 * This function changes the 'effective' priority of a task. It does
2909 * not touch ->normal_prio like __setscheduler(). 2905 * not touch ->normal_prio like __setscheduler().
2910 * 2906 *
2911 * Used by the rt_mutex code to implement priority inheritance logic. 2907 * Used by the rt_mutex code to implement priority inheritance
2908 * logic. Call site only calls if the priority of the task changed.
2912 */ 2909 */
2913void rt_mutex_setprio(struct task_struct *p, int prio) 2910void rt_mutex_setprio(struct task_struct *p, int prio)
2914{ 2911{
@@ -2998,7 +2995,7 @@ void set_user_nice(struct task_struct *p, long nice)
2998 unsigned long flags; 2995 unsigned long flags;
2999 struct rq *rq; 2996 struct rq *rq;
3000 2997
3001 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 2998 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
3002 return; 2999 return;
3003 /* 3000 /*
3004 * We have to be careful, if called from sys_setpriority(), 3001 * We have to be careful, if called from sys_setpriority(),
@@ -3076,11 +3073,11 @@ SYSCALL_DEFINE1(nice, int, increment)
3076 if (increment > 40) 3073 if (increment > 40)
3077 increment = 40; 3074 increment = 40;
3078 3075
3079 nice = TASK_NICE(current) + increment; 3076 nice = task_nice(current) + increment;
3080 if (nice < -20) 3077 if (nice < MIN_NICE)
3081 nice = -20; 3078 nice = MIN_NICE;
3082 if (nice > 19) 3079 if (nice > MAX_NICE)
3083 nice = 19; 3080 nice = MAX_NICE;
3084 3081
3085 if (increment < 0 && !can_nice(current, nice)) 3082 if (increment < 0 && !can_nice(current, nice))
3086 return -EPERM; 3083 return -EPERM;
@@ -3109,18 +3106,6 @@ int task_prio(const struct task_struct *p)
3109} 3106}
3110 3107
3111/** 3108/**
3112 * task_nice - return the nice value of a given task.
3113 * @p: the task in question.
3114 *
3115 * Return: The nice value [ -20 ... 0 ... 19 ].
3116 */
3117int task_nice(const struct task_struct *p)
3118{
3119 return TASK_NICE(p);
3120}
3121EXPORT_SYMBOL(task_nice);
3122
3123/**
3124 * idle_cpu - is a given cpu idle currently? 3109 * idle_cpu - is a given cpu idle currently?
3125 * @cpu: the processor in question. 3110 * @cpu: the processor in question.
3126 * 3111 *
@@ -3189,9 +3174,8 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3189 dl_se->dl_new = 1; 3174 dl_se->dl_new = 1;
3190} 3175}
3191 3176
3192/* Actually do priority change: must hold pi & rq lock. */ 3177static void __setscheduler_params(struct task_struct *p,
3193static void __setscheduler(struct rq *rq, struct task_struct *p, 3178 const struct sched_attr *attr)
3194 const struct sched_attr *attr)
3195{ 3179{
3196 int policy = attr->sched_policy; 3180 int policy = attr->sched_policy;
3197 3181
@@ -3211,9 +3195,21 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
3211 * getparam()/getattr() don't report silly values for !rt tasks. 3195 * getparam()/getattr() don't report silly values for !rt tasks.
3212 */ 3196 */
3213 p->rt_priority = attr->sched_priority; 3197 p->rt_priority = attr->sched_priority;
3214
3215 p->normal_prio = normal_prio(p); 3198 p->normal_prio = normal_prio(p);
3216 p->prio = rt_mutex_getprio(p); 3199 set_load_weight(p);
3200}
3201
3202/* Actually do priority change: must hold pi & rq lock. */
3203static void __setscheduler(struct rq *rq, struct task_struct *p,
3204 const struct sched_attr *attr)
3205{
3206 __setscheduler_params(p, attr);
3207
3208 /*
3209 * If we get here, there was no pi waiters boosting the
3210 * task. It is safe to use the normal prio.
3211 */
3212 p->prio = normal_prio(p);
3217 3213
3218 if (dl_prio(p->prio)) 3214 if (dl_prio(p->prio))
3219 p->sched_class = &dl_sched_class; 3215 p->sched_class = &dl_sched_class;
@@ -3221,8 +3217,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
3221 p->sched_class = &rt_sched_class; 3217 p->sched_class = &rt_sched_class;
3222 else 3218 else
3223 p->sched_class = &fair_sched_class; 3219 p->sched_class = &fair_sched_class;
3224
3225 set_load_weight(p);
3226} 3220}
3227 3221
3228static void 3222static void
@@ -3275,6 +3269,8 @@ static int __sched_setscheduler(struct task_struct *p,
3275 const struct sched_attr *attr, 3269 const struct sched_attr *attr,
3276 bool user) 3270 bool user)
3277{ 3271{
3272 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
3273 MAX_RT_PRIO - 1 - attr->sched_priority;
3278 int retval, oldprio, oldpolicy = -1, on_rq, running; 3274 int retval, oldprio, oldpolicy = -1, on_rq, running;
3279 int policy = attr->sched_policy; 3275 int policy = attr->sched_policy;
3280 unsigned long flags; 3276 unsigned long flags;
@@ -3319,7 +3315,7 @@ recheck:
3319 */ 3315 */
3320 if (user && !capable(CAP_SYS_NICE)) { 3316 if (user && !capable(CAP_SYS_NICE)) {
3321 if (fair_policy(policy)) { 3317 if (fair_policy(policy)) {
3322 if (attr->sched_nice < TASK_NICE(p) && 3318 if (attr->sched_nice < task_nice(p) &&
3323 !can_nice(p, attr->sched_nice)) 3319 !can_nice(p, attr->sched_nice))
3324 return -EPERM; 3320 return -EPERM;
3325 } 3321 }
@@ -3338,12 +3334,21 @@ recheck:
3338 return -EPERM; 3334 return -EPERM;
3339 } 3335 }
3340 3336
3337 /*
3338 * Can't set/change SCHED_DEADLINE policy at all for now
3339 * (safest behavior); in the future we would like to allow
3340 * unprivileged DL tasks to increase their relative deadline
3341 * or reduce their runtime (both ways reducing utilization)
3342 */
3343 if (dl_policy(policy))
3344 return -EPERM;
3345
3341 /* 3346 /*
3342 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3347 * Treat SCHED_IDLE as nice 20. Only allow a switch to
3343 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3348 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
3344 */ 3349 */
3345 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3350 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3346 if (!can_nice(p, TASK_NICE(p))) 3351 if (!can_nice(p, task_nice(p)))
3347 return -EPERM; 3352 return -EPERM;
3348 } 3353 }
3349 3354
@@ -3380,16 +3385,18 @@ recheck:
3380 } 3385 }
3381 3386
3382 /* 3387 /*
3383 * If not changing anything there's no need to proceed further: 3388 * If not changing anything there's no need to proceed further,
3389 * but store a possible modification of reset_on_fork.
3384 */ 3390 */
3385 if (unlikely(policy == p->policy)) { 3391 if (unlikely(policy == p->policy)) {
3386 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) 3392 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
3387 goto change; 3393 goto change;
3388 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3394 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3389 goto change; 3395 goto change;
3390 if (dl_policy(policy)) 3396 if (dl_policy(policy))
3391 goto change; 3397 goto change;
3392 3398
3399 p->sched_reset_on_fork = reset_on_fork;
3393 task_rq_unlock(rq, p, &flags); 3400 task_rq_unlock(rq, p, &flags);
3394 return 0; 3401 return 0;
3395 } 3402 }
@@ -3443,6 +3450,24 @@ change:
3443 return -EBUSY; 3450 return -EBUSY;
3444 } 3451 }
3445 3452
3453 p->sched_reset_on_fork = reset_on_fork;
3454 oldprio = p->prio;
3455
3456 /*
3457 * Special case for priority boosted tasks.
3458 *
3459 * If the new priority is lower or equal (user space view)
3460 * than the current (boosted) priority, we just store the new
3461 * normal parameters and do not touch the scheduler class and
3462 * the runqueue. This will be done when the task deboost
3463 * itself.
3464 */
3465 if (rt_mutex_check_prio(p, newprio)) {
3466 __setscheduler_params(p, attr);
3467 task_rq_unlock(rq, p, &flags);
3468 return 0;
3469 }
3470
3446 on_rq = p->on_rq; 3471 on_rq = p->on_rq;
3447 running = task_current(rq, p); 3472 running = task_current(rq, p);
3448 if (on_rq) 3473 if (on_rq)
@@ -3450,16 +3475,18 @@ change:
3450 if (running) 3475 if (running)
3451 p->sched_class->put_prev_task(rq, p); 3476 p->sched_class->put_prev_task(rq, p);
3452 3477
3453 p->sched_reset_on_fork = reset_on_fork;
3454
3455 oldprio = p->prio;
3456 prev_class = p->sched_class; 3478 prev_class = p->sched_class;
3457 __setscheduler(rq, p, attr); 3479 __setscheduler(rq, p, attr);
3458 3480
3459 if (running) 3481 if (running)
3460 p->sched_class->set_curr_task(rq); 3482 p->sched_class->set_curr_task(rq);
3461 if (on_rq) 3483 if (on_rq) {
3462 enqueue_task(rq, p, 0); 3484 /*
3485 * We enqueue to tail when the priority of a task is
3486 * increased (user space view).
3487 */
3488 enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
3489 }
3463 3490
3464 check_class_changed(rq, p, prev_class, oldprio); 3491 check_class_changed(rq, p, prev_class, oldprio);
3465 task_rq_unlock(rq, p, &flags); 3492 task_rq_unlock(rq, p, &flags);
@@ -3615,7 +3642,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
3615 * XXX: do we want to be lenient like existing syscalls; or do we want 3642 * XXX: do we want to be lenient like existing syscalls; or do we want
3616 * to be strict and return an error on out-of-bounds values? 3643 * to be strict and return an error on out-of-bounds values?
3617 */ 3644 */
3618 attr->sched_nice = clamp(attr->sched_nice, -20, 19); 3645 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
3619 3646
3620out: 3647out:
3621 return ret; 3648 return ret;
@@ -3661,13 +3688,14 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3661 * @pid: the pid in question. 3688 * @pid: the pid in question.
3662 * @uattr: structure containing the extended parameters. 3689 * @uattr: structure containing the extended parameters.
3663 */ 3690 */
3664SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) 3691SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3692 unsigned int, flags)
3665{ 3693{
3666 struct sched_attr attr; 3694 struct sched_attr attr;
3667 struct task_struct *p; 3695 struct task_struct *p;
3668 int retval; 3696 int retval;
3669 3697
3670 if (!uattr || pid < 0) 3698 if (!uattr || pid < 0 || flags)
3671 return -EINVAL; 3699 return -EINVAL;
3672 3700
3673 if (sched_copy_attr(uattr, &attr)) 3701 if (sched_copy_attr(uattr, &attr))
@@ -3786,7 +3814,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
3786 attr->size = usize; 3814 attr->size = usize;
3787 } 3815 }
3788 3816
3789 ret = copy_to_user(uattr, attr, usize); 3817 ret = copy_to_user(uattr, attr, attr->size);
3790 if (ret) 3818 if (ret)
3791 return -EFAULT; 3819 return -EFAULT;
3792 3820
@@ -3804,8 +3832,8 @@ err_size:
3804 * @uattr: structure containing the extended parameters. 3832 * @uattr: structure containing the extended parameters.
3805 * @size: sizeof(attr) for fwd/bwd comp. 3833 * @size: sizeof(attr) for fwd/bwd comp.
3806 */ 3834 */
3807SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 3835SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3808 unsigned int, size) 3836 unsigned int, size, unsigned int, flags)
3809{ 3837{
3810 struct sched_attr attr = { 3838 struct sched_attr attr = {
3811 .size = sizeof(struct sched_attr), 3839 .size = sizeof(struct sched_attr),
@@ -3814,7 +3842,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3814 int retval; 3842 int retval;
3815 3843
3816 if (!uattr || pid < 0 || size > PAGE_SIZE || 3844 if (!uattr || pid < 0 || size > PAGE_SIZE ||
3817 size < SCHED_ATTR_SIZE_VER0) 3845 size < SCHED_ATTR_SIZE_VER0 || flags)
3818 return -EINVAL; 3846 return -EINVAL;
3819 3847
3820 rcu_read_lock(); 3848 rcu_read_lock();
@@ -3835,7 +3863,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3835 else if (task_has_rt_policy(p)) 3863 else if (task_has_rt_policy(p))
3836 attr.sched_priority = p->rt_priority; 3864 attr.sched_priority = p->rt_priority;
3837 else 3865 else
3838 attr.sched_nice = TASK_NICE(p); 3866 attr.sched_nice = task_nice(p);
3839 3867
3840 rcu_read_unlock(); 3868 rcu_read_unlock();
3841 3869
@@ -4473,6 +4501,7 @@ void init_idle(struct task_struct *idle, int cpu)
4473 rcu_read_unlock(); 4501 rcu_read_unlock();
4474 4502
4475 rq->curr = rq->idle = idle; 4503 rq->curr = rq->idle = idle;
4504 idle->on_rq = 1;
4476#if defined(CONFIG_SMP) 4505#if defined(CONFIG_SMP)
4477 idle->on_cpu = 1; 4506 idle->on_cpu = 1;
4478#endif 4507#endif
@@ -4692,8 +4721,10 @@ void idle_task_exit(void)
4692 4721
4693 BUG_ON(cpu_online(smp_processor_id())); 4722 BUG_ON(cpu_online(smp_processor_id()));
4694 4723
4695 if (mm != &init_mm) 4724 if (mm != &init_mm) {
4696 switch_mm(mm, &init_mm, current); 4725 switch_mm(mm, &init_mm, current);
4726 finish_arch_post_lock_switch();
4727 }
4697 mmdrop(mm); 4728 mmdrop(mm);
4698} 4729}
4699 4730
@@ -4711,6 +4742,22 @@ static void calc_load_migrate(struct rq *rq)
4711 atomic_long_add(delta, &calc_load_tasks); 4742 atomic_long_add(delta, &calc_load_tasks);
4712} 4743}
4713 4744
4745static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
4746{
4747}
4748
4749static const struct sched_class fake_sched_class = {
4750 .put_prev_task = put_prev_task_fake,
4751};
4752
4753static struct task_struct fake_task = {
4754 /*
4755 * Avoid pull_{rt,dl}_task()
4756 */
4757 .prio = MAX_PRIO + 1,
4758 .sched_class = &fake_sched_class,
4759};
4760
4714/* 4761/*
4715 * Migrate all tasks from the rq, sleeping tasks will be migrated by 4762 * Migrate all tasks from the rq, sleeping tasks will be migrated by
4716 * try_to_wake_up()->select_task_rq(). 4763 * try_to_wake_up()->select_task_rq().
@@ -4751,7 +4798,7 @@ static void migrate_tasks(unsigned int dead_cpu)
4751 if (rq->nr_running == 1) 4798 if (rq->nr_running == 1)
4752 break; 4799 break;
4753 4800
4754 next = pick_next_task(rq); 4801 next = pick_next_task(rq, &fake_task);
4755 BUG_ON(!next); 4802 BUG_ON(!next);
4756 next->sched_class->put_prev_task(rq, next); 4803 next->sched_class->put_prev_task(rq, next);
4757 4804
@@ -4841,7 +4888,7 @@ set_table_entry(struct ctl_table *entry,
4841static struct ctl_table * 4888static struct ctl_table *
4842sd_alloc_ctl_domain_table(struct sched_domain *sd) 4889sd_alloc_ctl_domain_table(struct sched_domain *sd)
4843{ 4890{
4844 struct ctl_table *table = sd_alloc_ctl_entry(13); 4891 struct ctl_table *table = sd_alloc_ctl_entry(14);
4845 4892
4846 if (table == NULL) 4893 if (table == NULL)
4847 return NULL; 4894 return NULL;
@@ -4869,9 +4916,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
4869 sizeof(int), 0644, proc_dointvec_minmax, false); 4916 sizeof(int), 0644, proc_dointvec_minmax, false);
4870 set_table_entry(&table[10], "flags", &sd->flags, 4917 set_table_entry(&table[10], "flags", &sd->flags,
4871 sizeof(int), 0644, proc_dointvec_minmax, false); 4918 sizeof(int), 0644, proc_dointvec_minmax, false);
4872 set_table_entry(&table[11], "name", sd->name, 4919 set_table_entry(&table[11], "max_newidle_lb_cost",
4920 &sd->max_newidle_lb_cost,
4921 sizeof(long), 0644, proc_doulongvec_minmax, false);
4922 set_table_entry(&table[12], "name", sd->name,
4873 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 4923 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4874 /* &table[12] is terminator */ 4924 /* &table[13] is terminator */
4875 4925
4876 return table; 4926 return table;
4877} 4927}
@@ -6848,7 +6898,6 @@ void __init sched_init(void)
6848 6898
6849 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 6899 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6850#ifdef CONFIG_RT_GROUP_SCHED 6900#ifdef CONFIG_RT_GROUP_SCHED
6851 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6852 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6901 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6853#endif 6902#endif
6854 6903
@@ -6937,7 +6986,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
6937 static unsigned long prev_jiffy; /* ratelimiting */ 6986 static unsigned long prev_jiffy; /* ratelimiting */
6938 6987
6939 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 6988 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
6940 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 6989 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6990 !is_idle_task(current)) ||
6941 system_state != SYSTEM_RUNNING || oops_in_progress) 6991 system_state != SYSTEM_RUNNING || oops_in_progress)
6942 return; 6992 return;
6943 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6993 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
@@ -6955,6 +7005,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
6955 debug_show_held_locks(current); 7005 debug_show_held_locks(current);
6956 if (irqs_disabled()) 7006 if (irqs_disabled())
6957 print_irqtrace_events(current); 7007 print_irqtrace_events(current);
7008#ifdef CONFIG_DEBUG_PREEMPT
7009 if (!preempt_count_equals(preempt_offset)) {
7010 pr_err("Preemption disabled at:");
7011 print_ip_sym(current->preempt_disable_ip);
7012 pr_cont("\n");
7013 }
7014#endif
6958 dump_stack(); 7015 dump_stack();
6959} 7016}
6960EXPORT_SYMBOL(__might_sleep); 7017EXPORT_SYMBOL(__might_sleep);
@@ -7008,7 +7065,7 @@ void normalize_rt_tasks(void)
7008 * Renice negative nice level userspace 7065 * Renice negative nice level userspace
7009 * tasks back to 0: 7066 * tasks back to 0:
7010 */ 7067 */
7011 if (TASK_NICE(p) < 0 && p->mm) 7068 if (task_nice(p) < 0 && p->mm)
7012 set_user_nice(p, 0); 7069 set_user_nice(p, 0);
7013 continue; 7070 continue;
7014 } 7071 }
@@ -7422,6 +7479,7 @@ static int sched_dl_global_constraints(void)
7422 u64 period = global_rt_period(); 7479 u64 period = global_rt_period();
7423 u64 new_bw = to_ratio(period, runtime); 7480 u64 new_bw = to_ratio(period, runtime);
7424 int cpu, ret = 0; 7481 int cpu, ret = 0;
7482 unsigned long flags;
7425 7483
7426 /* 7484 /*
7427 * Here we want to check the bandwidth not being set to some 7485 * Here we want to check the bandwidth not being set to some
@@ -7435,10 +7493,10 @@ static int sched_dl_global_constraints(void)
7435 for_each_possible_cpu(cpu) { 7493 for_each_possible_cpu(cpu) {
7436 struct dl_bw *dl_b = dl_bw_of(cpu); 7494 struct dl_bw *dl_b = dl_bw_of(cpu);
7437 7495
7438 raw_spin_lock(&dl_b->lock); 7496 raw_spin_lock_irqsave(&dl_b->lock, flags);
7439 if (new_bw < dl_b->total_bw) 7497 if (new_bw < dl_b->total_bw)
7440 ret = -EBUSY; 7498 ret = -EBUSY;
7441 raw_spin_unlock(&dl_b->lock); 7499 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7442 7500
7443 if (ret) 7501 if (ret)
7444 break; 7502 break;
@@ -7451,6 +7509,7 @@ static void sched_dl_do_global(void)
7451{ 7509{
7452 u64 new_bw = -1; 7510 u64 new_bw = -1;
7453 int cpu; 7511 int cpu;
7512 unsigned long flags;
7454 7513
7455 def_dl_bandwidth.dl_period = global_rt_period(); 7514 def_dl_bandwidth.dl_period = global_rt_period();
7456 def_dl_bandwidth.dl_runtime = global_rt_runtime(); 7515 def_dl_bandwidth.dl_runtime = global_rt_runtime();
@@ -7464,9 +7523,9 @@ static void sched_dl_do_global(void)
7464 for_each_possible_cpu(cpu) { 7523 for_each_possible_cpu(cpu) {
7465 struct dl_bw *dl_b = dl_bw_of(cpu); 7524 struct dl_bw *dl_b = dl_bw_of(cpu);
7466 7525
7467 raw_spin_lock(&dl_b->lock); 7526 raw_spin_lock_irqsave(&dl_b->lock, flags);
7468 dl_b->bw = new_bw; 7527 dl_b->bw = new_bw;
7469 raw_spin_unlock(&dl_b->lock); 7528 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7470 } 7529 }
7471} 7530}
7472 7531
@@ -7475,7 +7534,8 @@ static int sched_rt_global_validate(void)
7475 if (sysctl_sched_rt_period <= 0) 7534 if (sysctl_sched_rt_period <= 0)
7476 return -EINVAL; 7535 return -EINVAL;
7477 7536
7478 if (sysctl_sched_rt_runtime > sysctl_sched_rt_period) 7537 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
7538 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
7479 return -EINVAL; 7539 return -EINVAL;
7480 7540
7481 return 0; 7541 return 0;
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 045fc74e3f09..5b9bb42b2d47 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx)
70 70
71static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) 71static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
72{ 72{
73 WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID); 73 WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
74 74
75 if (dl_time_before(new_dl, cp->elements[idx].dl)) { 75 if (dl_time_before(new_dl, cp->elements[idx].dl)) {
76 cp->elements[idx].dl = new_dl; 76 cp->elements[idx].dl = new_dl;
@@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
117 } 117 }
118 118
119out: 119out:
120 WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1); 120 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
121 121
122 return best_cpu; 122 return best_cpu;
123} 123}
@@ -137,7 +137,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
137 int old_idx, new_cpu; 137 int old_idx, new_cpu;
138 unsigned long flags; 138 unsigned long flags;
139 139
140 WARN_ON(cpu > num_present_cpus()); 140 WARN_ON(!cpu_present(cpu));
141 141
142 raw_spin_lock_irqsave(&cp->lock, flags); 142 raw_spin_lock_irqsave(&cp->lock, flags);
143 old_idx = cp->cpu_to_idx[cpu]; 143 old_idx = cp->cpu_to_idx[cpu];
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 99947919e30b..58624a65f124 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
142 p->utimescaled += cputime_scaled; 142 p->utimescaled += cputime_scaled;
143 account_group_user_time(p, cputime); 143 account_group_user_time(p, cputime);
144 144
145 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 145 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
146 146
147 /* Add user time to cpustat. */ 147 /* Add user time to cpustat. */
148 task_group_account_field(p, index, (__force u64) cputime); 148 task_group_account_field(p, index, (__force u64) cputime);
@@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
169 p->gtime += cputime; 169 p->gtime += cputime;
170 170
171 /* Add guest time to cpustat. */ 171 /* Add guest time to cpustat. */
172 if (TASK_NICE(p) > 0) { 172 if (task_nice(p) > 0) {
173 cpustat[CPUTIME_NICE] += (__force u64) cputime; 173 cpustat[CPUTIME_NICE] += (__force u64) cputime;
174 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; 174 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
175 } else { 175 } else {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0dd5e0971a07..27ef40925525 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -121,7 +121,7 @@ static inline void dl_clear_overload(struct rq *rq)
121 121
122static void update_dl_migration(struct dl_rq *dl_rq) 122static void update_dl_migration(struct dl_rq *dl_rq)
123{ 123{
124 if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) { 124 if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
125 if (!dl_rq->overloaded) { 125 if (!dl_rq->overloaded) {
126 dl_set_overload(rq_of_dl_rq(dl_rq)); 126 dl_set_overload(rq_of_dl_rq(dl_rq));
127 dl_rq->overloaded = 1; 127 dl_rq->overloaded = 1;
@@ -135,9 +135,7 @@ static void update_dl_migration(struct dl_rq *dl_rq)
135static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) 135static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
136{ 136{
137 struct task_struct *p = dl_task_of(dl_se); 137 struct task_struct *p = dl_task_of(dl_se);
138 dl_rq = &rq_of_dl_rq(dl_rq)->dl;
139 138
140 dl_rq->dl_nr_total++;
141 if (p->nr_cpus_allowed > 1) 139 if (p->nr_cpus_allowed > 1)
142 dl_rq->dl_nr_migratory++; 140 dl_rq->dl_nr_migratory++;
143 141
@@ -147,9 +145,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
147static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) 145static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
148{ 146{
149 struct task_struct *p = dl_task_of(dl_se); 147 struct task_struct *p = dl_task_of(dl_se);
150 dl_rq = &rq_of_dl_rq(dl_rq)->dl;
151 148
152 dl_rq->dl_nr_total--;
153 if (p->nr_cpus_allowed > 1) 149 if (p->nr_cpus_allowed > 1)
154 dl_rq->dl_nr_migratory--; 150 dl_rq->dl_nr_migratory--;
155 151
@@ -214,6 +210,16 @@ static inline int has_pushable_dl_tasks(struct rq *rq)
214 210
215static int push_dl_task(struct rq *rq); 211static int push_dl_task(struct rq *rq);
216 212
213static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
214{
215 return dl_task(prev);
216}
217
218static inline void set_post_schedule(struct rq *rq)
219{
220 rq->post_schedule = has_pushable_dl_tasks(rq);
221}
222
217#else 223#else
218 224
219static inline 225static inline
@@ -236,6 +242,19 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
236{ 242{
237} 243}
238 244
245static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
246{
247 return false;
248}
249
250static inline int pull_dl_task(struct rq *rq)
251{
252 return 0;
253}
254
255static inline void set_post_schedule(struct rq *rq)
256{
257}
239#endif /* CONFIG_SMP */ 258#endif /* CONFIG_SMP */
240 259
241static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); 260static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -566,6 +585,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
566 return 1; 585 return 1;
567} 586}
568 587
588extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
589
569/* 590/*
570 * Update the current task's runtime statistics (provided it is still 591 * Update the current task's runtime statistics (provided it is still
571 * a -deadline task and has not been removed from the dl_rq). 592 * a -deadline task and has not been removed from the dl_rq).
@@ -588,8 +609,8 @@ static void update_curr_dl(struct rq *rq)
588 * approach need further study. 609 * approach need further study.
589 */ 610 */
590 delta_exec = rq_clock_task(rq) - curr->se.exec_start; 611 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
591 if (unlikely((s64)delta_exec < 0)) 612 if (unlikely((s64)delta_exec <= 0))
592 delta_exec = 0; 613 return;
593 614
594 schedstat_set(curr->se.statistics.exec_max, 615 schedstat_set(curr->se.statistics.exec_max,
595 max(curr->se.statistics.exec_max, delta_exec)); 616 max(curr->se.statistics.exec_max, delta_exec));
@@ -629,11 +650,13 @@ static void update_curr_dl(struct rq *rq)
629 struct rt_rq *rt_rq = &rq->rt; 650 struct rt_rq *rt_rq = &rq->rt;
630 651
631 raw_spin_lock(&rt_rq->rt_runtime_lock); 652 raw_spin_lock(&rt_rq->rt_runtime_lock);
632 rt_rq->rt_time += delta_exec;
633 /* 653 /*
634 * We'll let actual RT tasks worry about the overflow here, we 654 * We'll let actual RT tasks worry about the overflow here, we
635 * have our own CBS to keep us inline -- see above. 655 * have our own CBS to keep us inline; only account when RT
656 * bandwidth is relevant.
636 */ 657 */
658 if (sched_rt_bandwidth_account(rt_rq))
659 rt_rq->rt_time += delta_exec;
637 raw_spin_unlock(&rt_rq->rt_runtime_lock); 660 raw_spin_unlock(&rt_rq->rt_runtime_lock);
638 } 661 }
639} 662}
@@ -717,6 +740,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
717 740
718 WARN_ON(!dl_prio(prio)); 741 WARN_ON(!dl_prio(prio));
719 dl_rq->dl_nr_running++; 742 dl_rq->dl_nr_running++;
743 inc_nr_running(rq_of_dl_rq(dl_rq));
720 744
721 inc_dl_deadline(dl_rq, deadline); 745 inc_dl_deadline(dl_rq, deadline);
722 inc_dl_migration(dl_se, dl_rq); 746 inc_dl_migration(dl_se, dl_rq);
@@ -730,6 +754,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
730 WARN_ON(!dl_prio(prio)); 754 WARN_ON(!dl_prio(prio));
731 WARN_ON(!dl_rq->dl_nr_running); 755 WARN_ON(!dl_rq->dl_nr_running);
732 dl_rq->dl_nr_running--; 756 dl_rq->dl_nr_running--;
757 dec_nr_running(rq_of_dl_rq(dl_rq));
733 758
734 dec_dl_deadline(dl_rq, dl_se->deadline); 759 dec_dl_deadline(dl_rq, dl_se->deadline);
735 dec_dl_migration(dl_se, dl_rq); 760 dec_dl_migration(dl_se, dl_rq);
@@ -836,8 +861,6 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
836 861
837 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 862 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
838 enqueue_pushable_dl_task(rq, p); 863 enqueue_pushable_dl_task(rq, p);
839
840 inc_nr_running(rq);
841} 864}
842 865
843static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) 866static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -850,8 +873,6 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
850{ 873{
851 update_curr_dl(rq); 874 update_curr_dl(rq);
852 __dequeue_task_dl(rq, p, flags); 875 __dequeue_task_dl(rq, p, flags);
853
854 dec_nr_running(rq);
855} 876}
856 877
857/* 878/*
@@ -944,6 +965,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
944 resched_task(rq->curr); 965 resched_task(rq->curr);
945} 966}
946 967
968static int pull_dl_task(struct rq *this_rq);
969
947#endif /* CONFIG_SMP */ 970#endif /* CONFIG_SMP */
948 971
949/* 972/*
@@ -990,7 +1013,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
990 return rb_entry(left, struct sched_dl_entity, rb_node); 1013 return rb_entry(left, struct sched_dl_entity, rb_node);
991} 1014}
992 1015
993struct task_struct *pick_next_task_dl(struct rq *rq) 1016struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
994{ 1017{
995 struct sched_dl_entity *dl_se; 1018 struct sched_dl_entity *dl_se;
996 struct task_struct *p; 1019 struct task_struct *p;
@@ -998,9 +1021,20 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
998 1021
999 dl_rq = &rq->dl; 1022 dl_rq = &rq->dl;
1000 1023
1024 if (need_pull_dl_task(rq, prev))
1025 pull_dl_task(rq);
1026 /*
1027 * When prev is DL, we may throttle it in put_prev_task().
1028 * So, we update time before we check for dl_nr_running.
1029 */
1030 if (prev->sched_class == &dl_sched_class)
1031 update_curr_dl(rq);
1032
1001 if (unlikely(!dl_rq->dl_nr_running)) 1033 if (unlikely(!dl_rq->dl_nr_running))
1002 return NULL; 1034 return NULL;
1003 1035
1036 put_prev_task(rq, prev);
1037
1004 dl_se = pick_next_dl_entity(rq, dl_rq); 1038 dl_se = pick_next_dl_entity(rq, dl_rq);
1005 BUG_ON(!dl_se); 1039 BUG_ON(!dl_se);
1006 1040
@@ -1015,9 +1049,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
1015 start_hrtick_dl(rq, p); 1049 start_hrtick_dl(rq, p);
1016#endif 1050#endif
1017 1051
1018#ifdef CONFIG_SMP 1052 set_post_schedule(rq);
1019 rq->post_schedule = has_pushable_dl_tasks(rq);
1020#endif /* CONFIG_SMP */
1021 1053
1022 return p; 1054 return p;
1023} 1055}
@@ -1426,13 +1458,6 @@ skip:
1426 return ret; 1458 return ret;
1427} 1459}
1428 1460
1429static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
1430{
1431 /* Try to pull other tasks here */
1432 if (dl_task(prev))
1433 pull_dl_task(rq);
1434}
1435
1436static void post_schedule_dl(struct rq *rq) 1461static void post_schedule_dl(struct rq *rq)
1437{ 1462{
1438 push_dl_tasks(rq); 1463 push_dl_tasks(rq);
@@ -1560,7 +1585,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1560 if (unlikely(p->dl.dl_throttled)) 1585 if (unlikely(p->dl.dl_throttled))
1561 return; 1586 return;
1562 1587
1563 if (p->on_rq || rq->curr != p) { 1588 if (p->on_rq && rq->curr != p) {
1564#ifdef CONFIG_SMP 1589#ifdef CONFIG_SMP
1565 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) 1590 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
1566 /* Only reschedule if pushing failed */ 1591 /* Only reschedule if pushing failed */
@@ -1625,7 +1650,6 @@ const struct sched_class dl_sched_class = {
1625 .set_cpus_allowed = set_cpus_allowed_dl, 1650 .set_cpus_allowed = set_cpus_allowed_dl,
1626 .rq_online = rq_online_dl, 1651 .rq_online = rq_online_dl,
1627 .rq_offline = rq_offline_dl, 1652 .rq_offline = rq_offline_dl,
1628 .pre_schedule = pre_schedule_dl,
1629 .post_schedule = post_schedule_dl, 1653 .post_schedule = post_schedule_dl,
1630 .task_woken = task_woken_dl, 1654 .task_woken = task_woken_dl,
1631#endif 1655#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index dd52e7ffb10e..f3344c31632a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -321,6 +321,7 @@ do { \
321 P(sched_goidle); 321 P(sched_goidle);
322#ifdef CONFIG_SMP 322#ifdef CONFIG_SMP
323 P64(avg_idle); 323 P64(avg_idle);
324 P64(max_idle_balance_cost);
324#endif 325#endif
325 326
326 P(ttwu_count); 327 P(ttwu_count);
@@ -533,15 +534,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
533 unsigned long nr_faults = -1; 534 unsigned long nr_faults = -1;
534 int cpu_current, home_node; 535 int cpu_current, home_node;
535 536
536 if (p->numa_faults) 537 if (p->numa_faults_memory)
537 nr_faults = p->numa_faults[2*node + i]; 538 nr_faults = p->numa_faults_memory[2*node + i];
538 539
539 cpu_current = !i ? (task_node(p) == node) : 540 cpu_current = !i ? (task_node(p) == node) :
540 (pol && node_isset(node, pol->v.nodes)); 541 (pol && node_isset(node, pol->v.nodes));
541 542
542 home_node = (p->numa_preferred_nid == node); 543 home_node = (p->numa_preferred_nid == node);
543 544
544 SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", 545 SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
545 i, node, cpu_current, home_node, nr_faults); 546 i, node, cpu_current, home_node, nr_faults);
546 } 547 }
547 } 548 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 966cc2bfcb77..7e9bd0b1fa9e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
322 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 322 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
323 323
324/* Do the two (enqueued) entities belong to the same group ? */ 324/* Do the two (enqueued) entities belong to the same group ? */
325static inline int 325static inline struct cfs_rq *
326is_same_group(struct sched_entity *se, struct sched_entity *pse) 326is_same_group(struct sched_entity *se, struct sched_entity *pse)
327{ 327{
328 if (se->cfs_rq == pse->cfs_rq) 328 if (se->cfs_rq == pse->cfs_rq)
329 return 1; 329 return se->cfs_rq;
330 330
331 return 0; 331 return NULL;
332} 332}
333 333
334static inline struct sched_entity *parent_entity(struct sched_entity *se) 334static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
336 return se->parent; 336 return se->parent;
337} 337}
338 338
339/* return depth at which a sched entity is present in the hierarchy */
340static inline int depth_se(struct sched_entity *se)
341{
342 int depth = 0;
343
344 for_each_sched_entity(se)
345 depth++;
346
347 return depth;
348}
349
350static void 339static void
351find_matching_se(struct sched_entity **se, struct sched_entity **pse) 340find_matching_se(struct sched_entity **se, struct sched_entity **pse)
352{ 341{
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
360 */ 349 */
361 350
362 /* First walk up until both entities are at same depth */ 351 /* First walk up until both entities are at same depth */
363 se_depth = depth_se(*se); 352 se_depth = (*se)->depth;
364 pse_depth = depth_se(*pse); 353 pse_depth = (*pse)->depth;
365 354
366 while (se_depth > pse_depth) { 355 while (se_depth > pse_depth) {
367 se_depth--; 356 se_depth--;
@@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
426#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 415#define for_each_leaf_cfs_rq(rq, cfs_rq) \
427 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 416 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
428 417
429static inline int
430is_same_group(struct sched_entity *se, struct sched_entity *pse)
431{
432 return 1;
433}
434
435static inline struct sched_entity *parent_entity(struct sched_entity *se) 418static inline struct sched_entity *parent_entity(struct sched_entity *se)
436{ 419{
437 return NULL; 420 return NULL;
@@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
819/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 802/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
820unsigned int sysctl_numa_balancing_scan_delay = 1000; 803unsigned int sysctl_numa_balancing_scan_delay = 1000;
821 804
822/*
823 * After skipping a page migration on a shared page, skip N more numa page
824 * migrations unconditionally. This reduces the number of NUMA migrations
825 * in shared memory workloads, and has the effect of pulling tasks towards
826 * where their memory lives, over pulling the memory towards the task.
827 */
828unsigned int sysctl_numa_balancing_migrate_deferred = 16;
829
830static unsigned int task_nr_scan_windows(struct task_struct *p) 805static unsigned int task_nr_scan_windows(struct task_struct *p)
831{ 806{
832 unsigned long rss = 0; 807 unsigned long rss = 0;
@@ -893,10 +868,26 @@ struct numa_group {
893 struct list_head task_list; 868 struct list_head task_list;
894 869
895 struct rcu_head rcu; 870 struct rcu_head rcu;
871 nodemask_t active_nodes;
896 unsigned long total_faults; 872 unsigned long total_faults;
873 /*
874 * Faults_cpu is used to decide whether memory should move
875 * towards the CPU. As a consequence, these stats are weighted
876 * more by CPU use than by memory faults.
877 */
878 unsigned long *faults_cpu;
897 unsigned long faults[0]; 879 unsigned long faults[0];
898}; 880};
899 881
882/* Shared or private faults. */
883#define NR_NUMA_HINT_FAULT_TYPES 2
884
885/* Memory and CPU locality */
886#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
887
888/* Averaged statistics, and temporary buffers. */
889#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
890
900pid_t task_numa_group_id(struct task_struct *p) 891pid_t task_numa_group_id(struct task_struct *p)
901{ 892{
902 return p->numa_group ? p->numa_group->gid : 0; 893 return p->numa_group ? p->numa_group->gid : 0;
@@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p)
904 895
905static inline int task_faults_idx(int nid, int priv) 896static inline int task_faults_idx(int nid, int priv)
906{ 897{
907 return 2 * nid + priv; 898 return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
908} 899}
909 900
910static inline unsigned long task_faults(struct task_struct *p, int nid) 901static inline unsigned long task_faults(struct task_struct *p, int nid)
911{ 902{
912 if (!p->numa_faults) 903 if (!p->numa_faults_memory)
913 return 0; 904 return 0;
914 905
915 return p->numa_faults[task_faults_idx(nid, 0)] + 906 return p->numa_faults_memory[task_faults_idx(nid, 0)] +
916 p->numa_faults[task_faults_idx(nid, 1)]; 907 p->numa_faults_memory[task_faults_idx(nid, 1)];
917} 908}
918 909
919static inline unsigned long group_faults(struct task_struct *p, int nid) 910static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
925 p->numa_group->faults[task_faults_idx(nid, 1)]; 916 p->numa_group->faults[task_faults_idx(nid, 1)];
926} 917}
927 918
919static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
920{
921 return group->faults_cpu[task_faults_idx(nid, 0)] +
922 group->faults_cpu[task_faults_idx(nid, 1)];
923}
924
928/* 925/*
929 * These return the fraction of accesses done by a particular task, or 926 * These return the fraction of accesses done by a particular task, or
930 * task group, on a particular numa node. The group weight is given a 927 * task group, on a particular numa node. The group weight is given a
@@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
935{ 932{
936 unsigned long total_faults; 933 unsigned long total_faults;
937 934
938 if (!p->numa_faults) 935 if (!p->numa_faults_memory)
939 return 0; 936 return 0;
940 937
941 total_faults = p->total_numa_faults; 938 total_faults = p->total_numa_faults;
@@ -954,6 +951,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
954 return 1000 * group_faults(p, nid) / p->numa_group->total_faults; 951 return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
955} 952}
956 953
954bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
955 int src_nid, int dst_cpu)
956{
957 struct numa_group *ng = p->numa_group;
958 int dst_nid = cpu_to_node(dst_cpu);
959 int last_cpupid, this_cpupid;
960
961 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
962
963 /*
964 * Multi-stage node selection is used in conjunction with a periodic
965 * migration fault to build a temporal task<->page relation. By using
966 * a two-stage filter we remove short/unlikely relations.
967 *
968 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
969 * a task's usage of a particular page (n_p) per total usage of this
970 * page (n_t) (in a given time-span) to a probability.
971 *
972 * Our periodic faults will sample this probability and getting the
973 * same result twice in a row, given these samples are fully
974 * independent, is then given by P(n)^2, provided our sample period
975 * is sufficiently short compared to the usage pattern.
976 *
977 * This quadric squishes small probabilities, making it less likely we
978 * act on an unlikely task<->page relation.
979 */
980 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
981 if (!cpupid_pid_unset(last_cpupid) &&
982 cpupid_to_nid(last_cpupid) != dst_nid)
983 return false;
984
985 /* Always allow migrate on private faults */
986 if (cpupid_match_pid(p, last_cpupid))
987 return true;
988
989 /* A shared fault, but p->numa_group has not been set up yet. */
990 if (!ng)
991 return true;
992
993 /*
994 * Do not migrate if the destination is not a node that
995 * is actively used by this numa group.
996 */
997 if (!node_isset(dst_nid, ng->active_nodes))
998 return false;
999
1000 /*
1001 * Source is a node that is not actively used by this
1002 * numa group, while the destination is. Migrate.
1003 */
1004 if (!node_isset(src_nid, ng->active_nodes))
1005 return true;
1006
1007 /*
1008 * Both source and destination are nodes in active
1009 * use by this numa group. Maximize memory bandwidth
1010 * by migrating from more heavily used groups, to less
1011 * heavily used ones, spreading the load around.
1012 * Use a 1/4 hysteresis to avoid spurious page movement.
1013 */
1014 return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1015}
1016
957static unsigned long weighted_cpuload(const int cpu); 1017static unsigned long weighted_cpuload(const int cpu);
958static unsigned long source_load(int cpu, int type); 1018static unsigned long source_load(int cpu, int type);
959static unsigned long target_load(int cpu, int type); 1019static unsigned long target_load(int cpu, int type);
@@ -1267,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p)
1267static void numa_migrate_preferred(struct task_struct *p) 1327static void numa_migrate_preferred(struct task_struct *p)
1268{ 1328{
1269 /* This task has no NUMA fault statistics yet */ 1329 /* This task has no NUMA fault statistics yet */
1270 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) 1330 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
1271 return; 1331 return;
1272 1332
1273 /* Periodically retry migrating the task to the preferred node */ 1333 /* Periodically retry migrating the task to the preferred node */
@@ -1282,6 +1342,38 @@ static void numa_migrate_preferred(struct task_struct *p)
1282} 1342}
1283 1343
1284/* 1344/*
1345 * Find the nodes on which the workload is actively running. We do this by
1346 * tracking the nodes from which NUMA hinting faults are triggered. This can
1347 * be different from the set of nodes where the workload's memory is currently
1348 * located.
1349 *
1350 * The bitmask is used to make smarter decisions on when to do NUMA page
1351 * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1352 * are added when they cause over 6/16 of the maximum number of faults, but
1353 * only removed when they drop below 3/16.
1354 */
1355static void update_numa_active_node_mask(struct numa_group *numa_group)
1356{
1357 unsigned long faults, max_faults = 0;
1358 int nid;
1359
1360 for_each_online_node(nid) {
1361 faults = group_faults_cpu(numa_group, nid);
1362 if (faults > max_faults)
1363 max_faults = faults;
1364 }
1365
1366 for_each_online_node(nid) {
1367 faults = group_faults_cpu(numa_group, nid);
1368 if (!node_isset(nid, numa_group->active_nodes)) {
1369 if (faults > max_faults * 6 / 16)
1370 node_set(nid, numa_group->active_nodes);
1371 } else if (faults < max_faults * 3 / 16)
1372 node_clear(nid, numa_group->active_nodes);
1373 }
1374}
1375
1376/*
1285 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS 1377 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1286 * increments. The more local the fault statistics are, the higher the scan 1378 * increments. The more local the fault statistics are, the higher the scan
1287 * period will be for the next scan window. If local/remote ratio is below 1379 * period will be for the next scan window. If local/remote ratio is below
@@ -1355,11 +1447,41 @@ static void update_task_scan_period(struct task_struct *p,
1355 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 1447 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1356} 1448}
1357 1449
1450/*
1451 * Get the fraction of time the task has been running since the last
1452 * NUMA placement cycle. The scheduler keeps similar statistics, but
1453 * decays those on a 32ms period, which is orders of magnitude off
1454 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1455 * stats only if the task is so new there are no NUMA statistics yet.
1456 */
1457static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1458{
1459 u64 runtime, delta, now;
1460 /* Use the start of this time slice to avoid calculations. */
1461 now = p->se.exec_start;
1462 runtime = p->se.sum_exec_runtime;
1463
1464 if (p->last_task_numa_placement) {
1465 delta = runtime - p->last_sum_exec_runtime;
1466 *period = now - p->last_task_numa_placement;
1467 } else {
1468 delta = p->se.avg.runnable_avg_sum;
1469 *period = p->se.avg.runnable_avg_period;
1470 }
1471
1472 p->last_sum_exec_runtime = runtime;
1473 p->last_task_numa_placement = now;
1474
1475 return delta;
1476}
1477
1358static void task_numa_placement(struct task_struct *p) 1478static void task_numa_placement(struct task_struct *p)
1359{ 1479{
1360 int seq, nid, max_nid = -1, max_group_nid = -1; 1480 int seq, nid, max_nid = -1, max_group_nid = -1;
1361 unsigned long max_faults = 0, max_group_faults = 0; 1481 unsigned long max_faults = 0, max_group_faults = 0;
1362 unsigned long fault_types[2] = { 0, 0 }; 1482 unsigned long fault_types[2] = { 0, 0 };
1483 unsigned long total_faults;
1484 u64 runtime, period;
1363 spinlock_t *group_lock = NULL; 1485 spinlock_t *group_lock = NULL;
1364 1486
1365 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1487 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1368,6 +1490,10 @@ static void task_numa_placement(struct task_struct *p)
1368 p->numa_scan_seq = seq; 1490 p->numa_scan_seq = seq;
1369 p->numa_scan_period_max = task_scan_max(p); 1491 p->numa_scan_period_max = task_scan_max(p);
1370 1492
1493 total_faults = p->numa_faults_locality[0] +
1494 p->numa_faults_locality[1];
1495 runtime = numa_get_avg_runtime(p, &period);
1496
1371 /* If the task is part of a group prevent parallel updates to group stats */ 1497 /* If the task is part of a group prevent parallel updates to group stats */
1372 if (p->numa_group) { 1498 if (p->numa_group) {
1373 group_lock = &p->numa_group->lock; 1499 group_lock = &p->numa_group->lock;
@@ -1379,24 +1505,37 @@ static void task_numa_placement(struct task_struct *p)
1379 unsigned long faults = 0, group_faults = 0; 1505 unsigned long faults = 0, group_faults = 0;
1380 int priv, i; 1506 int priv, i;
1381 1507
1382 for (priv = 0; priv < 2; priv++) { 1508 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1383 long diff; 1509 long diff, f_diff, f_weight;
1384 1510
1385 i = task_faults_idx(nid, priv); 1511 i = task_faults_idx(nid, priv);
1386 diff = -p->numa_faults[i];
1387 1512
1388 /* Decay existing window, copy faults since last scan */ 1513 /* Decay existing window, copy faults since last scan */
1389 p->numa_faults[i] >>= 1; 1514 diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
1390 p->numa_faults[i] += p->numa_faults_buffer[i]; 1515 fault_types[priv] += p->numa_faults_buffer_memory[i];
1391 fault_types[priv] += p->numa_faults_buffer[i]; 1516 p->numa_faults_buffer_memory[i] = 0;
1392 p->numa_faults_buffer[i] = 0;
1393 1517
1394 faults += p->numa_faults[i]; 1518 /*
1395 diff += p->numa_faults[i]; 1519 * Normalize the faults_from, so all tasks in a group
1520 * count according to CPU use, instead of by the raw
1521 * number of faults. Tasks with little runtime have
1522 * little over-all impact on throughput, and thus their
1523 * faults are less important.
1524 */
1525 f_weight = div64_u64(runtime << 16, period + 1);
1526 f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
1527 (total_faults + 1);
1528 f_diff = f_weight - p->numa_faults_cpu[i] / 2;
1529 p->numa_faults_buffer_cpu[i] = 0;
1530
1531 p->numa_faults_memory[i] += diff;
1532 p->numa_faults_cpu[i] += f_diff;
1533 faults += p->numa_faults_memory[i];
1396 p->total_numa_faults += diff; 1534 p->total_numa_faults += diff;
1397 if (p->numa_group) { 1535 if (p->numa_group) {
1398 /* safe because we can only change our own group */ 1536 /* safe because we can only change our own group */
1399 p->numa_group->faults[i] += diff; 1537 p->numa_group->faults[i] += diff;
1538 p->numa_group->faults_cpu[i] += f_diff;
1400 p->numa_group->total_faults += diff; 1539 p->numa_group->total_faults += diff;
1401 group_faults += p->numa_group->faults[i]; 1540 group_faults += p->numa_group->faults[i];
1402 } 1541 }
@@ -1416,6 +1555,7 @@ static void task_numa_placement(struct task_struct *p)
1416 update_task_scan_period(p, fault_types[0], fault_types[1]); 1555 update_task_scan_period(p, fault_types[0], fault_types[1]);
1417 1556
1418 if (p->numa_group) { 1557 if (p->numa_group) {
1558 update_numa_active_node_mask(p->numa_group);
1419 /* 1559 /*
1420 * If the preferred task and group nids are different, 1560 * If the preferred task and group nids are different,
1421 * iterate over the nodes again to find the best place. 1561 * iterate over the nodes again to find the best place.
@@ -1465,7 +1605,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1465 1605
1466 if (unlikely(!p->numa_group)) { 1606 if (unlikely(!p->numa_group)) {
1467 unsigned int size = sizeof(struct numa_group) + 1607 unsigned int size = sizeof(struct numa_group) +
1468 2*nr_node_ids*sizeof(unsigned long); 1608 4*nr_node_ids*sizeof(unsigned long);
1469 1609
1470 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); 1610 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1471 if (!grp) 1611 if (!grp)
@@ -1475,9 +1615,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1475 spin_lock_init(&grp->lock); 1615 spin_lock_init(&grp->lock);
1476 INIT_LIST_HEAD(&grp->task_list); 1616 INIT_LIST_HEAD(&grp->task_list);
1477 grp->gid = p->pid; 1617 grp->gid = p->pid;
1618 /* Second half of the array tracks nids where faults happen */
1619 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1620 nr_node_ids;
1621
1622 node_set(task_node(current), grp->active_nodes);
1478 1623
1479 for (i = 0; i < 2*nr_node_ids; i++) 1624 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1480 grp->faults[i] = p->numa_faults[i]; 1625 grp->faults[i] = p->numa_faults_memory[i];
1481 1626
1482 grp->total_faults = p->total_numa_faults; 1627 grp->total_faults = p->total_numa_faults;
1483 1628
@@ -1534,9 +1679,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1534 1679
1535 double_lock(&my_grp->lock, &grp->lock); 1680 double_lock(&my_grp->lock, &grp->lock);
1536 1681
1537 for (i = 0; i < 2*nr_node_ids; i++) { 1682 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
1538 my_grp->faults[i] -= p->numa_faults[i]; 1683 my_grp->faults[i] -= p->numa_faults_memory[i];
1539 grp->faults[i] += p->numa_faults[i]; 1684 grp->faults[i] += p->numa_faults_memory[i];
1540 } 1685 }
1541 my_grp->total_faults -= p->total_numa_faults; 1686 my_grp->total_faults -= p->total_numa_faults;
1542 grp->total_faults += p->total_numa_faults; 1687 grp->total_faults += p->total_numa_faults;
@@ -1562,12 +1707,12 @@ void task_numa_free(struct task_struct *p)
1562{ 1707{
1563 struct numa_group *grp = p->numa_group; 1708 struct numa_group *grp = p->numa_group;
1564 int i; 1709 int i;
1565 void *numa_faults = p->numa_faults; 1710 void *numa_faults = p->numa_faults_memory;
1566 1711
1567 if (grp) { 1712 if (grp) {
1568 spin_lock(&grp->lock); 1713 spin_lock(&grp->lock);
1569 for (i = 0; i < 2*nr_node_ids; i++) 1714 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1570 grp->faults[i] -= p->numa_faults[i]; 1715 grp->faults[i] -= p->numa_faults_memory[i];
1571 grp->total_faults -= p->total_numa_faults; 1716 grp->total_faults -= p->total_numa_faults;
1572 1717
1573 list_del(&p->numa_entry); 1718 list_del(&p->numa_entry);
@@ -1577,18 +1722,21 @@ void task_numa_free(struct task_struct *p)
1577 put_numa_group(grp); 1722 put_numa_group(grp);
1578 } 1723 }
1579 1724
1580 p->numa_faults = NULL; 1725 p->numa_faults_memory = NULL;
1581 p->numa_faults_buffer = NULL; 1726 p->numa_faults_buffer_memory = NULL;
1727 p->numa_faults_cpu= NULL;
1728 p->numa_faults_buffer_cpu = NULL;
1582 kfree(numa_faults); 1729 kfree(numa_faults);
1583} 1730}
1584 1731
1585/* 1732/*
1586 * Got a PROT_NONE fault for a page on @node. 1733 * Got a PROT_NONE fault for a page on @node.
1587 */ 1734 */
1588void task_numa_fault(int last_cpupid, int node, int pages, int flags) 1735void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1589{ 1736{
1590 struct task_struct *p = current; 1737 struct task_struct *p = current;
1591 bool migrated = flags & TNF_MIGRATED; 1738 bool migrated = flags & TNF_MIGRATED;
1739 int cpu_node = task_node(current);
1592 int priv; 1740 int priv;
1593 1741
1594 if (!numabalancing_enabled) 1742 if (!numabalancing_enabled)
@@ -1603,16 +1751,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1603 return; 1751 return;
1604 1752
1605 /* Allocate buffer to track faults on a per-node basis */ 1753 /* Allocate buffer to track faults on a per-node basis */
1606 if (unlikely(!p->numa_faults)) { 1754 if (unlikely(!p->numa_faults_memory)) {
1607 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; 1755 int size = sizeof(*p->numa_faults_memory) *
1756 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
1608 1757
1609 /* numa_faults and numa_faults_buffer share the allocation */ 1758 p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
1610 p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); 1759 if (!p->numa_faults_memory)
1611 if (!p->numa_faults)
1612 return; 1760 return;
1613 1761
1614 BUG_ON(p->numa_faults_buffer); 1762 BUG_ON(p->numa_faults_buffer_memory);
1615 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); 1763 /*
1764 * The averaged statistics, shared & private, memory & cpu,
1765 * occupy the first half of the array. The second half of the
1766 * array is for current counters, which are averaged into the
1767 * first set by task_numa_placement.
1768 */
1769 p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
1770 p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
1771 p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
1616 p->total_numa_faults = 0; 1772 p->total_numa_faults = 0;
1617 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 1773 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1618 } 1774 }
@@ -1641,7 +1797,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1641 if (migrated) 1797 if (migrated)
1642 p->numa_pages_migrated += pages; 1798 p->numa_pages_migrated += pages;
1643 1799
1644 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; 1800 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
1801 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
1645 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; 1802 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
1646} 1803}
1647 1804
@@ -1757,6 +1914,8 @@ void task_numa_work(struct callback_head *work)
1757 start = end; 1914 start = end;
1758 if (pages <= 0) 1915 if (pages <= 0)
1759 goto out; 1916 goto out;
1917
1918 cond_resched();
1760 } while (end != vma->vm_end); 1919 } while (end != vma->vm_end);
1761 } 1920 }
1762 1921
@@ -2217,13 +2376,20 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
2217 se->avg.load_avg_contrib >>= NICE_0_SHIFT; 2376 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
2218 } 2377 }
2219} 2378}
2220#else 2379
2380static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2381{
2382 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
2383 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
2384}
2385#else /* CONFIG_FAIR_GROUP_SCHED */
2221static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, 2386static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2222 int force_update) {} 2387 int force_update) {}
2223static inline void __update_tg_runnable_avg(struct sched_avg *sa, 2388static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2224 struct cfs_rq *cfs_rq) {} 2389 struct cfs_rq *cfs_rq) {}
2225static inline void __update_group_entity_contrib(struct sched_entity *se) {} 2390static inline void __update_group_entity_contrib(struct sched_entity *se) {}
2226#endif 2391static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2392#endif /* CONFIG_FAIR_GROUP_SCHED */
2227 2393
2228static inline void __update_task_entity_contrib(struct sched_entity *se) 2394static inline void __update_task_entity_contrib(struct sched_entity *se)
2229{ 2395{
@@ -2321,12 +2487,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
2321 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); 2487 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
2322} 2488}
2323 2489
2324static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2325{
2326 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
2327 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
2328}
2329
2330/* Add the load generated by se into cfs_rq's child load-average */ 2490/* Add the load generated by se into cfs_rq's child load-average */
2331static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, 2491static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2332 struct sched_entity *se, 2492 struct sched_entity *se,
@@ -2414,7 +2574,10 @@ void idle_exit_fair(struct rq *this_rq)
2414 update_rq_runnable_avg(this_rq, 0); 2574 update_rq_runnable_avg(this_rq, 0);
2415} 2575}
2416 2576
2417#else 2577static int idle_balance(struct rq *this_rq);
2578
2579#else /* CONFIG_SMP */
2580
2418static inline void update_entity_load_avg(struct sched_entity *se, 2581static inline void update_entity_load_avg(struct sched_entity *se,
2419 int update_cfs_rq) {} 2582 int update_cfs_rq) {}
2420static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} 2583static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
@@ -2426,7 +2589,13 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2426 int sleep) {} 2589 int sleep) {}
2427static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, 2590static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2428 int force_update) {} 2591 int force_update) {}
2429#endif 2592
2593static inline int idle_balance(struct rq *rq)
2594{
2595 return 0;
2596}
2597
2598#endif /* CONFIG_SMP */
2430 2599
2431static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 2600static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2432{ 2601{
@@ -2576,10 +2745,10 @@ static void __clear_buddies_last(struct sched_entity *se)
2576{ 2745{
2577 for_each_sched_entity(se) { 2746 for_each_sched_entity(se) {
2578 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2747 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2579 if (cfs_rq->last == se) 2748 if (cfs_rq->last != se)
2580 cfs_rq->last = NULL;
2581 else
2582 break; 2749 break;
2750
2751 cfs_rq->last = NULL;
2583 } 2752 }
2584} 2753}
2585 2754
@@ -2587,10 +2756,10 @@ static void __clear_buddies_next(struct sched_entity *se)
2587{ 2756{
2588 for_each_sched_entity(se) { 2757 for_each_sched_entity(se) {
2589 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2758 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2590 if (cfs_rq->next == se) 2759 if (cfs_rq->next != se)
2591 cfs_rq->next = NULL;
2592 else
2593 break; 2760 break;
2761
2762 cfs_rq->next = NULL;
2594 } 2763 }
2595} 2764}
2596 2765
@@ -2598,10 +2767,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
2598{ 2767{
2599 for_each_sched_entity(se) { 2768 for_each_sched_entity(se) {
2600 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2769 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2601 if (cfs_rq->skip == se) 2770 if (cfs_rq->skip != se)
2602 cfs_rq->skip = NULL;
2603 else
2604 break; 2771 break;
2772
2773 cfs_rq->skip = NULL;
2605 } 2774 }
2606} 2775}
2607 2776
@@ -2744,17 +2913,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2744 * 3) pick the "last" process, for cache locality 2913 * 3) pick the "last" process, for cache locality
2745 * 4) do not run the "skip" process, if something else is available 2914 * 4) do not run the "skip" process, if something else is available
2746 */ 2915 */
2747static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 2916static struct sched_entity *
2917pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2748{ 2918{
2749 struct sched_entity *se = __pick_first_entity(cfs_rq); 2919 struct sched_entity *left = __pick_first_entity(cfs_rq);
2750 struct sched_entity *left = se; 2920 struct sched_entity *se;
2921
2922 /*
2923 * If curr is set we have to see if its left of the leftmost entity
2924 * still in the tree, provided there was anything in the tree at all.
2925 */
2926 if (!left || (curr && entity_before(curr, left)))
2927 left = curr;
2928
2929 se = left; /* ideally we run the leftmost entity */
2751 2930
2752 /* 2931 /*
2753 * Avoid running the skip buddy, if running something else can 2932 * Avoid running the skip buddy, if running something else can
2754 * be done without getting too unfair. 2933 * be done without getting too unfair.
2755 */ 2934 */
2756 if (cfs_rq->skip == se) { 2935 if (cfs_rq->skip == se) {
2757 struct sched_entity *second = __pick_next_entity(se); 2936 struct sched_entity *second;
2937
2938 if (se == curr) {
2939 second = __pick_first_entity(cfs_rq);
2940 } else {
2941 second = __pick_next_entity(se);
2942 if (!second || (curr && entity_before(curr, second)))
2943 second = curr;
2944 }
2945
2758 if (second && wakeup_preempt_entity(second, left) < 1) 2946 if (second && wakeup_preempt_entity(second, left) < 1)
2759 se = second; 2947 se = second;
2760 } 2948 }
@@ -2776,7 +2964,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
2776 return se; 2964 return se;
2777} 2965}
2778 2966
2779static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); 2967static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2780 2968
2781static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 2969static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
2782{ 2970{
@@ -3431,22 +3619,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3431} 3619}
3432 3620
3433/* conditionally throttle active cfs_rq's from put_prev_entity() */ 3621/* conditionally throttle active cfs_rq's from put_prev_entity() */
3434static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 3622static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3435{ 3623{
3436 if (!cfs_bandwidth_used()) 3624 if (!cfs_bandwidth_used())
3437 return; 3625 return false;
3438 3626
3439 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) 3627 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3440 return; 3628 return false;
3441 3629
3442 /* 3630 /*
3443 * it's possible for a throttled entity to be forced into a running 3631 * it's possible for a throttled entity to be forced into a running
3444 * state (e.g. set_curr_task), in this case we're finished. 3632 * state (e.g. set_curr_task), in this case we're finished.
3445 */ 3633 */
3446 if (cfs_rq_throttled(cfs_rq)) 3634 if (cfs_rq_throttled(cfs_rq))
3447 return; 3635 return true;
3448 3636
3449 throttle_cfs_rq(cfs_rq); 3637 throttle_cfs_rq(cfs_rq);
3638 return true;
3450} 3639}
3451 3640
3452static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) 3641static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3556,7 +3745,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3556} 3745}
3557 3746
3558static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} 3747static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3559static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3748static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
3560static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 3749static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3561static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3750static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3562 3751
@@ -4211,13 +4400,14 @@ done:
4211} 4400}
4212 4401
4213/* 4402/*
4214 * sched_balance_self: balance the current task (running on cpu) in domains 4403 * select_task_rq_fair: Select target runqueue for the waking task in domains
4215 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 4404 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
4216 * SD_BALANCE_EXEC. 4405 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
4217 * 4406 *
4218 * Balance, ie. select the least loaded group. 4407 * Balances load by selecting the idlest cpu in the idlest group, or under
4408 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
4219 * 4409 *
4220 * Returns the target CPU number, or the same CPU if no balancing is needed. 4410 * Returns the target cpu number.
4221 * 4411 *
4222 * preempt must be disabled. 4412 * preempt must be disabled.
4223 */ 4413 */
@@ -4492,26 +4682,124 @@ preempt:
4492 set_last_buddy(se); 4682 set_last_buddy(se);
4493} 4683}
4494 4684
4495static struct task_struct *pick_next_task_fair(struct rq *rq) 4685static struct task_struct *
4686pick_next_task_fair(struct rq *rq, struct task_struct *prev)
4496{ 4687{
4497 struct task_struct *p;
4498 struct cfs_rq *cfs_rq = &rq->cfs; 4688 struct cfs_rq *cfs_rq = &rq->cfs;
4499 struct sched_entity *se; 4689 struct sched_entity *se;
4690 struct task_struct *p;
4691 int new_tasks;
4500 4692
4693again:
4694#ifdef CONFIG_FAIR_GROUP_SCHED
4501 if (!cfs_rq->nr_running) 4695 if (!cfs_rq->nr_running)
4502 return NULL; 4696 goto idle;
4697
4698 if (prev->sched_class != &fair_sched_class)
4699 goto simple;
4700
4701 /*
4702 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
4703 * likely that a next task is from the same cgroup as the current.
4704 *
4705 * Therefore attempt to avoid putting and setting the entire cgroup
4706 * hierarchy, only change the part that actually changes.
4707 */
4503 4708
4504 do { 4709 do {
4505 se = pick_next_entity(cfs_rq); 4710 struct sched_entity *curr = cfs_rq->curr;
4711
4712 /*
4713 * Since we got here without doing put_prev_entity() we also
4714 * have to consider cfs_rq->curr. If it is still a runnable
4715 * entity, update_curr() will update its vruntime, otherwise
4716 * forget we've ever seen it.
4717 */
4718 if (curr && curr->on_rq)
4719 update_curr(cfs_rq);
4720 else
4721 curr = NULL;
4722
4723 /*
4724 * This call to check_cfs_rq_runtime() will do the throttle and
4725 * dequeue its entity in the parent(s). Therefore the 'simple'
4726 * nr_running test will indeed be correct.
4727 */
4728 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
4729 goto simple;
4730
4731 se = pick_next_entity(cfs_rq, curr);
4732 cfs_rq = group_cfs_rq(se);
4733 } while (cfs_rq);
4734
4735 p = task_of(se);
4736
4737 /*
4738 * Since we haven't yet done put_prev_entity and if the selected task
4739 * is a different task than we started out with, try and touch the
4740 * least amount of cfs_rqs.
4741 */
4742 if (prev != p) {
4743 struct sched_entity *pse = &prev->se;
4744
4745 while (!(cfs_rq = is_same_group(se, pse))) {
4746 int se_depth = se->depth;
4747 int pse_depth = pse->depth;
4748
4749 if (se_depth <= pse_depth) {
4750 put_prev_entity(cfs_rq_of(pse), pse);
4751 pse = parent_entity(pse);
4752 }
4753 if (se_depth >= pse_depth) {
4754 set_next_entity(cfs_rq_of(se), se);
4755 se = parent_entity(se);
4756 }
4757 }
4758
4759 put_prev_entity(cfs_rq, pse);
4760 set_next_entity(cfs_rq, se);
4761 }
4762
4763 if (hrtick_enabled(rq))
4764 hrtick_start_fair(rq, p);
4765
4766 return p;
4767simple:
4768 cfs_rq = &rq->cfs;
4769#endif
4770
4771 if (!cfs_rq->nr_running)
4772 goto idle;
4773
4774 put_prev_task(rq, prev);
4775
4776 do {
4777 se = pick_next_entity(cfs_rq, NULL);
4506 set_next_entity(cfs_rq, se); 4778 set_next_entity(cfs_rq, se);
4507 cfs_rq = group_cfs_rq(se); 4779 cfs_rq = group_cfs_rq(se);
4508 } while (cfs_rq); 4780 } while (cfs_rq);
4509 4781
4510 p = task_of(se); 4782 p = task_of(se);
4783
4511 if (hrtick_enabled(rq)) 4784 if (hrtick_enabled(rq))
4512 hrtick_start_fair(rq, p); 4785 hrtick_start_fair(rq, p);
4513 4786
4514 return p; 4787 return p;
4788
4789idle:
4790 new_tasks = idle_balance(rq);
4791 /*
4792 * Because idle_balance() releases (and re-acquires) rq->lock, it is
4793 * possible for any higher priority task to appear. In that case we
4794 * must re-start the pick_next_entity() loop.
4795 */
4796 if (new_tasks < 0)
4797 return RETRY_TASK;
4798
4799 if (new_tasks > 0)
4800 goto again;
4801
4802 return NULL;
4515} 4803}
4516 4804
4517/* 4805/*
@@ -4749,7 +5037,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
4749 * Is this task likely cache-hot: 5037 * Is this task likely cache-hot:
4750 */ 5038 */
4751static int 5039static int
4752task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 5040task_hot(struct task_struct *p, u64 now)
4753{ 5041{
4754 s64 delta; 5042 s64 delta;
4755 5043
@@ -4783,7 +5071,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4783{ 5071{
4784 int src_nid, dst_nid; 5072 int src_nid, dst_nid;
4785 5073
4786 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || 5074 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
4787 !(env->sd->flags & SD_NUMA)) { 5075 !(env->sd->flags & SD_NUMA)) {
4788 return false; 5076 return false;
4789 } 5077 }
@@ -4814,7 +5102,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4814 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5102 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
4815 return false; 5103 return false;
4816 5104
4817 if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) 5105 if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
4818 return false; 5106 return false;
4819 5107
4820 src_nid = cpu_to_node(env->src_cpu); 5108 src_nid = cpu_to_node(env->src_cpu);
@@ -4910,7 +5198,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4910 * 2) task is cache cold, or 5198 * 2) task is cache cold, or
4911 * 3) too many balance attempts have failed. 5199 * 3) too many balance attempts have failed.
4912 */ 5200 */
4913 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); 5201 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
4914 if (!tsk_cache_hot) 5202 if (!tsk_cache_hot)
4915 tsk_cache_hot = migrate_degrades_locality(p, env); 5203 tsk_cache_hot = migrate_degrades_locality(p, env);
4916 5204
@@ -5773,12 +6061,10 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
5773 pwr_now /= SCHED_POWER_SCALE; 6061 pwr_now /= SCHED_POWER_SCALE;
5774 6062
5775 /* Amount of load we'd subtract */ 6063 /* Amount of load we'd subtract */
5776 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / 6064 if (busiest->avg_load > scaled_busy_load_per_task) {
5777 busiest->group_power;
5778 if (busiest->avg_load > tmp) {
5779 pwr_move += busiest->group_power * 6065 pwr_move += busiest->group_power *
5780 min(busiest->load_per_task, 6066 min(busiest->load_per_task,
5781 busiest->avg_load - tmp); 6067 busiest->avg_load - scaled_busy_load_per_task);
5782 } 6068 }
5783 6069
5784 /* Amount of load we'd add */ 6070 /* Amount of load we'd add */
@@ -6357,17 +6643,23 @@ out:
6357 * idle_balance is called by schedule() if this_cpu is about to become 6643 * idle_balance is called by schedule() if this_cpu is about to become
6358 * idle. Attempts to pull tasks from other CPUs. 6644 * idle. Attempts to pull tasks from other CPUs.
6359 */ 6645 */
6360void idle_balance(int this_cpu, struct rq *this_rq) 6646static int idle_balance(struct rq *this_rq)
6361{ 6647{
6362 struct sched_domain *sd; 6648 struct sched_domain *sd;
6363 int pulled_task = 0; 6649 int pulled_task = 0;
6364 unsigned long next_balance = jiffies + HZ; 6650 unsigned long next_balance = jiffies + HZ;
6365 u64 curr_cost = 0; 6651 u64 curr_cost = 0;
6652 int this_cpu = this_rq->cpu;
6366 6653
6654 idle_enter_fair(this_rq);
6655 /*
6656 * We must set idle_stamp _before_ calling idle_balance(), such that we
6657 * measure the duration of idle_balance() as idle time.
6658 */
6367 this_rq->idle_stamp = rq_clock(this_rq); 6659 this_rq->idle_stamp = rq_clock(this_rq);
6368 6660
6369 if (this_rq->avg_idle < sysctl_sched_migration_cost) 6661 if (this_rq->avg_idle < sysctl_sched_migration_cost)
6370 return; 6662 goto out;
6371 6663
6372 /* 6664 /*
6373 * Drop the rq->lock, but keep IRQ/preempt disabled. 6665 * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6405,15 +6697,22 @@ void idle_balance(int this_cpu, struct rq *this_rq)
6405 interval = msecs_to_jiffies(sd->balance_interval); 6697 interval = msecs_to_jiffies(sd->balance_interval);
6406 if (time_after(next_balance, sd->last_balance + interval)) 6698 if (time_after(next_balance, sd->last_balance + interval))
6407 next_balance = sd->last_balance + interval; 6699 next_balance = sd->last_balance + interval;
6408 if (pulled_task) { 6700 if (pulled_task)
6409 this_rq->idle_stamp = 0;
6410 break; 6701 break;
6411 }
6412 } 6702 }
6413 rcu_read_unlock(); 6703 rcu_read_unlock();
6414 6704
6415 raw_spin_lock(&this_rq->lock); 6705 raw_spin_lock(&this_rq->lock);
6416 6706
6707 /*
6708 * While browsing the domains, we released the rq lock.
6709 * A task could have be enqueued in the meantime
6710 */
6711 if (this_rq->cfs.h_nr_running && !pulled_task) {
6712 pulled_task = 1;
6713 goto out;
6714 }
6715
6417 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 6716 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
6418 /* 6717 /*
6419 * We are going idle. next_balance may be set based on 6718 * We are going idle. next_balance may be set based on
@@ -6424,6 +6723,20 @@ void idle_balance(int this_cpu, struct rq *this_rq)
6424 6723
6425 if (curr_cost > this_rq->max_idle_balance_cost) 6724 if (curr_cost > this_rq->max_idle_balance_cost)
6426 this_rq->max_idle_balance_cost = curr_cost; 6725 this_rq->max_idle_balance_cost = curr_cost;
6726
6727out:
6728 /* Is there a task of a high priority class? */
6729 if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
6730 (this_rq->dl.dl_nr_running ||
6731 (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
6732 pulled_task = -1;
6733
6734 if (pulled_task) {
6735 idle_exit_fair(this_rq);
6736 this_rq->idle_stamp = 0;
6737 }
6738
6739 return pulled_task;
6427} 6740}
6428 6741
6429/* 6742/*
@@ -6494,6 +6807,11 @@ out_unlock:
6494 return 0; 6807 return 0;
6495} 6808}
6496 6809
6810static inline int on_null_domain(struct rq *rq)
6811{
6812 return unlikely(!rcu_dereference_sched(rq->sd));
6813}
6814
6497#ifdef CONFIG_NO_HZ_COMMON 6815#ifdef CONFIG_NO_HZ_COMMON
6498/* 6816/*
6499 * idle load balancing details 6817 * idle load balancing details
@@ -6548,8 +6866,13 @@ static void nohz_balancer_kick(void)
6548static inline void nohz_balance_exit_idle(int cpu) 6866static inline void nohz_balance_exit_idle(int cpu)
6549{ 6867{
6550 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 6868 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
6551 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 6869 /*
6552 atomic_dec(&nohz.nr_cpus); 6870 * Completely isolated CPUs don't ever set, so we must test.
6871 */
6872 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
6873 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
6874 atomic_dec(&nohz.nr_cpus);
6875 }
6553 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 6876 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6554 } 6877 }
6555} 6878}
@@ -6603,6 +6926,12 @@ void nohz_balance_enter_idle(int cpu)
6603 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) 6926 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
6604 return; 6927 return;
6605 6928
6929 /*
6930 * If we're a completely isolated CPU, we don't play.
6931 */
6932 if (on_null_domain(cpu_rq(cpu)))
6933 return;
6934
6606 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 6935 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
6607 atomic_inc(&nohz.nr_cpus); 6936 atomic_inc(&nohz.nr_cpus);
6608 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 6937 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
@@ -6865,11 +7194,6 @@ static void run_rebalance_domains(struct softirq_action *h)
6865 nohz_idle_balance(this_rq, idle); 7194 nohz_idle_balance(this_rq, idle);
6866} 7195}
6867 7196
6868static inline int on_null_domain(struct rq *rq)
6869{
6870 return !rcu_dereference_sched(rq->sd);
6871}
6872
6873/* 7197/*
6874 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 7198 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
6875 */ 7199 */
@@ -6999,15 +7323,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
6999 struct cfs_rq *cfs_rq = cfs_rq_of(se); 7323 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7000 7324
7001 /* 7325 /*
7002 * Ensure the task's vruntime is normalized, so that when its 7326 * Ensure the task's vruntime is normalized, so that when it's
7003 * switched back to the fair class the enqueue_entity(.flags=0) will 7327 * switched back to the fair class the enqueue_entity(.flags=0) will
7004 * do the right thing. 7328 * do the right thing.
7005 * 7329 *
7006 * If it was on_rq, then the dequeue_entity(.flags=0) will already 7330 * If it's on_rq, then the dequeue_entity(.flags=0) will already
7007 * have normalized the vruntime, if it was !on_rq, then only when 7331 * have normalized the vruntime, if it's !on_rq, then only when
7008 * the task is sleeping will it still have non-normalized vruntime. 7332 * the task is sleeping will it still have non-normalized vruntime.
7009 */ 7333 */
7010 if (!se->on_rq && p->state != TASK_RUNNING) { 7334 if (!p->on_rq && p->state != TASK_RUNNING) {
7011 /* 7335 /*
7012 * Fix up our vruntime so that the current sleep doesn't 7336 * Fix up our vruntime so that the current sleep doesn't
7013 * cause 'unlimited' sleep bonus. 7337 * cause 'unlimited' sleep bonus.
@@ -7034,7 +7358,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7034 */ 7358 */
7035static void switched_to_fair(struct rq *rq, struct task_struct *p) 7359static void switched_to_fair(struct rq *rq, struct task_struct *p)
7036{ 7360{
7037 if (!p->se.on_rq) 7361 struct sched_entity *se = &p->se;
7362#ifdef CONFIG_FAIR_GROUP_SCHED
7363 /*
7364 * Since the real-depth could have been changed (only FAIR
7365 * class maintain depth value), reset depth properly.
7366 */
7367 se->depth = se->parent ? se->parent->depth + 1 : 0;
7368#endif
7369 if (!se->on_rq)
7038 return; 7370 return;
7039 7371
7040 /* 7372 /*
@@ -7082,7 +7414,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
7082#ifdef CONFIG_FAIR_GROUP_SCHED 7414#ifdef CONFIG_FAIR_GROUP_SCHED
7083static void task_move_group_fair(struct task_struct *p, int on_rq) 7415static void task_move_group_fair(struct task_struct *p, int on_rq)
7084{ 7416{
7417 struct sched_entity *se = &p->se;
7085 struct cfs_rq *cfs_rq; 7418 struct cfs_rq *cfs_rq;
7419
7086 /* 7420 /*
7087 * If the task was not on the rq at the time of this cgroup movement 7421 * If the task was not on the rq at the time of this cgroup movement
7088 * it must have been asleep, sleeping tasks keep their ->vruntime 7422 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7108,23 +7442,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
7108 * To prevent boost or penalty in the new cfs_rq caused by delta 7442 * To prevent boost or penalty in the new cfs_rq caused by delta
7109 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. 7443 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7110 */ 7444 */
7111 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) 7445 if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
7112 on_rq = 1; 7446 on_rq = 1;
7113 7447
7114 if (!on_rq) 7448 if (!on_rq)
7115 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 7449 se->vruntime -= cfs_rq_of(se)->min_vruntime;
7116 set_task_rq(p, task_cpu(p)); 7450 set_task_rq(p, task_cpu(p));
7451 se->depth = se->parent ? se->parent->depth + 1 : 0;
7117 if (!on_rq) { 7452 if (!on_rq) {
7118 cfs_rq = cfs_rq_of(&p->se); 7453 cfs_rq = cfs_rq_of(se);
7119 p->se.vruntime += cfs_rq->min_vruntime; 7454 se->vruntime += cfs_rq->min_vruntime;
7120#ifdef CONFIG_SMP 7455#ifdef CONFIG_SMP
7121 /* 7456 /*
7122 * migrate_task_rq_fair() will have removed our previous 7457 * migrate_task_rq_fair() will have removed our previous
7123 * contribution, but we must synchronize for ongoing future 7458 * contribution, but we must synchronize for ongoing future
7124 * decay. 7459 * decay.
7125 */ 7460 */
7126 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 7461 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
7127 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; 7462 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
7128#endif 7463#endif
7129 } 7464 }
7130} 7465}
@@ -7220,10 +7555,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7220 if (!se) 7555 if (!se)
7221 return; 7556 return;
7222 7557
7223 if (!parent) 7558 if (!parent) {
7224 se->cfs_rq = &rq->cfs; 7559 se->cfs_rq = &rq->cfs;
7225 else 7560 se->depth = 0;
7561 } else {
7226 se->cfs_rq = parent->my_q; 7562 se->cfs_rq = parent->my_q;
7563 se->depth = parent->depth + 1;
7564 }
7227 7565
7228 se->my_q = cfs_rq; 7566 se->my_q = cfs_rq;
7229 /* guarantee group entities always have weight */ 7567 /* guarantee group entities always have weight */
diff --git a/kernel/cpu/idle.c b/kernel/sched/idle.c
index 277f494c2a9a..b7976a127178 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/sched/idle.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/cpu.h> 5#include <linux/cpu.h>
6#include <linux/cpuidle.h>
6#include <linux/tick.h> 7#include <linux/tick.h>
7#include <linux/mm.h> 8#include <linux/mm.h>
8#include <linux/stackprotector.h> 9#include <linux/stackprotector.h>
@@ -95,8 +96,10 @@ static void cpu_idle_loop(void)
95 if (!current_clr_polling_and_test()) { 96 if (!current_clr_polling_and_test()) {
96 stop_critical_timings(); 97 stop_critical_timings();
97 rcu_idle_enter(); 98 rcu_idle_enter();
98 arch_cpu_idle(); 99 if (cpuidle_idle_call())
99 WARN_ON_ONCE(irqs_disabled()); 100 arch_cpu_idle();
101 if (WARN_ON_ONCE(irqs_disabled()))
102 local_irq_enable();
100 rcu_idle_exit(); 103 rcu_idle_exit();
101 start_critical_timings(); 104 start_critical_timings();
102 } else { 105 } else {
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9ceea1..879f2b75266a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,18 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
16
17static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
18{
19 idle_exit_fair(rq);
20 rq_last_tick_reset(rq);
21}
22
23static void post_schedule_idle(struct rq *rq)
24{
25 idle_enter_fair(rq);
26}
27#endif /* CONFIG_SMP */ 16#endif /* CONFIG_SMP */
17
28/* 18/*
29 * Idle tasks are unconditionally rescheduled: 19 * Idle tasks are unconditionally rescheduled:
30 */ 20 */
@@ -33,13 +23,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
33 resched_task(rq->idle); 23 resched_task(rq->idle);
34} 24}
35 25
36static struct task_struct *pick_next_task_idle(struct rq *rq) 26static struct task_struct *
27pick_next_task_idle(struct rq *rq, struct task_struct *prev)
37{ 28{
29 put_prev_task(rq, prev);
30
38 schedstat_inc(rq, sched_goidle); 31 schedstat_inc(rq, sched_goidle);
39#ifdef CONFIG_SMP
40 /* Trigger the post schedule to do an idle_enter for CFS */
41 rq->post_schedule = 1;
42#endif
43 return rq->idle; 32 return rq->idle;
44} 33}
45 34
@@ -58,6 +47,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
58 47
59static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 48static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
60{ 49{
50 idle_exit_fair(rq);
51 rq_last_tick_reset(rq);
61} 52}
62 53
63static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 54static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
@@ -101,8 +92,6 @@ const struct sched_class idle_sched_class = {
101 92
102#ifdef CONFIG_SMP 93#ifdef CONFIG_SMP
103 .select_task_rq = select_task_rq_idle, 94 .select_task_rq = select_task_rq_idle,
104 .pre_schedule = pre_schedule_idle,
105 .post_schedule = post_schedule_idle,
106#endif 95#endif
107 96
108 .set_curr_task = set_curr_task_idle, 97 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a2740b775b45..d8cdf1618551 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -229,6 +229,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
229 229
230#ifdef CONFIG_SMP 230#ifdef CONFIG_SMP
231 231
232static int pull_rt_task(struct rq *this_rq);
233
234static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
235{
236 /* Try to pull RT tasks here if we lower this rq's prio */
237 return rq->rt.highest_prio.curr > prev->prio;
238}
239
232static inline int rt_overloaded(struct rq *rq) 240static inline int rt_overloaded(struct rq *rq)
233{ 241{
234 return atomic_read(&rq->rd->rto_count); 242 return atomic_read(&rq->rd->rto_count);
@@ -315,6 +323,15 @@ static inline int has_pushable_tasks(struct rq *rq)
315 return !plist_head_empty(&rq->rt.pushable_tasks); 323 return !plist_head_empty(&rq->rt.pushable_tasks);
316} 324}
317 325
326static inline void set_post_schedule(struct rq *rq)
327{
328 /*
329 * We detect this state here so that we can avoid taking the RQ
330 * lock again later if there is no need to push
331 */
332 rq->post_schedule = has_pushable_tasks(rq);
333}
334
318static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 335static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
319{ 336{
320 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 337 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
@@ -359,6 +376,19 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
359{ 376{
360} 377}
361 378
379static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
380{
381 return false;
382}
383
384static inline int pull_rt_task(struct rq *this_rq)
385{
386 return 0;
387}
388
389static inline void set_post_schedule(struct rq *rq)
390{
391}
362#endif /* CONFIG_SMP */ 392#endif /* CONFIG_SMP */
363 393
364static inline int on_rt_rq(struct sched_rt_entity *rt_se) 394static inline int on_rt_rq(struct sched_rt_entity *rt_se)
@@ -440,11 +470,6 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
440 dequeue_rt_entity(rt_se); 470 dequeue_rt_entity(rt_se);
441} 471}
442 472
443static inline int rt_rq_throttled(struct rt_rq *rt_rq)
444{
445 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
446}
447
448static int rt_se_boosted(struct sched_rt_entity *rt_se) 473static int rt_se_boosted(struct sched_rt_entity *rt_se)
449{ 474{
450 struct rt_rq *rt_rq = group_rt_rq(rt_se); 475 struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -515,11 +540,6 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
515{ 540{
516} 541}
517 542
518static inline int rt_rq_throttled(struct rt_rq *rt_rq)
519{
520 return rt_rq->rt_throttled;
521}
522
523static inline const struct cpumask *sched_rt_period_mask(void) 543static inline const struct cpumask *sched_rt_period_mask(void)
524{ 544{
525 return cpu_online_mask; 545 return cpu_online_mask;
@@ -538,6 +558,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
538 558
539#endif /* CONFIG_RT_GROUP_SCHED */ 559#endif /* CONFIG_RT_GROUP_SCHED */
540 560
561bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
562{
563 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
564
565 return (hrtimer_active(&rt_b->rt_period_timer) ||
566 rt_rq->rt_time < rt_b->rt_runtime);
567}
568
541#ifdef CONFIG_SMP 569#ifdef CONFIG_SMP
542/* 570/*
543 * We ran out of runtime, see if we can borrow some from our neighbours. 571 * We ran out of runtime, see if we can borrow some from our neighbours.
@@ -1310,15 +1338,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1310{ 1338{
1311 struct sched_rt_entity *rt_se; 1339 struct sched_rt_entity *rt_se;
1312 struct task_struct *p; 1340 struct task_struct *p;
1313 struct rt_rq *rt_rq; 1341 struct rt_rq *rt_rq = &rq->rt;
1314
1315 rt_rq = &rq->rt;
1316
1317 if (!rt_rq->rt_nr_running)
1318 return NULL;
1319
1320 if (rt_rq_throttled(rt_rq))
1321 return NULL;
1322 1342
1323 do { 1343 do {
1324 rt_se = pick_next_rt_entity(rq, rt_rq); 1344 rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1332,21 +1352,45 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1332 return p; 1352 return p;
1333} 1353}
1334 1354
1335static struct task_struct *pick_next_task_rt(struct rq *rq) 1355static struct task_struct *
1356pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1336{ 1357{
1337 struct task_struct *p = _pick_next_task_rt(rq); 1358 struct task_struct *p;
1359 struct rt_rq *rt_rq = &rq->rt;
1360
1361 if (need_pull_rt_task(rq, prev)) {
1362 pull_rt_task(rq);
1363 /*
1364 * pull_rt_task() can drop (and re-acquire) rq->lock; this
1365 * means a dl task can slip in, in which case we need to
1366 * re-start task selection.
1367 */
1368 if (unlikely(rq->dl.dl_nr_running))
1369 return RETRY_TASK;
1370 }
1371
1372 /*
1373 * We may dequeue prev's rt_rq in put_prev_task().
1374 * So, we update time before rt_nr_running check.
1375 */
1376 if (prev->sched_class == &rt_sched_class)
1377 update_curr_rt(rq);
1378
1379 if (!rt_rq->rt_nr_running)
1380 return NULL;
1381
1382 if (rt_rq_throttled(rt_rq))
1383 return NULL;
1384
1385 put_prev_task(rq, prev);
1386
1387 p = _pick_next_task_rt(rq);
1338 1388
1339 /* The running task is never eligible for pushing */ 1389 /* The running task is never eligible for pushing */
1340 if (p) 1390 if (p)
1341 dequeue_pushable_task(rq, p); 1391 dequeue_pushable_task(rq, p);
1342 1392
1343#ifdef CONFIG_SMP 1393 set_post_schedule(rq);
1344 /*
1345 * We detect this state here so that we can avoid taking the RQ
1346 * lock again later if there is no need to push
1347 */
1348 rq->post_schedule = has_pushable_tasks(rq);
1349#endif
1350 1394
1351 return p; 1395 return p;
1352} 1396}
@@ -1716,13 +1760,6 @@ skip:
1716 return ret; 1760 return ret;
1717} 1761}
1718 1762
1719static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1720{
1721 /* Try to pull RT tasks here if we lower this rq's prio */
1722 if (rq->rt.highest_prio.curr > prev->prio)
1723 pull_rt_task(rq);
1724}
1725
1726static void post_schedule_rt(struct rq *rq) 1763static void post_schedule_rt(struct rq *rq)
1727{ 1764{
1728 push_rt_tasks(rq); 1765 push_rt_tasks(rq);
@@ -1825,7 +1862,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1825 resched_task(rq->curr); 1862 resched_task(rq->curr);
1826} 1863}
1827 1864
1828void init_sched_rt_class(void) 1865void __init init_sched_rt_class(void)
1829{ 1866{
1830 unsigned int i; 1867 unsigned int i;
1831 1868
@@ -1999,7 +2036,6 @@ const struct sched_class rt_sched_class = {
1999 .set_cpus_allowed = set_cpus_allowed_rt, 2036 .set_cpus_allowed = set_cpus_allowed_rt,
2000 .rq_online = rq_online_rt, 2037 .rq_online = rq_online_rt,
2001 .rq_offline = rq_offline_rt, 2038 .rq_offline = rq_offline_rt,
2002 .pre_schedule = pre_schedule_rt,
2003 .post_schedule = post_schedule_rt, 2039 .post_schedule = post_schedule_rt,
2004 .task_woken = task_woken_rt, 2040 .task_woken = task_woken_rt,
2005 .switched_from = switched_from_rt, 2041 .switched_from = switched_from_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c2119fd20f8b..f2de7a175620 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -24,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq);
24extern void update_cpu_load_active(struct rq *this_rq); 24extern void update_cpu_load_active(struct rq *this_rq);
25 25
26/* 26/*
27 * Convert user-nice values [ -20 ... 0 ... 19 ]
28 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
29 * and back.
30 */
31#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
32#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
33#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
34
35/*
36 * 'User priority' is the nice value converted to something we
37 * can work with better when scaling various scheduler parameters,
38 * it's a [ 0 ... 39 ] range.
39 */
40#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
41#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
42#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
43
44/*
45 * Helpers for converting nanosecond timing to jiffy resolution 27 * Helpers for converting nanosecond timing to jiffy resolution
46 */ 28 */
47#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 29#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
@@ -441,6 +423,18 @@ struct rt_rq {
441#endif 423#endif
442}; 424};
443 425
426#ifdef CONFIG_RT_GROUP_SCHED
427static inline int rt_rq_throttled(struct rt_rq *rt_rq)
428{
429 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
430}
431#else
432static inline int rt_rq_throttled(struct rt_rq *rt_rq)
433{
434 return rt_rq->rt_throttled;
435}
436#endif
437
444/* Deadline class' related fields in a runqueue */ 438/* Deadline class' related fields in a runqueue */
445struct dl_rq { 439struct dl_rq {
446 /* runqueue is an rbtree, ordered by deadline */ 440 /* runqueue is an rbtree, ordered by deadline */
@@ -462,7 +456,6 @@ struct dl_rq {
462 } earliest_dl; 456 } earliest_dl;
463 457
464 unsigned long dl_nr_migratory; 458 unsigned long dl_nr_migratory;
465 unsigned long dl_nr_total;
466 int overloaded; 459 int overloaded;
467 460
468 /* 461 /*
@@ -559,11 +552,9 @@ struct rq {
559#ifdef CONFIG_FAIR_GROUP_SCHED 552#ifdef CONFIG_FAIR_GROUP_SCHED
560 /* list of leaf cfs_rq on this cpu: */ 553 /* list of leaf cfs_rq on this cpu: */
561 struct list_head leaf_cfs_rq_list; 554 struct list_head leaf_cfs_rq_list;
562#endif /* CONFIG_FAIR_GROUP_SCHED */
563 555
564#ifdef CONFIG_RT_GROUP_SCHED 556 struct sched_avg avg;
565 struct list_head leaf_rt_rq_list; 557#endif /* CONFIG_FAIR_GROUP_SCHED */
566#endif
567 558
568 /* 559 /*
569 * This is part of a global counter where only the total sum 560 * This is part of a global counter where only the total sum
@@ -652,8 +643,6 @@ struct rq {
652#ifdef CONFIG_SMP 643#ifdef CONFIG_SMP
653 struct llist_head wake_list; 644 struct llist_head wake_list;
654#endif 645#endif
655
656 struct sched_avg avg;
657}; 646};
658 647
659static inline int cpu_of(struct rq *rq) 648static inline int cpu_of(struct rq *rq)
@@ -1113,6 +1102,8 @@ static const u32 prio_to_wmult[40] = {
1113 1102
1114#define DEQUEUE_SLEEP 1 1103#define DEQUEUE_SLEEP 1
1115 1104
1105#define RETRY_TASK ((void *)-1UL)
1106
1116struct sched_class { 1107struct sched_class {
1117 const struct sched_class *next; 1108 const struct sched_class *next;
1118 1109
@@ -1123,14 +1114,22 @@ struct sched_class {
1123 1114
1124 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 1115 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
1125 1116
1126 struct task_struct * (*pick_next_task) (struct rq *rq); 1117 /*
1118 * It is the responsibility of the pick_next_task() method that will
1119 * return the next task to call put_prev_task() on the @prev task or
1120 * something equivalent.
1121 *
1122 * May return RETRY_TASK when it finds a higher prio class has runnable
1123 * tasks.
1124 */
1125 struct task_struct * (*pick_next_task) (struct rq *rq,
1126 struct task_struct *prev);
1127 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1127 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
1128 1128
1129#ifdef CONFIG_SMP 1129#ifdef CONFIG_SMP
1130 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1130 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1131 void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 1131 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
1132 1132
1133 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1134 void (*post_schedule) (struct rq *this_rq); 1133 void (*post_schedule) (struct rq *this_rq);
1135 void (*task_waking) (struct task_struct *task); 1134 void (*task_waking) (struct task_struct *task);
1136 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1135 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1160,6 +1159,11 @@ struct sched_class {
1160#endif 1159#endif
1161}; 1160};
1162 1161
1162static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
1163{
1164 prev->sched_class->put_prev_task(rq, prev);
1165}
1166
1163#define sched_class_highest (&stop_sched_class) 1167#define sched_class_highest (&stop_sched_class)
1164#define for_each_class(class) \ 1168#define for_each_class(class) \
1165 for (class = sched_class_highest; class; class = class->next) 1169 for (class = sched_class_highest; class; class = class->next)
@@ -1176,16 +1180,14 @@ extern const struct sched_class idle_sched_class;
1176extern void update_group_power(struct sched_domain *sd, int cpu); 1180extern void update_group_power(struct sched_domain *sd, int cpu);
1177 1181
1178extern void trigger_load_balance(struct rq *rq); 1182extern void trigger_load_balance(struct rq *rq);
1179extern void idle_balance(int this_cpu, struct rq *this_rq);
1180 1183
1181extern void idle_enter_fair(struct rq *this_rq); 1184extern void idle_enter_fair(struct rq *this_rq);
1182extern void idle_exit_fair(struct rq *this_rq); 1185extern void idle_exit_fair(struct rq *this_rq);
1183 1186
1184#else /* CONFIG_SMP */ 1187#else
1185 1188
1186static inline void idle_balance(int cpu, struct rq *rq) 1189static inline void idle_enter_fair(struct rq *rq) { }
1187{ 1190static inline void idle_exit_fair(struct rq *rq) { }
1188}
1189 1191
1190#endif 1192#endif
1191 1193
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fdb6bb0b3356..d6ce65dde541 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,16 +23,19 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
23 /* we're never preempted */ 23 /* we're never preempted */
24} 24}
25 25
26static struct task_struct *pick_next_task_stop(struct rq *rq) 26static struct task_struct *
27pick_next_task_stop(struct rq *rq, struct task_struct *prev)
27{ 28{
28 struct task_struct *stop = rq->stop; 29 struct task_struct *stop = rq->stop;
29 30
30 if (stop && stop->on_rq) { 31 if (!stop || !stop->on_rq)
31 stop->se.exec_start = rq_clock_task(rq); 32 return NULL;
32 return stop;
33 }
34 33
35 return NULL; 34 put_prev_task(rq, prev);
35
36 stop->se.exec_start = rq_clock_task(rq);
37
38 return stop;
36} 39}
37 40
38static void 41static void
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 84571e09c907..01fbae5b97b7 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -293,7 +293,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
293 */ 293 */
294 smp_call_function_single(min(cpu1, cpu2), 294 smp_call_function_single(min(cpu1, cpu2),
295 &irq_cpu_stop_queue_work, 295 &irq_cpu_stop_queue_work,
296 &call_args, 0); 296 &call_args, 1);
297 lg_local_unlock(&stop_cpus_lock); 297 lg_local_unlock(&stop_cpus_lock);
298 preempt_enable(); 298 preempt_enable();
299 299
diff --git a/kernel/sys.c b/kernel/sys.c
index c0a58be780a4..adaeab6f7a87 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -174,10 +174,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
174 174
175 /* normalize: avoid signed division (rounding problems) */ 175 /* normalize: avoid signed division (rounding problems) */
176 error = -ESRCH; 176 error = -ESRCH;
177 if (niceval < -20) 177 if (niceval < MIN_NICE)
178 niceval = -20; 178 niceval = MIN_NICE;
179 if (niceval > 19) 179 if (niceval > MAX_NICE)
180 niceval = 19; 180 niceval = MAX_NICE;
181 181
182 rcu_read_lock(); 182 rcu_read_lock();
183 read_lock(&tasklist_lock); 183 read_lock(&tasklist_lock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 49e13e1f8fe6..7754ff16f334 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -386,13 +386,6 @@ static struct ctl_table kern_table[] = {
386 .proc_handler = proc_dointvec, 386 .proc_handler = proc_dointvec,
387 }, 387 },
388 { 388 {
389 .procname = "numa_balancing_migrate_deferred",
390 .data = &sysctl_numa_balancing_migrate_deferred,
391 .maxlen = sizeof(unsigned int),
392 .mode = 0644,
393 .proc_handler = proc_dointvec,
394 },
395 {
396 .procname = "numa_balancing", 389 .procname = "numa_balancing",
397 .data = NULL, /* filled in by handler */ 390 .data = NULL, /* filled in by handler */
398 .maxlen = sizeof(unsigned int), 391 .maxlen = sizeof(unsigned int),
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0abb36464281..4d23dc4d8139 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -116,20 +116,42 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
116void __init sched_clock_register(u64 (*read)(void), int bits, 116void __init sched_clock_register(u64 (*read)(void), int bits,
117 unsigned long rate) 117 unsigned long rate)
118{ 118{
119 u64 res, wrap, new_mask, new_epoch, cyc, ns;
120 u32 new_mult, new_shift;
121 ktime_t new_wrap_kt;
119 unsigned long r; 122 unsigned long r;
120 u64 res, wrap;
121 char r_unit; 123 char r_unit;
122 124
123 if (cd.rate > rate) 125 if (cd.rate > rate)
124 return; 126 return;
125 127
126 WARN_ON(!irqs_disabled()); 128 WARN_ON(!irqs_disabled());
127 read_sched_clock = read;
128 sched_clock_mask = CLOCKSOURCE_MASK(bits);
129 cd.rate = rate;
130 129
131 /* calculate the mult/shift to convert counter ticks to ns. */ 130 /* calculate the mult/shift to convert counter ticks to ns. */
132 clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600); 131 clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
132
133 new_mask = CLOCKSOURCE_MASK(bits);
134
135 /* calculate how many ns until we wrap */
136 wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
137 new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
138
139 /* update epoch for new counter and update epoch_ns from old counter*/
140 new_epoch = read();
141 cyc = read_sched_clock();
142 ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
143 cd.mult, cd.shift);
144
145 raw_write_seqcount_begin(&cd.seq);
146 read_sched_clock = read;
147 sched_clock_mask = new_mask;
148 cd.rate = rate;
149 cd.wrap_kt = new_wrap_kt;
150 cd.mult = new_mult;
151 cd.shift = new_shift;
152 cd.epoch_cyc = new_epoch;
153 cd.epoch_ns = ns;
154 raw_write_seqcount_end(&cd.seq);
133 155
134 r = rate; 156 r = rate;
135 if (r >= 4000000) { 157 if (r >= 4000000) {
@@ -141,22 +163,12 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
141 } else 163 } else
142 r_unit = ' '; 164 r_unit = ' ';
143 165
144 /* calculate how many ns until we wrap */
145 wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask);
146 cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
147
148 /* calculate the ns resolution of this counter */ 166 /* calculate the ns resolution of this counter */
149 res = cyc_to_ns(1ULL, cd.mult, cd.shift); 167 res = cyc_to_ns(1ULL, new_mult, new_shift);
168
150 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", 169 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
151 bits, r, r_unit, res, wrap); 170 bits, r, r_unit, res, wrap);
152 171
153 update_sched_clock();
154
155 /*
156 * Ensure that sched_clock() starts off at 0ns
157 */
158 cd.epoch_ns = 0;
159
160 /* Enable IRQ time accounting if we have a fast enough sched_clock */ 172 /* Enable IRQ time accounting if we have a fast enough sched_clock */
161 if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) 173 if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
162 enable_sched_clock_irqtime(); 174 enable_sched_clock_irqtime();
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0aa4ce81bc16..5b40279ecd71 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1435,7 +1435,8 @@ void update_wall_time(void)
1435out: 1435out:
1436 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1436 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1437 if (clock_set) 1437 if (clock_set)
1438 clock_was_set(); 1438 /* Have to call _delayed version, since in irq context*/
1439 clock_was_set_delayed();
1439} 1440}
1440 1441
1441/** 1442/**
diff --git a/kernel/timer.c b/kernel/timer.c
index accfd241b9e5..d78de047599b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -52,7 +52,7 @@
52#define CREATE_TRACE_POINTS 52#define CREATE_TRACE_POINTS
53#include <trace/events/timer.h> 53#include <trace/events/timer.h>
54 54
55u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; 55__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
56 56
57EXPORT_SYMBOL(jiffies_64); 57EXPORT_SYMBOL(jiffies_64);
58 58
diff --git a/kernel/torture.c b/kernel/torture.c
new file mode 100644
index 000000000000..acc9afc2f26e
--- /dev/null
+++ b/kernel/torture.c
@@ -0,0 +1,719 @@
1/*
2 * Common functions for in-kernel torture tests.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright (C) IBM Corporation, 2014
19 *
20 * Author: Paul E. McKenney <paulmck@us.ibm.com>
21 * Based on kernel/rcu/torture.c.
22 */
23#include <linux/types.h>
24#include <linux/kernel.h>
25#include <linux/init.h>
26#include <linux/module.h>
27#include <linux/kthread.h>
28#include <linux/err.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/interrupt.h>
32#include <linux/sched.h>
33#include <linux/atomic.h>
34#include <linux/bitops.h>
35#include <linux/completion.h>
36#include <linux/moduleparam.h>
37#include <linux/percpu.h>
38#include <linux/notifier.h>
39#include <linux/reboot.h>
40#include <linux/freezer.h>
41#include <linux/cpu.h>
42#include <linux/delay.h>
43#include <linux/stat.h>
44#include <linux/slab.h>
45#include <linux/trace_clock.h>
46#include <asm/byteorder.h>
47#include <linux/torture.h>
48
49MODULE_LICENSE("GPL");
50MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
51
52static char *torture_type;
53static bool verbose;
54
55/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
56#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
57#define FULLSTOP_SHUTDOWN 1 /* System shutdown with torture running. */
58#define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */
59static int fullstop = FULLSTOP_RMMOD;
60static DEFINE_MUTEX(fullstop_mutex);
61static int *torture_runnable;
62
63#ifdef CONFIG_HOTPLUG_CPU
64
65/*
66 * Variables for online-offline handling. Only present if CPU hotplug
67 * is enabled, otherwise does nothing.
68 */
69
70static struct task_struct *onoff_task;
71static long onoff_holdoff;
72static long onoff_interval;
73static long n_offline_attempts;
74static long n_offline_successes;
75static unsigned long sum_offline;
76static int min_offline = -1;
77static int max_offline;
78static long n_online_attempts;
79static long n_online_successes;
80static unsigned long sum_online;
81static int min_online = -1;
82static int max_online;
83
84/*
85 * Execute random CPU-hotplug operations at the interval specified
86 * by the onoff_interval.
87 */
88static int
89torture_onoff(void *arg)
90{
91 int cpu;
92 unsigned long delta;
93 int maxcpu = -1;
94 DEFINE_TORTURE_RANDOM(rand);
95 int ret;
96 unsigned long starttime;
97
98 VERBOSE_TOROUT_STRING("torture_onoff task started");
99 for_each_online_cpu(cpu)
100 maxcpu = cpu;
101 WARN_ON(maxcpu < 0);
102 if (onoff_holdoff > 0) {
103 VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
104 schedule_timeout_interruptible(onoff_holdoff);
105 VERBOSE_TOROUT_STRING("torture_onoff end holdoff");
106 }
107 while (!torture_must_stop()) {
108 cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
109 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
110 if (verbose)
111 pr_alert("%s" TORTURE_FLAG
112 "torture_onoff task: offlining %d\n",
113 torture_type, cpu);
114 starttime = jiffies;
115 n_offline_attempts++;
116 ret = cpu_down(cpu);
117 if (ret) {
118 if (verbose)
119 pr_alert("%s" TORTURE_FLAG
120 "torture_onoff task: offline %d failed: errno %d\n",
121 torture_type, cpu, ret);
122 } else {
123 if (verbose)
124 pr_alert("%s" TORTURE_FLAG
125 "torture_onoff task: offlined %d\n",
126 torture_type, cpu);
127 n_offline_successes++;
128 delta = jiffies - starttime;
129 sum_offline += delta;
130 if (min_offline < 0) {
131 min_offline = delta;
132 max_offline = delta;
133 }
134 if (min_offline > delta)
135 min_offline = delta;
136 if (max_offline < delta)
137 max_offline = delta;
138 }
139 } else if (cpu_is_hotpluggable(cpu)) {
140 if (verbose)
141 pr_alert("%s" TORTURE_FLAG
142 "torture_onoff task: onlining %d\n",
143 torture_type, cpu);
144 starttime = jiffies;
145 n_online_attempts++;
146 ret = cpu_up(cpu);
147 if (ret) {
148 if (verbose)
149 pr_alert("%s" TORTURE_FLAG
150 "torture_onoff task: online %d failed: errno %d\n",
151 torture_type, cpu, ret);
152 } else {
153 if (verbose)
154 pr_alert("%s" TORTURE_FLAG
155 "torture_onoff task: onlined %d\n",
156 torture_type, cpu);
157 n_online_successes++;
158 delta = jiffies - starttime;
159 sum_online += delta;
160 if (min_online < 0) {
161 min_online = delta;
162 max_online = delta;
163 }
164 if (min_online > delta)
165 min_online = delta;
166 if (max_online < delta)
167 max_online = delta;
168 }
169 }
170 schedule_timeout_interruptible(onoff_interval);
171 }
172 torture_kthread_stopping("torture_onoff");
173 return 0;
174}
175
176#endif /* #ifdef CONFIG_HOTPLUG_CPU */
177
178/*
179 * Initiate online-offline handling.
180 */
181int torture_onoff_init(long ooholdoff, long oointerval)
182{
183 int ret = 0;
184
185#ifdef CONFIG_HOTPLUG_CPU
186 onoff_holdoff = ooholdoff;
187 onoff_interval = oointerval;
188 if (onoff_interval <= 0)
189 return 0;
190 ret = torture_create_kthread(torture_onoff, NULL, onoff_task);
191#endif /* #ifdef CONFIG_HOTPLUG_CPU */
192 return ret;
193}
194EXPORT_SYMBOL_GPL(torture_onoff_init);
195
196/*
197 * Clean up after online/offline testing.
198 */
199static void torture_onoff_cleanup(void)
200{
201#ifdef CONFIG_HOTPLUG_CPU
202 if (onoff_task == NULL)
203 return;
204 VERBOSE_TOROUT_STRING("Stopping torture_onoff task");
205 kthread_stop(onoff_task);
206 onoff_task = NULL;
207#endif /* #ifdef CONFIG_HOTPLUG_CPU */
208}
209EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
210
211/*
212 * Print online/offline testing statistics.
213 */
214char *torture_onoff_stats(char *page)
215{
216#ifdef CONFIG_HOTPLUG_CPU
217 page += sprintf(page,
218 "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
219 n_online_successes, n_online_attempts,
220 n_offline_successes, n_offline_attempts,
221 min_online, max_online,
222 min_offline, max_offline,
223 sum_online, sum_offline, HZ);
224#endif /* #ifdef CONFIG_HOTPLUG_CPU */
225 return page;
226}
227EXPORT_SYMBOL_GPL(torture_onoff_stats);
228
229/*
230 * Were all the online/offline operations successful?
231 */
232bool torture_onoff_failures(void)
233{
234#ifdef CONFIG_HOTPLUG_CPU
235 return n_online_successes != n_online_attempts ||
236 n_offline_successes != n_offline_attempts;
237#else /* #ifdef CONFIG_HOTPLUG_CPU */
238 return false;
239#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
240}
241EXPORT_SYMBOL_GPL(torture_onoff_failures);
242
243#define TORTURE_RANDOM_MULT 39916801 /* prime */
244#define TORTURE_RANDOM_ADD 479001701 /* prime */
245#define TORTURE_RANDOM_REFRESH 10000
246
247/*
248 * Crude but fast random-number generator. Uses a linear congruential
249 * generator, with occasional help from cpu_clock().
250 */
251unsigned long
252torture_random(struct torture_random_state *trsp)
253{
254 if (--trsp->trs_count < 0) {
255 trsp->trs_state += (unsigned long)local_clock();
256 trsp->trs_count = TORTURE_RANDOM_REFRESH;
257 }
258 trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT +
259 TORTURE_RANDOM_ADD;
260 return swahw32(trsp->trs_state);
261}
262EXPORT_SYMBOL_GPL(torture_random);
263
264/*
265 * Variables for shuffling. The idea is to ensure that each CPU stays
266 * idle for an extended period to test interactions with dyntick idle,
267 * as well as interactions with any per-CPU varibles.
268 */
269struct shuffle_task {
270 struct list_head st_l;
271 struct task_struct *st_t;
272};
273
274static long shuffle_interval; /* In jiffies. */
275static struct task_struct *shuffler_task;
276static cpumask_var_t shuffle_tmp_mask;
277static int shuffle_idle_cpu; /* Force all torture tasks off this CPU */
278static struct list_head shuffle_task_list = LIST_HEAD_INIT(shuffle_task_list);
279static DEFINE_MUTEX(shuffle_task_mutex);
280
281/*
282 * Register a task to be shuffled. If there is no memory, just splat
283 * and don't bother registering.
284 */
285void torture_shuffle_task_register(struct task_struct *tp)
286{
287 struct shuffle_task *stp;
288
289 if (WARN_ON_ONCE(tp == NULL))
290 return;
291 stp = kmalloc(sizeof(*stp), GFP_KERNEL);
292 if (WARN_ON_ONCE(stp == NULL))
293 return;
294 stp->st_t = tp;
295 mutex_lock(&shuffle_task_mutex);
296 list_add(&stp->st_l, &shuffle_task_list);
297 mutex_unlock(&shuffle_task_mutex);
298}
299EXPORT_SYMBOL_GPL(torture_shuffle_task_register);
300
301/*
302 * Unregister all tasks, for example, at the end of the torture run.
303 */
304static void torture_shuffle_task_unregister_all(void)
305{
306 struct shuffle_task *stp;
307 struct shuffle_task *p;
308
309 mutex_lock(&shuffle_task_mutex);
310 list_for_each_entry_safe(stp, p, &shuffle_task_list, st_l) {
311 list_del(&stp->st_l);
312 kfree(stp);
313 }
314 mutex_unlock(&shuffle_task_mutex);
315}
316
317/* Shuffle tasks such that we allow shuffle_idle_cpu to become idle.
318 * A special case is when shuffle_idle_cpu = -1, in which case we allow
319 * the tasks to run on all CPUs.
320 */
321static void torture_shuffle_tasks(void)
322{
323 struct shuffle_task *stp;
324
325 cpumask_setall(shuffle_tmp_mask);
326 get_online_cpus();
327
328 /* No point in shuffling if there is only one online CPU (ex: UP) */
329 if (num_online_cpus() == 1) {
330 put_online_cpus();
331 return;
332 }
333
334 /* Advance to the next CPU. Upon overflow, don't idle any CPUs. */
335 shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask);
336 if (shuffle_idle_cpu >= nr_cpu_ids)
337 shuffle_idle_cpu = -1;
338 if (shuffle_idle_cpu != -1) {
339 cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
340 if (cpumask_empty(shuffle_tmp_mask)) {
341 put_online_cpus();
342 return;
343 }
344 }
345
346 mutex_lock(&shuffle_task_mutex);
347 list_for_each_entry(stp, &shuffle_task_list, st_l)
348 set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
349 mutex_unlock(&shuffle_task_mutex);
350
351 put_online_cpus();
352}
353
354/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
355 * system to become idle at a time and cut off its timer ticks. This is meant
356 * to test the support for such tickless idle CPU in RCU.
357 */
358static int torture_shuffle(void *arg)
359{
360 VERBOSE_TOROUT_STRING("torture_shuffle task started");
361 do {
362 schedule_timeout_interruptible(shuffle_interval);
363 torture_shuffle_tasks();
364 torture_shutdown_absorb("torture_shuffle");
365 } while (!torture_must_stop());
366 torture_kthread_stopping("torture_shuffle");
367 return 0;
368}
369
370/*
371 * Start the shuffler, with shuffint in jiffies.
372 */
373int torture_shuffle_init(long shuffint)
374{
375 shuffle_interval = shuffint;
376
377 shuffle_idle_cpu = -1;
378
379 if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
380 VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask");
381 return -ENOMEM;
382 }
383
384 /* Create the shuffler thread */
385 return torture_create_kthread(torture_shuffle, NULL, shuffler_task);
386}
387EXPORT_SYMBOL_GPL(torture_shuffle_init);
388
389/*
390 * Stop the shuffling.
391 */
392static void torture_shuffle_cleanup(void)
393{
394 torture_shuffle_task_unregister_all();
395 if (shuffler_task) {
396 VERBOSE_TOROUT_STRING("Stopping torture_shuffle task");
397 kthread_stop(shuffler_task);
398 free_cpumask_var(shuffle_tmp_mask);
399 }
400 shuffler_task = NULL;
401}
402EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
403
404/*
405 * Variables for auto-shutdown. This allows "lights out" torture runs
406 * to be fully scripted.
407 */
408static int shutdown_secs; /* desired test duration in seconds. */
409static struct task_struct *shutdown_task;
410static unsigned long shutdown_time; /* jiffies to system shutdown. */
411static void (*torture_shutdown_hook)(void);
412
413/*
414 * Absorb kthreads into a kernel function that won't return, so that
415 * they won't ever access module text or data again.
416 */
417void torture_shutdown_absorb(const char *title)
418{
419 while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
420 pr_notice("torture thread %s parking due to system shutdown\n",
421 title);
422 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
423 }
424}
425EXPORT_SYMBOL_GPL(torture_shutdown_absorb);
426
427/*
428 * Cause the torture test to shutdown the system after the test has
429 * run for the time specified by the shutdown_secs parameter.
430 */
431static int torture_shutdown(void *arg)
432{
433 long delta;
434 unsigned long jiffies_snap;
435
436 VERBOSE_TOROUT_STRING("torture_shutdown task started");
437 jiffies_snap = jiffies;
438 while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
439 !torture_must_stop()) {
440 delta = shutdown_time - jiffies_snap;
441 if (verbose)
442 pr_alert("%s" TORTURE_FLAG
443 "torture_shutdown task: %lu jiffies remaining\n",
444 torture_type, delta);
445 schedule_timeout_interruptible(delta);
446 jiffies_snap = jiffies;
447 }
448 if (torture_must_stop()) {
449 torture_kthread_stopping("torture_shutdown");
450 return 0;
451 }
452
453 /* OK, shut down the system. */
454
455 VERBOSE_TOROUT_STRING("torture_shutdown task shutting down system");
456 shutdown_task = NULL; /* Avoid self-kill deadlock. */
457 if (torture_shutdown_hook)
458 torture_shutdown_hook();
459 else
460 VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
461 kernel_power_off(); /* Shut down the system. */
462 return 0;
463}
464
465/*
466 * Start up the shutdown task.
467 */
468int torture_shutdown_init(int ssecs, void (*cleanup)(void))
469{
470 int ret = 0;
471
472 shutdown_secs = ssecs;
473 torture_shutdown_hook = cleanup;
474 if (shutdown_secs > 0) {
475 shutdown_time = jiffies + shutdown_secs * HZ;
476 ret = torture_create_kthread(torture_shutdown, NULL,
477 shutdown_task);
478 }
479 return ret;
480}
481EXPORT_SYMBOL_GPL(torture_shutdown_init);
482
483/*
484 * Detect and respond to a system shutdown.
485 */
486static int torture_shutdown_notify(struct notifier_block *unused1,
487 unsigned long unused2, void *unused3)
488{
489 mutex_lock(&fullstop_mutex);
490 if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
491 VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
492 ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN;
493 } else {
494 pr_warn("Concurrent rmmod and shutdown illegal!\n");
495 }
496 mutex_unlock(&fullstop_mutex);
497 return NOTIFY_DONE;
498}
499
500static struct notifier_block torture_shutdown_nb = {
501 .notifier_call = torture_shutdown_notify,
502};
503
504/*
505 * Shut down the shutdown task. Say what??? Heh! This can happen if
506 * the torture module gets an rmmod before the shutdown time arrives. ;-)
507 */
508static void torture_shutdown_cleanup(void)
509{
510 unregister_reboot_notifier(&torture_shutdown_nb);
511 if (shutdown_task != NULL) {
512 VERBOSE_TOROUT_STRING("Stopping torture_shutdown task");
513 kthread_stop(shutdown_task);
514 }
515 shutdown_task = NULL;
516}
517
518/*
519 * Variables for stuttering, which means to periodically pause and
520 * restart testing in order to catch bugs that appear when load is
521 * suddenly applied to or removed from the system.
522 */
523static struct task_struct *stutter_task;
524static int stutter_pause_test;
525static int stutter;
526
527/*
528 * Block until the stutter interval ends. This must be called periodically
529 * by all running kthreads that need to be subject to stuttering.
530 */
531void stutter_wait(const char *title)
532{
533 while (ACCESS_ONCE(stutter_pause_test) ||
534 (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
535 if (stutter_pause_test)
536 schedule_timeout_interruptible(1);
537 else
538 schedule_timeout_interruptible(round_jiffies_relative(HZ));
539 torture_shutdown_absorb(title);
540 }
541}
542EXPORT_SYMBOL_GPL(stutter_wait);
543
544/*
545 * Cause the torture test to "stutter", starting and stopping all
546 * threads periodically.
547 */
548static int torture_stutter(void *arg)
549{
550 VERBOSE_TOROUT_STRING("torture_stutter task started");
551 do {
552 if (!torture_must_stop()) {
553 schedule_timeout_interruptible(stutter);
554 ACCESS_ONCE(stutter_pause_test) = 1;
555 }
556 if (!torture_must_stop())
557 schedule_timeout_interruptible(stutter);
558 ACCESS_ONCE(stutter_pause_test) = 0;
559 torture_shutdown_absorb("torture_stutter");
560 } while (!torture_must_stop());
561 torture_kthread_stopping("torture_stutter");
562 return 0;
563}
564
565/*
566 * Initialize and kick off the torture_stutter kthread.
567 */
568int torture_stutter_init(int s)
569{
570 int ret;
571
572 stutter = s;
573 ret = torture_create_kthread(torture_stutter, NULL, stutter_task);
574 return ret;
575}
576EXPORT_SYMBOL_GPL(torture_stutter_init);
577
578/*
579 * Cleanup after the torture_stutter kthread.
580 */
581static void torture_stutter_cleanup(void)
582{
583 if (!stutter_task)
584 return;
585 VERBOSE_TOROUT_STRING("Stopping torture_stutter task");
586 kthread_stop(stutter_task);
587 stutter_task = NULL;
588}
589
590/*
591 * Initialize torture module. Please note that this is -not- invoked via
592 * the usual module_init() mechanism, but rather by an explicit call from
593 * the client torture module. This call must be paired with a later
594 * torture_init_end().
595 *
596 * The runnable parameter points to a flag that controls whether or not
597 * the test is currently runnable. If there is no such flag, pass in NULL.
598 */
599void __init torture_init_begin(char *ttype, bool v, int *runnable)
600{
601 mutex_lock(&fullstop_mutex);
602 torture_type = ttype;
603 verbose = v;
604 torture_runnable = runnable;
605 fullstop = FULLSTOP_DONTSTOP;
606
607}
608EXPORT_SYMBOL_GPL(torture_init_begin);
609
610/*
611 * Tell the torture module that initialization is complete.
612 */
613void __init torture_init_end(void)
614{
615 mutex_unlock(&fullstop_mutex);
616 register_reboot_notifier(&torture_shutdown_nb);
617}
618EXPORT_SYMBOL_GPL(torture_init_end);
619
620/*
621 * Clean up torture module. Please note that this is -not- invoked via
622 * the usual module_exit() mechanism, but rather by an explicit call from
623 * the client torture module. Returns true if a race with system shutdown
624 * is detected, otherwise, all kthreads started by functions in this file
625 * will be shut down.
626 *
627 * This must be called before the caller starts shutting down its own
628 * kthreads.
629 */
630bool torture_cleanup(void)
631{
632 mutex_lock(&fullstop_mutex);
633 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
634 pr_warn("Concurrent rmmod and shutdown illegal!\n");
635 mutex_unlock(&fullstop_mutex);
636 schedule_timeout_uninterruptible(10);
637 return true;
638 }
639 ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
640 mutex_unlock(&fullstop_mutex);
641 torture_shutdown_cleanup();
642 torture_shuffle_cleanup();
643 torture_stutter_cleanup();
644 torture_onoff_cleanup();
645 return false;
646}
647EXPORT_SYMBOL_GPL(torture_cleanup);
648
649/*
650 * Is it time for the current torture test to stop?
651 */
652bool torture_must_stop(void)
653{
654 return torture_must_stop_irq() || kthread_should_stop();
655}
656EXPORT_SYMBOL_GPL(torture_must_stop);
657
658/*
659 * Is it time for the current torture test to stop? This is the irq-safe
660 * version, hence no check for kthread_should_stop().
661 */
662bool torture_must_stop_irq(void)
663{
664 return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
665}
666EXPORT_SYMBOL_GPL(torture_must_stop_irq);
667
668/*
669 * Each kthread must wait for kthread_should_stop() before returning from
670 * its top-level function, otherwise segfaults ensue. This function
671 * prints a "stopping" message and waits for kthread_should_stop(), and
672 * should be called from all torture kthreads immediately prior to
673 * returning.
674 */
675void torture_kthread_stopping(char *title)
676{
677 if (verbose)
678 VERBOSE_TOROUT_STRING(title);
679 while (!kthread_should_stop()) {
680 torture_shutdown_absorb(title);
681 schedule_timeout_uninterruptible(1);
682 }
683}
684EXPORT_SYMBOL_GPL(torture_kthread_stopping);
685
686/*
687 * Create a generic torture kthread that is immediately runnable. If you
688 * need the kthread to be stopped so that you can do something to it before
689 * it starts, you will need to open-code your own.
690 */
691int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
692 char *f, struct task_struct **tp)
693{
694 int ret = 0;
695
696 VERBOSE_TOROUT_STRING(m);
697 *tp = kthread_run(fn, arg, s);
698 if (IS_ERR(*tp)) {
699 ret = PTR_ERR(*tp);
700 VERBOSE_TOROUT_ERRSTRING(f);
701 *tp = NULL;
702 }
703 torture_shuffle_task_register(*tp);
704 return ret;
705}
706EXPORT_SYMBOL_GPL(_torture_create_kthread);
707
708/*
709 * Stop a generic kthread, emitting a message.
710 */
711void _torture_stop_kthread(char *m, struct task_struct **tp)
712{
713 if (*tp == NULL)
714 return;
715 VERBOSE_TOROUT_STRING(m);
716 kthread_stop(*tp);
717 *tp = NULL;
718}
719EXPORT_SYMBOL_GPL(_torture_stop_kthread);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a5457d577b98..0434ff1b808e 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -40,8 +40,8 @@ static int write_iteration = 50;
40module_param(write_iteration, uint, 0644); 40module_param(write_iteration, uint, 0644);
41MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); 41MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
42 42
43static int producer_nice = 19; 43static int producer_nice = MAX_NICE;
44static int consumer_nice = 19; 44static int consumer_nice = MAX_NICE;
45 45
46static int producer_fifo = -1; 46static int producer_fifo = -1;
47static int consumer_fifo = -1; 47static int consumer_fifo = -1;
@@ -308,7 +308,7 @@ static void ring_buffer_producer(void)
308 308
309 /* Let the user know that the test is running at low priority */ 309 /* Let the user know that the test is running at low priority */
310 if (producer_fifo < 0 && consumer_fifo < 0 && 310 if (producer_fifo < 0 && consumer_fifo < 0 &&
311 producer_nice == 19 && consumer_nice == 19) 311 producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
312 trace_printk("WARNING!!! This test is running at lowest priority.\n"); 312 trace_printk("WARNING!!! This test is running at lowest priority.\n");
313 313
314 trace_printk("Time: %lld (usecs)\n", time); 314 trace_printk("Time: %lld (usecs)\n", time);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 815c878f409b..24c1f2382557 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1600,15 +1600,31 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,
1600} 1600}
1601EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); 1601EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
1602 1602
1603static struct ring_buffer *temp_buffer;
1604
1603struct ring_buffer_event * 1605struct ring_buffer_event *
1604trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, 1606trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
1605 struct ftrace_event_file *ftrace_file, 1607 struct ftrace_event_file *ftrace_file,
1606 int type, unsigned long len, 1608 int type, unsigned long len,
1607 unsigned long flags, int pc) 1609 unsigned long flags, int pc)
1608{ 1610{
1611 struct ring_buffer_event *entry;
1612
1609 *current_rb = ftrace_file->tr->trace_buffer.buffer; 1613 *current_rb = ftrace_file->tr->trace_buffer.buffer;
1610 return trace_buffer_lock_reserve(*current_rb, 1614 entry = trace_buffer_lock_reserve(*current_rb,
1611 type, len, flags, pc); 1615 type, len, flags, pc);
1616 /*
1617 * If tracing is off, but we have triggers enabled
1618 * we still need to look at the event data. Use the temp_buffer
1619 * to store the trace event for the tigger to use. It's recusive
1620 * safe and will not be recorded anywhere.
1621 */
1622 if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) {
1623 *current_rb = temp_buffer;
1624 entry = trace_buffer_lock_reserve(*current_rb,
1625 type, len, flags, pc);
1626 }
1627 return entry;
1612} 1628}
1613EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); 1629EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
1614 1630
@@ -6494,11 +6510,16 @@ __init static int tracer_alloc_buffers(void)
6494 6510
6495 raw_spin_lock_init(&global_trace.start_lock); 6511 raw_spin_lock_init(&global_trace.start_lock);
6496 6512
6513 /* Used for event triggers */
6514 temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
6515 if (!temp_buffer)
6516 goto out_free_cpumask;
6517
6497 /* TODO: make the number of buffers hot pluggable with CPUS */ 6518 /* TODO: make the number of buffers hot pluggable with CPUS */
6498 if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { 6519 if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
6499 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 6520 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
6500 WARN_ON(1); 6521 WARN_ON(1);
6501 goto out_free_cpumask; 6522 goto out_free_temp_buffer;
6502 } 6523 }
6503 6524
6504 if (global_trace.buffer_disabled) 6525 if (global_trace.buffer_disabled)
@@ -6540,6 +6561,8 @@ __init static int tracer_alloc_buffers(void)
6540 6561
6541 return 0; 6562 return 0;
6542 6563
6564out_free_temp_buffer:
6565 ring_buffer_free(temp_buffer);
6543out_free_cpumask: 6566out_free_cpumask:
6544 free_percpu(global_trace.trace_buffer.data); 6567 free_percpu(global_trace.trace_buffer.data);
6545#ifdef CONFIG_TRACER_MAX_TRACE 6568#ifdef CONFIG_TRACER_MAX_TRACE
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index e854f420e033..c894614de14d 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -31,9 +31,25 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
31 } 31 }
32 32
33 /* The ftrace function trace is allowed only for root. */ 33 /* The ftrace function trace is allowed only for root. */
34 if (ftrace_event_is_function(tp_event) && 34 if (ftrace_event_is_function(tp_event)) {
35 perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) 35 if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
36 return -EPERM; 36 return -EPERM;
37
38 /*
39 * We don't allow user space callchains for function trace
40 * event, due to issues with page faults while tracing page
41 * fault handler and its overall trickiness nature.
42 */
43 if (!p_event->attr.exclude_callchain_user)
44 return -EINVAL;
45
46 /*
47 * Same reason to disable user stack dump as for user space
48 * callchains above.
49 */
50 if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
51 return -EINVAL;
52 }
37 53
38 /* No tracing, just counting, so no obvious leak */ 54 /* No tracing, just counting, so no obvious leak */
39 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) 55 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e71ffd4eccb5..7b16d40bd64d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,12 +27,6 @@
27 27
28DEFINE_MUTEX(event_mutex); 28DEFINE_MUTEX(event_mutex);
29 29
30DEFINE_MUTEX(event_storage_mutex);
31EXPORT_SYMBOL_GPL(event_storage_mutex);
32
33char event_storage[EVENT_STORAGE_SIZE];
34EXPORT_SYMBOL_GPL(event_storage);
35
36LIST_HEAD(ftrace_events); 30LIST_HEAD(ftrace_events);
37static LIST_HEAD(ftrace_common_fields); 31static LIST_HEAD(ftrace_common_fields);
38 32
@@ -1777,6 +1771,16 @@ static void trace_module_add_events(struct module *mod)
1777{ 1771{
1778 struct ftrace_event_call **call, **start, **end; 1772 struct ftrace_event_call **call, **start, **end;
1779 1773
1774 if (!mod->num_trace_events)
1775 return;
1776
1777 /* Don't add infrastructure for mods without tracepoints */
1778 if (trace_module_has_bad_taint(mod)) {
1779 pr_err("%s: module has bad taint, not creating trace events\n",
1780 mod->name);
1781 return;
1782 }
1783
1780 start = mod->trace_events; 1784 start = mod->trace_events;
1781 end = mod->trace_events + mod->num_trace_events; 1785 end = mod->trace_events + mod->num_trace_events;
1782 1786
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7c3e3e72e2b6..ee0a5098ac43 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void) \
95#undef __array 95#undef __array
96#define __array(type, item, len) \ 96#define __array(type, item, len) \
97 do { \ 97 do { \
98 char *type_str = #type"["__stringify(len)"]"; \
98 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 99 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
99 mutex_lock(&event_storage_mutex); \ 100 ret = trace_define_field(event_call, type_str, #item, \
100 snprintf(event_storage, sizeof(event_storage), \
101 "%s[%d]", #type, len); \
102 ret = trace_define_field(event_call, event_storage, #item, \
103 offsetof(typeof(field), item), \ 101 offsetof(typeof(field), item), \
104 sizeof(field.item), \ 102 sizeof(field.item), \
105 is_signed_type(type), filter_type); \ 103 is_signed_type(type), filter_type); \
106 mutex_unlock(&event_storage_mutex); \
107 if (ret) \ 104 if (ret) \
108 return ret; \ 105 return ret; \
109 } while (0); 106 } while (0);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2aefbee93a6d..887ef88b0bc7 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -498,14 +498,14 @@ void trace_hardirqs_off(void)
498} 498}
499EXPORT_SYMBOL(trace_hardirqs_off); 499EXPORT_SYMBOL(trace_hardirqs_off);
500 500
501void trace_hardirqs_on_caller(unsigned long caller_addr) 501__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
502{ 502{
503 if (!preempt_trace() && irq_trace()) 503 if (!preempt_trace() && irq_trace())
504 stop_critical_timing(CALLER_ADDR0, caller_addr); 504 stop_critical_timing(CALLER_ADDR0, caller_addr);
505} 505}
506EXPORT_SYMBOL(trace_hardirqs_on_caller); 506EXPORT_SYMBOL(trace_hardirqs_on_caller);
507 507
508void trace_hardirqs_off_caller(unsigned long caller_addr) 508__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
509{ 509{
510 if (!preempt_trace() && irq_trace()) 510 if (!preempt_trace() && irq_trace())
511 start_critical_timing(CALLER_ADDR0, caller_addr); 511 start_critical_timing(CALLER_ADDR0, caller_addr);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 29f26540e9c9..031cc5655a51 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -631,6 +631,11 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
631EXPORT_SYMBOL_GPL(tracepoint_iter_reset); 631EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
632 632
633#ifdef CONFIG_MODULES 633#ifdef CONFIG_MODULES
634bool trace_module_has_bad_taint(struct module *mod)
635{
636 return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP));
637}
638
634static int tracepoint_module_coming(struct module *mod) 639static int tracepoint_module_coming(struct module *mod)
635{ 640{
636 struct tp_module *tp_mod, *iter; 641 struct tp_module *tp_mod, *iter;
@@ -641,7 +646,7 @@ static int tracepoint_module_coming(struct module *mod)
641 * module headers (for forced load), to make sure we don't cause a crash. 646 * module headers (for forced load), to make sure we don't cause a crash.
642 * Staging and out-of-tree GPL modules are fine. 647 * Staging and out-of-tree GPL modules are fine.
643 */ 648 */
644 if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) 649 if (trace_module_has_bad_taint(mod))
645 return 0; 650 return 0;
646 mutex_lock(&tracepoints_mutex); 651 mutex_lock(&tracepoints_mutex);
647 tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); 652 tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 240fb62cf394..dd06439b9c84 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -225,7 +225,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
225 * 225 *
226 * When there is no mapping defined for the user-namespace uid 226 * When there is no mapping defined for the user-namespace uid
227 * pair INVALID_UID is returned. Callers are expected to test 227 * pair INVALID_UID is returned. Callers are expected to test
228 * for and handle handle INVALID_UID being returned. INVALID_UID 228 * for and handle INVALID_UID being returned. INVALID_UID
229 * may be tested for using uid_valid(). 229 * may be tested for using uid_valid().
230 */ 230 */
231kuid_t make_kuid(struct user_namespace *ns, uid_t uid) 231kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 82ef9f3b7473..3fa5b8f3aae3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1851,6 +1851,12 @@ static void destroy_worker(struct worker *worker)
1851 if (worker->flags & WORKER_IDLE) 1851 if (worker->flags & WORKER_IDLE)
1852 pool->nr_idle--; 1852 pool->nr_idle--;
1853 1853
1854 /*
1855 * Once WORKER_DIE is set, the kworker may destroy itself at any
1856 * point. Pin to ensure the task stays until we're done with it.
1857 */
1858 get_task_struct(worker->task);
1859
1854 list_del_init(&worker->entry); 1860 list_del_init(&worker->entry);
1855 worker->flags |= WORKER_DIE; 1861 worker->flags |= WORKER_DIE;
1856 1862
@@ -1859,6 +1865,7 @@ static void destroy_worker(struct worker *worker)
1859 spin_unlock_irq(&pool->lock); 1865 spin_unlock_irq(&pool->lock);
1860 1866
1861 kthread_stop(worker->task); 1867 kthread_stop(worker->task);
1868 put_task_struct(worker->task);
1862 kfree(worker); 1869 kfree(worker);
1863 1870
1864 spin_lock_irq(&pool->lock); 1871 spin_lock_irq(&pool->lock);
@@ -3218,7 +3225,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
3218 return -ENOMEM; 3225 return -ENOMEM;
3219 3226
3220 if (sscanf(buf, "%d", &attrs->nice) == 1 && 3227 if (sscanf(buf, "%d", &attrs->nice) == 1 &&
3221 attrs->nice >= -20 && attrs->nice <= 19) 3228 attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
3222 ret = apply_workqueue_attrs(wq, attrs); 3229 ret = apply_workqueue_attrs(wq, attrs);
3223 else 3230 else
3224 ret = -EINVAL; 3231 ret = -EINVAL;