aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks9
-rw-r--r--kernel/cgroup.c461
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cpu.c33
-rw-r--r--kernel/cpuset.c500
-rw-r--r--kernel/events/core.c42
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/futex.c402
-rw-r--r--kernel/irq/generic-chip.c5
-rw-r--r--kernel/irq/irqdomain.c2
-rw-r--r--kernel/irq_work.c110
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kprobes.c14
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/locking/lockdep.c2
-rw-r--r--kernel/locking/mcs_spinlock.c72
-rw-r--r--kernel/locking/mcs_spinlock.h13
-rw-r--r--kernel/locking/mutex.c41
-rw-r--r--kernel/locking/qrwlock.c9
-rw-r--r--kernel/locking/rtmutex-debug.c5
-rw-r--r--kernel/locking/rtmutex-debug.h7
-rw-r--r--kernel/locking/rtmutex.c562
-rw-r--r--kernel/locking/rtmutex.h7
-rw-r--r--kernel/locking/rtmutex_common.h22
-rw-r--r--kernel/locking/rwsem-spinlock.c28
-rw-r--r--kernel/locking/rwsem-xadd.c20
-rw-r--r--kernel/locking/rwsem.c2
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/power/hibernate.c6
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/suspend.c6
-rw-r--r--kernel/ptrace.c8
-rw-r--r--kernel/rcu/rcu.h8
-rw-r--r--kernel/rcu/rcutorture.c4
-rw-r--r--kernel/rcu/srcu.c4
-rw-r--r--kernel/rcu/tree.c199
-rw-r--r--kernel/rcu/tree.h42
-rw-r--r--kernel/rcu/tree_plugin.h304
-rw-r--r--kernel/rcu/update.c25
-rw-r--r--kernel/sched/core.c128
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/deadline.c18
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c244
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c30
-rw-r--r--kernel/sched/sched.h38
-rw-r--r--kernel/sched/wait.c30
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/smp.c9
-rw-r--r--kernel/time/alarmtimer.c22
-rw-r--r--kernel/time/clockevents.c10
-rw-r--r--kernel/time/sched_clock.c4
-rw-r--r--kernel/time/tick-sched.c20
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/Kconfig5
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/ftrace.c449
-rw-r--r--kernel/trace/ring_buffer.c30
-rw-r--r--kernel/trace/trace.c116
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_clock.c9
-rw-r--r--kernel/trace/trace_event_perf.c12
-rw-r--r--kernel/trace/trace_events.c61
-rw-r--r--kernel/trace/trace_events_filter.c73
-rw-r--r--kernel/trace/trace_functions_graph.c43
-rw-r--r--kernel/trace/trace_output.c282
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_seq.c428
-rw-r--r--kernel/trace/trace_uprobe.c3
-rw-r--r--kernel/workqueue.c206
72 files changed, 3355 insertions, 1926 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 35536d9c0964..76768ee812b2 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -220,9 +220,16 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
220 220
221endif 221endif
222 222
223config ARCH_SUPPORTS_ATOMIC_RMW
224 bool
225
223config MUTEX_SPIN_ON_OWNER 226config MUTEX_SPIN_ON_OWNER
224 def_bool y 227 def_bool y
225 depends on SMP && !DEBUG_MUTEXES 228 depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
229
230config RWSEM_SPIN_ON_OWNER
231 def_bool y
232 depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
226 233
227config ARCH_USE_QUEUE_RWLOCK 234config ARCH_USE_QUEUE_RWLOCK
228 bool 235 bool
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 70776aec2562..7dc8788cfd52 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -149,12 +149,14 @@ struct cgroup_root cgrp_dfl_root;
149 */ 149 */
150static bool cgrp_dfl_root_visible; 150static bool cgrp_dfl_root_visible;
151 151
152/*
153 * Set by the boot param of the same name and makes subsystems with NULL
154 * ->dfl_files to use ->legacy_files on the default hierarchy.
155 */
156static bool cgroup_legacy_files_on_dfl;
157
152/* some controllers are not supported in the default hierarchy */ 158/* some controllers are not supported in the default hierarchy */
153static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 159static unsigned int cgrp_dfl_root_inhibit_ss_mask;
154#ifdef CONFIG_CGROUP_DEBUG
155 | (1 << debug_cgrp_id)
156#endif
157 ;
158 160
159/* The list of hierarchy roots */ 161/* The list of hierarchy roots */
160 162
@@ -180,13 +182,15 @@ static u64 css_serial_nr_next = 1;
180 */ 182 */
181static int need_forkexit_callback __read_mostly; 183static int need_forkexit_callback __read_mostly;
182 184
183static struct cftype cgroup_base_files[]; 185static struct cftype cgroup_dfl_base_files[];
186static struct cftype cgroup_legacy_base_files[];
184 187
185static void cgroup_put(struct cgroup *cgrp); 188static void cgroup_put(struct cgroup *cgrp);
186static int rebind_subsystems(struct cgroup_root *dst_root, 189static int rebind_subsystems(struct cgroup_root *dst_root,
187 unsigned int ss_mask); 190 unsigned int ss_mask);
188static int cgroup_destroy_locked(struct cgroup *cgrp); 191static int cgroup_destroy_locked(struct cgroup *cgrp);
189static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); 192static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
193 bool visible);
190static void css_release(struct percpu_ref *ref); 194static void css_release(struct percpu_ref *ref);
191static void kill_css(struct cgroup_subsys_state *css); 195static void kill_css(struct cgroup_subsys_state *css);
192static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 196static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
@@ -1037,6 +1041,58 @@ static void cgroup_put(struct cgroup *cgrp)
1037} 1041}
1038 1042
1039/** 1043/**
1044 * cgroup_refresh_child_subsys_mask - update child_subsys_mask
1045 * @cgrp: the target cgroup
1046 *
1047 * On the default hierarchy, a subsystem may request other subsystems to be
1048 * enabled together through its ->depends_on mask. In such cases, more
1049 * subsystems than specified in "cgroup.subtree_control" may be enabled.
1050 *
1051 * This function determines which subsystems need to be enabled given the
1052 * current @cgrp->subtree_control and records it in
1053 * @cgrp->child_subsys_mask. The resulting mask is always a superset of
1054 * @cgrp->subtree_control and follows the usual hierarchy rules.
1055 */
1056static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
1057{
1058 struct cgroup *parent = cgroup_parent(cgrp);
1059 unsigned int cur_ss_mask = cgrp->subtree_control;
1060 struct cgroup_subsys *ss;
1061 int ssid;
1062
1063 lockdep_assert_held(&cgroup_mutex);
1064
1065 if (!cgroup_on_dfl(cgrp)) {
1066 cgrp->child_subsys_mask = cur_ss_mask;
1067 return;
1068 }
1069
1070 while (true) {
1071 unsigned int new_ss_mask = cur_ss_mask;
1072
1073 for_each_subsys(ss, ssid)
1074 if (cur_ss_mask & (1 << ssid))
1075 new_ss_mask |= ss->depends_on;
1076
1077 /*
1078 * Mask out subsystems which aren't available. This can
1079 * happen only if some depended-upon subsystems were bound
1080 * to non-default hierarchies.
1081 */
1082 if (parent)
1083 new_ss_mask &= parent->child_subsys_mask;
1084 else
1085 new_ss_mask &= cgrp->root->subsys_mask;
1086
1087 if (new_ss_mask == cur_ss_mask)
1088 break;
1089 cur_ss_mask = new_ss_mask;
1090 }
1091
1092 cgrp->child_subsys_mask = cur_ss_mask;
1093}
1094
1095/**
1040 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods 1096 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1041 * @kn: the kernfs_node being serviced 1097 * @kn: the kernfs_node being serviced
1042 * 1098 *
@@ -1208,12 +1264,15 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
1208 up_write(&css_set_rwsem); 1264 up_write(&css_set_rwsem);
1209 1265
1210 src_root->subsys_mask &= ~(1 << ssid); 1266 src_root->subsys_mask &= ~(1 << ssid);
1211 src_root->cgrp.child_subsys_mask &= ~(1 << ssid); 1267 src_root->cgrp.subtree_control &= ~(1 << ssid);
1268 cgroup_refresh_child_subsys_mask(&src_root->cgrp);
1212 1269
1213 /* default hierarchy doesn't enable controllers by default */ 1270 /* default hierarchy doesn't enable controllers by default */
1214 dst_root->subsys_mask |= 1 << ssid; 1271 dst_root->subsys_mask |= 1 << ssid;
1215 if (dst_root != &cgrp_dfl_root) 1272 if (dst_root != &cgrp_dfl_root) {
1216 dst_root->cgrp.child_subsys_mask |= 1 << ssid; 1273 dst_root->cgrp.subtree_control |= 1 << ssid;
1274 cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
1275 }
1217 1276
1218 if (ss->bind) 1277 if (ss->bind)
1219 ss->bind(css); 1278 ss->bind(css);
@@ -1233,8 +1292,6 @@ static int cgroup_show_options(struct seq_file *seq,
1233 for_each_subsys(ss, ssid) 1292 for_each_subsys(ss, ssid)
1234 if (root->subsys_mask & (1 << ssid)) 1293 if (root->subsys_mask & (1 << ssid))
1235 seq_printf(seq, ",%s", ss->name); 1294 seq_printf(seq, ",%s", ss->name);
1236 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1237 seq_puts(seq, ",sane_behavior");
1238 if (root->flags & CGRP_ROOT_NOPREFIX) 1295 if (root->flags & CGRP_ROOT_NOPREFIX)
1239 seq_puts(seq, ",noprefix"); 1296 seq_puts(seq, ",noprefix");
1240 if (root->flags & CGRP_ROOT_XATTR) 1297 if (root->flags & CGRP_ROOT_XATTR)
@@ -1268,6 +1325,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1268 bool all_ss = false, one_ss = false; 1325 bool all_ss = false, one_ss = false;
1269 unsigned int mask = -1U; 1326 unsigned int mask = -1U;
1270 struct cgroup_subsys *ss; 1327 struct cgroup_subsys *ss;
1328 int nr_opts = 0;
1271 int i; 1329 int i;
1272 1330
1273#ifdef CONFIG_CPUSETS 1331#ifdef CONFIG_CPUSETS
@@ -1277,6 +1335,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1277 memset(opts, 0, sizeof(*opts)); 1335 memset(opts, 0, sizeof(*opts));
1278 1336
1279 while ((token = strsep(&o, ",")) != NULL) { 1337 while ((token = strsep(&o, ",")) != NULL) {
1338 nr_opts++;
1339
1280 if (!*token) 1340 if (!*token)
1281 return -EINVAL; 1341 return -EINVAL;
1282 if (!strcmp(token, "none")) { 1342 if (!strcmp(token, "none")) {
@@ -1361,37 +1421,33 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1361 return -ENOENT; 1421 return -ENOENT;
1362 } 1422 }
1363 1423
1364 /* Consistency checks */
1365
1366 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1424 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1367 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1425 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1368 1426 if (nr_opts != 1) {
1369 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || 1427 pr_err("sane_behavior: no other mount options allowed\n");
1370 opts->cpuset_clone_children || opts->release_agent ||
1371 opts->name) {
1372 pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1373 return -EINVAL; 1428 return -EINVAL;
1374 } 1429 }
1375 } else { 1430 return 0;
1376 /*
1377 * If the 'all' option was specified select all the
1378 * subsystems, otherwise if 'none', 'name=' and a subsystem
1379 * name options were not specified, let's default to 'all'
1380 */
1381 if (all_ss || (!one_ss && !opts->none && !opts->name))
1382 for_each_subsys(ss, i)
1383 if (!ss->disabled)
1384 opts->subsys_mask |= (1 << i);
1385
1386 /*
1387 * We either have to specify by name or by subsystems. (So
1388 * all empty hierarchies must have a name).
1389 */
1390 if (!opts->subsys_mask && !opts->name)
1391 return -EINVAL;
1392 } 1431 }
1393 1432
1394 /* 1433 /*
1434 * If the 'all' option was specified select all the subsystems,
1435 * otherwise if 'none', 'name=' and a subsystem name options were
1436 * not specified, let's default to 'all'
1437 */
1438 if (all_ss || (!one_ss && !opts->none && !opts->name))
1439 for_each_subsys(ss, i)
1440 if (!ss->disabled)
1441 opts->subsys_mask |= (1 << i);
1442
1443 /*
1444 * We either have to specify by name or by subsystems. (So all
1445 * empty hierarchies must have a name).
1446 */
1447 if (!opts->subsys_mask && !opts->name)
1448 return -EINVAL;
1449
1450 /*
1395 * Option noprefix was introduced just for backward compatibility 1451 * Option noprefix was introduced just for backward compatibility
1396 * with the old cpuset, so we allow noprefix only if mounting just 1452 * with the old cpuset, so we allow noprefix only if mounting just
1397 * the cpuset subsystem. 1453 * the cpuset subsystem.
@@ -1399,7 +1455,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1399 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) 1455 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1400 return -EINVAL; 1456 return -EINVAL;
1401 1457
1402
1403 /* Can't specify "none" and some subsystems */ 1458 /* Can't specify "none" and some subsystems */
1404 if (opts->subsys_mask && opts->none) 1459 if (opts->subsys_mask && opts->none)
1405 return -EINVAL; 1460 return -EINVAL;
@@ -1414,8 +1469,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1414 struct cgroup_sb_opts opts; 1469 struct cgroup_sb_opts opts;
1415 unsigned int added_mask, removed_mask; 1470 unsigned int added_mask, removed_mask;
1416 1471
1417 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1472 if (root == &cgrp_dfl_root) {
1418 pr_err("sane_behavior: remount is not allowed\n"); 1473 pr_err("remount is not allowed\n");
1419 return -EINVAL; 1474 return -EINVAL;
1420 } 1475 }
1421 1476
@@ -1434,11 +1489,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1434 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1489 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1435 1490
1436 /* Don't allow flags or name to change at remount */ 1491 /* Don't allow flags or name to change at remount */
1437 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1492 if ((opts.flags ^ root->flags) ||
1438 (opts.name && strcmp(opts.name, root->name))) { 1493 (opts.name && strcmp(opts.name, root->name))) {
1439 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", 1494 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1440 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", 1495 opts.flags, opts.name ?: "", root->flags, root->name);
1441 root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1442 ret = -EINVAL; 1496 ret = -EINVAL;
1443 goto out_unlock; 1497 goto out_unlock;
1444 } 1498 }
@@ -1563,6 +1617,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1563{ 1617{
1564 LIST_HEAD(tmp_links); 1618 LIST_HEAD(tmp_links);
1565 struct cgroup *root_cgrp = &root->cgrp; 1619 struct cgroup *root_cgrp = &root->cgrp;
1620 struct cftype *base_files;
1566 struct css_set *cset; 1621 struct css_set *cset;
1567 int i, ret; 1622 int i, ret;
1568 1623
@@ -1600,7 +1655,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1600 } 1655 }
1601 root_cgrp->kn = root->kf_root->kn; 1656 root_cgrp->kn = root->kf_root->kn;
1602 1657
1603 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); 1658 if (root == &cgrp_dfl_root)
1659 base_files = cgroup_dfl_base_files;
1660 else
1661 base_files = cgroup_legacy_base_files;
1662
1663 ret = cgroup_addrm_files(root_cgrp, base_files, true);
1604 if (ret) 1664 if (ret)
1605 goto destroy_root; 1665 goto destroy_root;
1606 1666
@@ -1638,7 +1698,7 @@ destroy_root:
1638exit_root_id: 1698exit_root_id:
1639 cgroup_exit_root_id(root); 1699 cgroup_exit_root_id(root);
1640cancel_ref: 1700cancel_ref:
1641 percpu_ref_cancel_init(&root_cgrp->self.refcnt); 1701 percpu_ref_exit(&root_cgrp->self.refcnt);
1642out: 1702out:
1643 free_cgrp_cset_links(&tmp_links); 1703 free_cgrp_cset_links(&tmp_links);
1644 return ret; 1704 return ret;
@@ -1672,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1672 goto out_unlock; 1732 goto out_unlock;
1673 1733
1674 /* look for a matching existing root */ 1734 /* look for a matching existing root */
1675 if (!opts.subsys_mask && !opts.none && !opts.name) { 1735 if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
1676 cgrp_dfl_root_visible = true; 1736 cgrp_dfl_root_visible = true;
1677 root = &cgrp_dfl_root; 1737 root = &cgrp_dfl_root;
1678 cgroup_get(&root->cgrp); 1738 cgroup_get(&root->cgrp);
@@ -1730,15 +1790,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1730 goto out_unlock; 1790 goto out_unlock;
1731 } 1791 }
1732 1792
1733 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1793 if (root->flags ^ opts.flags)
1734 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1794 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1735 pr_err("sane_behavior: new mount options should match the existing superblock\n");
1736 ret = -EINVAL;
1737 goto out_unlock;
1738 } else {
1739 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1740 }
1741 }
1742 1795
1743 /* 1796 /*
1744 * We want to reuse @root whose lifetime is governed by its 1797 * We want to reuse @root whose lifetime is governed by its
@@ -2457,9 +2510,7 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2457 2510
2458static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) 2511static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2459{ 2512{
2460 struct cgroup *cgrp = seq_css(seq)->cgroup; 2513 seq_puts(seq, "0\n");
2461
2462 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2463 return 0; 2514 return 0;
2464} 2515}
2465 2516
@@ -2496,7 +2547,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v)
2496{ 2547{
2497 struct cgroup *cgrp = seq_css(seq)->cgroup; 2548 struct cgroup *cgrp = seq_css(seq)->cgroup;
2498 2549
2499 cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); 2550 cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
2500 return 0; 2551 return 0;
2501} 2552}
2502 2553
@@ -2505,7 +2556,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2505{ 2556{
2506 struct cgroup *cgrp = seq_css(seq)->cgroup; 2557 struct cgroup *cgrp = seq_css(seq)->cgroup;
2507 2558
2508 cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); 2559 cgroup_print_ss_mask(seq, cgrp->subtree_control);
2509 return 0; 2560 return 0;
2510} 2561}
2511 2562
@@ -2611,6 +2662,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2611 loff_t off) 2662 loff_t off)
2612{ 2663{
2613 unsigned int enable = 0, disable = 0; 2664 unsigned int enable = 0, disable = 0;
2665 unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
2614 struct cgroup *cgrp, *child; 2666 struct cgroup *cgrp, *child;
2615 struct cgroup_subsys *ss; 2667 struct cgroup_subsys *ss;
2616 char *tok; 2668 char *tok;
@@ -2650,11 +2702,26 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2650 2702
2651 for_each_subsys(ss, ssid) { 2703 for_each_subsys(ss, ssid) {
2652 if (enable & (1 << ssid)) { 2704 if (enable & (1 << ssid)) {
2653 if (cgrp->child_subsys_mask & (1 << ssid)) { 2705 if (cgrp->subtree_control & (1 << ssid)) {
2654 enable &= ~(1 << ssid); 2706 enable &= ~(1 << ssid);
2655 continue; 2707 continue;
2656 } 2708 }
2657 2709
2710 /* unavailable or not enabled on the parent? */
2711 if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2712 (cgroup_parent(cgrp) &&
2713 !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
2714 ret = -ENOENT;
2715 goto out_unlock;
2716 }
2717
2718 /*
2719 * @ss is already enabled through dependency and
2720 * we'll just make it visible. Skip draining.
2721 */
2722 if (cgrp->child_subsys_mask & (1 << ssid))
2723 continue;
2724
2658 /* 2725 /*
2659 * Because css offlining is asynchronous, userland 2726 * Because css offlining is asynchronous, userland
2660 * might try to re-enable the same controller while 2727 * might try to re-enable the same controller while
@@ -2677,23 +2744,15 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2677 2744
2678 return restart_syscall(); 2745 return restart_syscall();
2679 } 2746 }
2680
2681 /* unavailable or not enabled on the parent? */
2682 if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2683 (cgroup_parent(cgrp) &&
2684 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
2685 ret = -ENOENT;
2686 goto out_unlock;
2687 }
2688 } else if (disable & (1 << ssid)) { 2747 } else if (disable & (1 << ssid)) {
2689 if (!(cgrp->child_subsys_mask & (1 << ssid))) { 2748 if (!(cgrp->subtree_control & (1 << ssid))) {
2690 disable &= ~(1 << ssid); 2749 disable &= ~(1 << ssid);
2691 continue; 2750 continue;
2692 } 2751 }
2693 2752
2694 /* a child has it enabled? */ 2753 /* a child has it enabled? */
2695 cgroup_for_each_live_child(child, cgrp) { 2754 cgroup_for_each_live_child(child, cgrp) {
2696 if (child->child_subsys_mask & (1 << ssid)) { 2755 if (child->subtree_control & (1 << ssid)) {
2697 ret = -EBUSY; 2756 ret = -EBUSY;
2698 goto out_unlock; 2757 goto out_unlock;
2699 } 2758 }
@@ -2707,7 +2766,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2707 } 2766 }
2708 2767
2709 /* 2768 /*
2710 * Except for the root, child_subsys_mask must be zero for a cgroup 2769 * Except for the root, subtree_control must be zero for a cgroup
2711 * with tasks so that child cgroups don't compete against tasks. 2770 * with tasks so that child cgroups don't compete against tasks.
2712 */ 2771 */
2713 if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { 2772 if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
@@ -2716,36 +2775,75 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2716 } 2775 }
2717 2776
2718 /* 2777 /*
2719 * Create csses for enables and update child_subsys_mask. This 2778 * Update subsys masks and calculate what needs to be done. More
2720 * changes cgroup_e_css() results which in turn makes the 2779 * subsystems than specified may need to be enabled or disabled
2721 * subsequent cgroup_update_dfl_csses() associate all tasks in the 2780 * depending on subsystem dependencies.
2722 * subtree to the updated csses. 2781 */
2782 cgrp->subtree_control |= enable;
2783 cgrp->subtree_control &= ~disable;
2784
2785 old_ctrl = cgrp->child_subsys_mask;
2786 cgroup_refresh_child_subsys_mask(cgrp);
2787 new_ctrl = cgrp->child_subsys_mask;
2788
2789 css_enable = ~old_ctrl & new_ctrl;
2790 css_disable = old_ctrl & ~new_ctrl;
2791 enable |= css_enable;
2792 disable |= css_disable;
2793
2794 /*
2795 * Create new csses or make the existing ones visible. A css is
2796 * created invisible if it's being implicitly enabled through
2797 * dependency. An invisible css is made visible when the userland
2798 * explicitly enables it.
2723 */ 2799 */
2724 for_each_subsys(ss, ssid) { 2800 for_each_subsys(ss, ssid) {
2725 if (!(enable & (1 << ssid))) 2801 if (!(enable & (1 << ssid)))
2726 continue; 2802 continue;
2727 2803
2728 cgroup_for_each_live_child(child, cgrp) { 2804 cgroup_for_each_live_child(child, cgrp) {
2729 ret = create_css(child, ss); 2805 if (css_enable & (1 << ssid))
2806 ret = create_css(child, ss,
2807 cgrp->subtree_control & (1 << ssid));
2808 else
2809 ret = cgroup_populate_dir(child, 1 << ssid);
2730 if (ret) 2810 if (ret)
2731 goto err_undo_css; 2811 goto err_undo_css;
2732 } 2812 }
2733 } 2813 }
2734 2814
2735 cgrp->child_subsys_mask |= enable; 2815 /*
2736 cgrp->child_subsys_mask &= ~disable; 2816 * At this point, cgroup_e_css() results reflect the new csses
2737 2817 * making the following cgroup_update_dfl_csses() properly update
2818 * css associations of all tasks in the subtree.
2819 */
2738 ret = cgroup_update_dfl_csses(cgrp); 2820 ret = cgroup_update_dfl_csses(cgrp);
2739 if (ret) 2821 if (ret)
2740 goto err_undo_css; 2822 goto err_undo_css;
2741 2823
2742 /* all tasks are now migrated away from the old csses, kill them */ 2824 /*
2825 * All tasks are migrated out of disabled csses. Kill or hide
2826 * them. A css is hidden when the userland requests it to be
2827 * disabled while other subsystems are still depending on it. The
2828 * css must not actively control resources and be in the vanilla
2829 * state if it's made visible again later. Controllers which may
2830 * be depended upon should provide ->css_reset() for this purpose.
2831 */
2743 for_each_subsys(ss, ssid) { 2832 for_each_subsys(ss, ssid) {
2744 if (!(disable & (1 << ssid))) 2833 if (!(disable & (1 << ssid)))
2745 continue; 2834 continue;
2746 2835
2747 cgroup_for_each_live_child(child, cgrp) 2836 cgroup_for_each_live_child(child, cgrp) {
2748 kill_css(cgroup_css(child, ss)); 2837 struct cgroup_subsys_state *css = cgroup_css(child, ss);
2838
2839 if (css_disable & (1 << ssid)) {
2840 kill_css(css);
2841 } else {
2842 cgroup_clear_dir(child, 1 << ssid);
2843 if (ss->css_reset)
2844 ss->css_reset(css);
2845 }
2846 }
2749 } 2847 }
2750 2848
2751 kernfs_activate(cgrp->kn); 2849 kernfs_activate(cgrp->kn);
@@ -2755,8 +2853,9 @@ out_unlock:
2755 return ret ?: nbytes; 2853 return ret ?: nbytes;
2756 2854
2757err_undo_css: 2855err_undo_css:
2758 cgrp->child_subsys_mask &= ~enable; 2856 cgrp->subtree_control &= ~enable;
2759 cgrp->child_subsys_mask |= disable; 2857 cgrp->subtree_control |= disable;
2858 cgroup_refresh_child_subsys_mask(cgrp);
2760 2859
2761 for_each_subsys(ss, ssid) { 2860 for_each_subsys(ss, ssid) {
2762 if (!(enable & (1 << ssid))) 2861 if (!(enable & (1 << ssid)))
@@ -2764,8 +2863,14 @@ err_undo_css:
2764 2863
2765 cgroup_for_each_live_child(child, cgrp) { 2864 cgroup_for_each_live_child(child, cgrp) {
2766 struct cgroup_subsys_state *css = cgroup_css(child, ss); 2865 struct cgroup_subsys_state *css = cgroup_css(child, ss);
2767 if (css) 2866
2867 if (!css)
2868 continue;
2869
2870 if (css_enable & (1 << ssid))
2768 kill_css(css); 2871 kill_css(css);
2872 else
2873 cgroup_clear_dir(child, 1 << ssid);
2769 } 2874 }
2770 } 2875 }
2771 goto out_unlock; 2876 goto out_unlock;
@@ -2878,9 +2983,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2878 2983
2879 /* 2984 /*
2880 * This isn't a proper migration and its usefulness is very 2985 * This isn't a proper migration and its usefulness is very
2881 * limited. Disallow if sane_behavior. 2986 * limited. Disallow on the default hierarchy.
2882 */ 2987 */
2883 if (cgroup_sane_behavior(cgrp)) 2988 if (cgroup_on_dfl(cgrp))
2884 return -EPERM; 2989 return -EPERM;
2885 2990
2886 /* 2991 /*
@@ -2964,9 +3069,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2964 3069
2965 for (cft = cfts; cft->name[0] != '\0'; cft++) { 3070 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2966 /* does cft->flags tell us to skip this file on @cgrp? */ 3071 /* does cft->flags tell us to skip this file on @cgrp? */
2967 if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) 3072 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
2968 continue; 3073 continue;
2969 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 3074 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
2970 continue; 3075 continue;
2971 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) 3076 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
2972 continue; 3077 continue;
@@ -3024,6 +3129,9 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
3024 kfree(cft->kf_ops); 3129 kfree(cft->kf_ops);
3025 cft->kf_ops = NULL; 3130 cft->kf_ops = NULL;
3026 cft->ss = NULL; 3131 cft->ss = NULL;
3132
3133 /* revert flags set by cgroup core while adding @cfts */
3134 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
3027 } 3135 }
3028} 3136}
3029 3137
@@ -3109,7 +3217,7 @@ int cgroup_rm_cftypes(struct cftype *cfts)
3109 * function currently returns 0 as long as @cfts registration is successful 3217 * function currently returns 0 as long as @cfts registration is successful
3110 * even if some file creation attempts on existing cgroups fail. 3218 * even if some file creation attempts on existing cgroups fail.
3111 */ 3219 */
3112int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 3220static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3113{ 3221{
3114 int ret; 3222 int ret;
3115 3223
@@ -3135,6 +3243,40 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3135} 3243}
3136 3244
3137/** 3245/**
3246 * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
3247 * @ss: target cgroup subsystem
3248 * @cfts: zero-length name terminated array of cftypes
3249 *
3250 * Similar to cgroup_add_cftypes() but the added files are only used for
3251 * the default hierarchy.
3252 */
3253int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3254{
3255 struct cftype *cft;
3256
3257 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3258 cft->flags |= __CFTYPE_ONLY_ON_DFL;
3259 return cgroup_add_cftypes(ss, cfts);
3260}
3261
3262/**
3263 * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
3264 * @ss: target cgroup subsystem
3265 * @cfts: zero-length name terminated array of cftypes
3266 *
3267 * Similar to cgroup_add_cftypes() but the added files are only used for
3268 * the legacy hierarchies.
3269 */
3270int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3271{
3272 struct cftype *cft;
3273
3274 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3275 cft->flags |= __CFTYPE_NOT_ON_DFL;
3276 return cgroup_add_cftypes(ss, cfts);
3277}
3278
3279/**
3138 * cgroup_task_count - count the number of tasks in a cgroup. 3280 * cgroup_task_count - count the number of tasks in a cgroup.
3139 * @cgrp: the cgroup in question 3281 * @cgrp: the cgroup in question
3140 * 3282 *
@@ -3699,8 +3841,9 @@ after:
3699 * 3841 *
3700 * All this extra complexity was caused by the original implementation 3842 * All this extra complexity was caused by the original implementation
3701 * committing to an entirely unnecessary property. In the long term, we 3843 * committing to an entirely unnecessary property. In the long term, we
3702 * want to do away with it. Explicitly scramble sort order if 3844 * want to do away with it. Explicitly scramble sort order if on the
3703 * sane_behavior so that no such expectation exists in the new interface. 3845 * default hierarchy so that no such expectation exists in the new
3846 * interface.
3704 * 3847 *
3705 * Scrambling is done by swapping every two consecutive bits, which is 3848 * Scrambling is done by swapping every two consecutive bits, which is
3706 * non-identity one-to-one mapping which disturbs sort order sufficiently. 3849 * non-identity one-to-one mapping which disturbs sort order sufficiently.
@@ -3715,7 +3858,7 @@ static pid_t pid_fry(pid_t pid)
3715 3858
3716static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) 3859static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3717{ 3860{
3718 if (cgroup_sane_behavior(cgrp)) 3861 if (cgroup_on_dfl(cgrp))
3719 return pid_fry(pid); 3862 return pid_fry(pid);
3720 else 3863 else
3721 return pid; 3864 return pid;
@@ -3818,7 +3961,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3818 css_task_iter_end(&it); 3961 css_task_iter_end(&it);
3819 length = n; 3962 length = n;
3820 /* now sort & (if procs) strip out duplicates */ 3963 /* now sort & (if procs) strip out duplicates */
3821 if (cgroup_sane_behavior(cgrp)) 3964 if (cgroup_on_dfl(cgrp))
3822 sort(array, length, sizeof(pid_t), fried_cmppid, NULL); 3965 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
3823 else 3966 else
3824 sort(array, length, sizeof(pid_t), cmppid, NULL); 3967 sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -4040,7 +4183,8 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4040 return 0; 4183 return 0;
4041} 4184}
4042 4185
4043static struct cftype cgroup_base_files[] = { 4186/* cgroup core interface files for the default hierarchy */
4187static struct cftype cgroup_dfl_base_files[] = {
4044 { 4188 {
4045 .name = "cgroup.procs", 4189 .name = "cgroup.procs",
4046 .seq_start = cgroup_pidlist_start, 4190 .seq_start = cgroup_pidlist_start,
@@ -4052,46 +4196,52 @@ static struct cftype cgroup_base_files[] = {
4052 .mode = S_IRUGO | S_IWUSR, 4196 .mode = S_IRUGO | S_IWUSR,
4053 }, 4197 },
4054 { 4198 {
4055 .name = "cgroup.clone_children",
4056 .flags = CFTYPE_INSANE,
4057 .read_u64 = cgroup_clone_children_read,
4058 .write_u64 = cgroup_clone_children_write,
4059 },
4060 {
4061 .name = "cgroup.sane_behavior",
4062 .flags = CFTYPE_ONLY_ON_ROOT,
4063 .seq_show = cgroup_sane_behavior_show,
4064 },
4065 {
4066 .name = "cgroup.controllers", 4199 .name = "cgroup.controllers",
4067 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, 4200 .flags = CFTYPE_ONLY_ON_ROOT,
4068 .seq_show = cgroup_root_controllers_show, 4201 .seq_show = cgroup_root_controllers_show,
4069 }, 4202 },
4070 { 4203 {
4071 .name = "cgroup.controllers", 4204 .name = "cgroup.controllers",
4072 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, 4205 .flags = CFTYPE_NOT_ON_ROOT,
4073 .seq_show = cgroup_controllers_show, 4206 .seq_show = cgroup_controllers_show,
4074 }, 4207 },
4075 { 4208 {
4076 .name = "cgroup.subtree_control", 4209 .name = "cgroup.subtree_control",
4077 .flags = CFTYPE_ONLY_ON_DFL,
4078 .seq_show = cgroup_subtree_control_show, 4210 .seq_show = cgroup_subtree_control_show,
4079 .write = cgroup_subtree_control_write, 4211 .write = cgroup_subtree_control_write,
4080 }, 4212 },
4081 { 4213 {
4082 .name = "cgroup.populated", 4214 .name = "cgroup.populated",
4083 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, 4215 .flags = CFTYPE_NOT_ON_ROOT,
4084 .seq_show = cgroup_populated_show, 4216 .seq_show = cgroup_populated_show,
4085 }, 4217 },
4218 { } /* terminate */
4219};
4086 4220
4087 /* 4221/* cgroup core interface files for the legacy hierarchies */
4088 * Historical crazy stuff. These don't have "cgroup." prefix and 4222static struct cftype cgroup_legacy_base_files[] = {
4089 * don't exist if sane_behavior. If you're depending on these, be 4223 {
4090 * prepared to be burned. 4224 .name = "cgroup.procs",
4091 */ 4225 .seq_start = cgroup_pidlist_start,
4226 .seq_next = cgroup_pidlist_next,
4227 .seq_stop = cgroup_pidlist_stop,
4228 .seq_show = cgroup_pidlist_show,
4229 .private = CGROUP_FILE_PROCS,
4230 .write = cgroup_procs_write,
4231 .mode = S_IRUGO | S_IWUSR,
4232 },
4233 {
4234 .name = "cgroup.clone_children",
4235 .read_u64 = cgroup_clone_children_read,
4236 .write_u64 = cgroup_clone_children_write,
4237 },
4238 {
4239 .name = "cgroup.sane_behavior",
4240 .flags = CFTYPE_ONLY_ON_ROOT,
4241 .seq_show = cgroup_sane_behavior_show,
4242 },
4092 { 4243 {
4093 .name = "tasks", 4244 .name = "tasks",
4094 .flags = CFTYPE_INSANE, /* use "procs" instead */
4095 .seq_start = cgroup_pidlist_start, 4245 .seq_start = cgroup_pidlist_start,
4096 .seq_next = cgroup_pidlist_next, 4246 .seq_next = cgroup_pidlist_next,
4097 .seq_stop = cgroup_pidlist_stop, 4247 .seq_stop = cgroup_pidlist_stop,
@@ -4102,13 +4252,12 @@ static struct cftype cgroup_base_files[] = {
4102 }, 4252 },
4103 { 4253 {
4104 .name = "notify_on_release", 4254 .name = "notify_on_release",
4105 .flags = CFTYPE_INSANE,
4106 .read_u64 = cgroup_read_notify_on_release, 4255 .read_u64 = cgroup_read_notify_on_release,
4107 .write_u64 = cgroup_write_notify_on_release, 4256 .write_u64 = cgroup_write_notify_on_release,
4108 }, 4257 },
4109 { 4258 {
4110 .name = "release_agent", 4259 .name = "release_agent",
4111 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 4260 .flags = CFTYPE_ONLY_ON_ROOT,
4112 .seq_show = cgroup_release_agent_show, 4261 .seq_show = cgroup_release_agent_show,
4113 .write = cgroup_release_agent_write, 4262 .write = cgroup_release_agent_write,
4114 .max_write_len = PATH_MAX - 1, 4263 .max_write_len = PATH_MAX - 1,
@@ -4175,6 +4324,8 @@ static void css_free_work_fn(struct work_struct *work)
4175 container_of(work, struct cgroup_subsys_state, destroy_work); 4324 container_of(work, struct cgroup_subsys_state, destroy_work);
4176 struct cgroup *cgrp = css->cgroup; 4325 struct cgroup *cgrp = css->cgroup;
4177 4326
4327 percpu_ref_exit(&css->refcnt);
4328
4178 if (css->ss) { 4329 if (css->ss) {
4179 /* css free path */ 4330 /* css free path */
4180 if (css->parent) 4331 if (css->parent)
@@ -4314,12 +4465,14 @@ static void offline_css(struct cgroup_subsys_state *css)
4314 * create_css - create a cgroup_subsys_state 4465 * create_css - create a cgroup_subsys_state
4315 * @cgrp: the cgroup new css will be associated with 4466 * @cgrp: the cgroup new css will be associated with
4316 * @ss: the subsys of new css 4467 * @ss: the subsys of new css
4468 * @visible: whether to create control knobs for the new css or not
4317 * 4469 *
4318 * Create a new css associated with @cgrp - @ss pair. On success, the new 4470 * Create a new css associated with @cgrp - @ss pair. On success, the new
4319 * css is online and installed in @cgrp with all interface files created. 4471 * css is online and installed in @cgrp with all interface files created if
4320 * Returns 0 on success, -errno on failure. 4472 * @visible. Returns 0 on success, -errno on failure.
4321 */ 4473 */
4322static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) 4474static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4475 bool visible)
4323{ 4476{
4324 struct cgroup *parent = cgroup_parent(cgrp); 4477 struct cgroup *parent = cgroup_parent(cgrp);
4325 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); 4478 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
@@ -4343,9 +4496,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4343 goto err_free_percpu_ref; 4496 goto err_free_percpu_ref;
4344 css->id = err; 4497 css->id = err;
4345 4498
4346 err = cgroup_populate_dir(cgrp, 1 << ss->id); 4499 if (visible) {
4347 if (err) 4500 err = cgroup_populate_dir(cgrp, 1 << ss->id);
4348 goto err_free_id; 4501 if (err)
4502 goto err_free_id;
4503 }
4349 4504
4350 /* @css is ready to be brought online now, make it visible */ 4505 /* @css is ready to be brought online now, make it visible */
4351 list_add_tail_rcu(&css->sibling, &parent_css->children); 4506 list_add_tail_rcu(&css->sibling, &parent_css->children);
@@ -4372,7 +4527,7 @@ err_list_del:
4372err_free_id: 4527err_free_id:
4373 cgroup_idr_remove(&ss->css_idr, css->id); 4528 cgroup_idr_remove(&ss->css_idr, css->id);
4374err_free_percpu_ref: 4529err_free_percpu_ref:
4375 percpu_ref_cancel_init(&css->refcnt); 4530 percpu_ref_exit(&css->refcnt);
4376err_free_css: 4531err_free_css:
4377 call_rcu(&css->rcu_head, css_free_rcu_fn); 4532 call_rcu(&css->rcu_head, css_free_rcu_fn);
4378 return err; 4533 return err;
@@ -4385,6 +4540,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4385 struct cgroup_root *root; 4540 struct cgroup_root *root;
4386 struct cgroup_subsys *ss; 4541 struct cgroup_subsys *ss;
4387 struct kernfs_node *kn; 4542 struct kernfs_node *kn;
4543 struct cftype *base_files;
4388 int ssid, ret; 4544 int ssid, ret;
4389 4545
4390 parent = cgroup_kn_lock_live(parent_kn); 4546 parent = cgroup_kn_lock_live(parent_kn);
@@ -4455,14 +4611,20 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4455 if (ret) 4611 if (ret)
4456 goto out_destroy; 4612 goto out_destroy;
4457 4613
4458 ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4614 if (cgroup_on_dfl(cgrp))
4615 base_files = cgroup_dfl_base_files;
4616 else
4617 base_files = cgroup_legacy_base_files;
4618
4619 ret = cgroup_addrm_files(cgrp, base_files, true);
4459 if (ret) 4620 if (ret)
4460 goto out_destroy; 4621 goto out_destroy;
4461 4622
4462 /* let's create and online css's */ 4623 /* let's create and online css's */
4463 for_each_subsys(ss, ssid) { 4624 for_each_subsys(ss, ssid) {
4464 if (parent->child_subsys_mask & (1 << ssid)) { 4625 if (parent->child_subsys_mask & (1 << ssid)) {
4465 ret = create_css(cgrp, ss); 4626 ret = create_css(cgrp, ss,
4627 parent->subtree_control & (1 << ssid));
4466 if (ret) 4628 if (ret)
4467 goto out_destroy; 4629 goto out_destroy;
4468 } 4630 }
@@ -4470,10 +4632,12 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4470 4632
4471 /* 4633 /*
4472 * On the default hierarchy, a child doesn't automatically inherit 4634 * On the default hierarchy, a child doesn't automatically inherit
4473 * child_subsys_mask from the parent. Each is configured manually. 4635 * subtree_control from the parent. Each is configured manually.
4474 */ 4636 */
4475 if (!cgroup_on_dfl(cgrp)) 4637 if (!cgroup_on_dfl(cgrp)) {
4476 cgrp->child_subsys_mask = parent->child_subsys_mask; 4638 cgrp->subtree_control = parent->subtree_control;
4639 cgroup_refresh_child_subsys_mask(cgrp);
4640 }
4477 4641
4478 kernfs_activate(kn); 4642 kernfs_activate(kn);
4479 4643
@@ -4483,7 +4647,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4483out_free_id: 4647out_free_id:
4484 cgroup_idr_remove(&root->cgroup_idr, cgrp->id); 4648 cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
4485out_cancel_ref: 4649out_cancel_ref:
4486 percpu_ref_cancel_init(&cgrp->self.refcnt); 4650 percpu_ref_exit(&cgrp->self.refcnt);
4487out_free_cgrp: 4651out_free_cgrp:
4488 kfree(cgrp); 4652 kfree(cgrp);
4489out_unlock: 4653out_unlock:
@@ -4736,8 +4900,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4736 */ 4900 */
4737int __init cgroup_init_early(void) 4901int __init cgroup_init_early(void)
4738{ 4902{
4739 static struct cgroup_sb_opts __initdata opts = 4903 static struct cgroup_sb_opts __initdata opts;
4740 { .flags = CGRP_ROOT_SANE_BEHAVIOR };
4741 struct cgroup_subsys *ss; 4904 struct cgroup_subsys *ss;
4742 int i; 4905 int i;
4743 4906
@@ -4775,7 +4938,8 @@ int __init cgroup_init(void)
4775 unsigned long key; 4938 unsigned long key;
4776 int ssid, err; 4939 int ssid, err;
4777 4940
4778 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 4941 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
4942 BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
4779 4943
4780 mutex_lock(&cgroup_mutex); 4944 mutex_lock(&cgroup_mutex);
4781 4945
@@ -4807,9 +4971,22 @@ int __init cgroup_init(void)
4807 * disabled flag and cftype registration needs kmalloc, 4971 * disabled flag and cftype registration needs kmalloc,
4808 * both of which aren't available during early_init. 4972 * both of which aren't available during early_init.
4809 */ 4973 */
4810 if (!ss->disabled) { 4974 if (ss->disabled)
4811 cgrp_dfl_root.subsys_mask |= 1 << ss->id; 4975 continue;
4812 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); 4976
4977 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4978
4979 if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
4980 ss->dfl_cftypes = ss->legacy_cftypes;
4981
4982 if (!ss->dfl_cftypes)
4983 cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
4984
4985 if (ss->dfl_cftypes == ss->legacy_cftypes) {
4986 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
4987 } else {
4988 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
4989 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
4813 } 4990 }
4814 } 4991 }
4815 4992
@@ -5205,6 +5382,14 @@ static int __init cgroup_disable(char *str)
5205} 5382}
5206__setup("cgroup_disable=", cgroup_disable); 5383__setup("cgroup_disable=", cgroup_disable);
5207 5384
5385static int __init cgroup_set_legacy_files_on_dfl(char *str)
5386{
5387 printk("cgroup: using legacy files on the default hierarchy\n");
5388 cgroup_legacy_files_on_dfl = true;
5389 return 0;
5390}
5391__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
5392
5208/** 5393/**
5209 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry 5394 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
5210 * @dentry: directory dentry of interest 5395 * @dentry: directory dentry of interest
@@ -5399,6 +5584,6 @@ static struct cftype debug_files[] = {
5399struct cgroup_subsys debug_cgrp_subsys = { 5584struct cgroup_subsys debug_cgrp_subsys = {
5400 .css_alloc = debug_css_alloc, 5585 .css_alloc = debug_css_alloc,
5401 .css_free = debug_css_free, 5586 .css_free = debug_css_free,
5402 .base_cftypes = debug_files, 5587 .legacy_cftypes = debug_files,
5403}; 5588};
5404#endif /* CONFIG_CGROUP_DEBUG */ 5589#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index a79e40f9d700..92b98cc0ee76 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -480,5 +480,5 @@ struct cgroup_subsys freezer_cgrp_subsys = {
480 .css_free = freezer_css_free, 480 .css_free = freezer_css_free,
481 .attach = freezer_attach, 481 .attach = freezer_attach,
482 .fork = freezer_fork, 482 .fork = freezer_fork,
483 .base_cftypes = files, 483 .legacy_cftypes = files,
484}; 484};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a343bde710b1..81e2a388a0f6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -274,21 +274,28 @@ void clear_tasks_mm_cpumask(int cpu)
274 rcu_read_unlock(); 274 rcu_read_unlock();
275} 275}
276 276
277static inline void check_for_tasks(int cpu) 277static inline void check_for_tasks(int dead_cpu)
278{ 278{
279 struct task_struct *p; 279 struct task_struct *g, *p;
280 cputime_t utime, stime;
281 280
282 write_lock_irq(&tasklist_lock); 281 read_lock_irq(&tasklist_lock);
283 for_each_process(p) { 282 do_each_thread(g, p) {
284 task_cputime(p, &utime, &stime); 283 if (!p->on_rq)
285 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 284 continue;
286 (utime || stime)) 285 /*
287 pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", 286 * We do the check with unlocked task_rq(p)->lock.
288 p->comm, task_pid_nr(p), cpu, 287 * Order the reading to do not warn about a task,
289 p->state, p->flags); 288 * which was running on this cpu in the past, and
290 } 289 * it's just been woken on another cpu.
291 write_unlock_irq(&tasklist_lock); 290 */
291 rmb();
292 if (task_cpu(p) != dead_cpu)
293 continue;
294
295 pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
296 p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
297 } while_each_thread(g, p);
298 read_unlock_irq(&tasklist_lock);
292} 299}
293 300
294struct take_cpu_down_param { 301struct take_cpu_down_param {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 116a4164720a..22874d7cf2c0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -76,8 +76,34 @@ struct cpuset {
76 struct cgroup_subsys_state css; 76 struct cgroup_subsys_state css;
77 77
78 unsigned long flags; /* "unsigned long" so bitops work */ 78 unsigned long flags; /* "unsigned long" so bitops work */
79 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 79
80 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 80 /*
81 * On default hierarchy:
82 *
83 * The user-configured masks can only be changed by writing to
84 * cpuset.cpus and cpuset.mems, and won't be limited by the
85 * parent masks.
86 *
87 * The effective masks is the real masks that apply to the tasks
88 * in the cpuset. They may be changed if the configured masks are
89 * changed or hotplug happens.
90 *
91 * effective_mask == configured_mask & parent's effective_mask,
92 * and if it ends up empty, it will inherit the parent's mask.
93 *
94 *
95 * On legacy hierachy:
96 *
97 * The user-configured masks are always the same with effective masks.
98 */
99
100 /* user-configured CPUs and Memory Nodes allow to tasks */
101 cpumask_var_t cpus_allowed;
102 nodemask_t mems_allowed;
103
104 /* effective CPUs and Memory Nodes allow to tasks */
105 cpumask_var_t effective_cpus;
106 nodemask_t effective_mems;
81 107
82 /* 108 /*
83 * This is old Memory Nodes tasks took on. 109 * This is old Memory Nodes tasks took on.
@@ -307,9 +333,9 @@ static struct file_system_type cpuset_fs_type = {
307 */ 333 */
308static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) 334static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
309{ 335{
310 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 336 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
311 cs = parent_cs(cs); 337 cs = parent_cs(cs);
312 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); 338 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
313} 339}
314 340
315/* 341/*
@@ -325,9 +351,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
325 */ 351 */
326static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 352static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
327{ 353{
328 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) 354 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
329 cs = parent_cs(cs); 355 cs = parent_cs(cs);
330 nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); 356 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
331} 357}
332 358
333/* 359/*
@@ -376,13 +402,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
376 if (!trial) 402 if (!trial)
377 return NULL; 403 return NULL;
378 404
379 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { 405 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
380 kfree(trial); 406 goto free_cs;
381 return NULL; 407 if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
382 } 408 goto free_cpus;
383 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
384 409
410 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
411 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
385 return trial; 412 return trial;
413
414free_cpus:
415 free_cpumask_var(trial->cpus_allowed);
416free_cs:
417 kfree(trial);
418 return NULL;
386} 419}
387 420
388/** 421/**
@@ -391,6 +424,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
391 */ 424 */
392static void free_trial_cpuset(struct cpuset *trial) 425static void free_trial_cpuset(struct cpuset *trial)
393{ 426{
427 free_cpumask_var(trial->effective_cpus);
394 free_cpumask_var(trial->cpus_allowed); 428 free_cpumask_var(trial->cpus_allowed);
395 kfree(trial); 429 kfree(trial);
396} 430}
@@ -436,9 +470,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
436 470
437 par = parent_cs(cur); 471 par = parent_cs(cur);
438 472
439 /* We must be a subset of our parent cpuset */ 473 /* On legacy hiearchy, we must be a subset of our parent cpuset. */
440 ret = -EACCES; 474 ret = -EACCES;
441 if (!is_cpuset_subset(trial, par)) 475 if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
442 goto out; 476 goto out;
443 477
444 /* 478 /*
@@ -480,11 +514,11 @@ out:
480#ifdef CONFIG_SMP 514#ifdef CONFIG_SMP
481/* 515/*
482 * Helper routine for generate_sched_domains(). 516 * Helper routine for generate_sched_domains().
483 * Do cpusets a, b have overlapping cpus_allowed masks? 517 * Do cpusets a, b have overlapping effective cpus_allowed masks?
484 */ 518 */
485static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 519static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
486{ 520{
487 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); 521 return cpumask_intersects(a->effective_cpus, b->effective_cpus);
488} 522}
489 523
490static void 524static void
@@ -601,7 +635,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
601 *dattr = SD_ATTR_INIT; 635 *dattr = SD_ATTR_INIT;
602 update_domain_attr_tree(dattr, &top_cpuset); 636 update_domain_attr_tree(dattr, &top_cpuset);
603 } 637 }
604 cpumask_copy(doms[0], top_cpuset.cpus_allowed); 638 cpumask_copy(doms[0], top_cpuset.effective_cpus);
605 639
606 goto done; 640 goto done;
607 } 641 }
@@ -705,7 +739,7 @@ restart:
705 struct cpuset *b = csa[j]; 739 struct cpuset *b = csa[j];
706 740
707 if (apn == b->pn) { 741 if (apn == b->pn) {
708 cpumask_or(dp, dp, b->cpus_allowed); 742 cpumask_or(dp, dp, b->effective_cpus);
709 if (dattr) 743 if (dattr)
710 update_domain_attr_tree(dattr + nslot, b); 744 update_domain_attr_tree(dattr + nslot, b);
711 745
@@ -757,7 +791,7 @@ static void rebuild_sched_domains_locked(void)
757 * passing doms with offlined cpu to partition_sched_domains(). 791 * passing doms with offlined cpu to partition_sched_domains().
758 * Anyways, hotplug work item will rebuild sched domains. 792 * Anyways, hotplug work item will rebuild sched domains.
759 */ 793 */
760 if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) 794 if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
761 goto out; 795 goto out;
762 796
763 /* Generate domain masks and attrs */ 797 /* Generate domain masks and attrs */
@@ -781,45 +815,6 @@ void rebuild_sched_domains(void)
781 mutex_unlock(&cpuset_mutex); 815 mutex_unlock(&cpuset_mutex);
782} 816}
783 817
784/*
785 * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
786 * @cs: the cpuset in interest
787 *
788 * A cpuset's effective cpumask is the cpumask of the nearest ancestor
789 * with non-empty cpus. We use effective cpumask whenever:
790 * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
791 * if the cpuset they reside in has no cpus)
792 * - we want to retrieve task_cs(tsk)'s cpus_allowed.
793 *
794 * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
795 * exception. See comments there.
796 */
797static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
798{
799 while (cpumask_empty(cs->cpus_allowed))
800 cs = parent_cs(cs);
801 return cs;
802}
803
804/*
805 * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
806 * @cs: the cpuset in interest
807 *
808 * A cpuset's effective nodemask is the nodemask of the nearest ancestor
809 * with non-empty memss. We use effective nodemask whenever:
810 * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
811 * if the cpuset they reside in has no mems)
812 * - we want to retrieve task_cs(tsk)'s mems_allowed.
813 *
814 * Called with cpuset_mutex held.
815 */
816static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
817{
818 while (nodes_empty(cs->mems_allowed))
819 cs = parent_cs(cs);
820 return cs;
821}
822
823/** 818/**
824 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 819 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
825 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 820 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -830,53 +825,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
830 */ 825 */
831static void update_tasks_cpumask(struct cpuset *cs) 826static void update_tasks_cpumask(struct cpuset *cs)
832{ 827{
833 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
834 struct css_task_iter it; 828 struct css_task_iter it;
835 struct task_struct *task; 829 struct task_struct *task;
836 830
837 css_task_iter_start(&cs->css, &it); 831 css_task_iter_start(&cs->css, &it);
838 while ((task = css_task_iter_next(&it))) 832 while ((task = css_task_iter_next(&it)))
839 set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed); 833 set_cpus_allowed_ptr(task, cs->effective_cpus);
840 css_task_iter_end(&it); 834 css_task_iter_end(&it);
841} 835}
842 836
843/* 837/*
844 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 838 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
845 * @root_cs: the root cpuset of the hierarchy 839 * @cs: the cpuset to consider
846 * @update_root: update root cpuset or not? 840 * @new_cpus: temp variable for calculating new effective_cpus
841 *
842 * When congifured cpumask is changed, the effective cpumasks of this cpuset
843 * and all its descendants need to be updated.
847 * 844 *
848 * This will update cpumasks of tasks in @root_cs and all other empty cpusets 845 * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
849 * which take on cpumask of @root_cs.
850 * 846 *
851 * Called with cpuset_mutex held 847 * Called with cpuset_mutex held
852 */ 848 */
853static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) 849static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
854{ 850{
855 struct cpuset *cp; 851 struct cpuset *cp;
856 struct cgroup_subsys_state *pos_css; 852 struct cgroup_subsys_state *pos_css;
853 bool need_rebuild_sched_domains = false;
857 854
858 rcu_read_lock(); 855 rcu_read_lock();
859 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { 856 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
860 if (cp == root_cs) { 857 struct cpuset *parent = parent_cs(cp);
861 if (!update_root) 858
862 continue; 859 cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
863 } else { 860
864 /* skip the whole subtree if @cp have some CPU */ 861 /*
865 if (!cpumask_empty(cp->cpus_allowed)) { 862 * If it becomes empty, inherit the effective mask of the
866 pos_css = css_rightmost_descendant(pos_css); 863 * parent, which is guaranteed to have some CPUs.
867 continue; 864 */
868 } 865 if (cpumask_empty(new_cpus))
866 cpumask_copy(new_cpus, parent->effective_cpus);
867
868 /* Skip the whole subtree if the cpumask remains the same. */
869 if (cpumask_equal(new_cpus, cp->effective_cpus)) {
870 pos_css = css_rightmost_descendant(pos_css);
871 continue;
869 } 872 }
873
870 if (!css_tryget_online(&cp->css)) 874 if (!css_tryget_online(&cp->css))
871 continue; 875 continue;
872 rcu_read_unlock(); 876 rcu_read_unlock();
873 877
878 mutex_lock(&callback_mutex);
879 cpumask_copy(cp->effective_cpus, new_cpus);
880 mutex_unlock(&callback_mutex);
881
882 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
883 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
884
874 update_tasks_cpumask(cp); 885 update_tasks_cpumask(cp);
875 886
887 /*
888 * If the effective cpumask of any non-empty cpuset is changed,
889 * we need to rebuild sched domains.
890 */
891 if (!cpumask_empty(cp->cpus_allowed) &&
892 is_sched_load_balance(cp))
893 need_rebuild_sched_domains = true;
894
876 rcu_read_lock(); 895 rcu_read_lock();
877 css_put(&cp->css); 896 css_put(&cp->css);
878 } 897 }
879 rcu_read_unlock(); 898 rcu_read_unlock();
899
900 if (need_rebuild_sched_domains)
901 rebuild_sched_domains_locked();
880} 902}
881 903
882/** 904/**
@@ -889,7 +911,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
889 const char *buf) 911 const char *buf)
890{ 912{
891 int retval; 913 int retval;
892 int is_load_balanced;
893 914
894 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ 915 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
895 if (cs == &top_cpuset) 916 if (cs == &top_cpuset)
@@ -908,7 +929,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
908 if (retval < 0) 929 if (retval < 0)
909 return retval; 930 return retval;
910 931
911 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) 932 if (!cpumask_subset(trialcs->cpus_allowed,
933 top_cpuset.cpus_allowed))
912 return -EINVAL; 934 return -EINVAL;
913 } 935 }
914 936
@@ -920,16 +942,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
920 if (retval < 0) 942 if (retval < 0)
921 return retval; 943 return retval;
922 944
923 is_load_balanced = is_sched_load_balance(trialcs);
924
925 mutex_lock(&callback_mutex); 945 mutex_lock(&callback_mutex);
926 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 946 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
927 mutex_unlock(&callback_mutex); 947 mutex_unlock(&callback_mutex);
928 948
929 update_tasks_cpumask_hier(cs, true); 949 /* use trialcs->cpus_allowed as a temp variable */
930 950 update_cpumasks_hier(cs, trialcs->cpus_allowed);
931 if (is_load_balanced)
932 rebuild_sched_domains_locked();
933 return 0; 951 return 0;
934} 952}
935 953
@@ -951,15 +969,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
951 const nodemask_t *to) 969 const nodemask_t *to)
952{ 970{
953 struct task_struct *tsk = current; 971 struct task_struct *tsk = current;
954 struct cpuset *mems_cs;
955 972
956 tsk->mems_allowed = *to; 973 tsk->mems_allowed = *to;
957 974
958 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 975 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
959 976
960 rcu_read_lock(); 977 rcu_read_lock();
961 mems_cs = effective_nodemask_cpuset(task_cs(tsk)); 978 guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
962 guarantee_online_mems(mems_cs, &tsk->mems_allowed);
963 rcu_read_unlock(); 979 rcu_read_unlock();
964} 980}
965 981
@@ -1028,13 +1044,12 @@ static void *cpuset_being_rebound;
1028static void update_tasks_nodemask(struct cpuset *cs) 1044static void update_tasks_nodemask(struct cpuset *cs)
1029{ 1045{
1030 static nodemask_t newmems; /* protected by cpuset_mutex */ 1046 static nodemask_t newmems; /* protected by cpuset_mutex */
1031 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1032 struct css_task_iter it; 1047 struct css_task_iter it;
1033 struct task_struct *task; 1048 struct task_struct *task;
1034 1049
1035 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1050 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1036 1051
1037 guarantee_online_mems(mems_cs, &newmems); 1052 guarantee_online_mems(cs, &newmems);
1038 1053
1039 /* 1054 /*
1040 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1055 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1077,36 +1092,52 @@ static void update_tasks_nodemask(struct cpuset *cs)
1077} 1092}
1078 1093
1079/* 1094/*
1080 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1095 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
1081 * @cs: the root cpuset of the hierarchy 1096 * @cs: the cpuset to consider
1082 * @update_root: update the root cpuset or not? 1097 * @new_mems: a temp variable for calculating new effective_mems
1083 * 1098 *
1084 * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1099 * When configured nodemask is changed, the effective nodemasks of this cpuset
1085 * which take on nodemask of @root_cs. 1100 * and all its descendants need to be updated.
1101 *
1102 * On legacy hiearchy, effective_mems will be the same with mems_allowed.
1086 * 1103 *
1087 * Called with cpuset_mutex held 1104 * Called with cpuset_mutex held
1088 */ 1105 */
1089static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) 1106static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1090{ 1107{
1091 struct cpuset *cp; 1108 struct cpuset *cp;
1092 struct cgroup_subsys_state *pos_css; 1109 struct cgroup_subsys_state *pos_css;
1093 1110
1094 rcu_read_lock(); 1111 rcu_read_lock();
1095 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { 1112 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1096 if (cp == root_cs) { 1113 struct cpuset *parent = parent_cs(cp);
1097 if (!update_root) 1114
1098 continue; 1115 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1099 } else { 1116
1100 /* skip the whole subtree if @cp have some CPU */ 1117 /*
1101 if (!nodes_empty(cp->mems_allowed)) { 1118 * If it becomes empty, inherit the effective mask of the
1102 pos_css = css_rightmost_descendant(pos_css); 1119 * parent, which is guaranteed to have some MEMs.
1103 continue; 1120 */
1104 } 1121 if (nodes_empty(*new_mems))
1122 *new_mems = parent->effective_mems;
1123
1124 /* Skip the whole subtree if the nodemask remains the same. */
1125 if (nodes_equal(*new_mems, cp->effective_mems)) {
1126 pos_css = css_rightmost_descendant(pos_css);
1127 continue;
1105 } 1128 }
1129
1106 if (!css_tryget_online(&cp->css)) 1130 if (!css_tryget_online(&cp->css))
1107 continue; 1131 continue;
1108 rcu_read_unlock(); 1132 rcu_read_unlock();
1109 1133
1134 mutex_lock(&callback_mutex);
1135 cp->effective_mems = *new_mems;
1136 mutex_unlock(&callback_mutex);
1137
1138 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
1139 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1140
1110 update_tasks_nodemask(cp); 1141 update_tasks_nodemask(cp);
1111 1142
1112 rcu_read_lock(); 1143 rcu_read_lock();
@@ -1156,8 +1187,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1156 goto done; 1187 goto done;
1157 1188
1158 if (!nodes_subset(trialcs->mems_allowed, 1189 if (!nodes_subset(trialcs->mems_allowed,
1159 node_states[N_MEMORY])) { 1190 top_cpuset.mems_allowed)) {
1160 retval = -EINVAL; 1191 retval = -EINVAL;
1161 goto done; 1192 goto done;
1162 } 1193 }
1163 } 1194 }
@@ -1174,7 +1205,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1174 cs->mems_allowed = trialcs->mems_allowed; 1205 cs->mems_allowed = trialcs->mems_allowed;
1175 mutex_unlock(&callback_mutex); 1206 mutex_unlock(&callback_mutex);
1176 1207
1177 update_tasks_nodemask_hier(cs, true); 1208 /* use trialcs->mems_allowed as a temp variable */
1209 update_nodemasks_hier(cs, &cs->mems_allowed);
1178done: 1210done:
1179 return retval; 1211 return retval;
1180} 1212}
@@ -1389,12 +1421,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1389 1421
1390 mutex_lock(&cpuset_mutex); 1422 mutex_lock(&cpuset_mutex);
1391 1423
1392 /* 1424 /* allow moving tasks into an empty cpuset if on default hierarchy */
1393 * We allow to move tasks into an empty cpuset if sane_behavior
1394 * flag is set.
1395 */
1396 ret = -ENOSPC; 1425 ret = -ENOSPC;
1397 if (!cgroup_sane_behavior(css->cgroup) && 1426 if (!cgroup_on_dfl(css->cgroup) &&
1398 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1427 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1399 goto out_unlock; 1428 goto out_unlock;
1400 1429
@@ -1452,8 +1481,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1452 struct task_struct *leader = cgroup_taskset_first(tset); 1481 struct task_struct *leader = cgroup_taskset_first(tset);
1453 struct cpuset *cs = css_cs(css); 1482 struct cpuset *cs = css_cs(css);
1454 struct cpuset *oldcs = cpuset_attach_old_cs; 1483 struct cpuset *oldcs = cpuset_attach_old_cs;
1455 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1456 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1457 1484
1458 mutex_lock(&cpuset_mutex); 1485 mutex_lock(&cpuset_mutex);
1459 1486
@@ -1461,9 +1488,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1461 if (cs == &top_cpuset) 1488 if (cs == &top_cpuset)
1462 cpumask_copy(cpus_attach, cpu_possible_mask); 1489 cpumask_copy(cpus_attach, cpu_possible_mask);
1463 else 1490 else
1464 guarantee_online_cpus(cpus_cs, cpus_attach); 1491 guarantee_online_cpus(cs, cpus_attach);
1465 1492
1466 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1493 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1467 1494
1468 cgroup_taskset_for_each(task, tset) { 1495 cgroup_taskset_for_each(task, tset) {
1469 /* 1496 /*
@@ -1480,11 +1507,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1480 * Change mm, possibly for multiple threads in a threadgroup. This is 1507 * Change mm, possibly for multiple threads in a threadgroup. This is
1481 * expensive and may sleep. 1508 * expensive and may sleep.
1482 */ 1509 */
1483 cpuset_attach_nodemask_to = cs->mems_allowed; 1510 cpuset_attach_nodemask_to = cs->effective_mems;
1484 mm = get_task_mm(leader); 1511 mm = get_task_mm(leader);
1485 if (mm) { 1512 if (mm) {
1486 struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
1487
1488 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1513 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1489 1514
1490 /* 1515 /*
@@ -1495,7 +1520,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1495 * mm from. 1520 * mm from.
1496 */ 1521 */
1497 if (is_memory_migrate(cs)) { 1522 if (is_memory_migrate(cs)) {
1498 cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed, 1523 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
1499 &cpuset_attach_nodemask_to); 1524 &cpuset_attach_nodemask_to);
1500 } 1525 }
1501 mmput(mm); 1526 mmput(mm);
@@ -1516,6 +1541,8 @@ typedef enum {
1516 FILE_MEMORY_MIGRATE, 1541 FILE_MEMORY_MIGRATE,
1517 FILE_CPULIST, 1542 FILE_CPULIST,
1518 FILE_MEMLIST, 1543 FILE_MEMLIST,
1544 FILE_EFFECTIVE_CPULIST,
1545 FILE_EFFECTIVE_MEMLIST,
1519 FILE_CPU_EXCLUSIVE, 1546 FILE_CPU_EXCLUSIVE,
1520 FILE_MEM_EXCLUSIVE, 1547 FILE_MEM_EXCLUSIVE,
1521 FILE_MEM_HARDWALL, 1548 FILE_MEM_HARDWALL,
@@ -1694,6 +1721,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1694 case FILE_MEMLIST: 1721 case FILE_MEMLIST:
1695 s += nodelist_scnprintf(s, count, cs->mems_allowed); 1722 s += nodelist_scnprintf(s, count, cs->mems_allowed);
1696 break; 1723 break;
1724 case FILE_EFFECTIVE_CPULIST:
1725 s += cpulist_scnprintf(s, count, cs->effective_cpus);
1726 break;
1727 case FILE_EFFECTIVE_MEMLIST:
1728 s += nodelist_scnprintf(s, count, cs->effective_mems);
1729 break;
1697 default: 1730 default:
1698 ret = -EINVAL; 1731 ret = -EINVAL;
1699 goto out_unlock; 1732 goto out_unlock;
@@ -1779,6 +1812,18 @@ static struct cftype files[] = {
1779 }, 1812 },
1780 1813
1781 { 1814 {
1815 .name = "effective_cpus",
1816 .seq_show = cpuset_common_seq_show,
1817 .private = FILE_EFFECTIVE_CPULIST,
1818 },
1819
1820 {
1821 .name = "effective_mems",
1822 .seq_show = cpuset_common_seq_show,
1823 .private = FILE_EFFECTIVE_MEMLIST,
1824 },
1825
1826 {
1782 .name = "cpu_exclusive", 1827 .name = "cpu_exclusive",
1783 .read_u64 = cpuset_read_u64, 1828 .read_u64 = cpuset_read_u64,
1784 .write_u64 = cpuset_write_u64, 1829 .write_u64 = cpuset_write_u64,
@@ -1869,18 +1914,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1869 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1914 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1870 if (!cs) 1915 if (!cs)
1871 return ERR_PTR(-ENOMEM); 1916 return ERR_PTR(-ENOMEM);
1872 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { 1917 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
1873 kfree(cs); 1918 goto free_cs;
1874 return ERR_PTR(-ENOMEM); 1919 if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
1875 } 1920 goto free_cpus;
1876 1921
1877 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1922 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1878 cpumask_clear(cs->cpus_allowed); 1923 cpumask_clear(cs->cpus_allowed);
1879 nodes_clear(cs->mems_allowed); 1924 nodes_clear(cs->mems_allowed);
1925 cpumask_clear(cs->effective_cpus);
1926 nodes_clear(cs->effective_mems);
1880 fmeter_init(&cs->fmeter); 1927 fmeter_init(&cs->fmeter);
1881 cs->relax_domain_level = -1; 1928 cs->relax_domain_level = -1;
1882 1929
1883 return &cs->css; 1930 return &cs->css;
1931
1932free_cpus:
1933 free_cpumask_var(cs->cpus_allowed);
1934free_cs:
1935 kfree(cs);
1936 return ERR_PTR(-ENOMEM);
1884} 1937}
1885 1938
1886static int cpuset_css_online(struct cgroup_subsys_state *css) 1939static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1903,6 +1956,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1903 1956
1904 cpuset_inc(); 1957 cpuset_inc();
1905 1958
1959 mutex_lock(&callback_mutex);
1960 if (cgroup_on_dfl(cs->css.cgroup)) {
1961 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1962 cs->effective_mems = parent->effective_mems;
1963 }
1964 mutex_unlock(&callback_mutex);
1965
1906 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 1966 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1907 goto out_unlock; 1967 goto out_unlock;
1908 1968
@@ -1962,20 +2022,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
1962{ 2022{
1963 struct cpuset *cs = css_cs(css); 2023 struct cpuset *cs = css_cs(css);
1964 2024
2025 free_cpumask_var(cs->effective_cpus);
1965 free_cpumask_var(cs->cpus_allowed); 2026 free_cpumask_var(cs->cpus_allowed);
1966 kfree(cs); 2027 kfree(cs);
1967} 2028}
1968 2029
2030static void cpuset_bind(struct cgroup_subsys_state *root_css)
2031{
2032 mutex_lock(&cpuset_mutex);
2033 mutex_lock(&callback_mutex);
2034
2035 if (cgroup_on_dfl(root_css->cgroup)) {
2036 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2037 top_cpuset.mems_allowed = node_possible_map;
2038 } else {
2039 cpumask_copy(top_cpuset.cpus_allowed,
2040 top_cpuset.effective_cpus);
2041 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2042 }
2043
2044 mutex_unlock(&callback_mutex);
2045 mutex_unlock(&cpuset_mutex);
2046}
2047
1969struct cgroup_subsys cpuset_cgrp_subsys = { 2048struct cgroup_subsys cpuset_cgrp_subsys = {
1970 .css_alloc = cpuset_css_alloc, 2049 .css_alloc = cpuset_css_alloc,
1971 .css_online = cpuset_css_online, 2050 .css_online = cpuset_css_online,
1972 .css_offline = cpuset_css_offline, 2051 .css_offline = cpuset_css_offline,
1973 .css_free = cpuset_css_free, 2052 .css_free = cpuset_css_free,
1974 .can_attach = cpuset_can_attach, 2053 .can_attach = cpuset_can_attach,
1975 .cancel_attach = cpuset_cancel_attach, 2054 .cancel_attach = cpuset_cancel_attach,
1976 .attach = cpuset_attach, 2055 .attach = cpuset_attach,
1977 .base_cftypes = files, 2056 .bind = cpuset_bind,
1978 .early_init = 1, 2057 .legacy_cftypes = files,
2058 .early_init = 1,
1979}; 2059};
1980 2060
1981/** 2061/**
@@ -1990,9 +2070,13 @@ int __init cpuset_init(void)
1990 2070
1991 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) 2071 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1992 BUG(); 2072 BUG();
2073 if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
2074 BUG();
1993 2075
1994 cpumask_setall(top_cpuset.cpus_allowed); 2076 cpumask_setall(top_cpuset.cpus_allowed);
1995 nodes_setall(top_cpuset.mems_allowed); 2077 nodes_setall(top_cpuset.mems_allowed);
2078 cpumask_setall(top_cpuset.effective_cpus);
2079 nodes_setall(top_cpuset.effective_mems);
1996 2080
1997 fmeter_init(&top_cpuset.fmeter); 2081 fmeter_init(&top_cpuset.fmeter);
1998 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 2082 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
@@ -2035,6 +2119,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2035 } 2119 }
2036} 2120}
2037 2121
2122static void
2123hotplug_update_tasks_legacy(struct cpuset *cs,
2124 struct cpumask *new_cpus, nodemask_t *new_mems,
2125 bool cpus_updated, bool mems_updated)
2126{
2127 bool is_empty;
2128
2129 mutex_lock(&callback_mutex);
2130 cpumask_copy(cs->cpus_allowed, new_cpus);
2131 cpumask_copy(cs->effective_cpus, new_cpus);
2132 cs->mems_allowed = *new_mems;
2133 cs->effective_mems = *new_mems;
2134 mutex_unlock(&callback_mutex);
2135
2136 /*
2137 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
2138 * as the tasks will be migratecd to an ancestor.
2139 */
2140 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
2141 update_tasks_cpumask(cs);
2142 if (mems_updated && !nodes_empty(cs->mems_allowed))
2143 update_tasks_nodemask(cs);
2144
2145 is_empty = cpumask_empty(cs->cpus_allowed) ||
2146 nodes_empty(cs->mems_allowed);
2147
2148 mutex_unlock(&cpuset_mutex);
2149
2150 /*
2151 * Move tasks to the nearest ancestor with execution resources,
2152 * This is full cgroup operation which will also call back into
2153 * cpuset. Should be done outside any lock.
2154 */
2155 if (is_empty)
2156 remove_tasks_in_empty_cpuset(cs);
2157
2158 mutex_lock(&cpuset_mutex);
2159}
2160
2161static void
2162hotplug_update_tasks(struct cpuset *cs,
2163 struct cpumask *new_cpus, nodemask_t *new_mems,
2164 bool cpus_updated, bool mems_updated)
2165{
2166 if (cpumask_empty(new_cpus))
2167 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
2168 if (nodes_empty(*new_mems))
2169 *new_mems = parent_cs(cs)->effective_mems;
2170
2171 mutex_lock(&callback_mutex);
2172 cpumask_copy(cs->effective_cpus, new_cpus);
2173 cs->effective_mems = *new_mems;
2174 mutex_unlock(&callback_mutex);
2175
2176 if (cpus_updated)
2177 update_tasks_cpumask(cs);
2178 if (mems_updated)
2179 update_tasks_nodemask(cs);
2180}
2181
2038/** 2182/**
2039 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug 2183 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
2040 * @cs: cpuset in interest 2184 * @cs: cpuset in interest
@@ -2045,11 +2189,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2045 */ 2189 */
2046static void cpuset_hotplug_update_tasks(struct cpuset *cs) 2190static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2047{ 2191{
2048 static cpumask_t off_cpus; 2192 static cpumask_t new_cpus;
2049 static nodemask_t off_mems; 2193 static nodemask_t new_mems;
2050 bool is_empty; 2194 bool cpus_updated;
2051 bool sane = cgroup_sane_behavior(cs->css.cgroup); 2195 bool mems_updated;
2052
2053retry: 2196retry:
2054 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 2197 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2055 2198
@@ -2064,51 +2207,20 @@ retry:
2064 goto retry; 2207 goto retry;
2065 } 2208 }
2066 2209
2067 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); 2210 cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
2068 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); 2211 nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
2069
2070 mutex_lock(&callback_mutex);
2071 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2072 mutex_unlock(&callback_mutex);
2073
2074 /*
2075 * If sane_behavior flag is set, we need to update tasks' cpumask
2076 * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
2077 * call update_tasks_cpumask() if the cpuset becomes empty, as
2078 * the tasks in it will be migrated to an ancestor.
2079 */
2080 if ((sane && cpumask_empty(cs->cpus_allowed)) ||
2081 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
2082 update_tasks_cpumask(cs);
2083 2212
2084 mutex_lock(&callback_mutex); 2213 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
2085 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); 2214 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
2086 mutex_unlock(&callback_mutex);
2087
2088 /*
2089 * If sane_behavior flag is set, we need to update tasks' nodemask
2090 * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
2091 * call update_tasks_nodemask() if the cpuset becomes empty, as
2092 * the tasks in it will be migratd to an ancestor.
2093 */
2094 if ((sane && nodes_empty(cs->mems_allowed)) ||
2095 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
2096 update_tasks_nodemask(cs);
2097 2215
2098 is_empty = cpumask_empty(cs->cpus_allowed) || 2216 if (cgroup_on_dfl(cs->css.cgroup))
2099 nodes_empty(cs->mems_allowed); 2217 hotplug_update_tasks(cs, &new_cpus, &new_mems,
2218 cpus_updated, mems_updated);
2219 else
2220 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
2221 cpus_updated, mems_updated);
2100 2222
2101 mutex_unlock(&cpuset_mutex); 2223 mutex_unlock(&cpuset_mutex);
2102
2103 /*
2104 * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
2105 *
2106 * Otherwise move tasks to the nearest ancestor with execution
2107 * resources. This is full cgroup operation which will
2108 * also call back into cpuset. Should be done outside any lock.
2109 */
2110 if (!sane && is_empty)
2111 remove_tasks_in_empty_cpuset(cs);
2112} 2224}
2113 2225
2114/** 2226/**
@@ -2132,6 +2244,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2132 static cpumask_t new_cpus; 2244 static cpumask_t new_cpus;
2133 static nodemask_t new_mems; 2245 static nodemask_t new_mems;
2134 bool cpus_updated, mems_updated; 2246 bool cpus_updated, mems_updated;
2247 bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
2135 2248
2136 mutex_lock(&cpuset_mutex); 2249 mutex_lock(&cpuset_mutex);
2137 2250
@@ -2139,13 +2252,15 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2139 cpumask_copy(&new_cpus, cpu_active_mask); 2252 cpumask_copy(&new_cpus, cpu_active_mask);
2140 new_mems = node_states[N_MEMORY]; 2253 new_mems = node_states[N_MEMORY];
2141 2254
2142 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); 2255 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
2143 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); 2256 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
2144 2257
2145 /* synchronize cpus_allowed to cpu_active_mask */ 2258 /* synchronize cpus_allowed to cpu_active_mask */
2146 if (cpus_updated) { 2259 if (cpus_updated) {
2147 mutex_lock(&callback_mutex); 2260 mutex_lock(&callback_mutex);
2148 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); 2261 if (!on_dfl)
2262 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2263 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2149 mutex_unlock(&callback_mutex); 2264 mutex_unlock(&callback_mutex);
2150 /* we don't mess with cpumasks of tasks in top_cpuset */ 2265 /* we don't mess with cpumasks of tasks in top_cpuset */
2151 } 2266 }
@@ -2153,7 +2268,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2153 /* synchronize mems_allowed to N_MEMORY */ 2268 /* synchronize mems_allowed to N_MEMORY */
2154 if (mems_updated) { 2269 if (mems_updated) {
2155 mutex_lock(&callback_mutex); 2270 mutex_lock(&callback_mutex);
2156 top_cpuset.mems_allowed = new_mems; 2271 if (!on_dfl)
2272 top_cpuset.mems_allowed = new_mems;
2273 top_cpuset.effective_mems = new_mems;
2157 mutex_unlock(&callback_mutex); 2274 mutex_unlock(&callback_mutex);
2158 update_tasks_nodemask(&top_cpuset); 2275 update_tasks_nodemask(&top_cpuset);
2159 } 2276 }
@@ -2228,6 +2345,9 @@ void __init cpuset_init_smp(void)
2228 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2345 top_cpuset.mems_allowed = node_states[N_MEMORY];
2229 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; 2346 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
2230 2347
2348 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
2349 top_cpuset.effective_mems = node_states[N_MEMORY];
2350
2231 register_hotmemory_notifier(&cpuset_track_online_nodes_nb); 2351 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2232} 2352}
2233 2353
@@ -2244,23 +2364,17 @@ void __init cpuset_init_smp(void)
2244 2364
2245void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2365void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2246{ 2366{
2247 struct cpuset *cpus_cs;
2248
2249 mutex_lock(&callback_mutex); 2367 mutex_lock(&callback_mutex);
2250 rcu_read_lock(); 2368 rcu_read_lock();
2251 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2369 guarantee_online_cpus(task_cs(tsk), pmask);
2252 guarantee_online_cpus(cpus_cs, pmask);
2253 rcu_read_unlock(); 2370 rcu_read_unlock();
2254 mutex_unlock(&callback_mutex); 2371 mutex_unlock(&callback_mutex);
2255} 2372}
2256 2373
2257void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2374void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2258{ 2375{
2259 struct cpuset *cpus_cs;
2260
2261 rcu_read_lock(); 2376 rcu_read_lock();
2262 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2377 do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
2263 do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
2264 rcu_read_unlock(); 2378 rcu_read_unlock();
2265 2379
2266 /* 2380 /*
@@ -2299,13 +2413,11 @@ void cpuset_init_current_mems_allowed(void)
2299 2413
2300nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2414nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2301{ 2415{
2302 struct cpuset *mems_cs;
2303 nodemask_t mask; 2416 nodemask_t mask;
2304 2417
2305 mutex_lock(&callback_mutex); 2418 mutex_lock(&callback_mutex);
2306 rcu_read_lock(); 2419 rcu_read_lock();
2307 mems_cs = effective_nodemask_cpuset(task_cs(tsk)); 2420 guarantee_online_mems(task_cs(tsk), &mask);
2308 guarantee_online_mems(mems_cs, &mask);
2309 rcu_read_unlock(); 2421 rcu_read_unlock();
2310 mutex_unlock(&callback_mutex); 2422 mutex_unlock(&callback_mutex);
2311 2423
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a33d9a2bcbd7..1cf24b3e42ec 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2320,7 +2320,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2320 next_parent = rcu_dereference(next_ctx->parent_ctx); 2320 next_parent = rcu_dereference(next_ctx->parent_ctx);
2321 2321
2322 /* If neither context have a parent context; they cannot be clones. */ 2322 /* If neither context have a parent context; they cannot be clones. */
2323 if (!parent && !next_parent) 2323 if (!parent || !next_parent)
2324 goto unlock; 2324 goto unlock;
2325 2325
2326 if (next_parent == ctx || next_ctx == parent || next_parent == parent) { 2326 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -5266,6 +5266,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5266 5266
5267 goto got_name; 5267 goto got_name;
5268 } else { 5268 } else {
5269 if (vma->vm_ops && vma->vm_ops->name) {
5270 name = (char *) vma->vm_ops->name(vma);
5271 if (name)
5272 goto cpy_name;
5273 }
5274
5269 name = (char *)arch_vma_name(vma); 5275 name = (char *)arch_vma_name(vma);
5270 if (name) 5276 if (name)
5271 goto cpy_name; 5277 goto cpy_name;
@@ -7458,7 +7464,19 @@ __perf_event_exit_task(struct perf_event *child_event,
7458 struct perf_event_context *child_ctx, 7464 struct perf_event_context *child_ctx,
7459 struct task_struct *child) 7465 struct task_struct *child)
7460{ 7466{
7461 perf_remove_from_context(child_event, true); 7467 /*
7468 * Do not destroy the 'original' grouping; because of the context
7469 * switch optimization the original events could've ended up in a
7470 * random child task.
7471 *
7472 * If we were to destroy the original group, all group related
7473 * operations would cease to function properly after this random
7474 * child dies.
7475 *
7476 * Do destroy all inherited groups, we don't care about those
7477 * and being thorough is better.
7478 */
7479 perf_remove_from_context(child_event, !!child_event->parent);
7462 7480
7463 /* 7481 /*
7464 * It can happen that the parent exits first, and has events 7482 * It can happen that the parent exits first, and has events
@@ -7474,7 +7492,7 @@ __perf_event_exit_task(struct perf_event *child_event,
7474static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 7492static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7475{ 7493{
7476 struct perf_event *child_event, *next; 7494 struct perf_event *child_event, *next;
7477 struct perf_event_context *child_ctx; 7495 struct perf_event_context *child_ctx, *parent_ctx;
7478 unsigned long flags; 7496 unsigned long flags;
7479 7497
7480 if (likely(!child->perf_event_ctxp[ctxn])) { 7498 if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -7499,6 +7517,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7499 raw_spin_lock(&child_ctx->lock); 7517 raw_spin_lock(&child_ctx->lock);
7500 task_ctx_sched_out(child_ctx); 7518 task_ctx_sched_out(child_ctx);
7501 child->perf_event_ctxp[ctxn] = NULL; 7519 child->perf_event_ctxp[ctxn] = NULL;
7520
7521 /*
7522 * In order to avoid freeing: child_ctx->parent_ctx->task
7523 * under perf_event_context::lock, grab another reference.
7524 */
7525 parent_ctx = child_ctx->parent_ctx;
7526 if (parent_ctx)
7527 get_ctx(parent_ctx);
7528
7502 /* 7529 /*
7503 * If this context is a clone; unclone it so it can't get 7530 * If this context is a clone; unclone it so it can't get
7504 * swapped to another process while we're removing all 7531 * swapped to another process while we're removing all
@@ -7509,6 +7536,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7509 raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 7536 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
7510 7537
7511 /* 7538 /*
7539 * Now that we no longer hold perf_event_context::lock, drop
7540 * our extra child_ctx->parent_ctx reference.
7541 */
7542 if (parent_ctx)
7543 put_ctx(parent_ctx);
7544
7545 /*
7512 * Report the task dead after unscheduling the events so that we 7546 * Report the task dead after unscheduling the events so that we
7513 * won't get any samples after PERF_RECORD_EXIT. We can however still 7547 * won't get any samples after PERF_RECORD_EXIT. We can however still
7514 * get a few PERF_RECORD_READ events. 7548 * get a few PERF_RECORD_READ events.
@@ -7776,7 +7810,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
7776/* 7810/*
7777 * Initialize the perf_event context in task_struct 7811 * Initialize the perf_event context in task_struct
7778 */ 7812 */
7779int perf_event_init_context(struct task_struct *child, int ctxn) 7813static int perf_event_init_context(struct task_struct *child, int ctxn)
7780{ 7814{
7781 struct perf_event_context *child_ctx, *parent_ctx; 7815 struct perf_event_context *child_ctx, *parent_ctx;
7782 struct perf_event_context *cloned_ctx; 7816 struct perf_event_context *cloned_ctx;
diff --git a/kernel/fork.c b/kernel/fork.c
index 627b7f80afb0..5f1bf3bebb4f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1095,7 +1095,6 @@ static void rt_mutex_init_task(struct task_struct *p)
1095 p->pi_waiters = RB_ROOT; 1095 p->pi_waiters = RB_ROOT;
1096 p->pi_waiters_leftmost = NULL; 1096 p->pi_waiters_leftmost = NULL;
1097 p->pi_blocked_on = NULL; 1097 p->pi_blocked_on = NULL;
1098 p->pi_top_task = NULL;
1099#endif 1098#endif
1100} 1099}
1101 1100
diff --git a/kernel/futex.c b/kernel/futex.c
index b632b5f3f094..d3a9d946d0b7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -792,94 +792,91 @@ void exit_pi_state_list(struct task_struct *curr)
792 * [10] There is no transient state which leaves owner and user space 792 * [10] There is no transient state which leaves owner and user space
793 * TID out of sync. 793 * TID out of sync.
794 */ 794 */
795static int 795
796lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 796/*
797 union futex_key *key, struct futex_pi_state **ps) 797 * Validate that the existing waiter has a pi_state and sanity check
798 * the pi_state against the user space value. If correct, attach to
799 * it.
800 */
801static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
802 struct futex_pi_state **ps)
798{ 803{
799 struct futex_pi_state *pi_state = NULL;
800 struct futex_q *this, *next;
801 struct task_struct *p;
802 pid_t pid = uval & FUTEX_TID_MASK; 804 pid_t pid = uval & FUTEX_TID_MASK;
803 805
804 plist_for_each_entry_safe(this, next, &hb->chain, list) { 806 /*
805 if (match_futex(&this->key, key)) { 807 * Userspace might have messed up non-PI and PI futexes [3]
806 /* 808 */
807 * Sanity check the waiter before increasing 809 if (unlikely(!pi_state))
808 * the refcount and attaching to it. 810 return -EINVAL;
809 */
810 pi_state = this->pi_state;
811 /*
812 * Userspace might have messed up non-PI and
813 * PI futexes [3]
814 */
815 if (unlikely(!pi_state))
816 return -EINVAL;
817 811
818 WARN_ON(!atomic_read(&pi_state->refcount)); 812 WARN_ON(!atomic_read(&pi_state->refcount));
819 813
814 /*
815 * Handle the owner died case:
816 */
817 if (uval & FUTEX_OWNER_DIED) {
818 /*
819 * exit_pi_state_list sets owner to NULL and wakes the
820 * topmost waiter. The task which acquires the
821 * pi_state->rt_mutex will fixup owner.
822 */
823 if (!pi_state->owner) {
820 /* 824 /*
821 * Handle the owner died case: 825 * No pi state owner, but the user space TID
826 * is not 0. Inconsistent state. [5]
822 */ 827 */
823 if (uval & FUTEX_OWNER_DIED) { 828 if (pid)
824 /* 829 return -EINVAL;
825 * exit_pi_state_list sets owner to NULL and
826 * wakes the topmost waiter. The task which
827 * acquires the pi_state->rt_mutex will fixup
828 * owner.
829 */
830 if (!pi_state->owner) {
831 /*
832 * No pi state owner, but the user
833 * space TID is not 0. Inconsistent
834 * state. [5]
835 */
836 if (pid)
837 return -EINVAL;
838 /*
839 * Take a ref on the state and
840 * return. [4]
841 */
842 goto out_state;
843 }
844
845 /*
846 * If TID is 0, then either the dying owner
847 * has not yet executed exit_pi_state_list()
848 * or some waiter acquired the rtmutex in the
849 * pi state, but did not yet fixup the TID in
850 * user space.
851 *
852 * Take a ref on the state and return. [6]
853 */
854 if (!pid)
855 goto out_state;
856 } else {
857 /*
858 * If the owner died bit is not set,
859 * then the pi_state must have an
860 * owner. [7]
861 */
862 if (!pi_state->owner)
863 return -EINVAL;
864 }
865
866 /* 830 /*
867 * Bail out if user space manipulated the 831 * Take a ref on the state and return success. [4]
868 * futex value. If pi state exists then the
869 * owner TID must be the same as the user
870 * space TID. [9/10]
871 */ 832 */
872 if (pid != task_pid_vnr(pi_state->owner)) 833 goto out_state;
873 return -EINVAL;
874
875 out_state:
876 atomic_inc(&pi_state->refcount);
877 *ps = pi_state;
878 return 0;
879 } 834 }
835
836 /*
837 * If TID is 0, then either the dying owner has not
838 * yet executed exit_pi_state_list() or some waiter
839 * acquired the rtmutex in the pi state, but did not
840 * yet fixup the TID in user space.
841 *
842 * Take a ref on the state and return success. [6]
843 */
844 if (!pid)
845 goto out_state;
846 } else {
847 /*
848 * If the owner died bit is not set, then the pi_state
849 * must have an owner. [7]
850 */
851 if (!pi_state->owner)
852 return -EINVAL;
880 } 853 }
881 854
882 /* 855 /*
856 * Bail out if user space manipulated the futex value. If pi
857 * state exists then the owner TID must be the same as the
858 * user space TID. [9/10]
859 */
860 if (pid != task_pid_vnr(pi_state->owner))
861 return -EINVAL;
862out_state:
863 atomic_inc(&pi_state->refcount);
864 *ps = pi_state;
865 return 0;
866}
867
868/*
869 * Lookup the task for the TID provided from user space and attach to
870 * it after doing proper sanity checks.
871 */
872static int attach_to_pi_owner(u32 uval, union futex_key *key,
873 struct futex_pi_state **ps)
874{
875 pid_t pid = uval & FUTEX_TID_MASK;
876 struct futex_pi_state *pi_state;
877 struct task_struct *p;
878
879 /*
883 * We are the first waiter - try to look up the real owner and attach 880 * We are the first waiter - try to look up the real owner and attach
884 * the new pi_state to it, but bail out when TID = 0 [1] 881 * the new pi_state to it, but bail out when TID = 0 [1]
885 */ 882 */
@@ -920,7 +917,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
920 pi_state = alloc_pi_state(); 917 pi_state = alloc_pi_state();
921 918
922 /* 919 /*
923 * Initialize the pi_mutex in locked state and make 'p' 920 * Initialize the pi_mutex in locked state and make @p
924 * the owner of it: 921 * the owner of it:
925 */ 922 */
926 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 923 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
@@ -940,6 +937,36 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
940 return 0; 937 return 0;
941} 938}
942 939
940static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
941 union futex_key *key, struct futex_pi_state **ps)
942{
943 struct futex_q *match = futex_top_waiter(hb, key);
944
945 /*
946 * If there is a waiter on that futex, validate it and
947 * attach to the pi_state when the validation succeeds.
948 */
949 if (match)
950 return attach_to_pi_state(uval, match->pi_state, ps);
951
952 /*
953 * We are the first waiter - try to look up the owner based on
954 * @uval and attach to it.
955 */
956 return attach_to_pi_owner(uval, key, ps);
957}
958
959static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
960{
961 u32 uninitialized_var(curval);
962
963 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
964 return -EFAULT;
965
966 /*If user space value changed, let the caller retry */
967 return curval != uval ? -EAGAIN : 0;
968}
969
943/** 970/**
944 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex 971 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
945 * @uaddr: the pi futex user address 972 * @uaddr: the pi futex user address
@@ -963,113 +990,69 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
963 struct futex_pi_state **ps, 990 struct futex_pi_state **ps,
964 struct task_struct *task, int set_waiters) 991 struct task_struct *task, int set_waiters)
965{ 992{
966 int lock_taken, ret, force_take = 0; 993 u32 uval, newval, vpid = task_pid_vnr(task);
967 u32 uval, newval, curval, vpid = task_pid_vnr(task); 994 struct futex_q *match;
968 995 int ret;
969retry:
970 ret = lock_taken = 0;
971 996
972 /* 997 /*
973 * To avoid races, we attempt to take the lock here again 998 * Read the user space value first so we can validate a few
974 * (by doing a 0 -> TID atomic cmpxchg), while holding all 999 * things before proceeding further.
975 * the locks. It will most likely not succeed.
976 */ 1000 */
977 newval = vpid; 1001 if (get_futex_value_locked(&uval, uaddr))
978 if (set_waiters)
979 newval |= FUTEX_WAITERS;
980
981 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
982 return -EFAULT; 1002 return -EFAULT;
983 1003
984 /* 1004 /*
985 * Detect deadlocks. 1005 * Detect deadlocks.
986 */ 1006 */
987 if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) 1007 if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
988 return -EDEADLK; 1008 return -EDEADLK;
989 1009
990 /* 1010 /*
991 * Surprise - we got the lock, but we do not trust user space at all. 1011 * Lookup existing state first. If it exists, try to attach to
992 */ 1012 * its pi_state.
993 if (unlikely(!curval)) {
994 /*
995 * We verify whether there is kernel state for this
996 * futex. If not, we can safely assume, that the 0 ->
997 * TID transition is correct. If state exists, we do
998 * not bother to fixup the user space state as it was
999 * corrupted already.
1000 */
1001 return futex_top_waiter(hb, key) ? -EINVAL : 1;
1002 }
1003
1004 uval = curval;
1005
1006 /*
1007 * Set the FUTEX_WAITERS flag, so the owner will know it has someone
1008 * to wake at the next unlock.
1009 */ 1013 */
1010 newval = curval | FUTEX_WAITERS; 1014 match = futex_top_waiter(hb, key);
1015 if (match)
1016 return attach_to_pi_state(uval, match->pi_state, ps);
1011 1017
1012 /* 1018 /*
1013 * Should we force take the futex? See below. 1019 * No waiter and user TID is 0. We are here because the
1020 * waiters or the owner died bit is set or called from
1021 * requeue_cmp_pi or for whatever reason something took the
1022 * syscall.
1014 */ 1023 */
1015 if (unlikely(force_take)) { 1024 if (!(uval & FUTEX_TID_MASK)) {
1016 /* 1025 /*
1017 * Keep the OWNER_DIED and the WAITERS bit and set the 1026 * We take over the futex. No other waiters and the user space
1018 * new TID value. 1027 * TID is 0. We preserve the owner died bit.
1019 */ 1028 */
1020 newval = (curval & ~FUTEX_TID_MASK) | vpid; 1029 newval = uval & FUTEX_OWNER_DIED;
1021 force_take = 0; 1030 newval |= vpid;
1022 lock_taken = 1;
1023 }
1024 1031
1025 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) 1032 /* The futex requeue_pi code can enforce the waiters bit */
1026 return -EFAULT; 1033 if (set_waiters)
1027 if (unlikely(curval != uval)) 1034 newval |= FUTEX_WAITERS;
1028 goto retry; 1035
1036 ret = lock_pi_update_atomic(uaddr, uval, newval);
1037 /* If the take over worked, return 1 */
1038 return ret < 0 ? ret : 1;
1039 }
1029 1040
1030 /* 1041 /*
1031 * We took the lock due to forced take over. 1042 * First waiter. Set the waiters bit before attaching ourself to
1043 * the owner. If owner tries to unlock, it will be forced into
1044 * the kernel and blocked on hb->lock.
1032 */ 1045 */
1033 if (unlikely(lock_taken)) 1046 newval = uval | FUTEX_WAITERS;
1034 return 1; 1047 ret = lock_pi_update_atomic(uaddr, uval, newval);
1035 1048 if (ret)
1049 return ret;
1036 /* 1050 /*
1037 * We dont have the lock. Look up the PI state (or create it if 1051 * If the update of the user space value succeeded, we try to
1038 * we are the first waiter): 1052 * attach to the owner. If that fails, no harm done, we only
1053 * set the FUTEX_WAITERS bit in the user space variable.
1039 */ 1054 */
1040 ret = lookup_pi_state(uval, hb, key, ps); 1055 return attach_to_pi_owner(uval, key, ps);
1041
1042 if (unlikely(ret)) {
1043 switch (ret) {
1044 case -ESRCH:
1045 /*
1046 * We failed to find an owner for this
1047 * futex. So we have no pi_state to block
1048 * on. This can happen in two cases:
1049 *
1050 * 1) The owner died
1051 * 2) A stale FUTEX_WAITERS bit
1052 *
1053 * Re-read the futex value.
1054 */
1055 if (get_futex_value_locked(&curval, uaddr))
1056 return -EFAULT;
1057
1058 /*
1059 * If the owner died or we have a stale
1060 * WAITERS bit the owner TID in the user space
1061 * futex is 0.
1062 */
1063 if (!(curval & FUTEX_TID_MASK)) {
1064 force_take = 1;
1065 goto retry;
1066 }
1067 default:
1068 break;
1069 }
1070 }
1071
1072 return ret;
1073} 1056}
1074 1057
1075/** 1058/**
@@ -1186,22 +1169,6 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
1186 return 0; 1169 return 0;
1187} 1170}
1188 1171
1189static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
1190{
1191 u32 uninitialized_var(oldval);
1192
1193 /*
1194 * There is no waiter, so we unlock the futex. The owner died
1195 * bit has not to be preserved here. We are the owner:
1196 */
1197 if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
1198 return -EFAULT;
1199 if (oldval != uval)
1200 return -EAGAIN;
1201
1202 return 0;
1203}
1204
1205/* 1172/*
1206 * Express the locking dependencies for lockdep: 1173 * Express the locking dependencies for lockdep:
1207 */ 1174 */
@@ -1659,7 +1626,12 @@ retry_private:
1659 goto retry; 1626 goto retry;
1660 goto out; 1627 goto out;
1661 case -EAGAIN: 1628 case -EAGAIN:
1662 /* The owner was exiting, try again. */ 1629 /*
1630 * Two reasons for this:
1631 * - Owner is exiting and we just wait for the
1632 * exit to complete.
1633 * - The user space value changed.
1634 */
1663 double_unlock_hb(hb1, hb2); 1635 double_unlock_hb(hb1, hb2);
1664 hb_waiters_dec(hb2); 1636 hb_waiters_dec(hb2);
1665 put_futex_key(&key2); 1637 put_futex_key(&key2);
@@ -1718,7 +1690,7 @@ retry_private:
1718 this->pi_state = pi_state; 1690 this->pi_state = pi_state;
1719 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, 1691 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1720 this->rt_waiter, 1692 this->rt_waiter,
1721 this->task, 1); 1693 this->task);
1722 if (ret == 1) { 1694 if (ret == 1) {
1723 /* We got the lock. */ 1695 /* We got the lock. */
1724 requeue_pi_wake_futex(this, &key2, hb2); 1696 requeue_pi_wake_futex(this, &key2, hb2);
@@ -2316,8 +2288,10 @@ retry_private:
2316 goto uaddr_faulted; 2288 goto uaddr_faulted;
2317 case -EAGAIN: 2289 case -EAGAIN:
2318 /* 2290 /*
2319 * Task is exiting and we just wait for the 2291 * Two reasons for this:
2320 * exit to complete. 2292 * - Task is exiting and we just wait for the
2293 * exit to complete.
2294 * - The user space value changed.
2321 */ 2295 */
2322 queue_unlock(hb); 2296 queue_unlock(hb);
2323 put_futex_key(&q.key); 2297 put_futex_key(&q.key);
@@ -2337,9 +2311,9 @@ retry_private:
2337 /* 2311 /*
2338 * Block on the PI mutex: 2312 * Block on the PI mutex:
2339 */ 2313 */
2340 if (!trylock) 2314 if (!trylock) {
2341 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); 2315 ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
2342 else { 2316 } else {
2343 ret = rt_mutex_trylock(&q.pi_state->pi_mutex); 2317 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
2344 /* Fixup the trylock return value: */ 2318 /* Fixup the trylock return value: */
2345 ret = ret ? 0 : -EWOULDBLOCK; 2319 ret = ret ? 0 : -EWOULDBLOCK;
@@ -2401,10 +2375,10 @@ uaddr_faulted:
2401 */ 2375 */
2402static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) 2376static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2403{ 2377{
2404 struct futex_hash_bucket *hb; 2378 u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
2405 struct futex_q *this, *next;
2406 union futex_key key = FUTEX_KEY_INIT; 2379 union futex_key key = FUTEX_KEY_INIT;
2407 u32 uval, vpid = task_pid_vnr(current); 2380 struct futex_hash_bucket *hb;
2381 struct futex_q *match;
2408 int ret; 2382 int ret;
2409 2383
2410retry: 2384retry:
@@ -2417,57 +2391,47 @@ retry:
2417 return -EPERM; 2391 return -EPERM;
2418 2392
2419 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); 2393 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
2420 if (unlikely(ret != 0)) 2394 if (ret)
2421 goto out; 2395 return ret;
2422 2396
2423 hb = hash_futex(&key); 2397 hb = hash_futex(&key);
2424 spin_lock(&hb->lock); 2398 spin_lock(&hb->lock);
2425 2399
2426 /* 2400 /*
2427 * To avoid races, try to do the TID -> 0 atomic transition 2401 * Check waiters first. We do not trust user space values at
2428 * again. If it succeeds then we can return without waking 2402 * all and we at least want to know if user space fiddled
2429 * anyone else up. We only try this if neither the waiters nor 2403 * with the futex value instead of blindly unlocking.
2430 * the owner died bit are set.
2431 */
2432 if (!(uval & ~FUTEX_TID_MASK) &&
2433 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2434 goto pi_faulted;
2435 /*
2436 * Rare case: we managed to release the lock atomically,
2437 * no need to wake anyone else up:
2438 */
2439 if (unlikely(uval == vpid))
2440 goto out_unlock;
2441
2442 /*
2443 * Ok, other tasks may need to be woken up - check waiters
2444 * and do the wakeup if necessary:
2445 */ 2404 */
2446 plist_for_each_entry_safe(this, next, &hb->chain, list) { 2405 match = futex_top_waiter(hb, &key);
2447 if (!match_futex (&this->key, &key)) 2406 if (match) {
2448 continue; 2407 ret = wake_futex_pi(uaddr, uval, match);
2449 ret = wake_futex_pi(uaddr, uval, this);
2450 /* 2408 /*
2451 * The atomic access to the futex value 2409 * The atomic access to the futex value generated a
2452 * generated a pagefault, so retry the 2410 * pagefault, so retry the user-access and the wakeup:
2453 * user-access and the wakeup:
2454 */ 2411 */
2455 if (ret == -EFAULT) 2412 if (ret == -EFAULT)
2456 goto pi_faulted; 2413 goto pi_faulted;
2457 goto out_unlock; 2414 goto out_unlock;
2458 } 2415 }
2416
2459 /* 2417 /*
2460 * No waiters - kernel unlocks the futex: 2418 * We have no kernel internal state, i.e. no waiters in the
2419 * kernel. Waiters which are about to queue themselves are stuck
2420 * on hb->lock. So we can safely ignore them. We do neither
2421 * preserve the WAITERS bit not the OWNER_DIED one. We are the
2422 * owner.
2461 */ 2423 */
2462 ret = unlock_futex_pi(uaddr, uval); 2424 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
2463 if (ret == -EFAULT)
2464 goto pi_faulted; 2425 goto pi_faulted;
2465 2426
2427 /*
2428 * If uval has changed, let user space handle it.
2429 */
2430 ret = (curval == uval) ? 0 : -EAGAIN;
2431
2466out_unlock: 2432out_unlock:
2467 spin_unlock(&hb->lock); 2433 spin_unlock(&hb->lock);
2468 put_futex_key(&key); 2434 put_futex_key(&key);
2469
2470out:
2471 return ret; 2435 return ret;
2472 2436
2473pi_faulted: 2437pi_faulted:
@@ -2669,7 +2633,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2669 */ 2633 */
2670 WARN_ON(!q.pi_state); 2634 WARN_ON(!q.pi_state);
2671 pi_mutex = &q.pi_state->pi_mutex; 2635 pi_mutex = &q.pi_state->pi_mutex;
2672 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); 2636 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
2673 debug_rt_mutex_free_waiter(&rt_waiter); 2637 debug_rt_mutex_free_waiter(&rt_waiter);
2674 2638
2675 spin_lock(q.lock_ptr); 2639 spin_lock(q.lock_ptr);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 452d6f2ba21d..cf80e7b0ddab 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -341,8 +341,8 @@ static struct lock_class_key irq_nested_lock_class;
341/* 341/*
342 * irq_map_generic_chip - Map a generic chip for an irq domain 342 * irq_map_generic_chip - Map a generic chip for an irq domain
343 */ 343 */
344static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, 344int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
345 irq_hw_number_t hw_irq) 345 irq_hw_number_t hw_irq)
346{ 346{
347 struct irq_data *data = irq_get_irq_data(virq); 347 struct irq_data *data = irq_get_irq_data(virq);
348 struct irq_domain_chip_generic *dgc = d->gc; 348 struct irq_domain_chip_generic *dgc = d->gc;
@@ -394,6 +394,7 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
394 irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); 394 irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
395 return 0; 395 return 0;
396} 396}
397EXPORT_SYMBOL_GPL(irq_map_generic_chip);
397 398
398struct irq_domain_ops irq_generic_chip_ops = { 399struct irq_domain_ops irq_generic_chip_ops = {
399 .map = irq_map_generic_chip, 400 .map = irq_map_generic_chip,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index eb5e10e32e05..6534ff6ce02e 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -231,7 +231,7 @@ void irq_set_default_host(struct irq_domain *domain)
231} 231}
232EXPORT_SYMBOL_GPL(irq_set_default_host); 232EXPORT_SYMBOL_GPL(irq_set_default_host);
233 233
234static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) 234void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
235{ 235{
236 struct irq_data *irq_data = irq_get_irq_data(irq); 236 struct irq_data *irq_data = irq_get_irq_data(irq);
237 irq_hw_number_t hwirq; 237 irq_hw_number_t hwirq;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index a82170e2fa78..e6bcbe756663 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -16,11 +16,12 @@
16#include <linux/tick.h> 16#include <linux/tick.h>
17#include <linux/cpu.h> 17#include <linux/cpu.h>
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h>
19#include <asm/processor.h> 20#include <asm/processor.h>
20 21
21 22
22static DEFINE_PER_CPU(struct llist_head, irq_work_list); 23static DEFINE_PER_CPU(struct llist_head, raised_list);
23static DEFINE_PER_CPU(int, irq_work_raised); 24static DEFINE_PER_CPU(struct llist_head, lazy_list);
24 25
25/* 26/*
26 * Claim the entry so that no one else will poke at it. 27 * Claim the entry so that no one else will poke at it.
@@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void)
55 */ 56 */
56} 57}
57 58
59#ifdef CONFIG_SMP
58/* 60/*
59 * Enqueue the irq_work @entry unless it's already pending 61 * Enqueue the irq_work @work on @cpu unless it's already pending
60 * somewhere. 62 * somewhere.
61 * 63 *
62 * Can be re-enqueued while the callback is still in progress. 64 * Can be re-enqueued while the callback is still in progress.
63 */ 65 */
66bool irq_work_queue_on(struct irq_work *work, int cpu)
67{
68 /* All work should have been flushed before going offline */
69 WARN_ON_ONCE(cpu_is_offline(cpu));
70
71 /* Arch remote IPI send/receive backend aren't NMI safe */
72 WARN_ON_ONCE(in_nmi());
73
74 /* Only queue if not already pending */
75 if (!irq_work_claim(work))
76 return false;
77
78 if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
79 arch_send_call_function_single_ipi(cpu);
80
81 return true;
82}
83EXPORT_SYMBOL_GPL(irq_work_queue_on);
84#endif
85
86/* Enqueue the irq work @work on the current CPU */
64bool irq_work_queue(struct irq_work *work) 87bool irq_work_queue(struct irq_work *work)
65{ 88{
66 /* Only queue if not already pending */ 89 /* Only queue if not already pending */
@@ -70,15 +93,13 @@ bool irq_work_queue(struct irq_work *work)
70 /* Queue the entry and raise the IPI if needed. */ 93 /* Queue the entry and raise the IPI if needed. */
71 preempt_disable(); 94 preempt_disable();
72 95
73 llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); 96 /* If the work is "lazy", handle it from next tick if any */
74 97 if (work->flags & IRQ_WORK_LAZY) {
75 /* 98 if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) &&
76 * If the work is not "lazy" or the tick is stopped, raise the irq 99 tick_nohz_tick_stopped())
77 * work interrupt (if supported by the arch), otherwise, just wait 100 arch_irq_work_raise();
78 * for the next tick. 101 } else {
79 */ 102 if (llist_add(&work->llnode, &__get_cpu_var(raised_list)))
80 if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
81 if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
82 arch_irq_work_raise(); 103 arch_irq_work_raise();
83 } 104 }
84 105
@@ -90,10 +111,11 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
90 111
91bool irq_work_needs_cpu(void) 112bool irq_work_needs_cpu(void)
92{ 113{
93 struct llist_head *this_list; 114 struct llist_head *raised, *lazy;
94 115
95 this_list = &__get_cpu_var(irq_work_list); 116 raised = &__get_cpu_var(raised_list);
96 if (llist_empty(this_list)) 117 lazy = &__get_cpu_var(lazy_list);
118 if (llist_empty(raised) && llist_empty(lazy))
97 return false; 119 return false;
98 120
99 /* All work should have been flushed before going offline */ 121 /* All work should have been flushed before going offline */
@@ -102,28 +124,18 @@ bool irq_work_needs_cpu(void)
102 return true; 124 return true;
103} 125}
104 126
105static void __irq_work_run(void) 127static void irq_work_run_list(struct llist_head *list)
106{ 128{
107 unsigned long flags; 129 unsigned long flags;
108 struct irq_work *work; 130 struct irq_work *work;
109 struct llist_head *this_list;
110 struct llist_node *llnode; 131 struct llist_node *llnode;
111 132
133 BUG_ON(!irqs_disabled());
112 134
113 /* 135 if (llist_empty(list))
114 * Reset the "raised" state right before we check the list because
115 * an NMI may enqueue after we find the list empty from the runner.
116 */
117 __this_cpu_write(irq_work_raised, 0);
118 barrier();
119
120 this_list = &__get_cpu_var(irq_work_list);
121 if (llist_empty(this_list))
122 return; 136 return;
123 137
124 BUG_ON(!irqs_disabled()); 138 llnode = llist_del_all(list);
125
126 llnode = llist_del_all(this_list);
127 while (llnode != NULL) { 139 while (llnode != NULL) {
128 work = llist_entry(llnode, struct irq_work, llnode); 140 work = llist_entry(llnode, struct irq_work, llnode);
129 141
@@ -149,13 +161,13 @@ static void __irq_work_run(void)
149} 161}
150 162
151/* 163/*
152 * Run the irq_work entries on this cpu. Requires to be ran from hardirq 164 * hotplug calls this through:
153 * context with local IRQs disabled. 165 * hotplug_cfd() -> flush_smp_call_function_queue()
154 */ 166 */
155void irq_work_run(void) 167void irq_work_run(void)
156{ 168{
157 BUG_ON(!in_irq()); 169 irq_work_run_list(&__get_cpu_var(raised_list));
158 __irq_work_run(); 170 irq_work_run_list(&__get_cpu_var(lazy_list));
159} 171}
160EXPORT_SYMBOL_GPL(irq_work_run); 172EXPORT_SYMBOL_GPL(irq_work_run);
161 173
@@ -171,35 +183,3 @@ void irq_work_sync(struct irq_work *work)
171 cpu_relax(); 183 cpu_relax();
172} 184}
173EXPORT_SYMBOL_GPL(irq_work_sync); 185EXPORT_SYMBOL_GPL(irq_work_sync);
174
175#ifdef CONFIG_HOTPLUG_CPU
176static int irq_work_cpu_notify(struct notifier_block *self,
177 unsigned long action, void *hcpu)
178{
179 long cpu = (long)hcpu;
180
181 switch (action) {
182 case CPU_DYING:
183 /* Called from stop_machine */
184 if (WARN_ON_ONCE(cpu != smp_processor_id()))
185 break;
186 __irq_work_run();
187 break;
188 default:
189 break;
190 }
191 return NOTIFY_OK;
192}
193
194static struct notifier_block cpu_notify;
195
196static __init int irq_work_init_cpu_notifier(void)
197{
198 cpu_notify.notifier_call = irq_work_cpu_notify;
199 cpu_notify.priority = 0;
200 register_cpu_notifier(&cpu_notify);
201 return 0;
202}
203device_initcall(irq_work_init_cpu_notifier);
204
205#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 369f41a94124..4b8f0c925884 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
33#include <linux/swap.h> 33#include <linux/swap.h>
34#include <linux/syscore_ops.h> 34#include <linux/syscore_ops.h>
35#include <linux/compiler.h> 35#include <linux/compiler.h>
36#include <linux/hugetlb.h>
36 37
37#include <asm/page.h> 38#include <asm/page.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
@@ -1619,6 +1620,9 @@ static int __init crash_save_vmcoreinfo_init(void)
1619#endif 1620#endif
1620 VMCOREINFO_NUMBER(PG_head_mask); 1621 VMCOREINFO_NUMBER(PG_head_mask);
1621 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); 1622 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1623#ifdef CONFIG_HUGETLBFS
1624 VMCOREINFO_SYMBOL(free_huge_page);
1625#endif
1622 1626
1623 arch_crash_save_vmcoreinfo(); 1627 arch_crash_save_vmcoreinfo();
1624 update_vmcoreinfo_note(); 1628 update_vmcoreinfo_note();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3214289df5a7..734e9a7d280b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2037,19 +2037,23 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
2037{ 2037{
2038 unsigned long *iter; 2038 unsigned long *iter;
2039 struct kprobe_blacklist_entry *ent; 2039 struct kprobe_blacklist_entry *ent;
2040 unsigned long offset = 0, size = 0; 2040 unsigned long entry, offset = 0, size = 0;
2041 2041
2042 for (iter = start; iter < end; iter++) { 2042 for (iter = start; iter < end; iter++) {
2043 if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) { 2043 entry = arch_deref_entry_point((void *)*iter);
2044 pr_err("Failed to find blacklist %p\n", (void *)*iter); 2044
2045 if (!kernel_text_address(entry) ||
2046 !kallsyms_lookup_size_offset(entry, &size, &offset)) {
2047 pr_err("Failed to find blacklist at %p\n",
2048 (void *)entry);
2045 continue; 2049 continue;
2046 } 2050 }
2047 2051
2048 ent = kmalloc(sizeof(*ent), GFP_KERNEL); 2052 ent = kmalloc(sizeof(*ent), GFP_KERNEL);
2049 if (!ent) 2053 if (!ent)
2050 return -ENOMEM; 2054 return -ENOMEM;
2051 ent->start_addr = *iter; 2055 ent->start_addr = entry;
2052 ent->end_addr = *iter + size; 2056 ent->end_addr = entry + size;
2053 INIT_LIST_HEAD(&ent->list); 2057 INIT_LIST_HEAD(&ent->list);
2054 list_add_tail(&ent->list, &kprobe_blacklist); 2058 list_add_tail(&ent->list, &kprobe_blacklist);
2055 } 2059 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c2390f41307b..ef483220e855 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -591,7 +591,7 @@ static void insert_kthread_work(struct kthread_worker *worker,
591 591
592 list_add_tail(&work->node, pos); 592 list_add_tail(&work->node, pos);
593 work->worker = worker; 593 work->worker = worker;
594 if (likely(worker->task)) 594 if (!worker->current_work && likely(worker->task))
595 wake_up_process(worker->task); 595 wake_up_process(worker->task);
596} 596}
597 597
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index d24e4339b46d..88d0d4420ad2 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -384,7 +384,9 @@ static void print_lockdep_off(const char *bug_msg)
384{ 384{
385 printk(KERN_DEBUG "%s\n", bug_msg); 385 printk(KERN_DEBUG "%s\n", bug_msg);
386 printk(KERN_DEBUG "turning off the locking correctness validator.\n"); 386 printk(KERN_DEBUG "turning off the locking correctness validator.\n");
387#ifdef CONFIG_LOCK_STAT
387 printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); 388 printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
389#endif
388} 390}
389 391
390static int save_trace(struct stack_trace *trace) 392static int save_trace(struct stack_trace *trace)
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e00669..9887a905a762 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -1,6 +1,4 @@
1
2#include <linux/percpu.h> 1#include <linux/percpu.h>
3#include <linux/mutex.h>
4#include <linux/sched.h> 2#include <linux/sched.h>
5#include "mcs_spinlock.h" 3#include "mcs_spinlock.h"
6 4
@@ -14,21 +12,47 @@
14 * called from interrupt context and we have preemption disabled while 12 * called from interrupt context and we have preemption disabled while
15 * spinning. 13 * spinning.
16 */ 14 */
17static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node); 15static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
16
17/*
18 * We use the value 0 to represent "no CPU", thus the encoded value
19 * will be the CPU number incremented by 1.
20 */
21static inline int encode_cpu(int cpu_nr)
22{
23 return cpu_nr + 1;
24}
25
26static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
27{
28 int cpu_nr = encoded_cpu_val - 1;
29
30 return per_cpu_ptr(&osq_node, cpu_nr);
31}
18 32
19/* 33/*
20 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. 34 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
21 * Can return NULL in case we were the last queued and we updated @lock instead. 35 * Can return NULL in case we were the last queued and we updated @lock instead.
22 */ 36 */
23static inline struct optimistic_spin_queue * 37static inline struct optimistic_spin_node *
24osq_wait_next(struct optimistic_spin_queue **lock, 38osq_wait_next(struct optimistic_spin_queue *lock,
25 struct optimistic_spin_queue *node, 39 struct optimistic_spin_node *node,
26 struct optimistic_spin_queue *prev) 40 struct optimistic_spin_node *prev)
27{ 41{
28 struct optimistic_spin_queue *next = NULL; 42 struct optimistic_spin_node *next = NULL;
43 int curr = encode_cpu(smp_processor_id());
44 int old;
45
46 /*
47 * If there is a prev node in queue, then the 'old' value will be
48 * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
49 * we're currently last in queue, then the queue will then become empty.
50 */
51 old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
29 52
30 for (;;) { 53 for (;;) {
31 if (*lock == node && cmpxchg(lock, node, prev) == node) { 54 if (atomic_read(&lock->tail) == curr &&
55 atomic_cmpxchg(&lock->tail, curr, old) == curr) {
32 /* 56 /*
33 * We were the last queued, we moved @lock back. @prev 57 * We were the last queued, we moved @lock back. @prev
34 * will now observe @lock and will complete its 58 * will now observe @lock and will complete its
@@ -53,24 +77,29 @@ osq_wait_next(struct optimistic_spin_queue **lock,
53 break; 77 break;
54 } 78 }
55 79
56 arch_mutex_cpu_relax(); 80 cpu_relax_lowlatency();
57 } 81 }
58 82
59 return next; 83 return next;
60} 84}
61 85
62bool osq_lock(struct optimistic_spin_queue **lock) 86bool osq_lock(struct optimistic_spin_queue *lock)
63{ 87{
64 struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); 88 struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
65 struct optimistic_spin_queue *prev, *next; 89 struct optimistic_spin_node *prev, *next;
90 int curr = encode_cpu(smp_processor_id());
91 int old;
66 92
67 node->locked = 0; 93 node->locked = 0;
68 node->next = NULL; 94 node->next = NULL;
95 node->cpu = curr;
69 96
70 node->prev = prev = xchg(lock, node); 97 old = atomic_xchg(&lock->tail, curr);
71 if (likely(prev == NULL)) 98 if (old == OSQ_UNLOCKED_VAL)
72 return true; 99 return true;
73 100
101 prev = decode_cpu(old);
102 node->prev = prev;
74 ACCESS_ONCE(prev->next) = node; 103 ACCESS_ONCE(prev->next) = node;
75 104
76 /* 105 /*
@@ -89,7 +118,7 @@ bool osq_lock(struct optimistic_spin_queue **lock)
89 if (need_resched()) 118 if (need_resched())
90 goto unqueue; 119 goto unqueue;
91 120
92 arch_mutex_cpu_relax(); 121 cpu_relax_lowlatency();
93 } 122 }
94 return true; 123 return true;
95 124
@@ -115,7 +144,7 @@ unqueue:
115 if (smp_load_acquire(&node->locked)) 144 if (smp_load_acquire(&node->locked))
116 return true; 145 return true;
117 146
118 arch_mutex_cpu_relax(); 147 cpu_relax_lowlatency();
119 148
120 /* 149 /*
121 * Or we race against a concurrent unqueue()'s step-B, in which 150 * Or we race against a concurrent unqueue()'s step-B, in which
@@ -149,20 +178,21 @@ unqueue:
149 return false; 178 return false;
150} 179}
151 180
152void osq_unlock(struct optimistic_spin_queue **lock) 181void osq_unlock(struct optimistic_spin_queue *lock)
153{ 182{
154 struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); 183 struct optimistic_spin_node *node, *next;
155 struct optimistic_spin_queue *next; 184 int curr = encode_cpu(smp_processor_id());
156 185
157 /* 186 /*
158 * Fast path for the uncontended case. 187 * Fast path for the uncontended case.
159 */ 188 */
160 if (likely(cmpxchg(lock, node, NULL) == node)) 189 if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
161 return; 190 return;
162 191
163 /* 192 /*
164 * Second most likely case. 193 * Second most likely case.
165 */ 194 */
195 node = this_cpu_ptr(&osq_node);
166 next = xchg(&node->next, NULL); 196 next = xchg(&node->next, NULL);
167 if (next) { 197 if (next) {
168 ACCESS_ONCE(next->locked) = 1; 198 ACCESS_ONCE(next->locked) = 1;
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4aca6b..23e89c5930e9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -27,7 +27,7 @@ struct mcs_spinlock {
27#define arch_mcs_spin_lock_contended(l) \ 27#define arch_mcs_spin_lock_contended(l) \
28do { \ 28do { \
29 while (!(smp_load_acquire(l))) \ 29 while (!(smp_load_acquire(l))) \
30 arch_mutex_cpu_relax(); \ 30 cpu_relax_lowlatency(); \
31} while (0) 31} while (0)
32#endif 32#endif
33 33
@@ -104,7 +104,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
104 return; 104 return;
105 /* Wait until the next pointer is set */ 105 /* Wait until the next pointer is set */
106 while (!(next = ACCESS_ONCE(node->next))) 106 while (!(next = ACCESS_ONCE(node->next)))
107 arch_mutex_cpu_relax(); 107 cpu_relax_lowlatency();
108 } 108 }
109 109
110 /* Pass lock to next waiter. */ 110 /* Pass lock to next waiter. */
@@ -118,12 +118,13 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
118 * mutex_lock()/rwsem_down_{read,write}() etc. 118 * mutex_lock()/rwsem_down_{read,write}() etc.
119 */ 119 */
120 120
121struct optimistic_spin_queue { 121struct optimistic_spin_node {
122 struct optimistic_spin_queue *next, *prev; 122 struct optimistic_spin_node *next, *prev;
123 int locked; /* 1 if lock acquired */ 123 int locked; /* 1 if lock acquired */
124 int cpu; /* encoded CPU # value */
124}; 125};
125 126
126extern bool osq_lock(struct optimistic_spin_queue **lock); 127extern bool osq_lock(struct optimistic_spin_queue *lock);
127extern void osq_unlock(struct optimistic_spin_queue **lock); 128extern void osq_unlock(struct optimistic_spin_queue *lock);
128 129
129#endif /* __LINUX_MCS_SPINLOCK_H */ 130#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index bc73d33c6760..ae712b25e492 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -46,12 +46,6 @@
46# include <asm/mutex.h> 46# include <asm/mutex.h>
47#endif 47#endif
48 48
49/*
50 * A negative mutex count indicates that waiters are sleeping waiting for the
51 * mutex.
52 */
53#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0)
54
55void 49void
56__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) 50__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
57{ 51{
@@ -60,7 +54,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
60 INIT_LIST_HEAD(&lock->wait_list); 54 INIT_LIST_HEAD(&lock->wait_list);
61 mutex_clear_owner(lock); 55 mutex_clear_owner(lock);
62#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 56#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
63 lock->osq = NULL; 57 osq_lock_init(&lock->osq);
64#endif 58#endif
65 59
66 debug_mutex_init(lock, name, key); 60 debug_mutex_init(lock, name, key);
@@ -152,7 +146,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
152 if (need_resched()) 146 if (need_resched())
153 break; 147 break;
154 148
155 arch_mutex_cpu_relax(); 149 cpu_relax_lowlatency();
156 } 150 }
157 rcu_read_unlock(); 151 rcu_read_unlock();
158 152
@@ -388,12 +382,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
388 /* 382 /*
389 * Optimistic spinning. 383 * Optimistic spinning.
390 * 384 *
391 * We try to spin for acquisition when we find that there are no 385 * We try to spin for acquisition when we find that the lock owner
392 * pending waiters and the lock owner is currently running on a 386 * is currently running on a (different) CPU and while we don't
393 * (different) CPU. 387 * need to reschedule. The rationale is that if the lock owner is
394 * 388 * running, it is likely to release the lock soon.
395 * The rationale is that if the lock owner is running, it is likely to
396 * release the lock soon.
397 * 389 *
398 * Since this needs the lock owner, and this mutex implementation 390 * Since this needs the lock owner, and this mutex implementation
399 * doesn't track the owner atomically in the lock field, we need to 391 * doesn't track the owner atomically in the lock field, we need to
@@ -440,7 +432,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
440 if (owner && !mutex_spin_on_owner(lock, owner)) 432 if (owner && !mutex_spin_on_owner(lock, owner))
441 break; 433 break;
442 434
443 if ((atomic_read(&lock->count) == 1) && 435 /* Try to acquire the mutex if it is unlocked. */
436 if (!mutex_is_locked(lock) &&
444 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { 437 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
445 lock_acquired(&lock->dep_map, ip); 438 lock_acquired(&lock->dep_map, ip);
446 if (use_ww_ctx) { 439 if (use_ww_ctx) {
@@ -471,7 +464,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
471 * memory barriers as we'll eventually observe the right 464 * memory barriers as we'll eventually observe the right
472 * values at the cost of a few extra spins. 465 * values at the cost of a few extra spins.
473 */ 466 */
474 arch_mutex_cpu_relax(); 467 cpu_relax_lowlatency();
475 } 468 }
476 osq_unlock(&lock->osq); 469 osq_unlock(&lock->osq);
477slowpath: 470slowpath:
@@ -485,8 +478,11 @@ slowpath:
485#endif 478#endif
486 spin_lock_mutex(&lock->wait_lock, flags); 479 spin_lock_mutex(&lock->wait_lock, flags);
487 480
488 /* once more, can we acquire the lock? */ 481 /*
489 if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) 482 * Once more, try to acquire the lock. Only try-lock the mutex if
483 * it is unlocked to reduce unnecessary xchg() operations.
484 */
485 if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1))
490 goto skip_wait; 486 goto skip_wait;
491 487
492 debug_mutex_lock_common(lock, &waiter); 488 debug_mutex_lock_common(lock, &waiter);
@@ -506,9 +502,10 @@ slowpath:
506 * it's unlocked. Later on, if we sleep, this is the 502 * it's unlocked. Later on, if we sleep, this is the
507 * operation that gives us the lock. We xchg it to -1, so 503 * operation that gives us the lock. We xchg it to -1, so
508 * that when we release the lock, we properly wake up the 504 * that when we release the lock, we properly wake up the
509 * other waiters: 505 * other waiters. We only attempt the xchg if the count is
506 * non-negative in order to avoid unnecessary xchg operations:
510 */ 507 */
511 if (MUTEX_SHOW_NO_WAITER(lock) && 508 if (atomic_read(&lock->count) >= 0 &&
512 (atomic_xchg(&lock->count, -1) == 1)) 509 (atomic_xchg(&lock->count, -1) == 1))
513 break; 510 break;
514 511
@@ -823,6 +820,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
823 unsigned long flags; 820 unsigned long flags;
824 int prev; 821 int prev;
825 822
823 /* No need to trylock if the mutex is locked. */
824 if (mutex_is_locked(lock))
825 return 0;
826
826 spin_lock_mutex(&lock->wait_lock, flags); 827 spin_lock_mutex(&lock->wait_lock, flags);
827 828
828 prev = atomic_xchg(&lock->count, -1); 829 prev = atomic_xchg(&lock->count, -1);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fb5b8ac411a5..f956ede7f90d 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -20,7 +20,6 @@
20#include <linux/cpumask.h> 20#include <linux/cpumask.h>
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/mutex.h>
24#include <asm/qrwlock.h> 23#include <asm/qrwlock.h>
25 24
26/** 25/**
@@ -35,7 +34,7 @@ static __always_inline void
35rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) 34rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
36{ 35{
37 while ((cnts & _QW_WMASK) == _QW_LOCKED) { 36 while ((cnts & _QW_WMASK) == _QW_LOCKED) {
38 arch_mutex_cpu_relax(); 37 cpu_relax_lowlatency();
39 cnts = smp_load_acquire((u32 *)&lock->cnts); 38 cnts = smp_load_acquire((u32 *)&lock->cnts);
40 } 39 }
41} 40}
@@ -75,7 +74,7 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
75 * to make sure that the write lock isn't taken. 74 * to make sure that the write lock isn't taken.
76 */ 75 */
77 while (atomic_read(&lock->cnts) & _QW_WMASK) 76 while (atomic_read(&lock->cnts) & _QW_WMASK)
78 arch_mutex_cpu_relax(); 77 cpu_relax_lowlatency();
79 78
80 cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; 79 cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
81 rspin_until_writer_unlock(lock, cnts); 80 rspin_until_writer_unlock(lock, cnts);
@@ -114,7 +113,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
114 cnts | _QW_WAITING) == cnts)) 113 cnts | _QW_WAITING) == cnts))
115 break; 114 break;
116 115
117 arch_mutex_cpu_relax(); 116 cpu_relax_lowlatency();
118 } 117 }
119 118
120 /* When no more readers, set the locked flag */ 119 /* When no more readers, set the locked flag */
@@ -125,7 +124,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
125 _QW_LOCKED) == _QW_WAITING)) 124 _QW_LOCKED) == _QW_WAITING))
126 break; 125 break;
127 126
128 arch_mutex_cpu_relax(); 127 cpu_relax_lowlatency();
129 } 128 }
130unlock: 129unlock:
131 arch_spin_unlock(&lock->lock); 130 arch_spin_unlock(&lock->lock);
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 49b2ed3dced8..62b6cee8ea7f 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -66,12 +66,13 @@ void rt_mutex_debug_task_free(struct task_struct *task)
66 * the deadlock. We print when we return. act_waiter can be NULL in 66 * the deadlock. We print when we return. act_waiter can be NULL in
67 * case of a remove waiter operation. 67 * case of a remove waiter operation.
68 */ 68 */
69void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, 69void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
70 struct rt_mutex_waiter *act_waiter,
70 struct rt_mutex *lock) 71 struct rt_mutex *lock)
71{ 72{
72 struct task_struct *task; 73 struct task_struct *task;
73 74
74 if (!debug_locks || detect || !act_waiter) 75 if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
75 return; 76 return;
76 77
77 task = rt_mutex_owner(act_waiter->lock); 78 task = rt_mutex_owner(act_waiter->lock);
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index ab29b6a22669..d0519c3432b6 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -20,14 +20,15 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
20extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, 20extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
21 struct task_struct *powner); 21 struct task_struct *powner);
22extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); 22extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
23extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, 23extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
24 struct rt_mutex_waiter *waiter,
24 struct rt_mutex *lock); 25 struct rt_mutex *lock);
25extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); 26extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
26# define debug_rt_mutex_reset_waiter(w) \ 27# define debug_rt_mutex_reset_waiter(w) \
27 do { (w)->deadlock_lock = NULL; } while (0) 28 do { (w)->deadlock_lock = NULL; } while (0)
28 29
29static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, 30static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
30 int detect) 31 enum rtmutex_chainwalk walk)
31{ 32{
32 return (waiter != NULL); 33 return (waiter != NULL);
33} 34}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index fc605941b9b8..a0ea2a141b3b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -308,6 +308,32 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
308} 308}
309 309
310/* 310/*
311 * Deadlock detection is conditional:
312 *
313 * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted
314 * if the detect argument is == RT_MUTEX_FULL_CHAINWALK.
315 *
316 * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always
317 * conducted independent of the detect argument.
318 *
319 * If the waiter argument is NULL this indicates the deboost path and
320 * deadlock detection is disabled independent of the detect argument
321 * and the config settings.
322 */
323static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
324 enum rtmutex_chainwalk chwalk)
325{
326 /*
327 * This is just a wrapper function for the following call,
328 * because debug_rt_mutex_detect_deadlock() smells like a magic
329 * debug feature and I wanted to keep the cond function in the
330 * main source file along with the comments instead of having
331 * two of the same in the headers.
332 */
333 return debug_rt_mutex_detect_deadlock(waiter, chwalk);
334}
335
336/*
311 * Max number of times we'll walk the boosting chain: 337 * Max number of times we'll walk the boosting chain:
312 */ 338 */
313int max_lock_depth = 1024; 339int max_lock_depth = 1024;
@@ -337,21 +363,65 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
337 * @top_task: the current top waiter 363 * @top_task: the current top waiter
338 * 364 *
339 * Returns 0 or -EDEADLK. 365 * Returns 0 or -EDEADLK.
366 *
367 * Chain walk basics and protection scope
368 *
369 * [R] refcount on task
370 * [P] task->pi_lock held
371 * [L] rtmutex->wait_lock held
372 *
373 * Step Description Protected by
374 * function arguments:
375 * @task [R]
376 * @orig_lock if != NULL @top_task is blocked on it
377 * @next_lock Unprotected. Cannot be
378 * dereferenced. Only used for
379 * comparison.
380 * @orig_waiter if != NULL @top_task is blocked on it
381 * @top_task current, or in case of proxy
382 * locking protected by calling
383 * code
384 * again:
385 * loop_sanity_check();
386 * retry:
387 * [1] lock(task->pi_lock); [R] acquire [P]
388 * [2] waiter = task->pi_blocked_on; [P]
389 * [3] check_exit_conditions_1(); [P]
390 * [4] lock = waiter->lock; [P]
391 * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
392 * unlock(task->pi_lock); release [P]
393 * goto retry;
394 * }
395 * [6] check_exit_conditions_2(); [P] + [L]
396 * [7] requeue_lock_waiter(lock, waiter); [P] + [L]
397 * [8] unlock(task->pi_lock); release [P]
398 * put_task_struct(task); release [R]
399 * [9] check_exit_conditions_3(); [L]
400 * [10] task = owner(lock); [L]
401 * get_task_struct(task); [L] acquire [R]
402 * lock(task->pi_lock); [L] acquire [P]
403 * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
404 * [12] check_exit_conditions_4(); [P] + [L]
405 * [13] unlock(task->pi_lock); release [P]
406 * unlock(lock->wait_lock); release [L]
407 * goto again;
340 */ 408 */
341static int rt_mutex_adjust_prio_chain(struct task_struct *task, 409static int rt_mutex_adjust_prio_chain(struct task_struct *task,
342 int deadlock_detect, 410 enum rtmutex_chainwalk chwalk,
343 struct rt_mutex *orig_lock, 411 struct rt_mutex *orig_lock,
344 struct rt_mutex *next_lock, 412 struct rt_mutex *next_lock,
345 struct rt_mutex_waiter *orig_waiter, 413 struct rt_mutex_waiter *orig_waiter,
346 struct task_struct *top_task) 414 struct task_struct *top_task)
347{ 415{
348 struct rt_mutex *lock;
349 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; 416 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
350 int detect_deadlock, ret = 0, depth = 0; 417 struct rt_mutex_waiter *prerequeue_top_waiter;
418 int ret = 0, depth = 0;
419 struct rt_mutex *lock;
420 bool detect_deadlock;
351 unsigned long flags; 421 unsigned long flags;
422 bool requeue = true;
352 423
353 detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, 424 detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
354 deadlock_detect);
355 425
356 /* 426 /*
357 * The (de)boosting is a step by step approach with a lot of 427 * The (de)boosting is a step by step approach with a lot of
@@ -360,6 +430,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
360 * carefully whether things change under us. 430 * carefully whether things change under us.
361 */ 431 */
362 again: 432 again:
433 /*
434 * We limit the lock chain length for each invocation.
435 */
363 if (++depth > max_lock_depth) { 436 if (++depth > max_lock_depth) {
364 static int prev_max; 437 static int prev_max;
365 438
@@ -377,13 +450,28 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
377 450
378 return -EDEADLK; 451 return -EDEADLK;
379 } 452 }
453
454 /*
455 * We are fully preemptible here and only hold the refcount on
456 * @task. So everything can have changed under us since the
457 * caller or our own code below (goto retry/again) dropped all
458 * locks.
459 */
380 retry: 460 retry:
381 /* 461 /*
382 * Task can not go away as we did a get_task() before ! 462 * [1] Task cannot go away as we did a get_task() before !
383 */ 463 */
384 raw_spin_lock_irqsave(&task->pi_lock, flags); 464 raw_spin_lock_irqsave(&task->pi_lock, flags);
385 465
466 /*
467 * [2] Get the waiter on which @task is blocked on.
468 */
386 waiter = task->pi_blocked_on; 469 waiter = task->pi_blocked_on;
470
471 /*
472 * [3] check_exit_conditions_1() protected by task->pi_lock.
473 */
474
387 /* 475 /*
388 * Check whether the end of the boosting chain has been 476 * Check whether the end of the boosting chain has been
389 * reached or the state of the chain has changed while we 477 * reached or the state of the chain has changed while we
@@ -421,20 +509,41 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
421 goto out_unlock_pi; 509 goto out_unlock_pi;
422 /* 510 /*
423 * If deadlock detection is off, we stop here if we 511 * If deadlock detection is off, we stop here if we
424 * are not the top pi waiter of the task. 512 * are not the top pi waiter of the task. If deadlock
513 * detection is enabled we continue, but stop the
514 * requeueing in the chain walk.
425 */ 515 */
426 if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) 516 if (top_waiter != task_top_pi_waiter(task)) {
427 goto out_unlock_pi; 517 if (!detect_deadlock)
518 goto out_unlock_pi;
519 else
520 requeue = false;
521 }
428 } 522 }
429 523
430 /* 524 /*
431 * When deadlock detection is off then we check, if further 525 * If the waiter priority is the same as the task priority
432 * priority adjustment is necessary. 526 * then there is no further priority adjustment necessary. If
527 * deadlock detection is off, we stop the chain walk. If its
528 * enabled we continue, but stop the requeueing in the chain
529 * walk.
433 */ 530 */
434 if (!detect_deadlock && waiter->prio == task->prio) 531 if (waiter->prio == task->prio) {
435 goto out_unlock_pi; 532 if (!detect_deadlock)
533 goto out_unlock_pi;
534 else
535 requeue = false;
536 }
436 537
538 /*
539 * [4] Get the next lock
540 */
437 lock = waiter->lock; 541 lock = waiter->lock;
542 /*
543 * [5] We need to trylock here as we are holding task->pi_lock,
544 * which is the reverse lock order versus the other rtmutex
545 * operations.
546 */
438 if (!raw_spin_trylock(&lock->wait_lock)) { 547 if (!raw_spin_trylock(&lock->wait_lock)) {
439 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 548 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
440 cpu_relax(); 549 cpu_relax();
@@ -442,79 +551,180 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
442 } 551 }
443 552
444 /* 553 /*
554 * [6] check_exit_conditions_2() protected by task->pi_lock and
555 * lock->wait_lock.
556 *
445 * Deadlock detection. If the lock is the same as the original 557 * Deadlock detection. If the lock is the same as the original
446 * lock which caused us to walk the lock chain or if the 558 * lock which caused us to walk the lock chain or if the
447 * current lock is owned by the task which initiated the chain 559 * current lock is owned by the task which initiated the chain
448 * walk, we detected a deadlock. 560 * walk, we detected a deadlock.
449 */ 561 */
450 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { 562 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
451 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); 563 debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
452 raw_spin_unlock(&lock->wait_lock); 564 raw_spin_unlock(&lock->wait_lock);
453 ret = -EDEADLK; 565 ret = -EDEADLK;
454 goto out_unlock_pi; 566 goto out_unlock_pi;
455 } 567 }
456 568
457 top_waiter = rt_mutex_top_waiter(lock); 569 /*
570 * If we just follow the lock chain for deadlock detection, no
571 * need to do all the requeue operations. To avoid a truckload
572 * of conditionals around the various places below, just do the
573 * minimum chain walk checks.
574 */
575 if (!requeue) {
576 /*
577 * No requeue[7] here. Just release @task [8]
578 */
579 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
580 put_task_struct(task);
581
582 /*
583 * [9] check_exit_conditions_3 protected by lock->wait_lock.
584 * If there is no owner of the lock, end of chain.
585 */
586 if (!rt_mutex_owner(lock)) {
587 raw_spin_unlock(&lock->wait_lock);
588 return 0;
589 }
590
591 /* [10] Grab the next task, i.e. owner of @lock */
592 task = rt_mutex_owner(lock);
593 get_task_struct(task);
594 raw_spin_lock_irqsave(&task->pi_lock, flags);
595
596 /*
597 * No requeue [11] here. We just do deadlock detection.
598 *
599 * [12] Store whether owner is blocked
600 * itself. Decision is made after dropping the locks
601 */
602 next_lock = task_blocked_on_lock(task);
603 /*
604 * Get the top waiter for the next iteration
605 */
606 top_waiter = rt_mutex_top_waiter(lock);
607
608 /* [13] Drop locks */
609 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
610 raw_spin_unlock(&lock->wait_lock);
611
612 /* If owner is not blocked, end of chain. */
613 if (!next_lock)
614 goto out_put_task;
615 goto again;
616 }
458 617
459 /* Requeue the waiter */ 618 /*
619 * Store the current top waiter before doing the requeue
620 * operation on @lock. We need it for the boost/deboost
621 * decision below.
622 */
623 prerequeue_top_waiter = rt_mutex_top_waiter(lock);
624
625 /* [7] Requeue the waiter in the lock waiter list. */
460 rt_mutex_dequeue(lock, waiter); 626 rt_mutex_dequeue(lock, waiter);
461 waiter->prio = task->prio; 627 waiter->prio = task->prio;
462 rt_mutex_enqueue(lock, waiter); 628 rt_mutex_enqueue(lock, waiter);
463 629
464 /* Release the task */ 630 /* [8] Release the task */
465 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 631 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
632 put_task_struct(task);
633
634 /*
635 * [9] check_exit_conditions_3 protected by lock->wait_lock.
636 *
637 * We must abort the chain walk if there is no lock owner even
638 * in the dead lock detection case, as we have nothing to
639 * follow here. This is the end of the chain we are walking.
640 */
466 if (!rt_mutex_owner(lock)) { 641 if (!rt_mutex_owner(lock)) {
467 /* 642 /*
468 * If the requeue above changed the top waiter, then we need 643 * If the requeue [7] above changed the top waiter,
469 * to wake the new top waiter up to try to get the lock. 644 * then we need to wake the new top waiter up to try
645 * to get the lock.
470 */ 646 */
471 647 if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
472 if (top_waiter != rt_mutex_top_waiter(lock))
473 wake_up_process(rt_mutex_top_waiter(lock)->task); 648 wake_up_process(rt_mutex_top_waiter(lock)->task);
474 raw_spin_unlock(&lock->wait_lock); 649 raw_spin_unlock(&lock->wait_lock);
475 goto out_put_task; 650 return 0;
476 } 651 }
477 put_task_struct(task);
478 652
479 /* Grab the next task */ 653 /* [10] Grab the next task, i.e. the owner of @lock */
480 task = rt_mutex_owner(lock); 654 task = rt_mutex_owner(lock);
481 get_task_struct(task); 655 get_task_struct(task);
482 raw_spin_lock_irqsave(&task->pi_lock, flags); 656 raw_spin_lock_irqsave(&task->pi_lock, flags);
483 657
658 /* [11] requeue the pi waiters if necessary */
484 if (waiter == rt_mutex_top_waiter(lock)) { 659 if (waiter == rt_mutex_top_waiter(lock)) {
485 /* Boost the owner */ 660 /*
486 rt_mutex_dequeue_pi(task, top_waiter); 661 * The waiter became the new top (highest priority)
662 * waiter on the lock. Replace the previous top waiter
663 * in the owner tasks pi waiters list with this waiter
664 * and adjust the priority of the owner.
665 */
666 rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
487 rt_mutex_enqueue_pi(task, waiter); 667 rt_mutex_enqueue_pi(task, waiter);
488 __rt_mutex_adjust_prio(task); 668 __rt_mutex_adjust_prio(task);
489 669
490 } else if (top_waiter == waiter) { 670 } else if (prerequeue_top_waiter == waiter) {
491 /* Deboost the owner */ 671 /*
672 * The waiter was the top waiter on the lock, but is
673 * no longer the top prority waiter. Replace waiter in
674 * the owner tasks pi waiters list with the new top
675 * (highest priority) waiter and adjust the priority
676 * of the owner.
677 * The new top waiter is stored in @waiter so that
678 * @waiter == @top_waiter evaluates to true below and
679 * we continue to deboost the rest of the chain.
680 */
492 rt_mutex_dequeue_pi(task, waiter); 681 rt_mutex_dequeue_pi(task, waiter);
493 waiter = rt_mutex_top_waiter(lock); 682 waiter = rt_mutex_top_waiter(lock);
494 rt_mutex_enqueue_pi(task, waiter); 683 rt_mutex_enqueue_pi(task, waiter);
495 __rt_mutex_adjust_prio(task); 684 __rt_mutex_adjust_prio(task);
685 } else {
686 /*
687 * Nothing changed. No need to do any priority
688 * adjustment.
689 */
496 } 690 }
497 691
498 /* 692 /*
693 * [12] check_exit_conditions_4() protected by task->pi_lock
694 * and lock->wait_lock. The actual decisions are made after we
695 * dropped the locks.
696 *
499 * Check whether the task which owns the current lock is pi 697 * Check whether the task which owns the current lock is pi
500 * blocked itself. If yes we store a pointer to the lock for 698 * blocked itself. If yes we store a pointer to the lock for
501 * the lock chain change detection above. After we dropped 699 * the lock chain change detection above. After we dropped
502 * task->pi_lock next_lock cannot be dereferenced anymore. 700 * task->pi_lock next_lock cannot be dereferenced anymore.
503 */ 701 */
504 next_lock = task_blocked_on_lock(task); 702 next_lock = task_blocked_on_lock(task);
703 /*
704 * Store the top waiter of @lock for the end of chain walk
705 * decision below.
706 */
707 top_waiter = rt_mutex_top_waiter(lock);
505 708
709 /* [13] Drop the locks */
506 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 710 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
507
508 top_waiter = rt_mutex_top_waiter(lock);
509 raw_spin_unlock(&lock->wait_lock); 711 raw_spin_unlock(&lock->wait_lock);
510 712
511 /* 713 /*
714 * Make the actual exit decisions [12], based on the stored
715 * values.
716 *
512 * We reached the end of the lock chain. Stop right here. No 717 * We reached the end of the lock chain. Stop right here. No
513 * point to go back just to figure that out. 718 * point to go back just to figure that out.
514 */ 719 */
515 if (!next_lock) 720 if (!next_lock)
516 goto out_put_task; 721 goto out_put_task;
517 722
723 /*
724 * If the current waiter is not the top waiter on the lock,
725 * then we can stop the chain walk here if we are not in full
726 * deadlock detection mode.
727 */
518 if (!detect_deadlock && waiter != top_waiter) 728 if (!detect_deadlock && waiter != top_waiter)
519 goto out_put_task; 729 goto out_put_task;
520 730
@@ -533,76 +743,119 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
533 * 743 *
534 * Must be called with lock->wait_lock held. 744 * Must be called with lock->wait_lock held.
535 * 745 *
536 * @lock: the lock to be acquired. 746 * @lock: The lock to be acquired.
537 * @task: the task which wants to acquire the lock 747 * @task: The task which wants to acquire the lock
538 * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) 748 * @waiter: The waiter that is queued to the lock's wait list if the
749 * callsite called task_blocked_on_lock(), otherwise NULL
539 */ 750 */
540static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, 751static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
541 struct rt_mutex_waiter *waiter) 752 struct rt_mutex_waiter *waiter)
542{ 753{
754 unsigned long flags;
755
543 /* 756 /*
544 * We have to be careful here if the atomic speedups are 757 * Before testing whether we can acquire @lock, we set the
545 * enabled, such that, when 758 * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
546 * - no other waiter is on the lock 759 * other tasks which try to modify @lock into the slow path
547 * - the lock has been released since we did the cmpxchg 760 * and they serialize on @lock->wait_lock.
548 * the lock can be released or taken while we are doing the 761 *
549 * checks and marking the lock with RT_MUTEX_HAS_WAITERS. 762 * The RT_MUTEX_HAS_WAITERS bit can have a transitional state
763 * as explained at the top of this file if and only if:
550 * 764 *
551 * The atomic acquire/release aware variant of 765 * - There is a lock owner. The caller must fixup the
552 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting 766 * transient state if it does a trylock or leaves the lock
553 * the WAITERS bit, the atomic release / acquire can not 767 * function due to a signal or timeout.
554 * happen anymore and lock->wait_lock protects us from the
555 * non-atomic case.
556 * 768 *
557 * Note, that this might set lock->owner = 769 * - @task acquires the lock and there are no other
558 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended 770 * waiters. This is undone in rt_mutex_set_owner(@task) at
559 * any more. This is fixed up when we take the ownership. 771 * the end of this function.
560 * This is the transitional state explained at the top of this file.
561 */ 772 */
562 mark_rt_mutex_waiters(lock); 773 mark_rt_mutex_waiters(lock);
563 774
775 /*
776 * If @lock has an owner, give up.
777 */
564 if (rt_mutex_owner(lock)) 778 if (rt_mutex_owner(lock))
565 return 0; 779 return 0;
566 780
567 /* 781 /*
568 * It will get the lock because of one of these conditions: 782 * If @waiter != NULL, @task has already enqueued the waiter
569 * 1) there is no waiter 783 * into @lock waiter list. If @waiter == NULL then this is a
570 * 2) higher priority than waiters 784 * trylock attempt.
571 * 3) it is top waiter
572 */ 785 */
573 if (rt_mutex_has_waiters(lock)) { 786 if (waiter) {
574 if (task->prio >= rt_mutex_top_waiter(lock)->prio) { 787 /*
575 if (!waiter || waiter != rt_mutex_top_waiter(lock)) 788 * If waiter is not the highest priority waiter of
576 return 0; 789 * @lock, give up.
577 } 790 */
578 } 791 if (waiter != rt_mutex_top_waiter(lock))
579 792 return 0;
580 if (waiter || rt_mutex_has_waiters(lock)) {
581 unsigned long flags;
582 struct rt_mutex_waiter *top;
583
584 raw_spin_lock_irqsave(&task->pi_lock, flags);
585 793
586 /* remove the queued waiter. */ 794 /*
587 if (waiter) { 795 * We can acquire the lock. Remove the waiter from the
588 rt_mutex_dequeue(lock, waiter); 796 * lock waiters list.
589 task->pi_blocked_on = NULL; 797 */
590 } 798 rt_mutex_dequeue(lock, waiter);
591 799
800 } else {
592 /* 801 /*
593 * We have to enqueue the top waiter(if it exists) into 802 * If the lock has waiters already we check whether @task is
594 * task->pi_waiters list. 803 * eligible to take over the lock.
804 *
805 * If there are no other waiters, @task can acquire
806 * the lock. @task->pi_blocked_on is NULL, so it does
807 * not need to be dequeued.
595 */ 808 */
596 if (rt_mutex_has_waiters(lock)) { 809 if (rt_mutex_has_waiters(lock)) {
597 top = rt_mutex_top_waiter(lock); 810 /*
598 rt_mutex_enqueue_pi(task, top); 811 * If @task->prio is greater than or equal to
812 * the top waiter priority (kernel view),
813 * @task lost.
814 */
815 if (task->prio >= rt_mutex_top_waiter(lock)->prio)
816 return 0;
817
818 /*
819 * The current top waiter stays enqueued. We
820 * don't have to change anything in the lock
821 * waiters order.
822 */
823 } else {
824 /*
825 * No waiters. Take the lock without the
826 * pi_lock dance.@task->pi_blocked_on is NULL
827 * and we have no waiters to enqueue in @task
828 * pi waiters list.
829 */
830 goto takeit;
599 } 831 }
600 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
601 } 832 }
602 833
834 /*
835 * Clear @task->pi_blocked_on. Requires protection by
836 * @task->pi_lock. Redundant operation for the @waiter == NULL
837 * case, but conditionals are more expensive than a redundant
838 * store.
839 */
840 raw_spin_lock_irqsave(&task->pi_lock, flags);
841 task->pi_blocked_on = NULL;
842 /*
843 * Finish the lock acquisition. @task is the new owner. If
844 * other waiters exist we have to insert the highest priority
845 * waiter into @task->pi_waiters list.
846 */
847 if (rt_mutex_has_waiters(lock))
848 rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
849 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
850
851takeit:
603 /* We got the lock. */ 852 /* We got the lock. */
604 debug_rt_mutex_lock(lock); 853 debug_rt_mutex_lock(lock);
605 854
855 /*
856 * This either preserves the RT_MUTEX_HAS_WAITERS bit if there
857 * are still waiters or clears it.
858 */
606 rt_mutex_set_owner(lock, task); 859 rt_mutex_set_owner(lock, task);
607 860
608 rt_mutex_deadlock_account_lock(lock, task); 861 rt_mutex_deadlock_account_lock(lock, task);
@@ -620,7 +873,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
620static int task_blocks_on_rt_mutex(struct rt_mutex *lock, 873static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
621 struct rt_mutex_waiter *waiter, 874 struct rt_mutex_waiter *waiter,
622 struct task_struct *task, 875 struct task_struct *task,
623 int detect_deadlock) 876 enum rtmutex_chainwalk chwalk)
624{ 877{
625 struct task_struct *owner = rt_mutex_owner(lock); 878 struct task_struct *owner = rt_mutex_owner(lock);
626 struct rt_mutex_waiter *top_waiter = waiter; 879 struct rt_mutex_waiter *top_waiter = waiter;
@@ -666,7 +919,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
666 __rt_mutex_adjust_prio(owner); 919 __rt_mutex_adjust_prio(owner);
667 if (owner->pi_blocked_on) 920 if (owner->pi_blocked_on)
668 chain_walk = 1; 921 chain_walk = 1;
669 } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { 922 } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
670 chain_walk = 1; 923 chain_walk = 1;
671 } 924 }
672 925
@@ -691,7 +944,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
691 944
692 raw_spin_unlock(&lock->wait_lock); 945 raw_spin_unlock(&lock->wait_lock);
693 946
694 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, 947 res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
695 next_lock, waiter, task); 948 next_lock, waiter, task);
696 949
697 raw_spin_lock(&lock->wait_lock); 950 raw_spin_lock(&lock->wait_lock);
@@ -753,9 +1006,9 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
753static void remove_waiter(struct rt_mutex *lock, 1006static void remove_waiter(struct rt_mutex *lock,
754 struct rt_mutex_waiter *waiter) 1007 struct rt_mutex_waiter *waiter)
755{ 1008{
756 int first = (waiter == rt_mutex_top_waiter(lock)); 1009 bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
757 struct task_struct *owner = rt_mutex_owner(lock); 1010 struct task_struct *owner = rt_mutex_owner(lock);
758 struct rt_mutex *next_lock = NULL; 1011 struct rt_mutex *next_lock;
759 unsigned long flags; 1012 unsigned long flags;
760 1013
761 raw_spin_lock_irqsave(&current->pi_lock, flags); 1014 raw_spin_lock_irqsave(&current->pi_lock, flags);
@@ -763,29 +1016,31 @@ static void remove_waiter(struct rt_mutex *lock,
763 current->pi_blocked_on = NULL; 1016 current->pi_blocked_on = NULL;
764 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 1017 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
765 1018
766 if (!owner) 1019 /*
1020 * Only update priority if the waiter was the highest priority
1021 * waiter of the lock and there is an owner to update.
1022 */
1023 if (!owner || !is_top_waiter)
767 return; 1024 return;
768 1025
769 if (first) { 1026 raw_spin_lock_irqsave(&owner->pi_lock, flags);
770
771 raw_spin_lock_irqsave(&owner->pi_lock, flags);
772 1027
773 rt_mutex_dequeue_pi(owner, waiter); 1028 rt_mutex_dequeue_pi(owner, waiter);
774 1029
775 if (rt_mutex_has_waiters(lock)) { 1030 if (rt_mutex_has_waiters(lock))
776 struct rt_mutex_waiter *next; 1031 rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
777 1032
778 next = rt_mutex_top_waiter(lock); 1033 __rt_mutex_adjust_prio(owner);
779 rt_mutex_enqueue_pi(owner, next);
780 }
781 __rt_mutex_adjust_prio(owner);
782 1034
783 /* Store the lock on which owner is blocked or NULL */ 1035 /* Store the lock on which owner is blocked or NULL */
784 next_lock = task_blocked_on_lock(owner); 1036 next_lock = task_blocked_on_lock(owner);
785 1037
786 raw_spin_unlock_irqrestore(&owner->pi_lock, flags); 1038 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
787 }
788 1039
1040 /*
1041 * Don't walk the chain, if the owner task is not blocked
1042 * itself.
1043 */
789 if (!next_lock) 1044 if (!next_lock)
790 return; 1045 return;
791 1046
@@ -794,7 +1049,8 @@ static void remove_waiter(struct rt_mutex *lock,
794 1049
795 raw_spin_unlock(&lock->wait_lock); 1050 raw_spin_unlock(&lock->wait_lock);
796 1051
797 rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current); 1052 rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
1053 next_lock, NULL, current);
798 1054
799 raw_spin_lock(&lock->wait_lock); 1055 raw_spin_lock(&lock->wait_lock);
800} 1056}
@@ -824,7 +1080,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
824 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 1080 /* gets dropped in rt_mutex_adjust_prio_chain()! */
825 get_task_struct(task); 1081 get_task_struct(task);
826 1082
827 rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task); 1083 rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
1084 next_lock, NULL, task);
828} 1085}
829 1086
830/** 1087/**
@@ -902,7 +1159,7 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
902static int __sched 1159static int __sched
903rt_mutex_slowlock(struct rt_mutex *lock, int state, 1160rt_mutex_slowlock(struct rt_mutex *lock, int state,
904 struct hrtimer_sleeper *timeout, 1161 struct hrtimer_sleeper *timeout,
905 int detect_deadlock) 1162 enum rtmutex_chainwalk chwalk)
906{ 1163{
907 struct rt_mutex_waiter waiter; 1164 struct rt_mutex_waiter waiter;
908 int ret = 0; 1165 int ret = 0;
@@ -928,7 +1185,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
928 timeout->task = NULL; 1185 timeout->task = NULL;
929 } 1186 }
930 1187
931 ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); 1188 ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
932 1189
933 if (likely(!ret)) 1190 if (likely(!ret))
934 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); 1191 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
@@ -937,7 +1194,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
937 1194
938 if (unlikely(ret)) { 1195 if (unlikely(ret)) {
939 remove_waiter(lock, &waiter); 1196 remove_waiter(lock, &waiter);
940 rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter); 1197 rt_mutex_handle_deadlock(ret, chwalk, &waiter);
941 } 1198 }
942 1199
943 /* 1200 /*
@@ -960,22 +1217,31 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
960/* 1217/*
961 * Slow path try-lock function: 1218 * Slow path try-lock function:
962 */ 1219 */
963static inline int 1220static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
964rt_mutex_slowtrylock(struct rt_mutex *lock)
965{ 1221{
966 int ret = 0; 1222 int ret;
1223
1224 /*
1225 * If the lock already has an owner we fail to get the lock.
1226 * This can be done without taking the @lock->wait_lock as
1227 * it is only being read, and this is a trylock anyway.
1228 */
1229 if (rt_mutex_owner(lock))
1230 return 0;
967 1231
1232 /*
1233 * The mutex has currently no owner. Lock the wait lock and
1234 * try to acquire the lock.
1235 */
968 raw_spin_lock(&lock->wait_lock); 1236 raw_spin_lock(&lock->wait_lock);
969 1237
970 if (likely(rt_mutex_owner(lock) != current)) { 1238 ret = try_to_take_rt_mutex(lock, current, NULL);
971 1239
972 ret = try_to_take_rt_mutex(lock, current, NULL); 1240 /*
973 /* 1241 * try_to_take_rt_mutex() sets the lock waiters bit
974 * try_to_take_rt_mutex() sets the lock waiters 1242 * unconditionally. Clean this up.
975 * bit unconditionally. Clean this up. 1243 */
976 */ 1244 fixup_rt_mutex_waiters(lock);
977 fixup_rt_mutex_waiters(lock);
978 }
979 1245
980 raw_spin_unlock(&lock->wait_lock); 1246 raw_spin_unlock(&lock->wait_lock);
981 1247
@@ -1053,30 +1319,31 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
1053 */ 1319 */
1054static inline int 1320static inline int
1055rt_mutex_fastlock(struct rt_mutex *lock, int state, 1321rt_mutex_fastlock(struct rt_mutex *lock, int state,
1056 int detect_deadlock,
1057 int (*slowfn)(struct rt_mutex *lock, int state, 1322 int (*slowfn)(struct rt_mutex *lock, int state,
1058 struct hrtimer_sleeper *timeout, 1323 struct hrtimer_sleeper *timeout,
1059 int detect_deadlock)) 1324 enum rtmutex_chainwalk chwalk))
1060{ 1325{
1061 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { 1326 if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
1062 rt_mutex_deadlock_account_lock(lock, current); 1327 rt_mutex_deadlock_account_lock(lock, current);
1063 return 0; 1328 return 0;
1064 } else 1329 } else
1065 return slowfn(lock, state, NULL, detect_deadlock); 1330 return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
1066} 1331}
1067 1332
1068static inline int 1333static inline int
1069rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, 1334rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
1070 struct hrtimer_sleeper *timeout, int detect_deadlock, 1335 struct hrtimer_sleeper *timeout,
1336 enum rtmutex_chainwalk chwalk,
1071 int (*slowfn)(struct rt_mutex *lock, int state, 1337 int (*slowfn)(struct rt_mutex *lock, int state,
1072 struct hrtimer_sleeper *timeout, 1338 struct hrtimer_sleeper *timeout,
1073 int detect_deadlock)) 1339 enum rtmutex_chainwalk chwalk))
1074{ 1340{
1075 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { 1341 if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
1342 likely(rt_mutex_cmpxchg(lock, NULL, current))) {
1076 rt_mutex_deadlock_account_lock(lock, current); 1343 rt_mutex_deadlock_account_lock(lock, current);
1077 return 0; 1344 return 0;
1078 } else 1345 } else
1079 return slowfn(lock, state, timeout, detect_deadlock); 1346 return slowfn(lock, state, timeout, chwalk);
1080} 1347}
1081 1348
1082static inline int 1349static inline int
@@ -1109,54 +1376,61 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
1109{ 1376{
1110 might_sleep(); 1377 might_sleep();
1111 1378
1112 rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); 1379 rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
1113} 1380}
1114EXPORT_SYMBOL_GPL(rt_mutex_lock); 1381EXPORT_SYMBOL_GPL(rt_mutex_lock);
1115 1382
1116/** 1383/**
1117 * rt_mutex_lock_interruptible - lock a rt_mutex interruptible 1384 * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
1118 * 1385 *
1119 * @lock: the rt_mutex to be locked 1386 * @lock: the rt_mutex to be locked
1120 * @detect_deadlock: deadlock detection on/off
1121 * 1387 *
1122 * Returns: 1388 * Returns:
1123 * 0 on success 1389 * 0 on success
1124 * -EINTR when interrupted by a signal 1390 * -EINTR when interrupted by a signal
1125 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
1126 */ 1391 */
1127int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, 1392int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
1128 int detect_deadlock)
1129{ 1393{
1130 might_sleep(); 1394 might_sleep();
1131 1395
1132 return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, 1396 return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
1133 detect_deadlock, rt_mutex_slowlock);
1134} 1397}
1135EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); 1398EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
1136 1399
1400/*
1401 * Futex variant with full deadlock detection.
1402 */
1403int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
1404 struct hrtimer_sleeper *timeout)
1405{
1406 might_sleep();
1407
1408 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
1409 RT_MUTEX_FULL_CHAINWALK,
1410 rt_mutex_slowlock);
1411}
1412
1137/** 1413/**
1138 * rt_mutex_timed_lock - lock a rt_mutex interruptible 1414 * rt_mutex_timed_lock - lock a rt_mutex interruptible
1139 * the timeout structure is provided 1415 * the timeout structure is provided
1140 * by the caller 1416 * by the caller
1141 * 1417 *
1142 * @lock: the rt_mutex to be locked 1418 * @lock: the rt_mutex to be locked
1143 * @timeout: timeout structure or NULL (no timeout) 1419 * @timeout: timeout structure or NULL (no timeout)
1144 * @detect_deadlock: deadlock detection on/off
1145 * 1420 *
1146 * Returns: 1421 * Returns:
1147 * 0 on success 1422 * 0 on success
1148 * -EINTR when interrupted by a signal 1423 * -EINTR when interrupted by a signal
1149 * -ETIMEDOUT when the timeout expired 1424 * -ETIMEDOUT when the timeout expired
1150 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
1151 */ 1425 */
1152int 1426int
1153rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, 1427rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
1154 int detect_deadlock)
1155{ 1428{
1156 might_sleep(); 1429 might_sleep();
1157 1430
1158 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, 1431 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
1159 detect_deadlock, rt_mutex_slowlock); 1432 RT_MUTEX_MIN_CHAINWALK,
1433 rt_mutex_slowlock);
1160} 1434}
1161EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); 1435EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
1162 1436
@@ -1262,7 +1536,6 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
1262 * @lock: the rt_mutex to take 1536 * @lock: the rt_mutex to take
1263 * @waiter: the pre-initialized rt_mutex_waiter 1537 * @waiter: the pre-initialized rt_mutex_waiter
1264 * @task: the task to prepare 1538 * @task: the task to prepare
1265 * @detect_deadlock: perform deadlock detection (1) or not (0)
1266 * 1539 *
1267 * Returns: 1540 * Returns:
1268 * 0 - task blocked on lock 1541 * 0 - task blocked on lock
@@ -1273,7 +1546,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
1273 */ 1546 */
1274int rt_mutex_start_proxy_lock(struct rt_mutex *lock, 1547int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1275 struct rt_mutex_waiter *waiter, 1548 struct rt_mutex_waiter *waiter,
1276 struct task_struct *task, int detect_deadlock) 1549 struct task_struct *task)
1277{ 1550{
1278 int ret; 1551 int ret;
1279 1552
@@ -1285,7 +1558,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1285 } 1558 }
1286 1559
1287 /* We enforce deadlock detection for futexes */ 1560 /* We enforce deadlock detection for futexes */
1288 ret = task_blocks_on_rt_mutex(lock, waiter, task, 1); 1561 ret = task_blocks_on_rt_mutex(lock, waiter, task,
1562 RT_MUTEX_FULL_CHAINWALK);
1289 1563
1290 if (ret && !rt_mutex_owner(lock)) { 1564 if (ret && !rt_mutex_owner(lock)) {
1291 /* 1565 /*
@@ -1331,22 +1605,20 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
1331 * rt_mutex_finish_proxy_lock() - Complete lock acquisition 1605 * rt_mutex_finish_proxy_lock() - Complete lock acquisition
1332 * @lock: the rt_mutex we were woken on 1606 * @lock: the rt_mutex we were woken on
1333 * @to: the timeout, null if none. hrtimer should already have 1607 * @to: the timeout, null if none. hrtimer should already have
1334 * been started. 1608 * been started.
1335 * @waiter: the pre-initialized rt_mutex_waiter 1609 * @waiter: the pre-initialized rt_mutex_waiter
1336 * @detect_deadlock: perform deadlock detection (1) or not (0)
1337 * 1610 *
1338 * Complete the lock acquisition started our behalf by another thread. 1611 * Complete the lock acquisition started our behalf by another thread.
1339 * 1612 *
1340 * Returns: 1613 * Returns:
1341 * 0 - success 1614 * 0 - success
1342 * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK 1615 * <0 - error, one of -EINTR, -ETIMEDOUT
1343 * 1616 *
1344 * Special API call for PI-futex requeue support 1617 * Special API call for PI-futex requeue support
1345 */ 1618 */
1346int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, 1619int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1347 struct hrtimer_sleeper *to, 1620 struct hrtimer_sleeper *to,
1348 struct rt_mutex_waiter *waiter, 1621 struct rt_mutex_waiter *waiter)
1349 int detect_deadlock)
1350{ 1622{
1351 int ret; 1623 int ret;
1352 1624
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index f6a1f3c133b1..c4060584c407 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -22,10 +22,15 @@
22#define debug_rt_mutex_init(m, n) do { } while (0) 22#define debug_rt_mutex_init(m, n) do { } while (0)
23#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) 23#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
24#define debug_rt_mutex_print_deadlock(w) do { } while (0) 24#define debug_rt_mutex_print_deadlock(w) do { } while (0)
25#define debug_rt_mutex_detect_deadlock(w,d) (d)
26#define debug_rt_mutex_reset_waiter(w) do { } while (0) 25#define debug_rt_mutex_reset_waiter(w) do { } while (0)
27 26
28static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) 27static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
29{ 28{
30 WARN(1, "rtmutex deadlock detected\n"); 29 WARN(1, "rtmutex deadlock detected\n");
31} 30}
31
32static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
33 enum rtmutex_chainwalk walk)
34{
35 return walk == RT_MUTEX_FULL_CHAINWALK;
36}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 7431a9c86f35..855212501407 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -102,6 +102,21 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
102} 102}
103 103
104/* 104/*
105 * Constants for rt mutex functions which have a selectable deadlock
106 * detection.
107 *
108 * RT_MUTEX_MIN_CHAINWALK: Stops the lock chain walk when there are
109 * no further PI adjustments to be made.
110 *
111 * RT_MUTEX_FULL_CHAINWALK: Invoke deadlock detection with a full
112 * walk of the lock chain.
113 */
114enum rtmutex_chainwalk {
115 RT_MUTEX_MIN_CHAINWALK,
116 RT_MUTEX_FULL_CHAINWALK,
117};
118
119/*
105 * PI-futex support (proxy locking functions, etc.): 120 * PI-futex support (proxy locking functions, etc.):
106 */ 121 */
107extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); 122extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -111,12 +126,11 @@ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
111 struct task_struct *proxy_owner); 126 struct task_struct *proxy_owner);
112extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, 127extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
113 struct rt_mutex_waiter *waiter, 128 struct rt_mutex_waiter *waiter,
114 struct task_struct *task, 129 struct task_struct *task);
115 int detect_deadlock);
116extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, 130extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
117 struct hrtimer_sleeper *to, 131 struct hrtimer_sleeper *to,
118 struct rt_mutex_waiter *waiter, 132 struct rt_mutex_waiter *waiter);
119 int detect_deadlock); 133extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
120 134
121#ifdef CONFIG_DEBUG_RT_MUTEXES 135#ifdef CONFIG_DEBUG_RT_MUTEXES
122# include "rtmutex-debug.h" 136# include "rtmutex-debug.h"
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 9be8a9144978..2c93571162cb 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem)
26 unsigned long flags; 26 unsigned long flags;
27 27
28 if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { 28 if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
29 ret = (sem->activity != 0); 29 ret = (sem->count != 0);
30 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 30 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
31 } 31 }
32 return ret; 32 return ret;
@@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
46 debug_check_no_locks_freed((void *)sem, sizeof(*sem)); 46 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
47 lockdep_init_map(&sem->dep_map, name, key, 0); 47 lockdep_init_map(&sem->dep_map, name, key, 0);
48#endif 48#endif
49 sem->activity = 0; 49 sem->count = 0;
50 raw_spin_lock_init(&sem->wait_lock); 50 raw_spin_lock_init(&sem->wait_lock);
51 INIT_LIST_HEAD(&sem->wait_list); 51 INIT_LIST_HEAD(&sem->wait_list);
52} 52}
@@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
95 waiter = list_entry(next, struct rwsem_waiter, list); 95 waiter = list_entry(next, struct rwsem_waiter, list);
96 } while (waiter->type != RWSEM_WAITING_FOR_WRITE); 96 } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
97 97
98 sem->activity += woken; 98 sem->count += woken;
99 99
100 out: 100 out:
101 return sem; 101 return sem;
@@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem)
126 126
127 raw_spin_lock_irqsave(&sem->wait_lock, flags); 127 raw_spin_lock_irqsave(&sem->wait_lock, flags);
128 128
129 if (sem->activity >= 0 && list_empty(&sem->wait_list)) { 129 if (sem->count >= 0 && list_empty(&sem->wait_list)) {
130 /* granted */ 130 /* granted */
131 sem->activity++; 131 sem->count++;
132 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 132 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
133 goto out; 133 goto out;
134 } 134 }
@@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem)
170 170
171 raw_spin_lock_irqsave(&sem->wait_lock, flags); 171 raw_spin_lock_irqsave(&sem->wait_lock, flags);
172 172
173 if (sem->activity >= 0 && list_empty(&sem->wait_list)) { 173 if (sem->count >= 0 && list_empty(&sem->wait_list)) {
174 /* granted */ 174 /* granted */
175 sem->activity++; 175 sem->count++;
176 ret = 1; 176 ret = 1;
177 } 177 }
178 178
@@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
206 * itself into sleep and waiting for system woke it or someone 206 * itself into sleep and waiting for system woke it or someone
207 * else in the head of the wait list up. 207 * else in the head of the wait list up.
208 */ 208 */
209 if (sem->activity == 0) 209 if (sem->count == 0)
210 break; 210 break;
211 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 211 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
212 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 212 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
214 raw_spin_lock_irqsave(&sem->wait_lock, flags); 214 raw_spin_lock_irqsave(&sem->wait_lock, flags);
215 } 215 }
216 /* got the lock */ 216 /* got the lock */
217 sem->activity = -1; 217 sem->count = -1;
218 list_del(&waiter.list); 218 list_del(&waiter.list);
219 219
220 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 220 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem)
235 235
236 raw_spin_lock_irqsave(&sem->wait_lock, flags); 236 raw_spin_lock_irqsave(&sem->wait_lock, flags);
237 237
238 if (sem->activity == 0) { 238 if (sem->count == 0) {
239 /* got the lock */ 239 /* got the lock */
240 sem->activity = -1; 240 sem->count = -1;
241 ret = 1; 241 ret = 1;
242 } 242 }
243 243
@@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem)
255 255
256 raw_spin_lock_irqsave(&sem->wait_lock, flags); 256 raw_spin_lock_irqsave(&sem->wait_lock, flags);
257 257
258 if (--sem->activity == 0 && !list_empty(&sem->wait_list)) 258 if (--sem->count == 0 && !list_empty(&sem->wait_list))
259 sem = __rwsem_wake_one_writer(sem); 259 sem = __rwsem_wake_one_writer(sem);
260 260
261 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 261 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem)
270 270
271 raw_spin_lock_irqsave(&sem->wait_lock, flags); 271 raw_spin_lock_irqsave(&sem->wait_lock, flags);
272 272
273 sem->activity = 0; 273 sem->count = 0;
274 if (!list_empty(&sem->wait_list)) 274 if (!list_empty(&sem->wait_list))
275 sem = __rwsem_do_wake(sem, 1); 275 sem = __rwsem_do_wake(sem, 1);
276 276
@@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem)
287 287
288 raw_spin_lock_irqsave(&sem->wait_lock, flags); 288 raw_spin_lock_irqsave(&sem->wait_lock, flags);
289 289
290 sem->activity = 1; 290 sem->count = 1;
291 if (!list_empty(&sem->wait_list)) 291 if (!list_empty(&sem->wait_list))
292 sem = __rwsem_do_wake(sem, 0); 292 sem = __rwsem_do_wake(sem, 0);
293 293
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index dacc32142fcc..d6203faf2eb1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -82,9 +82,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
82 sem->count = RWSEM_UNLOCKED_VALUE; 82 sem->count = RWSEM_UNLOCKED_VALUE;
83 raw_spin_lock_init(&sem->wait_lock); 83 raw_spin_lock_init(&sem->wait_lock);
84 INIT_LIST_HEAD(&sem->wait_list); 84 INIT_LIST_HEAD(&sem->wait_list);
85#ifdef CONFIG_SMP 85#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
86 sem->owner = NULL; 86 sem->owner = NULL;
87 sem->osq = NULL; 87 osq_lock_init(&sem->osq);
88#endif 88#endif
89} 89}
90 90
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
262 return false; 262 return false;
263} 263}
264 264
265#ifdef CONFIG_SMP 265#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
266/* 266/*
267 * Try to acquire write lock before the writer has been put on wait queue. 267 * Try to acquire write lock before the writer has been put on wait queue.
268 */ 268 */
@@ -285,10 +285,10 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
285static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) 285static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
286{ 286{
287 struct task_struct *owner; 287 struct task_struct *owner;
288 bool on_cpu = true; 288 bool on_cpu = false;
289 289
290 if (need_resched()) 290 if (need_resched())
291 return 0; 291 return false;
292 292
293 rcu_read_lock(); 293 rcu_read_lock();
294 owner = ACCESS_ONCE(sem->owner); 294 owner = ACCESS_ONCE(sem->owner);
@@ -297,9 +297,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
297 rcu_read_unlock(); 297 rcu_read_unlock();
298 298
299 /* 299 /*
300 * If sem->owner is not set, the rwsem owner may have 300 * If sem->owner is not set, yet we have just recently entered the
301 * just acquired it and not set the owner yet or the rwsem 301 * slowpath, then there is a possibility reader(s) may have the lock.
302 * has been released. 302 * To be safe, avoid spinning in these situations.
303 */ 303 */
304 return on_cpu; 304 return on_cpu;
305} 305}
@@ -329,7 +329,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
329 if (need_resched()) 329 if (need_resched())
330 break; 330 break;
331 331
332 arch_mutex_cpu_relax(); 332 cpu_relax_lowlatency();
333 } 333 }
334 rcu_read_unlock(); 334 rcu_read_unlock();
335 335
@@ -381,7 +381,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
381 * memory barriers as we'll eventually observe the right 381 * memory barriers as we'll eventually observe the right
382 * values at the cost of a few extra spins. 382 * values at the cost of a few extra spins.
383 */ 383 */
384 arch_mutex_cpu_relax(); 384 cpu_relax_lowlatency();
385 } 385 }
386 osq_unlock(&sem->osq); 386 osq_unlock(&sem->osq);
387done: 387done:
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 42f806de49d4..e2d3bc7f03b4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,7 +12,7 @@
12 12
13#include <linux/atomic.h> 13#include <linux/atomic.h>
14 14
15#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) 15#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
16static inline void rwsem_set_owner(struct rw_semaphore *sem) 16static inline void rwsem_set_owner(struct rw_semaphore *sem)
17{ 17{
18 sem->owner = current; 18 sem->owner = current;
diff --git a/kernel/module.c b/kernel/module.c
index 81e727cf6df9..ae79ce615cb9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -60,7 +60,6 @@
60#include <linux/jump_label.h> 60#include <linux/jump_label.h>
61#include <linux/pfn.h> 61#include <linux/pfn.h>
62#include <linux/bsearch.h> 62#include <linux/bsearch.h>
63#include <linux/fips.h>
64#include <uapi/linux/module.h> 63#include <uapi/linux/module.h>
65#include "module-internal.h" 64#include "module-internal.h"
66 65
@@ -2448,9 +2447,6 @@ static int module_sig_check(struct load_info *info)
2448 } 2447 }
2449 2448
2450 /* Not having a signature is only an error if we're strict. */ 2449 /* Not having a signature is only an error if we're strict. */
2451 if (err < 0 && fips_enabled)
2452 panic("Module verification failed with error %d in FIPS mode\n",
2453 err);
2454 if (err == -ENOKEY && !sig_enforce) 2450 if (err == -ENOKEY && !sig_enforce)
2455 err = 0; 2451 err = 0;
2456 2452
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index fcc2611d3f14..a9dfa79b6bab 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -371,7 +371,6 @@ int hibernation_snapshot(int platform_mode)
371 } 371 }
372 372
373 suspend_console(); 373 suspend_console();
374 ftrace_stop();
375 pm_restrict_gfp_mask(); 374 pm_restrict_gfp_mask();
376 375
377 error = dpm_suspend(PMSG_FREEZE); 376 error = dpm_suspend(PMSG_FREEZE);
@@ -397,7 +396,6 @@ int hibernation_snapshot(int platform_mode)
397 if (error || !in_suspend) 396 if (error || !in_suspend)
398 pm_restore_gfp_mask(); 397 pm_restore_gfp_mask();
399 398
400 ftrace_start();
401 resume_console(); 399 resume_console();
402 dpm_complete(msg); 400 dpm_complete(msg);
403 401
@@ -500,7 +498,6 @@ int hibernation_restore(int platform_mode)
500 498
501 pm_prepare_console(); 499 pm_prepare_console();
502 suspend_console(); 500 suspend_console();
503 ftrace_stop();
504 pm_restrict_gfp_mask(); 501 pm_restrict_gfp_mask();
505 error = dpm_suspend_start(PMSG_QUIESCE); 502 error = dpm_suspend_start(PMSG_QUIESCE);
506 if (!error) { 503 if (!error) {
@@ -508,7 +505,6 @@ int hibernation_restore(int platform_mode)
508 dpm_resume_end(PMSG_RECOVER); 505 dpm_resume_end(PMSG_RECOVER);
509 } 506 }
510 pm_restore_gfp_mask(); 507 pm_restore_gfp_mask();
511 ftrace_start();
512 resume_console(); 508 resume_console();
513 pm_restore_console(); 509 pm_restore_console();
514 return error; 510 return error;
@@ -535,7 +531,6 @@ int hibernation_platform_enter(void)
535 531
536 entering_platform_hibernation = true; 532 entering_platform_hibernation = true;
537 suspend_console(); 533 suspend_console();
538 ftrace_stop();
539 error = dpm_suspend_start(PMSG_HIBERNATE); 534 error = dpm_suspend_start(PMSG_HIBERNATE);
540 if (error) { 535 if (error) {
541 if (hibernation_ops->recover) 536 if (hibernation_ops->recover)
@@ -579,7 +574,6 @@ int hibernation_platform_enter(void)
579 Resume_devices: 574 Resume_devices:
580 entering_platform_hibernation = false; 575 entering_platform_hibernation = false;
581 dpm_resume_end(PMSG_RESTORE); 576 dpm_resume_end(PMSG_RESTORE);
582 ftrace_start();
583 resume_console(); 577 resume_console();
584 578
585 Close: 579 Close:
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0ca8d83e2369..4ee194eb524b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -186,6 +186,7 @@ void thaw_processes(void)
186 186
187 printk("Restarting tasks ... "); 187 printk("Restarting tasks ... ");
188 188
189 __usermodehelper_set_disable_depth(UMH_FREEZING);
189 thaw_workqueues(); 190 thaw_workqueues();
190 191
191 read_lock(&tasklist_lock); 192 read_lock(&tasklist_lock);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4dd8822f732a..4b736b4dfa96 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -248,7 +248,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
248 goto Platform_wake; 248 goto Platform_wake;
249 } 249 }
250 250
251 ftrace_stop();
252 error = disable_nonboot_cpus(); 251 error = disable_nonboot_cpus();
253 if (error || suspend_test(TEST_CPUS)) 252 if (error || suspend_test(TEST_CPUS))
254 goto Enable_cpus; 253 goto Enable_cpus;
@@ -275,7 +274,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
275 274
276 Enable_cpus: 275 Enable_cpus:
277 enable_nonboot_cpus(); 276 enable_nonboot_cpus();
278 ftrace_start();
279 277
280 Platform_wake: 278 Platform_wake:
281 if (need_suspend_ops(state) && suspend_ops->wake) 279 if (need_suspend_ops(state) && suspend_ops->wake)
@@ -306,7 +304,7 @@ int suspend_devices_and_enter(suspend_state_t state)
306 error = suspend_ops->begin(state); 304 error = suspend_ops->begin(state);
307 if (error) 305 if (error)
308 goto Close; 306 goto Close;
309 } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) { 307 } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) {
310 error = freeze_ops->begin(); 308 error = freeze_ops->begin();
311 if (error) 309 if (error)
312 goto Close; 310 goto Close;
@@ -335,7 +333,7 @@ int suspend_devices_and_enter(suspend_state_t state)
335 Close: 333 Close:
336 if (need_suspend_ops(state) && suspend_ops->end) 334 if (need_suspend_ops(state) && suspend_ops->end)
337 suspend_ops->end(); 335 suspend_ops->end();
338 else if (state == PM_SUSPEND_FREEZE && freeze_ops->end) 336 else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
339 freeze_ops->end(); 337 freeze_ops->end();
340 338
341 return error; 339 return error;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index adf98622cb32..54e75226c2c4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,12 +28,6 @@
28#include <linux/compat.h> 28#include <linux/compat.h>
29 29
30 30
31static int ptrace_trapping_sleep_fn(void *flags)
32{
33 schedule();
34 return 0;
35}
36
37/* 31/*
38 * ptrace a task: make the debugger its new parent and 32 * ptrace a task: make the debugger its new parent and
39 * move it to the ptrace list. 33 * move it to the ptrace list.
@@ -371,7 +365,7 @@ unlock_creds:
371out: 365out:
372 if (!retval) { 366 if (!retval) {
373 wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, 367 wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
374 ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); 368 TASK_UNINTERRUPTIBLE);
375 proc_ptrace_connector(task, PTRACE_ATTACH); 369 proc_ptrace_connector(task, PTRACE_ATTACH);
376 } 370 }
377 371
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index bfda2726ca45..ff1a6de62f17 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -99,6 +99,10 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
99 99
100void kfree(const void *); 100void kfree(const void *);
101 101
102/*
103 * Reclaim the specified callback, either by invoking it (non-lazy case)
104 * or freeing it directly (lazy case). Return true if lazy, false otherwise.
105 */
102static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) 106static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
103{ 107{
104 unsigned long offset = (unsigned long)head->func; 108 unsigned long offset = (unsigned long)head->func;
@@ -108,12 +112,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
108 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); 112 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
109 kfree((void *)head - offset); 113 kfree((void *)head - offset);
110 rcu_lock_release(&rcu_callback_map); 114 rcu_lock_release(&rcu_callback_map);
111 return 1; 115 return true;
112 } else { 116 } else {
113 RCU_TRACE(trace_rcu_invoke_callback(rn, head)); 117 RCU_TRACE(trace_rcu_invoke_callback(rn, head));
114 head->func(head); 118 head->func(head);
115 rcu_lock_release(&rcu_callback_map); 119 rcu_lock_release(&rcu_callback_map);
116 return 0; 120 return false;
117 } 121 }
118} 122}
119 123
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7fa34f86e5ba..948a7693748e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -18,7 +18,7 @@
18 * Copyright (C) IBM Corporation, 2005, 2006 18 * Copyright (C) IBM Corporation, 2005, 2006
19 * 19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com> 20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * Josh Triplett <josh@freedesktop.org> 21 * Josh Triplett <josh@joshtriplett.org>
22 * 22 *
23 * See also: Documentation/RCU/torture.txt 23 * See also: Documentation/RCU/torture.txt
24 */ 24 */
@@ -51,7 +51,7 @@
51#include <linux/torture.h> 51#include <linux/torture.h>
52 52
53MODULE_LICENSE("GPL"); 53MODULE_LICENSE("GPL");
54MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); 54MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
55 55
56 56
57torture_param(int, fqs_duration, 0, 57torture_param(int, fqs_duration, 0,
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index c639556f3fa0..e037f3eb2f7b 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -298,9 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
298 298
299 idx = ACCESS_ONCE(sp->completed) & 0x1; 299 idx = ACCESS_ONCE(sp->completed) & 0x1;
300 preempt_disable(); 300 preempt_disable();
301 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; 301 __this_cpu_inc(sp->per_cpu_ref->c[idx]);
302 smp_mb(); /* B */ /* Avoid leaking the critical section. */ 302 smp_mb(); /* B */ /* Avoid leaking the critical section. */
303 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; 303 __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
304 preempt_enable(); 304 preempt_enable();
305 return idx; 305 return idx;
306} 306}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f1ba77363fbb..1b70cb6fbe3c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu)
206 rdp->passed_quiesce = 1; 206 rdp->passed_quiesce = 1;
207} 207}
208 208
209static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
210
211static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
212 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
213 .dynticks = ATOMIC_INIT(1),
214#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
215 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
216 .dynticks_idle = ATOMIC_INIT(1),
217#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
218};
219
220/*
221 * Let the RCU core know that this CPU has gone through the scheduler,
222 * which is a quiescent state. This is called when the need for a
223 * quiescent state is urgent, so we burn an atomic operation and full
224 * memory barriers to let the RCU core know about it, regardless of what
225 * this CPU might (or might not) do in the near future.
226 *
227 * We inform the RCU core by emulating a zero-duration dyntick-idle
228 * period, which we in turn do by incrementing the ->dynticks counter
229 * by two.
230 */
231static void rcu_momentary_dyntick_idle(void)
232{
233 unsigned long flags;
234 struct rcu_data *rdp;
235 struct rcu_dynticks *rdtp;
236 int resched_mask;
237 struct rcu_state *rsp;
238
239 local_irq_save(flags);
240
241 /*
242 * Yes, we can lose flag-setting operations. This is OK, because
243 * the flag will be set again after some delay.
244 */
245 resched_mask = raw_cpu_read(rcu_sched_qs_mask);
246 raw_cpu_write(rcu_sched_qs_mask, 0);
247
248 /* Find the flavor that needs a quiescent state. */
249 for_each_rcu_flavor(rsp) {
250 rdp = raw_cpu_ptr(rsp->rda);
251 if (!(resched_mask & rsp->flavor_mask))
252 continue;
253 smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
254 if (ACCESS_ONCE(rdp->mynode->completed) !=
255 ACCESS_ONCE(rdp->cond_resched_completed))
256 continue;
257
258 /*
259 * Pretend to be momentarily idle for the quiescent state.
260 * This allows the grace-period kthread to record the
261 * quiescent state, with no need for this CPU to do anything
262 * further.
263 */
264 rdtp = this_cpu_ptr(&rcu_dynticks);
265 smp_mb__before_atomic(); /* Earlier stuff before QS. */
266 atomic_add(2, &rdtp->dynticks); /* QS. */
267 smp_mb__after_atomic(); /* Later stuff after QS. */
268 break;
269 }
270 local_irq_restore(flags);
271}
272
209/* 273/*
210 * Note a context switch. This is a quiescent state for RCU-sched, 274 * Note a context switch. This is a quiescent state for RCU-sched,
211 * and requires special handling for preemptible RCU. 275 * and requires special handling for preemptible RCU.
@@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu)
216 trace_rcu_utilization(TPS("Start context switch")); 280 trace_rcu_utilization(TPS("Start context switch"));
217 rcu_sched_qs(cpu); 281 rcu_sched_qs(cpu);
218 rcu_preempt_note_context_switch(cpu); 282 rcu_preempt_note_context_switch(cpu);
283 if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
284 rcu_momentary_dyntick_idle();
219 trace_rcu_utilization(TPS("End context switch")); 285 trace_rcu_utilization(TPS("End context switch"));
220} 286}
221EXPORT_SYMBOL_GPL(rcu_note_context_switch); 287EXPORT_SYMBOL_GPL(rcu_note_context_switch);
222 288
223static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
224 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
225 .dynticks = ATOMIC_INIT(1),
226#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
227 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
228 .dynticks_idle = ATOMIC_INIT(1),
229#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
230};
231
232static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 289static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
233static long qhimark = 10000; /* If this many pending, ignore blimit. */ 290static long qhimark = 10000; /* If this many pending, ignore blimit. */
234static long qlowmark = 100; /* Once only this many pending, use blimit. */ 291static long qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -243,6 +300,13 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
243module_param(jiffies_till_first_fqs, ulong, 0644); 300module_param(jiffies_till_first_fqs, ulong, 0644);
244module_param(jiffies_till_next_fqs, ulong, 0644); 301module_param(jiffies_till_next_fqs, ulong, 0644);
245 302
303/*
304 * How long the grace period must be before we start recruiting
305 * quiescent-state help from rcu_note_context_switch().
306 */
307static ulong jiffies_till_sched_qs = HZ / 20;
308module_param(jiffies_till_sched_qs, ulong, 0644);
309
246static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 310static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
247 struct rcu_data *rdp); 311 struct rcu_data *rdp);
248static void force_qs_rnp(struct rcu_state *rsp, 312static void force_qs_rnp(struct rcu_state *rsp,
@@ -853,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
853 bool *isidle, unsigned long *maxj) 917 bool *isidle, unsigned long *maxj)
854{ 918{
855 unsigned int curr; 919 unsigned int curr;
920 int *rcrmp;
856 unsigned int snap; 921 unsigned int snap;
857 922
858 curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); 923 curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
@@ -893,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
893 } 958 }
894 959
895 /* 960 /*
896 * There is a possibility that a CPU in adaptive-ticks state 961 * A CPU running for an extended time within the kernel can
897 * might run in the kernel with the scheduling-clock tick disabled 962 * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
898 * for an extended time period. Invoke rcu_kick_nohz_cpu() to 963 * even context-switching back and forth between a pair of
899 * force the CPU to restart the scheduling-clock tick in this 964 * in-kernel CPU-bound tasks cannot advance grace periods.
900 * CPU is in this state. 965 * So if the grace period is old enough, make the CPU pay attention.
901 */ 966 * Note that the unsynchronized assignments to the per-CPU
902 rcu_kick_nohz_cpu(rdp->cpu); 967 * rcu_sched_qs_mask variable are safe. Yes, setting of
903 968 * bits can be lost, but they will be set again on the next
904 /* 969 * force-quiescent-state pass. So lost bit sets do not result
905 * Alternatively, the CPU might be running in the kernel 970 * in incorrect behavior, merely in a grace period lasting
906 * for an extended period of time without a quiescent state. 971 * a few jiffies longer than it might otherwise. Because
907 * Attempt to force the CPU through the scheduler to gain the 972 * there are at most four threads involved, and because the
908 * needed quiescent state, but only if the grace period has gone 973 * updates are only once every few jiffies, the probability of
909 * on for an uncommonly long time. If there are many stuck CPUs, 974 * lossage (and thus of slight grace-period extension) is
910 * we will beat on the first one until it gets unstuck, then move 975 * quite low.
911 * to the next. Only do this for the primary flavor of RCU. 976 *
977 * Note that if the jiffies_till_sched_qs boot/sysfs parameter
978 * is set too high, we override with half of the RCU CPU stall
979 * warning delay.
912 */ 980 */
913 if (rdp->rsp == rcu_state_p && 981 rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
982 if (ULONG_CMP_GE(jiffies,
983 rdp->rsp->gp_start + jiffies_till_sched_qs) ||
914 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { 984 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
915 rdp->rsp->jiffies_resched += 5; 985 if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
916 resched_cpu(rdp->cpu); 986 ACCESS_ONCE(rdp->cond_resched_completed) =
987 ACCESS_ONCE(rdp->mynode->completed);
988 smp_mb(); /* ->cond_resched_completed before *rcrmp. */
989 ACCESS_ONCE(*rcrmp) =
990 ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
991 resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
992 rdp->rsp->jiffies_resched += 5; /* Enable beating. */
993 } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
994 /* Time to beat on that CPU again! */
995 resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
996 rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
997 }
917 } 998 }
918 999
919 return 0; 1000 return 0;
@@ -932,10 +1013,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
932} 1013}
933 1014
934/* 1015/*
935 * Dump stacks of all tasks running on stalled CPUs. This is a fallback 1016 * Dump stacks of all tasks running on stalled CPUs.
936 * for architectures that do not implement trigger_all_cpu_backtrace().
937 * The NMI-triggered stack traces are more accurate because they are
938 * printed by the target CPU.
939 */ 1017 */
940static void rcu_dump_cpu_stacks(struct rcu_state *rsp) 1018static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
941{ 1019{
@@ -1013,7 +1091,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
1013 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1091 (long)rsp->gpnum, (long)rsp->completed, totqlen);
1014 if (ndetected == 0) 1092 if (ndetected == 0)
1015 pr_err("INFO: Stall ended before state dump start\n"); 1093 pr_err("INFO: Stall ended before state dump start\n");
1016 else if (!trigger_all_cpu_backtrace()) 1094 else
1017 rcu_dump_cpu_stacks(rsp); 1095 rcu_dump_cpu_stacks(rsp);
1018 1096
1019 /* Complain about tasks blocking the grace period. */ 1097 /* Complain about tasks blocking the grace period. */
@@ -1044,8 +1122,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
1044 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", 1122 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
1045 jiffies - rsp->gp_start, 1123 jiffies - rsp->gp_start,
1046 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1124 (long)rsp->gpnum, (long)rsp->completed, totqlen);
1047 if (!trigger_all_cpu_backtrace()) 1125 rcu_dump_cpu_stacks(rsp);
1048 dump_stack();
1049 1126
1050 raw_spin_lock_irqsave(&rnp->lock, flags); 1127 raw_spin_lock_irqsave(&rnp->lock, flags);
1051 if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) 1128 if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
@@ -1224,10 +1301,16 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1224 * believe that a grace period is in progress, then we must wait 1301 * believe that a grace period is in progress, then we must wait
1225 * for the one following, which is in "c". Because our request 1302 * for the one following, which is in "c". Because our request
1226 * will be noticed at the end of the current grace period, we don't 1303 * will be noticed at the end of the current grace period, we don't
1227 * need to explicitly start one. 1304 * need to explicitly start one. We only do the lockless check
1305 * of rnp_root's fields if the current rcu_node structure thinks
1306 * there is no grace period in flight, and because we hold rnp->lock,
1307 * the only possible change is when rnp_root's two fields are
1308 * equal, in which case rnp_root->gpnum might be concurrently
1309 * incremented. But that is OK, as it will just result in our
1310 * doing some extra useless work.
1228 */ 1311 */
1229 if (rnp->gpnum != rnp->completed || 1312 if (rnp->gpnum != rnp->completed ||
1230 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { 1313 ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
1231 rnp->need_future_gp[c & 0x1]++; 1314 rnp->need_future_gp[c & 0x1]++;
1232 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); 1315 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
1233 goto out; 1316 goto out;
@@ -1564,11 +1647,6 @@ static int rcu_gp_init(struct rcu_state *rsp)
1564 rnp->level, rnp->grplo, 1647 rnp->level, rnp->grplo,
1565 rnp->grphi, rnp->qsmask); 1648 rnp->grphi, rnp->qsmask);
1566 raw_spin_unlock_irq(&rnp->lock); 1649 raw_spin_unlock_irq(&rnp->lock);
1567#ifdef CONFIG_PROVE_RCU_DELAY
1568 if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 &&
1569 system_state == SYSTEM_RUNNING)
1570 udelay(200);
1571#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1572 cond_resched(); 1650 cond_resched();
1573 } 1651 }
1574 1652
@@ -2266,7 +2344,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2266 } 2344 }
2267 smp_mb(); /* List handling before counting for rcu_barrier(). */ 2345 smp_mb(); /* List handling before counting for rcu_barrier(). */
2268 rdp->qlen_lazy -= count_lazy; 2346 rdp->qlen_lazy -= count_lazy;
2269 ACCESS_ONCE(rdp->qlen) -= count; 2347 ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
2270 rdp->n_cbs_invoked += count; 2348 rdp->n_cbs_invoked += count;
2271 2349
2272 /* Reinstate batch limit if we have worked down the excess. */ 2350 /* Reinstate batch limit if we have worked down the excess. */
@@ -2404,14 +2482,14 @@ static void force_quiescent_state(struct rcu_state *rsp)
2404 struct rcu_node *rnp_old = NULL; 2482 struct rcu_node *rnp_old = NULL;
2405 2483
2406 /* Funnel through hierarchy to reduce memory contention. */ 2484 /* Funnel through hierarchy to reduce memory contention. */
2407 rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; 2485 rnp = __this_cpu_read(rsp->rda->mynode);
2408 for (; rnp != NULL; rnp = rnp->parent) { 2486 for (; rnp != NULL; rnp = rnp->parent) {
2409 ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || 2487 ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
2410 !raw_spin_trylock(&rnp->fqslock); 2488 !raw_spin_trylock(&rnp->fqslock);
2411 if (rnp_old != NULL) 2489 if (rnp_old != NULL)
2412 raw_spin_unlock(&rnp_old->fqslock); 2490 raw_spin_unlock(&rnp_old->fqslock);
2413 if (ret) { 2491 if (ret) {
2414 ACCESS_ONCE(rsp->n_force_qs_lh)++; 2492 rsp->n_force_qs_lh++;
2415 return; 2493 return;
2416 } 2494 }
2417 rnp_old = rnp; 2495 rnp_old = rnp;
@@ -2423,7 +2501,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
2423 smp_mb__after_unlock_lock(); 2501 smp_mb__after_unlock_lock();
2424 raw_spin_unlock(&rnp_old->fqslock); 2502 raw_spin_unlock(&rnp_old->fqslock);
2425 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2503 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
2426 ACCESS_ONCE(rsp->n_force_qs_lh)++; 2504 rsp->n_force_qs_lh++;
2427 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2505 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2428 return; /* Someone beat us to it. */ 2506 return; /* Someone beat us to it. */
2429 } 2507 }
@@ -2581,7 +2659,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2581 unsigned long flags; 2659 unsigned long flags;
2582 struct rcu_data *rdp; 2660 struct rcu_data *rdp;
2583 2661
2584 WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ 2662 WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
2585 if (debug_rcu_head_queue(head)) { 2663 if (debug_rcu_head_queue(head)) {
2586 /* Probable double call_rcu(), so leak the callback. */ 2664 /* Probable double call_rcu(), so leak the callback. */
2587 ACCESS_ONCE(head->func) = rcu_leak_callback; 2665 ACCESS_ONCE(head->func) = rcu_leak_callback;
@@ -2612,7 +2690,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2612 local_irq_restore(flags); 2690 local_irq_restore(flags);
2613 return; 2691 return;
2614 } 2692 }
2615 ACCESS_ONCE(rdp->qlen)++; 2693 ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
2616 if (lazy) 2694 if (lazy)
2617 rdp->qlen_lazy++; 2695 rdp->qlen_lazy++;
2618 else 2696 else
@@ -3176,7 +3254,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
3176 * ACCESS_ONCE() to prevent the compiler from speculating 3254 * ACCESS_ONCE() to prevent the compiler from speculating
3177 * the increment to precede the early-exit check. 3255 * the increment to precede the early-exit check.
3178 */ 3256 */
3179 ACCESS_ONCE(rsp->n_barrier_done)++; 3257 ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
3180 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); 3258 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
3181 _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); 3259 _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
3182 smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ 3260 smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
@@ -3226,7 +3304,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
3226 3304
3227 /* Increment ->n_barrier_done to prevent duplicate work. */ 3305 /* Increment ->n_barrier_done to prevent duplicate work. */
3228 smp_mb(); /* Keep increment after above mechanism. */ 3306 smp_mb(); /* Keep increment after above mechanism. */
3229 ACCESS_ONCE(rsp->n_barrier_done)++; 3307 ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
3230 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); 3308 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
3231 _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); 3309 _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
3232 smp_mb(); /* Keep increment before caller's subsequent code. */ 3310 smp_mb(); /* Keep increment before caller's subsequent code. */
@@ -3483,14 +3561,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
3483static void __init rcu_init_one(struct rcu_state *rsp, 3561static void __init rcu_init_one(struct rcu_state *rsp,
3484 struct rcu_data __percpu *rda) 3562 struct rcu_data __percpu *rda)
3485{ 3563{
3486 static char *buf[] = { "rcu_node_0", 3564 static const char * const buf[] = {
3487 "rcu_node_1", 3565 "rcu_node_0",
3488 "rcu_node_2", 3566 "rcu_node_1",
3489 "rcu_node_3" }; /* Match MAX_RCU_LVLS */ 3567 "rcu_node_2",
3490 static char *fqs[] = { "rcu_node_fqs_0", 3568 "rcu_node_3" }; /* Match MAX_RCU_LVLS */
3491 "rcu_node_fqs_1", 3569 static const char * const fqs[] = {
3492 "rcu_node_fqs_2", 3570 "rcu_node_fqs_0",
3493 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ 3571 "rcu_node_fqs_1",
3572 "rcu_node_fqs_2",
3573 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
3574 static u8 fl_mask = 0x1;
3494 int cpustride = 1; 3575 int cpustride = 1;
3495 int i; 3576 int i;
3496 int j; 3577 int j;
@@ -3509,6 +3590,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3509 for (i = 1; i < rcu_num_lvls; i++) 3590 for (i = 1; i < rcu_num_lvls; i++)
3510 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; 3591 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
3511 rcu_init_levelspread(rsp); 3592 rcu_init_levelspread(rsp);
3593 rsp->flavor_mask = fl_mask;
3594 fl_mask <<= 1;
3512 3595
3513 /* Initialize the elements themselves, starting from the leaves. */ 3596 /* Initialize the elements themselves, starting from the leaves. */
3514 3597
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bf2c1e669691..71e64c718f75 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -172,6 +172,14 @@ struct rcu_node {
172 /* queued on this rcu_node structure that */ 172 /* queued on this rcu_node structure that */
173 /* are blocking the current grace period, */ 173 /* are blocking the current grace period, */
174 /* there can be no such task. */ 174 /* there can be no such task. */
175 struct completion boost_completion;
176 /* Used to ensure that the rt_mutex used */
177 /* to carry out the boosting is fully */
178 /* released with no future boostee accesses */
179 /* before that rt_mutex is re-initialized. */
180 struct rt_mutex boost_mtx;
181 /* Used only for the priority-boosting */
182 /* side effect, not as a lock. */
175 unsigned long boost_time; 183 unsigned long boost_time;
176 /* When to start boosting (jiffies). */ 184 /* When to start boosting (jiffies). */
177 struct task_struct *boost_kthread_task; 185 struct task_struct *boost_kthread_task;
@@ -307,6 +315,9 @@ struct rcu_data {
307 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 315 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
308 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ 316 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
309 unsigned long offline_fqs; /* Kicked due to being offline. */ 317 unsigned long offline_fqs; /* Kicked due to being offline. */
318 unsigned long cond_resched_completed;
319 /* Grace period that needs help */
320 /* from cond_resched(). */
310 321
311 /* 5) __rcu_pending() statistics. */ 322 /* 5) __rcu_pending() statistics. */
312 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 323 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
@@ -331,11 +342,29 @@ struct rcu_data {
331 struct rcu_head **nocb_tail; 342 struct rcu_head **nocb_tail;
332 atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ 343 atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
333 atomic_long_t nocb_q_count_lazy; /* (approximate). */ 344 atomic_long_t nocb_q_count_lazy; /* (approximate). */
345 struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
346 struct rcu_head **nocb_follower_tail;
347 atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
348 atomic_long_t nocb_follower_count_lazy; /* (approximate). */
334 int nocb_p_count; /* # CBs being invoked by kthread */ 349 int nocb_p_count; /* # CBs being invoked by kthread */
335 int nocb_p_count_lazy; /* (approximate). */ 350 int nocb_p_count_lazy; /* (approximate). */
336 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ 351 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
337 struct task_struct *nocb_kthread; 352 struct task_struct *nocb_kthread;
338 bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ 353 bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
354
355 /* The following fields are used by the leader, hence own cacheline. */
356 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
357 /* CBs waiting for GP. */
358 struct rcu_head **nocb_gp_tail;
359 long nocb_gp_count;
360 long nocb_gp_count_lazy;
361 bool nocb_leader_wake; /* Is the nocb leader thread awake? */
362 struct rcu_data *nocb_next_follower;
363 /* Next follower in wakeup chain. */
364
365 /* The following fields are used by the follower, hence new cachline. */
366 struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp;
367 /* Leader CPU takes GP-end wakeups. */
339#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 368#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
340 369
341 /* 8) RCU CPU stall data. */ 370 /* 8) RCU CPU stall data. */
@@ -392,6 +421,7 @@ struct rcu_state {
392 struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ 421 struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
393 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 422 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
394 u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ 423 u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
424 u8 flavor_mask; /* bit in flavor mask. */
395 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 425 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
396 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 426 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
397 void (*func)(struct rcu_head *head)); 427 void (*func)(struct rcu_head *head));
@@ -563,7 +593,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
563static void do_nocb_deferred_wakeup(struct rcu_data *rdp); 593static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
564static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); 594static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
565static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 595static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
566static void rcu_kick_nohz_cpu(int cpu); 596static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
567static bool init_nocb_callback_list(struct rcu_data *rdp); 597static bool init_nocb_callback_list(struct rcu_data *rdp);
568static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); 598static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
569static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); 599static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
@@ -583,8 +613,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
583/* Sum up queue lengths for tracing. */ 613/* Sum up queue lengths for tracing. */
584static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) 614static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
585{ 615{
586 *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; 616 *ql = atomic_long_read(&rdp->nocb_q_count) +
587 *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; 617 rdp->nocb_p_count +
618 atomic_long_read(&rdp->nocb_follower_count) +
619 rdp->nocb_p_count + rdp->nocb_gp_count;
620 *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
621 rdp->nocb_p_count_lazy +
622 atomic_long_read(&rdp->nocb_follower_count_lazy) +
623 rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
588} 624}
589#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 625#else /* #ifdef CONFIG_RCU_NOCB_CPU */
590static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) 626static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index cbc2c45265e2..00dc411e9676 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -33,6 +33,7 @@
33#define RCU_KTHREAD_PRIO 1 33#define RCU_KTHREAD_PRIO 1
34 34
35#ifdef CONFIG_RCU_BOOST 35#ifdef CONFIG_RCU_BOOST
36#include "../locking/rtmutex_common.h"
36#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO 37#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
37#else 38#else
38#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO 39#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
@@ -336,7 +337,7 @@ void rcu_read_unlock_special(struct task_struct *t)
336 unsigned long flags; 337 unsigned long flags;
337 struct list_head *np; 338 struct list_head *np;
338#ifdef CONFIG_RCU_BOOST 339#ifdef CONFIG_RCU_BOOST
339 struct rt_mutex *rbmp = NULL; 340 bool drop_boost_mutex = false;
340#endif /* #ifdef CONFIG_RCU_BOOST */ 341#endif /* #ifdef CONFIG_RCU_BOOST */
341 struct rcu_node *rnp; 342 struct rcu_node *rnp;
342 int special; 343 int special;
@@ -398,11 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t)
398#ifdef CONFIG_RCU_BOOST 399#ifdef CONFIG_RCU_BOOST
399 if (&t->rcu_node_entry == rnp->boost_tasks) 400 if (&t->rcu_node_entry == rnp->boost_tasks)
400 rnp->boost_tasks = np; 401 rnp->boost_tasks = np;
401 /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ 402 /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
402 if (t->rcu_boost_mutex) { 403 drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
403 rbmp = t->rcu_boost_mutex;
404 t->rcu_boost_mutex = NULL;
405 }
406#endif /* #ifdef CONFIG_RCU_BOOST */ 404#endif /* #ifdef CONFIG_RCU_BOOST */
407 405
408 /* 406 /*
@@ -427,8 +425,10 @@ void rcu_read_unlock_special(struct task_struct *t)
427 425
428#ifdef CONFIG_RCU_BOOST 426#ifdef CONFIG_RCU_BOOST
429 /* Unboost if we were boosted. */ 427 /* Unboost if we were boosted. */
430 if (rbmp) 428 if (drop_boost_mutex) {
431 rt_mutex_unlock(rbmp); 429 rt_mutex_unlock(&rnp->boost_mtx);
430 complete(&rnp->boost_completion);
431 }
432#endif /* #ifdef CONFIG_RCU_BOOST */ 432#endif /* #ifdef CONFIG_RCU_BOOST */
433 433
434 /* 434 /*
@@ -988,6 +988,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
988 988
989/* Because preemptible RCU does not exist, no quieting of tasks. */ 989/* Because preemptible RCU does not exist, no quieting of tasks. */
990static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 990static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
991 __releases(rnp->lock)
991{ 992{
992 raw_spin_unlock_irqrestore(&rnp->lock, flags); 993 raw_spin_unlock_irqrestore(&rnp->lock, flags);
993} 994}
@@ -1149,7 +1150,6 @@ static void rcu_wake_cond(struct task_struct *t, int status)
1149static int rcu_boost(struct rcu_node *rnp) 1150static int rcu_boost(struct rcu_node *rnp)
1150{ 1151{
1151 unsigned long flags; 1152 unsigned long flags;
1152 struct rt_mutex mtx;
1153 struct task_struct *t; 1153 struct task_struct *t;
1154 struct list_head *tb; 1154 struct list_head *tb;
1155 1155
@@ -1200,11 +1200,15 @@ static int rcu_boost(struct rcu_node *rnp)
1200 * section. 1200 * section.
1201 */ 1201 */
1202 t = container_of(tb, struct task_struct, rcu_node_entry); 1202 t = container_of(tb, struct task_struct, rcu_node_entry);
1203 rt_mutex_init_proxy_locked(&mtx, t); 1203 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
1204 t->rcu_boost_mutex = &mtx; 1204 init_completion(&rnp->boost_completion);
1205 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1205 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1206 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ 1206 /* Lock only for side effect: boosts task t's priority. */
1207 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 1207 rt_mutex_lock(&rnp->boost_mtx);
1208 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
1209
1210 /* Wait for boostee to be done w/boost_mtx before reinitializing. */
1211 wait_for_completion(&rnp->boost_completion);
1208 1212
1209 return ACCESS_ONCE(rnp->exp_tasks) != NULL || 1213 return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1210 ACCESS_ONCE(rnp->boost_tasks) != NULL; 1214 ACCESS_ONCE(rnp->boost_tasks) != NULL;
@@ -1256,6 +1260,7 @@ static int rcu_boost_kthread(void *arg)
1256 * about it going away. 1260 * about it going away.
1257 */ 1261 */
1258static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1262static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1263 __releases(rnp->lock)
1259{ 1264{
1260 struct task_struct *t; 1265 struct task_struct *t;
1261 1266
@@ -1491,6 +1496,7 @@ static void rcu_prepare_kthreads(int cpu)
1491#else /* #ifdef CONFIG_RCU_BOOST */ 1496#else /* #ifdef CONFIG_RCU_BOOST */
1492 1497
1493static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1498static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1499 __releases(rnp->lock)
1494{ 1500{
1495 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1501 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1496} 1502}
@@ -2060,6 +2066,22 @@ bool rcu_is_nocb_cpu(int cpu)
2060#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ 2066#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
2061 2067
2062/* 2068/*
2069 * Kick the leader kthread for this NOCB group.
2070 */
2071static void wake_nocb_leader(struct rcu_data *rdp, bool force)
2072{
2073 struct rcu_data *rdp_leader = rdp->nocb_leader;
2074
2075 if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
2076 return;
2077 if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) {
2078 /* Prior xchg orders against prior callback enqueue. */
2079 ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true;
2080 wake_up(&rdp_leader->nocb_wq);
2081 }
2082}
2083
2084/*
2063 * Enqueue the specified string of rcu_head structures onto the specified 2085 * Enqueue the specified string of rcu_head structures onto the specified
2064 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the 2086 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
2065 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy 2087 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
@@ -2093,7 +2115,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2093 len = atomic_long_read(&rdp->nocb_q_count); 2115 len = atomic_long_read(&rdp->nocb_q_count);
2094 if (old_rhpp == &rdp->nocb_head) { 2116 if (old_rhpp == &rdp->nocb_head) {
2095 if (!irqs_disabled_flags(flags)) { 2117 if (!irqs_disabled_flags(flags)) {
2096 wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */ 2118 /* ... if queue was empty ... */
2119 wake_nocb_leader(rdp, false);
2097 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2120 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2098 TPS("WakeEmpty")); 2121 TPS("WakeEmpty"));
2099 } else { 2122 } else {
@@ -2103,7 +2126,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2103 } 2126 }
2104 rdp->qlen_last_fqs_check = 0; 2127 rdp->qlen_last_fqs_check = 0;
2105 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 2128 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2106 wake_up_process(t); /* ... or if many callbacks queued. */ 2129 /* ... or if many callbacks queued. */
2130 wake_nocb_leader(rdp, true);
2107 rdp->qlen_last_fqs_check = LONG_MAX / 2; 2131 rdp->qlen_last_fqs_check = LONG_MAX / 2;
2108 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); 2132 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
2109 } else { 2133 } else {
@@ -2213,13 +2237,150 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2213} 2237}
2214 2238
2215/* 2239/*
2240 * Leaders come here to wait for additional callbacks to show up.
2241 * This function does not return until callbacks appear.
2242 */
2243static void nocb_leader_wait(struct rcu_data *my_rdp)
2244{
2245 bool firsttime = true;
2246 bool gotcbs;
2247 struct rcu_data *rdp;
2248 struct rcu_head **tail;
2249
2250wait_again:
2251
2252 /* Wait for callbacks to appear. */
2253 if (!rcu_nocb_poll) {
2254 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
2255 wait_event_interruptible(my_rdp->nocb_wq,
2256 ACCESS_ONCE(my_rdp->nocb_leader_wake));
2257 /* Memory barrier handled by smp_mb() calls below and repoll. */
2258 } else if (firsttime) {
2259 firsttime = false; /* Don't drown trace log with "Poll"! */
2260 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
2261 }
2262
2263 /*
2264 * Each pass through the following loop checks a follower for CBs.
2265 * We are our own first follower. Any CBs found are moved to
2266 * nocb_gp_head, where they await a grace period.
2267 */
2268 gotcbs = false;
2269 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
2270 rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
2271 if (!rdp->nocb_gp_head)
2272 continue; /* No CBs here, try next follower. */
2273
2274 /* Move callbacks to wait-for-GP list, which is empty. */
2275 ACCESS_ONCE(rdp->nocb_head) = NULL;
2276 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2277 rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
2278 rdp->nocb_gp_count_lazy =
2279 atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2280 gotcbs = true;
2281 }
2282
2283 /*
2284 * If there were no callbacks, sleep a bit, rescan after a
2285 * memory barrier, and go retry.
2286 */
2287 if (unlikely(!gotcbs)) {
2288 if (!rcu_nocb_poll)
2289 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
2290 "WokeEmpty");
2291 flush_signals(current);
2292 schedule_timeout_interruptible(1);
2293
2294 /* Rescan in case we were a victim of memory ordering. */
2295 my_rdp->nocb_leader_wake = false;
2296 smp_mb(); /* Ensure _wake false before scan. */
2297 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
2298 if (ACCESS_ONCE(rdp->nocb_head)) {
2299 /* Found CB, so short-circuit next wait. */
2300 my_rdp->nocb_leader_wake = true;
2301 break;
2302 }
2303 goto wait_again;
2304 }
2305
2306 /* Wait for one grace period. */
2307 rcu_nocb_wait_gp(my_rdp);
2308
2309 /*
2310 * We left ->nocb_leader_wake set to reduce cache thrashing.
2311 * We clear it now, but recheck for new callbacks while
2312 * traversing our follower list.
2313 */
2314 my_rdp->nocb_leader_wake = false;
2315 smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */
2316
2317 /* Each pass through the following loop wakes a follower, if needed. */
2318 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
2319 if (ACCESS_ONCE(rdp->nocb_head))
2320 my_rdp->nocb_leader_wake = true; /* No need to wait. */
2321 if (!rdp->nocb_gp_head)
2322 continue; /* No CBs, so no need to wake follower. */
2323
2324 /* Append callbacks to follower's "done" list. */
2325 tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
2326 *tail = rdp->nocb_gp_head;
2327 atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
2328 atomic_long_add(rdp->nocb_gp_count_lazy,
2329 &rdp->nocb_follower_count_lazy);
2330 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
2331 /*
2332 * List was empty, wake up the follower.
2333 * Memory barriers supplied by atomic_long_add().
2334 */
2335 wake_up(&rdp->nocb_wq);
2336 }
2337 }
2338
2339 /* If we (the leader) don't have CBs, go wait some more. */
2340 if (!my_rdp->nocb_follower_head)
2341 goto wait_again;
2342}
2343
2344/*
2345 * Followers come here to wait for additional callbacks to show up.
2346 * This function does not return until callbacks appear.
2347 */
2348static void nocb_follower_wait(struct rcu_data *rdp)
2349{
2350 bool firsttime = true;
2351
2352 for (;;) {
2353 if (!rcu_nocb_poll) {
2354 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2355 "FollowerSleep");
2356 wait_event_interruptible(rdp->nocb_wq,
2357 ACCESS_ONCE(rdp->nocb_follower_head));
2358 } else if (firsttime) {
2359 /* Don't drown trace log with "Poll"! */
2360 firsttime = false;
2361 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
2362 }
2363 if (smp_load_acquire(&rdp->nocb_follower_head)) {
2364 /* ^^^ Ensure CB invocation follows _head test. */
2365 return;
2366 }
2367 if (!rcu_nocb_poll)
2368 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2369 "WokeEmpty");
2370 flush_signals(current);
2371 schedule_timeout_interruptible(1);
2372 }
2373}
2374
2375/*
2216 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes 2376 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
2217 * callbacks queued by the corresponding no-CBs CPU. 2377 * callbacks queued by the corresponding no-CBs CPU, however, there is
2378 * an optional leader-follower relationship so that the grace-period
2379 * kthreads don't have to do quite so many wakeups.
2218 */ 2380 */
2219static int rcu_nocb_kthread(void *arg) 2381static int rcu_nocb_kthread(void *arg)
2220{ 2382{
2221 int c, cl; 2383 int c, cl;
2222 bool firsttime = 1;
2223 struct rcu_head *list; 2384 struct rcu_head *list;
2224 struct rcu_head *next; 2385 struct rcu_head *next;
2225 struct rcu_head **tail; 2386 struct rcu_head **tail;
@@ -2227,41 +2388,22 @@ static int rcu_nocb_kthread(void *arg)
2227 2388
2228 /* Each pass through this loop invokes one batch of callbacks */ 2389 /* Each pass through this loop invokes one batch of callbacks */
2229 for (;;) { 2390 for (;;) {
2230 /* If not polling, wait for next batch of callbacks. */ 2391 /* Wait for callbacks. */
2231 if (!rcu_nocb_poll) { 2392 if (rdp->nocb_leader == rdp)
2232 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2393 nocb_leader_wait(rdp);
2233 TPS("Sleep")); 2394 else
2234 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); 2395 nocb_follower_wait(rdp);
2235 /* Memory barrier provide by xchg() below. */ 2396
2236 } else if (firsttime) { 2397 /* Pull the ready-to-invoke callbacks onto local list. */
2237 firsttime = 0; 2398 list = ACCESS_ONCE(rdp->nocb_follower_head);
2238 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2399 BUG_ON(!list);
2239 TPS("Poll")); 2400 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
2240 } 2401 ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
2241 list = ACCESS_ONCE(rdp->nocb_head); 2402 tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
2242 if (!list) { 2403 c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
2243 if (!rcu_nocb_poll) 2404 cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
2244 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2405 rdp->nocb_p_count += c;
2245 TPS("WokeEmpty")); 2406 rdp->nocb_p_count_lazy += cl;
2246 schedule_timeout_interruptible(1);
2247 flush_signals(current);
2248 continue;
2249 }
2250 firsttime = 1;
2251 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2252 TPS("WokeNonEmpty"));
2253
2254 /*
2255 * Extract queued callbacks, update counts, and wait
2256 * for a grace period to elapse.
2257 */
2258 ACCESS_ONCE(rdp->nocb_head) = NULL;
2259 tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2260 c = atomic_long_xchg(&rdp->nocb_q_count, 0);
2261 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2262 ACCESS_ONCE(rdp->nocb_p_count) += c;
2263 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
2264 rcu_nocb_wait_gp(rdp);
2265 2407
2266 /* Each pass through the following loop invokes a callback. */ 2408 /* Each pass through the following loop invokes a callback. */
2267 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); 2409 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@ -2305,7 +2447,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2305 if (!rcu_nocb_need_deferred_wakeup(rdp)) 2447 if (!rcu_nocb_need_deferred_wakeup(rdp))
2306 return; 2448 return;
2307 ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; 2449 ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
2308 wake_up(&rdp->nocb_wq); 2450 wake_nocb_leader(rdp, false);
2309 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); 2451 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
2310} 2452}
2311 2453
@@ -2314,19 +2456,57 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2314{ 2456{
2315 rdp->nocb_tail = &rdp->nocb_head; 2457 rdp->nocb_tail = &rdp->nocb_head;
2316 init_waitqueue_head(&rdp->nocb_wq); 2458 init_waitqueue_head(&rdp->nocb_wq);
2459 rdp->nocb_follower_tail = &rdp->nocb_follower_head;
2317} 2460}
2318 2461
2319/* Create a kthread for each RCU flavor for each no-CBs CPU. */ 2462/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
2463static int rcu_nocb_leader_stride = -1;
2464module_param(rcu_nocb_leader_stride, int, 0444);
2465
2466/*
2467 * Create a kthread for each RCU flavor for each no-CBs CPU.
2468 * Also initialize leader-follower relationships.
2469 */
2320static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) 2470static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2321{ 2471{
2322 int cpu; 2472 int cpu;
2473 int ls = rcu_nocb_leader_stride;
2474 int nl = 0; /* Next leader. */
2323 struct rcu_data *rdp; 2475 struct rcu_data *rdp;
2476 struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
2477 struct rcu_data *rdp_prev = NULL;
2324 struct task_struct *t; 2478 struct task_struct *t;
2325 2479
2326 if (rcu_nocb_mask == NULL) 2480 if (rcu_nocb_mask == NULL)
2327 return; 2481 return;
2482#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL)
2483 if (tick_nohz_full_running)
2484 cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
2485#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */
2486 if (ls == -1) {
2487 ls = int_sqrt(nr_cpu_ids);
2488 rcu_nocb_leader_stride = ls;
2489 }
2490
2491 /*
2492 * Each pass through this loop sets up one rcu_data structure and
2493 * spawns one rcu_nocb_kthread().
2494 */
2328 for_each_cpu(cpu, rcu_nocb_mask) { 2495 for_each_cpu(cpu, rcu_nocb_mask) {
2329 rdp = per_cpu_ptr(rsp->rda, cpu); 2496 rdp = per_cpu_ptr(rsp->rda, cpu);
2497 if (rdp->cpu >= nl) {
2498 /* New leader, set up for followers & next leader. */
2499 nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
2500 rdp->nocb_leader = rdp;
2501 rdp_leader = rdp;
2502 } else {
2503 /* Another follower, link to previous leader. */
2504 rdp->nocb_leader = rdp_leader;
2505 rdp_prev->nocb_next_follower = rdp;
2506 }
2507 rdp_prev = rdp;
2508
2509 /* Spawn the kthread for this CPU. */
2330 t = kthread_run(rcu_nocb_kthread, rdp, 2510 t = kthread_run(rcu_nocb_kthread, rdp,
2331 "rcuo%c/%d", rsp->abbr, cpu); 2511 "rcuo%c/%d", rsp->abbr, cpu);
2332 BUG_ON(IS_ERR(t)); 2512 BUG_ON(IS_ERR(t));
@@ -2404,7 +2584,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
2404 * if an adaptive-ticks CPU is failing to respond to the current grace 2584 * if an adaptive-ticks CPU is failing to respond to the current grace
2405 * period and has not be idle from an RCU perspective, kick it. 2585 * period and has not be idle from an RCU perspective, kick it.
2406 */ 2586 */
2407static void rcu_kick_nohz_cpu(int cpu) 2587static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
2408{ 2588{
2409#ifdef CONFIG_NO_HZ_FULL 2589#ifdef CONFIG_NO_HZ_FULL
2410 if (tick_nohz_full_cpu(cpu)) 2590 if (tick_nohz_full_cpu(cpu))
@@ -2843,12 +3023,16 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
2843 */ 3023 */
2844static void rcu_bind_gp_kthread(void) 3024static void rcu_bind_gp_kthread(void)
2845{ 3025{
2846#ifdef CONFIG_NO_HZ_FULL 3026 int __maybe_unused cpu;
2847 int cpu = ACCESS_ONCE(tick_do_timer_cpu);
2848 3027
2849 if (cpu < 0 || cpu >= nr_cpu_ids) 3028 if (!tick_nohz_full_enabled())
2850 return; 3029 return;
2851 if (raw_smp_processor_id() != cpu) 3030#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
3031 cpu = tick_do_timer_cpu;
3032 if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu)
2852 set_cpus_allowed_ptr(current, cpumask_of(cpu)); 3033 set_cpus_allowed_ptr(current, cpumask_of(cpu));
2853#endif /* #ifdef CONFIG_NO_HZ_FULL */ 3034#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3035 if (!is_housekeeping_cpu(raw_smp_processor_id()))
3036 housekeeping_affine(current);
3037#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2854} 3038}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a2aeb4df0f60..4056d7992a6c 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -90,9 +90,6 @@ void __rcu_read_unlock(void)
90 } else { 90 } else {
91 barrier(); /* critical section before exit code. */ 91 barrier(); /* critical section before exit code. */
92 t->rcu_read_lock_nesting = INT_MIN; 92 t->rcu_read_lock_nesting = INT_MIN;
93#ifdef CONFIG_PROVE_RCU_DELAY
94 udelay(10); /* Make preemption more probable. */
95#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
96 barrier(); /* assign before ->rcu_read_unlock_special load */ 93 barrier(); /* assign before ->rcu_read_unlock_special load */
97 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 94 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
98 rcu_read_unlock_special(t); 95 rcu_read_unlock_special(t);
@@ -200,12 +197,12 @@ void wait_rcu_gp(call_rcu_func_t crf)
200EXPORT_SYMBOL_GPL(wait_rcu_gp); 197EXPORT_SYMBOL_GPL(wait_rcu_gp);
201 198
202#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD 199#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
203static inline void debug_init_rcu_head(struct rcu_head *head) 200void init_rcu_head(struct rcu_head *head)
204{ 201{
205 debug_object_init(head, &rcuhead_debug_descr); 202 debug_object_init(head, &rcuhead_debug_descr);
206} 203}
207 204
208static inline void debug_rcu_head_free(struct rcu_head *head) 205void destroy_rcu_head(struct rcu_head *head)
209{ 206{
210 debug_object_free(head, &rcuhead_debug_descr); 207 debug_object_free(head, &rcuhead_debug_descr);
211} 208}
@@ -350,21 +347,3 @@ static int __init check_cpu_stall_init(void)
350early_initcall(check_cpu_stall_init); 347early_initcall(check_cpu_stall_init);
351 348
352#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ 349#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
353
354/*
355 * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
356 */
357
358DEFINE_PER_CPU(int, rcu_cond_resched_count);
359
360/*
361 * Report a set of RCU quiescent states, for use by cond_resched()
362 * and friends. Out of line due to being called infrequently.
363 */
364void rcu_resched(void)
365{
366 preempt_disable();
367 __this_cpu_write(rcu_cond_resched_count, 0);
368 rcu_note_context_switch(smp_processor_id());
369 preempt_enable();
370}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3bdf01b494fe..1211575a2208 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -139,6 +139,8 @@ void update_rq_clock(struct rq *rq)
139 return; 139 return;
140 140
141 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 141 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
142 if (delta < 0)
143 return;
142 rq->clock += delta; 144 rq->clock += delta;
143 update_rq_clock_task(rq, delta); 145 update_rq_clock_task(rq, delta);
144} 146}
@@ -243,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
243 char buf[64]; 245 char buf[64];
244 char *cmp; 246 char *cmp;
245 int i; 247 int i;
248 struct inode *inode;
246 249
247 if (cnt > 63) 250 if (cnt > 63)
248 cnt = 63; 251 cnt = 63;
@@ -253,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
253 buf[cnt] = 0; 256 buf[cnt] = 0;
254 cmp = strstrip(buf); 257 cmp = strstrip(buf);
255 258
259 /* Ensure the static_key remains in a consistent state */
260 inode = file_inode(filp);
261 mutex_lock(&inode->i_mutex);
256 i = sched_feat_set(cmp); 262 i = sched_feat_set(cmp);
263 mutex_unlock(&inode->i_mutex);
257 if (i == __SCHED_FEAT_NR) 264 if (i == __SCHED_FEAT_NR)
258 return -EINVAL; 265 return -EINVAL;
259 266
@@ -587,30 +594,31 @@ static bool set_nr_if_polling(struct task_struct *p)
587#endif 594#endif
588 595
589/* 596/*
590 * resched_task - mark a task 'to be rescheduled now'. 597 * resched_curr - mark rq's current task 'to be rescheduled now'.
591 * 598 *
592 * On UP this means the setting of the need_resched flag, on SMP it 599 * On UP this means the setting of the need_resched flag, on SMP it
593 * might also involve a cross-CPU call to trigger the scheduler on 600 * might also involve a cross-CPU call to trigger the scheduler on
594 * the target CPU. 601 * the target CPU.
595 */ 602 */
596void resched_task(struct task_struct *p) 603void resched_curr(struct rq *rq)
597{ 604{
605 struct task_struct *curr = rq->curr;
598 int cpu; 606 int cpu;
599 607
600 lockdep_assert_held(&task_rq(p)->lock); 608 lockdep_assert_held(&rq->lock);
601 609
602 if (test_tsk_need_resched(p)) 610 if (test_tsk_need_resched(curr))
603 return; 611 return;
604 612
605 cpu = task_cpu(p); 613 cpu = cpu_of(rq);
606 614
607 if (cpu == smp_processor_id()) { 615 if (cpu == smp_processor_id()) {
608 set_tsk_need_resched(p); 616 set_tsk_need_resched(curr);
609 set_preempt_need_resched(); 617 set_preempt_need_resched();
610 return; 618 return;
611 } 619 }
612 620
613 if (set_nr_and_not_polling(p)) 621 if (set_nr_and_not_polling(curr))
614 smp_send_reschedule(cpu); 622 smp_send_reschedule(cpu);
615 else 623 else
616 trace_sched_wake_idle_without_ipi(cpu); 624 trace_sched_wake_idle_without_ipi(cpu);
@@ -623,7 +631,7 @@ void resched_cpu(int cpu)
623 631
624 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 632 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
625 return; 633 return;
626 resched_task(cpu_curr(cpu)); 634 resched_curr(rq);
627 raw_spin_unlock_irqrestore(&rq->lock, flags); 635 raw_spin_unlock_irqrestore(&rq->lock, flags);
628} 636}
629 637
@@ -684,10 +692,16 @@ static void wake_up_idle_cpu(int cpu)
684 692
685static bool wake_up_full_nohz_cpu(int cpu) 693static bool wake_up_full_nohz_cpu(int cpu)
686{ 694{
695 /*
696 * We just need the target to call irq_exit() and re-evaluate
697 * the next tick. The nohz full kick at least implies that.
698 * If needed we can still optimize that later with an
699 * empty IRQ.
700 */
687 if (tick_nohz_full_cpu(cpu)) { 701 if (tick_nohz_full_cpu(cpu)) {
688 if (cpu != smp_processor_id() || 702 if (cpu != smp_processor_id() ||
689 tick_nohz_tick_stopped()) 703 tick_nohz_tick_stopped())
690 smp_send_reschedule(cpu); 704 tick_nohz_full_kick_cpu(cpu);
691 return true; 705 return true;
692 } 706 }
693 707
@@ -730,18 +744,15 @@ static inline bool got_nohz_idle_kick(void)
730#ifdef CONFIG_NO_HZ_FULL 744#ifdef CONFIG_NO_HZ_FULL
731bool sched_can_stop_tick(void) 745bool sched_can_stop_tick(void)
732{ 746{
733 struct rq *rq; 747 /*
734 748 * More than one running task need preemption.
735 rq = this_rq(); 749 * nr_running update is assumed to be visible
736 750 * after IPI is sent from wakers.
737 /* Make sure rq->nr_running update is visible after the IPI */ 751 */
738 smp_rmb(); 752 if (this_rq()->nr_running > 1)
739 753 return false;
740 /* More than one running task need preemption */
741 if (rq->nr_running > 1)
742 return false;
743 754
744 return true; 755 return true;
745} 756}
746#endif /* CONFIG_NO_HZ_FULL */ 757#endif /* CONFIG_NO_HZ_FULL */
747 758
@@ -1022,7 +1033,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1022 if (class == rq->curr->sched_class) 1033 if (class == rq->curr->sched_class)
1023 break; 1034 break;
1024 if (class == p->sched_class) { 1035 if (class == p->sched_class) {
1025 resched_task(rq->curr); 1036 resched_curr(rq);
1026 break; 1037 break;
1027 } 1038 }
1028 } 1039 }
@@ -1568,9 +1579,7 @@ void scheduler_ipi(void)
1568 */ 1579 */
1569 preempt_fold_need_resched(); 1580 preempt_fold_need_resched();
1570 1581
1571 if (llist_empty(&this_rq()->wake_list) 1582 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1572 && !tick_nohz_full_cpu(smp_processor_id())
1573 && !got_nohz_idle_kick())
1574 return; 1583 return;
1575 1584
1576 /* 1585 /*
@@ -1587,7 +1596,6 @@ void scheduler_ipi(void)
1587 * somewhat pessimize the simple resched case. 1596 * somewhat pessimize the simple resched case.
1588 */ 1597 */
1589 irq_enter(); 1598 irq_enter();
1590 tick_nohz_full_check();
1591 sched_ttwu_pending(); 1599 sched_ttwu_pending();
1592 1600
1593 /* 1601 /*
@@ -2431,7 +2439,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2431{ 2439{
2432 u64 ns = 0; 2440 u64 ns = 0;
2433 2441
2434 if (task_current(rq, p)) { 2442 /*
2443 * Must be ->curr _and_ ->on_rq. If dequeued, we would
2444 * project cycles that may never be accounted to this
2445 * thread, breaking clock_gettime().
2446 */
2447 if (task_current(rq, p) && p->on_rq) {
2435 update_rq_clock(rq); 2448 update_rq_clock(rq);
2436 ns = rq_clock_task(rq) - p->se.exec_start; 2449 ns = rq_clock_task(rq) - p->se.exec_start;
2437 if ((s64)ns < 0) 2450 if ((s64)ns < 0)
@@ -2474,8 +2487,10 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2474 * If we race with it leaving cpu, we'll take a lock. So we're correct. 2487 * If we race with it leaving cpu, we'll take a lock. So we're correct.
2475 * If we race with it entering cpu, unaccounted time is 0. This is 2488 * If we race with it entering cpu, unaccounted time is 0. This is
2476 * indistinguishable from the read occurring a few cycles earlier. 2489 * indistinguishable from the read occurring a few cycles earlier.
2490 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
2491 * been accounted, so we're correct here as well.
2477 */ 2492 */
2478 if (!p->on_cpu) 2493 if (!p->on_cpu || !p->on_rq)
2479 return p->se.sum_exec_runtime; 2494 return p->se.sum_exec_runtime;
2480#endif 2495#endif
2481 2496
@@ -2971,7 +2986,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2971 } 2986 }
2972 2987
2973 trace_sched_pi_setprio(p, prio); 2988 trace_sched_pi_setprio(p, prio);
2974 p->pi_top_task = rt_mutex_get_top_task(p);
2975 oldprio = p->prio; 2989 oldprio = p->prio;
2976 prev_class = p->sched_class; 2990 prev_class = p->sched_class;
2977 on_rq = p->on_rq; 2991 on_rq = p->on_rq;
@@ -2991,8 +3005,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2991 * running task 3005 * running task
2992 */ 3006 */
2993 if (dl_prio(prio)) { 3007 if (dl_prio(prio)) {
2994 if (!dl_prio(p->normal_prio) || (p->pi_top_task && 3008 struct task_struct *pi_task = rt_mutex_get_top_task(p);
2995 dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { 3009 if (!dl_prio(p->normal_prio) ||
3010 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
2996 p->dl.dl_boosted = 1; 3011 p->dl.dl_boosted = 1;
2997 p->dl.dl_throttled = 0; 3012 p->dl.dl_throttled = 0;
2998 enqueue_flag = ENQUEUE_REPLENISH; 3013 enqueue_flag = ENQUEUE_REPLENISH;
@@ -3064,7 +3079,7 @@ void set_user_nice(struct task_struct *p, long nice)
3064 * lowered its priority, then reschedule its CPU: 3079 * lowered its priority, then reschedule its CPU:
3065 */ 3080 */
3066 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3081 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3067 resched_task(rq->curr); 3082 resched_curr(rq);
3068 } 3083 }
3069out_unlock: 3084out_unlock:
3070 task_rq_unlock(rq, p, &flags); 3085 task_rq_unlock(rq, p, &flags);
@@ -3203,12 +3218,18 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3203 dl_se->dl_yielded = 0; 3218 dl_se->dl_yielded = 0;
3204} 3219}
3205 3220
3221/*
3222 * sched_setparam() passes in -1 for its policy, to let the functions
3223 * it calls know not to change it.
3224 */
3225#define SETPARAM_POLICY -1
3226
3206static void __setscheduler_params(struct task_struct *p, 3227static void __setscheduler_params(struct task_struct *p,
3207 const struct sched_attr *attr) 3228 const struct sched_attr *attr)
3208{ 3229{
3209 int policy = attr->sched_policy; 3230 int policy = attr->sched_policy;
3210 3231
3211 if (policy == -1) /* setparam */ 3232 if (policy == SETPARAM_POLICY)
3212 policy = p->policy; 3233 policy = p->policy;
3213 3234
3214 p->policy = policy; 3235 p->policy = policy;
@@ -3557,10 +3578,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
3557 .sched_nice = PRIO_TO_NICE(p->static_prio), 3578 .sched_nice = PRIO_TO_NICE(p->static_prio),
3558 }; 3579 };
3559 3580
3560 /* 3581 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
3561 * Fixup the legacy SCHED_RESET_ON_FORK hack 3582 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
3562 */
3563 if (policy & SCHED_RESET_ON_FORK) {
3564 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 3583 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3565 policy &= ~SCHED_RESET_ON_FORK; 3584 policy &= ~SCHED_RESET_ON_FORK;
3566 attr.sched_policy = policy; 3585 attr.sched_policy = policy;
@@ -3730,7 +3749,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3730 */ 3749 */
3731SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3750SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3732{ 3751{
3733 return do_sched_setscheduler(pid, -1, param); 3752 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
3734} 3753}
3735 3754
3736/** 3755/**
@@ -4147,7 +4166,6 @@ static void __cond_resched(void)
4147 4166
4148int __sched _cond_resched(void) 4167int __sched _cond_resched(void)
4149{ 4168{
4150 rcu_cond_resched();
4151 if (should_resched()) { 4169 if (should_resched()) {
4152 __cond_resched(); 4170 __cond_resched();
4153 return 1; 4171 return 1;
@@ -4166,18 +4184,15 @@ EXPORT_SYMBOL(_cond_resched);
4166 */ 4184 */
4167int __cond_resched_lock(spinlock_t *lock) 4185int __cond_resched_lock(spinlock_t *lock)
4168{ 4186{
4169 bool need_rcu_resched = rcu_should_resched();
4170 int resched = should_resched(); 4187 int resched = should_resched();
4171 int ret = 0; 4188 int ret = 0;
4172 4189
4173 lockdep_assert_held(lock); 4190 lockdep_assert_held(lock);
4174 4191
4175 if (spin_needbreak(lock) || resched || need_rcu_resched) { 4192 if (spin_needbreak(lock) || resched) {
4176 spin_unlock(lock); 4193 spin_unlock(lock);
4177 if (resched) 4194 if (resched)
4178 __cond_resched(); 4195 __cond_resched();
4179 else if (unlikely(need_rcu_resched))
4180 rcu_resched();
4181 else 4196 else
4182 cpu_relax(); 4197 cpu_relax();
4183 ret = 1; 4198 ret = 1;
@@ -4191,7 +4206,6 @@ int __sched __cond_resched_softirq(void)
4191{ 4206{
4192 BUG_ON(!in_softirq()); 4207 BUG_ON(!in_softirq());
4193 4208
4194 rcu_cond_resched(); /* BH disabled OK, just recording QSes. */
4195 if (should_resched()) { 4209 if (should_resched()) {
4196 local_bh_enable(); 4210 local_bh_enable();
4197 __cond_resched(); 4211 __cond_resched();
@@ -4290,7 +4304,7 @@ again:
4290 * fairness. 4304 * fairness.
4291 */ 4305 */
4292 if (preempt && rq != p_rq) 4306 if (preempt && rq != p_rq)
4293 resched_task(p_rq->curr); 4307 resched_curr(p_rq);
4294 } 4308 }
4295 4309
4296out_unlock: 4310out_unlock:
@@ -6470,6 +6484,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6470 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6484 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6471 child->parent = sd; 6485 child->parent = sd;
6472 sd->child = child; 6486 sd->child = child;
6487
6488 if (!cpumask_subset(sched_domain_span(child),
6489 sched_domain_span(sd))) {
6490 pr_err("BUG: arch topology borken\n");
6491#ifdef CONFIG_SCHED_DEBUG
6492 pr_err(" the %s domain not a subset of the %s domain\n",
6493 child->name, sd->name);
6494#endif
6495 /* Fixup, ensure @sd has at least @child cpus. */
6496 cpumask_or(sched_domain_span(sd),
6497 sched_domain_span(sd),
6498 sched_domain_span(child));
6499 }
6500
6473 } 6501 }
6474 set_domain_attribute(sd, attr); 6502 set_domain_attribute(sd, attr);
6475 6503
@@ -7097,7 +7125,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7097 __setscheduler(rq, p, &attr); 7125 __setscheduler(rq, p, &attr);
7098 if (on_rq) { 7126 if (on_rq) {
7099 enqueue_task(rq, p, 0); 7127 enqueue_task(rq, p, 0);
7100 resched_task(rq->curr); 7128 resched_curr(rq);
7101 } 7129 }
7102 7130
7103 check_class_changed(rq, p, prev_class, old_prio); 7131 check_class_changed(rq, p, prev_class, old_prio);
@@ -7808,6 +7836,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7808 if (period > max_cfs_quota_period) 7836 if (period > max_cfs_quota_period)
7809 return -EINVAL; 7837 return -EINVAL;
7810 7838
7839 /*
7840 * Prevent race between setting of cfs_rq->runtime_enabled and
7841 * unthrottle_offline_cfs_rqs().
7842 */
7843 get_online_cpus();
7811 mutex_lock(&cfs_constraints_mutex); 7844 mutex_lock(&cfs_constraints_mutex);
7812 ret = __cfs_schedulable(tg, period, quota); 7845 ret = __cfs_schedulable(tg, period, quota);
7813 if (ret) 7846 if (ret)
@@ -7833,7 +7866,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7833 } 7866 }
7834 raw_spin_unlock_irq(&cfs_b->lock); 7867 raw_spin_unlock_irq(&cfs_b->lock);
7835 7868
7836 for_each_possible_cpu(i) { 7869 for_each_online_cpu(i) {
7837 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7870 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7838 struct rq *rq = cfs_rq->rq; 7871 struct rq *rq = cfs_rq->rq;
7839 7872
@@ -7849,6 +7882,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7849 cfs_bandwidth_usage_dec(); 7882 cfs_bandwidth_usage_dec();
7850out_unlock: 7883out_unlock:
7851 mutex_unlock(&cfs_constraints_mutex); 7884 mutex_unlock(&cfs_constraints_mutex);
7885 put_online_cpus();
7852 7886
7853 return ret; 7887 return ret;
7854} 7888}
@@ -8088,7 +8122,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
8088 .can_attach = cpu_cgroup_can_attach, 8122 .can_attach = cpu_cgroup_can_attach,
8089 .attach = cpu_cgroup_attach, 8123 .attach = cpu_cgroup_attach,
8090 .exit = cpu_cgroup_exit, 8124 .exit = cpu_cgroup_exit,
8091 .base_cftypes = cpu_files, 8125 .legacy_cftypes = cpu_files,
8092 .early_init = 1, 8126 .early_init = 1,
8093}; 8127};
8094 8128
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9cf350c94ec4..dd7cbb55bbf2 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -278,6 +278,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
278struct cgroup_subsys cpuacct_cgrp_subsys = { 278struct cgroup_subsys cpuacct_cgrp_subsys = {
279 .css_alloc = cpuacct_css_alloc, 279 .css_alloc = cpuacct_css_alloc,
280 .css_free = cpuacct_css_free, 280 .css_free = cpuacct_css_free,
281 .base_cftypes = files, 281 .legacy_cftypes = files,
282 .early_init = 1, 282 .early_init = 1,
283}; 283};
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fc4f98b1258f..255ce138b652 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -306,7 +306,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
306 * the overrunning entity can't interfere with other entity in the system and 306 * the overrunning entity can't interfere with other entity in the system and
307 * can't make them miss their deadlines. Reasons why this kind of overruns 307 * can't make them miss their deadlines. Reasons why this kind of overruns
308 * could happen are, typically, a entity voluntarily trying to overcome its 308 * could happen are, typically, a entity voluntarily trying to overcome its
309 * runtime, or it just underestimated it during sched_setscheduler_ex(). 309 * runtime, or it just underestimated it during sched_setattr().
310 */ 310 */
311static void replenish_dl_entity(struct sched_dl_entity *dl_se, 311static void replenish_dl_entity(struct sched_dl_entity *dl_se,
312 struct sched_dl_entity *pi_se) 312 struct sched_dl_entity *pi_se)
@@ -535,7 +535,7 @@ again:
535 if (task_has_dl_policy(rq->curr)) 535 if (task_has_dl_policy(rq->curr))
536 check_preempt_curr_dl(rq, p, 0); 536 check_preempt_curr_dl(rq, p, 0);
537 else 537 else
538 resched_task(rq->curr); 538 resched_curr(rq);
539#ifdef CONFIG_SMP 539#ifdef CONFIG_SMP
540 /* 540 /*
541 * Queueing this task back might have overloaded rq, 541 * Queueing this task back might have overloaded rq,
@@ -634,7 +634,7 @@ static void update_curr_dl(struct rq *rq)
634 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); 634 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
635 635
636 if (!is_leftmost(curr, &rq->dl)) 636 if (!is_leftmost(curr, &rq->dl))
637 resched_task(curr); 637 resched_curr(rq);
638 } 638 }
639 639
640 /* 640 /*
@@ -964,7 +964,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
964 cpudl_find(&rq->rd->cpudl, p, NULL) != -1) 964 cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
965 return; 965 return;
966 966
967 resched_task(rq->curr); 967 resched_curr(rq);
968} 968}
969 969
970static int pull_dl_task(struct rq *this_rq); 970static int pull_dl_task(struct rq *this_rq);
@@ -979,7 +979,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
979 int flags) 979 int flags)
980{ 980{
981 if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { 981 if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
982 resched_task(rq->curr); 982 resched_curr(rq);
983 return; 983 return;
984 } 984 }
985 985
@@ -1333,7 +1333,7 @@ retry:
1333 if (dl_task(rq->curr) && 1333 if (dl_task(rq->curr) &&
1334 dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && 1334 dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
1335 rq->curr->nr_cpus_allowed > 1) { 1335 rq->curr->nr_cpus_allowed > 1) {
1336 resched_task(rq->curr); 1336 resched_curr(rq);
1337 return 0; 1337 return 0;
1338 } 1338 }
1339 1339
@@ -1373,7 +1373,7 @@ retry:
1373 set_task_cpu(next_task, later_rq->cpu); 1373 set_task_cpu(next_task, later_rq->cpu);
1374 activate_task(later_rq, next_task, 0); 1374 activate_task(later_rq, next_task, 0);
1375 1375
1376 resched_task(later_rq->curr); 1376 resched_curr(later_rq);
1377 1377
1378 double_unlock_balance(rq, later_rq); 1378 double_unlock_balance(rq, later_rq);
1379 1379
@@ -1632,14 +1632,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
1632 */ 1632 */
1633 if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && 1633 if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
1634 rq->curr == p) 1634 rq->curr == p)
1635 resched_task(p); 1635 resched_curr(rq);
1636#else 1636#else
1637 /* 1637 /*
1638 * Again, we don't know if p has a earlier 1638 * Again, we don't know if p has a earlier
1639 * or later deadline, so let's blindly set a 1639 * or later deadline, so let's blindly set a
1640 * (maybe not needed) rescheduling point. 1640 * (maybe not needed) rescheduling point.
1641 */ 1641 */
1642 resched_task(p); 1642 resched_curr(rq);
1643#endif /* CONFIG_SMP */ 1643#endif /* CONFIG_SMP */
1644 } else 1644 } else
1645 switched_to_dl(rq, p); 1645 switched_to_dl(rq, p);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 695f9773bb60..627b3c34b821 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
608 608
609 avg_atom = p->se.sum_exec_runtime; 609 avg_atom = p->se.sum_exec_runtime;
610 if (nr_switches) 610 if (nr_switches)
611 do_div(avg_atom, nr_switches); 611 avg_atom = div64_ul(avg_atom, nr_switches);
612 else 612 else
613 avg_atom = -1LL; 613 avg_atom = -1LL;
614 614
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fea7d3335e1f..bfa3c86d0d68 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1062 if (!cpus) 1062 if (!cpus)
1063 return; 1063 return;
1064 1064
1065 ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
1066 ns->task_capacity = 1065 ns->task_capacity =
1067 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); 1066 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
1068 ns->has_free_capacity = (ns->nr_running < ns->task_capacity); 1067 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
@@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env,
1096 env->best_cpu = env->dst_cpu; 1095 env->best_cpu = env->dst_cpu;
1097} 1096}
1098 1097
1099static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, 1098static bool load_too_imbalanced(long src_load, long dst_load,
1100 long src_load, long dst_load,
1101 struct task_numa_env *env) 1099 struct task_numa_env *env)
1102{ 1100{
1103 long imb, old_imb; 1101 long imb, old_imb;
1102 long orig_src_load, orig_dst_load;
1103 long src_capacity, dst_capacity;
1104
1105 /*
1106 * The load is corrected for the CPU capacity available on each node.
1107 *
1108 * src_load dst_load
1109 * ------------ vs ---------
1110 * src_capacity dst_capacity
1111 */
1112 src_capacity = env->src_stats.compute_capacity;
1113 dst_capacity = env->dst_stats.compute_capacity;
1104 1114
1105 /* We care about the slope of the imbalance, not the direction. */ 1115 /* We care about the slope of the imbalance, not the direction. */
1106 if (dst_load < src_load) 1116 if (dst_load < src_load)
1107 swap(dst_load, src_load); 1117 swap(dst_load, src_load);
1108 1118
1109 /* Is the difference below the threshold? */ 1119 /* Is the difference below the threshold? */
1110 imb = dst_load * 100 - src_load * env->imbalance_pct; 1120 imb = dst_load * src_capacity * 100 -
1121 src_load * dst_capacity * env->imbalance_pct;
1111 if (imb <= 0) 1122 if (imb <= 0)
1112 return false; 1123 return false;
1113 1124
@@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
1115 * The imbalance is above the allowed threshold. 1126 * The imbalance is above the allowed threshold.
1116 * Compare it with the old imbalance. 1127 * Compare it with the old imbalance.
1117 */ 1128 */
1129 orig_src_load = env->src_stats.load;
1130 orig_dst_load = env->dst_stats.load;
1131
1118 if (orig_dst_load < orig_src_load) 1132 if (orig_dst_load < orig_src_load)
1119 swap(orig_dst_load, orig_src_load); 1133 swap(orig_dst_load, orig_src_load);
1120 1134
1121 old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; 1135 old_imb = orig_dst_load * src_capacity * 100 -
1136 orig_src_load * dst_capacity * env->imbalance_pct;
1122 1137
1123 /* Would this change make things worse? */ 1138 /* Would this change make things worse? */
1124 return (imb > old_imb); 1139 return (imb > old_imb);
@@ -1136,10 +1151,10 @@ static void task_numa_compare(struct task_numa_env *env,
1136 struct rq *src_rq = cpu_rq(env->src_cpu); 1151 struct rq *src_rq = cpu_rq(env->src_cpu);
1137 struct rq *dst_rq = cpu_rq(env->dst_cpu); 1152 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1138 struct task_struct *cur; 1153 struct task_struct *cur;
1139 long orig_src_load, src_load; 1154 long src_load, dst_load;
1140 long orig_dst_load, dst_load;
1141 long load; 1155 long load;
1142 long imp = (groupimp > 0) ? groupimp : taskimp; 1156 long imp = env->p->numa_group ? groupimp : taskimp;
1157 long moveimp = imp;
1143 1158
1144 rcu_read_lock(); 1159 rcu_read_lock();
1145 cur = ACCESS_ONCE(dst_rq->curr); 1160 cur = ACCESS_ONCE(dst_rq->curr);
@@ -1177,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env,
1177 * itself (not part of a group), use the task weight 1192 * itself (not part of a group), use the task weight
1178 * instead. 1193 * instead.
1179 */ 1194 */
1180 if (env->p->numa_group)
1181 imp = groupimp;
1182 else
1183 imp = taskimp;
1184
1185 if (cur->numa_group) 1195 if (cur->numa_group)
1186 imp += group_weight(cur, env->src_nid) - 1196 imp += group_weight(cur, env->src_nid) -
1187 group_weight(cur, env->dst_nid); 1197 group_weight(cur, env->dst_nid);
@@ -1191,7 +1201,7 @@ static void task_numa_compare(struct task_numa_env *env,
1191 } 1201 }
1192 } 1202 }
1193 1203
1194 if (imp < env->best_imp) 1204 if (imp <= env->best_imp && moveimp <= env->best_imp)
1195 goto unlock; 1205 goto unlock;
1196 1206
1197 if (!cur) { 1207 if (!cur) {
@@ -1204,20 +1214,34 @@ static void task_numa_compare(struct task_numa_env *env,
1204 } 1214 }
1205 1215
1206 /* Balance doesn't matter much if we're running a task per cpu */ 1216 /* Balance doesn't matter much if we're running a task per cpu */
1207 if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) 1217 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1218 dst_rq->nr_running == 1)
1208 goto assign; 1219 goto assign;
1209 1220
1210 /* 1221 /*
1211 * In the overloaded case, try and keep the load balanced. 1222 * In the overloaded case, try and keep the load balanced.
1212 */ 1223 */
1213balance: 1224balance:
1214 orig_dst_load = env->dst_stats.load;
1215 orig_src_load = env->src_stats.load;
1216
1217 /* XXX missing capacity terms */
1218 load = task_h_load(env->p); 1225 load = task_h_load(env->p);
1219 dst_load = orig_dst_load + load; 1226 dst_load = env->dst_stats.load + load;
1220 src_load = orig_src_load - load; 1227 src_load = env->src_stats.load - load;
1228
1229 if (moveimp > imp && moveimp > env->best_imp) {
1230 /*
1231 * If the improvement from just moving env->p direction is
1232 * better than swapping tasks around, check if a move is
1233 * possible. Store a slightly smaller score than moveimp,
1234 * so an actually idle CPU will win.
1235 */
1236 if (!load_too_imbalanced(src_load, dst_load, env)) {
1237 imp = moveimp - 1;
1238 cur = NULL;
1239 goto assign;
1240 }
1241 }
1242
1243 if (imp <= env->best_imp)
1244 goto unlock;
1221 1245
1222 if (cur) { 1246 if (cur) {
1223 load = task_h_load(cur); 1247 load = task_h_load(cur);
@@ -1225,8 +1249,7 @@ balance:
1225 src_load += load; 1249 src_load += load;
1226 } 1250 }
1227 1251
1228 if (load_too_imbalanced(orig_src_load, orig_dst_load, 1252 if (load_too_imbalanced(src_load, dst_load, env))
1229 src_load, dst_load, env))
1230 goto unlock; 1253 goto unlock;
1231 1254
1232assign: 1255assign:
@@ -1302,9 +1325,8 @@ static int task_numa_migrate(struct task_struct *p)
1302 groupimp = group_weight(p, env.dst_nid) - groupweight; 1325 groupimp = group_weight(p, env.dst_nid) - groupweight;
1303 update_numa_stats(&env.dst_stats, env.dst_nid); 1326 update_numa_stats(&env.dst_stats, env.dst_nid);
1304 1327
1305 /* If the preferred nid has free capacity, try to use it. */ 1328 /* Try to find a spot on the preferred nid. */
1306 if (env.dst_stats.has_free_capacity) 1329 task_numa_find_cpu(&env, taskimp, groupimp);
1307 task_numa_find_cpu(&env, taskimp, groupimp);
1308 1330
1309 /* No space available on the preferred nid. Look elsewhere. */ 1331 /* No space available on the preferred nid. Look elsewhere. */
1310 if (env.best_cpu == -1) { 1332 if (env.best_cpu == -1) {
@@ -1324,10 +1346,6 @@ static int task_numa_migrate(struct task_struct *p)
1324 } 1346 }
1325 } 1347 }
1326 1348
1327 /* No better CPU than the current one was found. */
1328 if (env.best_cpu == -1)
1329 return -EAGAIN;
1330
1331 /* 1349 /*
1332 * If the task is part of a workload that spans multiple NUMA nodes, 1350 * If the task is part of a workload that spans multiple NUMA nodes,
1333 * and is migrating into one of the workload's active nodes, remember 1351 * and is migrating into one of the workload's active nodes, remember
@@ -1336,8 +1354,19 @@ static int task_numa_migrate(struct task_struct *p)
1336 * A task that migrated to a second choice node will be better off 1354 * A task that migrated to a second choice node will be better off
1337 * trying for a better one later. Do not set the preferred node here. 1355 * trying for a better one later. Do not set the preferred node here.
1338 */ 1356 */
1339 if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) 1357 if (p->numa_group) {
1340 sched_setnuma(p, env.dst_nid); 1358 if (env.best_cpu == -1)
1359 nid = env.src_nid;
1360 else
1361 nid = env.dst_nid;
1362
1363 if (node_isset(nid, p->numa_group->active_nodes))
1364 sched_setnuma(p, env.dst_nid);
1365 }
1366
1367 /* No better CPU than the current one was found. */
1368 if (env.best_cpu == -1)
1369 return -EAGAIN;
1341 1370
1342 /* 1371 /*
1343 * Reset the scan period if the task is being rescheduled on an 1372 * Reset the scan period if the task is being rescheduled on an
@@ -1415,12 +1444,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
1415/* 1444/*
1416 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS 1445 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1417 * increments. The more local the fault statistics are, the higher the scan 1446 * increments. The more local the fault statistics are, the higher the scan
1418 * period will be for the next scan window. If local/remote ratio is below 1447 * period will be for the next scan window. If local/(local+remote) ratio is
1419 * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the 1448 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1420 * scan period will decrease 1449 * the scan period will decrease. Aim for 70% local accesses.
1421 */ 1450 */
1422#define NUMA_PERIOD_SLOTS 10 1451#define NUMA_PERIOD_SLOTS 10
1423#define NUMA_PERIOD_THRESHOLD 3 1452#define NUMA_PERIOD_THRESHOLD 7
1424 1453
1425/* 1454/*
1426 * Increase the scan period (slow down scanning) if the majority of 1455 * Increase the scan period (slow down scanning) if the majority of
@@ -1595,30 +1624,17 @@ static void task_numa_placement(struct task_struct *p)
1595 1624
1596 if (p->numa_group) { 1625 if (p->numa_group) {
1597 update_numa_active_node_mask(p->numa_group); 1626 update_numa_active_node_mask(p->numa_group);
1598 /*
1599 * If the preferred task and group nids are different,
1600 * iterate over the nodes again to find the best place.
1601 */
1602 if (max_nid != max_group_nid) {
1603 unsigned long weight, max_weight = 0;
1604
1605 for_each_online_node(nid) {
1606 weight = task_weight(p, nid) + group_weight(p, nid);
1607 if (weight > max_weight) {
1608 max_weight = weight;
1609 max_nid = nid;
1610 }
1611 }
1612 }
1613
1614 spin_unlock_irq(group_lock); 1627 spin_unlock_irq(group_lock);
1628 max_nid = max_group_nid;
1615 } 1629 }
1616 1630
1617 /* Preferred node as the node with the most faults */ 1631 if (max_faults) {
1618 if (max_faults && max_nid != p->numa_preferred_nid) { 1632 /* Set the new preferred node */
1619 /* Update the preferred nid and migrate task if possible */ 1633 if (max_nid != p->numa_preferred_nid)
1620 sched_setnuma(p, max_nid); 1634 sched_setnuma(p, max_nid);
1621 numa_migrate_preferred(p); 1635
1636 if (task_node(p) != p->numa_preferred_nid)
1637 numa_migrate_preferred(p);
1622 } 1638 }
1623} 1639}
1624 1640
@@ -2899,7 +2915,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2899 ideal_runtime = sched_slice(cfs_rq, curr); 2915 ideal_runtime = sched_slice(cfs_rq, curr);
2900 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 2916 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
2901 if (delta_exec > ideal_runtime) { 2917 if (delta_exec > ideal_runtime) {
2902 resched_task(rq_of(cfs_rq)->curr); 2918 resched_curr(rq_of(cfs_rq));
2903 /* 2919 /*
2904 * The current task ran long enough, ensure it doesn't get 2920 * The current task ran long enough, ensure it doesn't get
2905 * re-elected due to buddy favours. 2921 * re-elected due to buddy favours.
@@ -2923,7 +2939,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2923 return; 2939 return;
2924 2940
2925 if (delta > ideal_runtime) 2941 if (delta > ideal_runtime)
2926 resched_task(rq_of(cfs_rq)->curr); 2942 resched_curr(rq_of(cfs_rq));
2927} 2943}
2928 2944
2929static void 2945static void
@@ -3063,7 +3079,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3063 * validating it and just reschedule. 3079 * validating it and just reschedule.
3064 */ 3080 */
3065 if (queued) { 3081 if (queued) {
3066 resched_task(rq_of(cfs_rq)->curr); 3082 resched_curr(rq_of(cfs_rq));
3067 return; 3083 return;
3068 } 3084 }
3069 /* 3085 /*
@@ -3254,7 +3270,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3254 * hierarchy can be throttled 3270 * hierarchy can be throttled
3255 */ 3271 */
3256 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) 3272 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3257 resched_task(rq_of(cfs_rq)->curr); 3273 resched_curr(rq_of(cfs_rq));
3258} 3274}
3259 3275
3260static __always_inline 3276static __always_inline
@@ -3360,7 +3376,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3360 cfs_rq->throttled = 1; 3376 cfs_rq->throttled = 1;
3361 cfs_rq->throttled_clock = rq_clock(rq); 3377 cfs_rq->throttled_clock = rq_clock(rq);
3362 raw_spin_lock(&cfs_b->lock); 3378 raw_spin_lock(&cfs_b->lock);
3363 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 3379 /*
3380 * Add to the _head_ of the list, so that an already-started
3381 * distribute_cfs_runtime will not see us
3382 */
3383 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3364 if (!cfs_b->timer_active) 3384 if (!cfs_b->timer_active)
3365 __start_cfs_bandwidth(cfs_b, false); 3385 __start_cfs_bandwidth(cfs_b, false);
3366 raw_spin_unlock(&cfs_b->lock); 3386 raw_spin_unlock(&cfs_b->lock);
@@ -3410,14 +3430,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3410 3430
3411 /* determine whether we need to wake up potentially idle cpu */ 3431 /* determine whether we need to wake up potentially idle cpu */
3412 if (rq->curr == rq->idle && rq->cfs.nr_running) 3432 if (rq->curr == rq->idle && rq->cfs.nr_running)
3413 resched_task(rq->curr); 3433 resched_curr(rq);
3414} 3434}
3415 3435
3416static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, 3436static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3417 u64 remaining, u64 expires) 3437 u64 remaining, u64 expires)
3418{ 3438{
3419 struct cfs_rq *cfs_rq; 3439 struct cfs_rq *cfs_rq;
3420 u64 runtime = remaining; 3440 u64 runtime;
3441 u64 starting_runtime = remaining;
3421 3442
3422 rcu_read_lock(); 3443 rcu_read_lock();
3423 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, 3444 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -3448,7 +3469,7 @@ next:
3448 } 3469 }
3449 rcu_read_unlock(); 3470 rcu_read_unlock();
3450 3471
3451 return remaining; 3472 return starting_runtime - remaining;
3452} 3473}
3453 3474
3454/* 3475/*
@@ -3494,22 +3515,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3494 /* account preceding periods in which throttling occurred */ 3515 /* account preceding periods in which throttling occurred */
3495 cfs_b->nr_throttled += overrun; 3516 cfs_b->nr_throttled += overrun;
3496 3517
3497 /*
3498 * There are throttled entities so we must first use the new bandwidth
3499 * to unthrottle them before making it generally available. This
3500 * ensures that all existing debts will be paid before a new cfs_rq is
3501 * allowed to run.
3502 */
3503 runtime = cfs_b->runtime;
3504 runtime_expires = cfs_b->runtime_expires; 3518 runtime_expires = cfs_b->runtime_expires;
3505 cfs_b->runtime = 0;
3506 3519
3507 /* 3520 /*
3508 * This check is repeated as we are holding onto the new bandwidth 3521 * This check is repeated as we are holding onto the new bandwidth while
3509 * while we unthrottle. This can potentially race with an unthrottled 3522 * we unthrottle. This can potentially race with an unthrottled group
3510 * group trying to acquire new bandwidth from the global pool. 3523 * trying to acquire new bandwidth from the global pool. This can result
3524 * in us over-using our runtime if it is all used during this loop, but
3525 * only by limited amounts in that extreme case.
3511 */ 3526 */
3512 while (throttled && runtime > 0) { 3527 while (throttled && cfs_b->runtime > 0) {
3528 runtime = cfs_b->runtime;
3513 raw_spin_unlock(&cfs_b->lock); 3529 raw_spin_unlock(&cfs_b->lock);
3514 /* we can't nest cfs_b->lock while distributing bandwidth */ 3530 /* we can't nest cfs_b->lock while distributing bandwidth */
3515 runtime = distribute_cfs_runtime(cfs_b, runtime, 3531 runtime = distribute_cfs_runtime(cfs_b, runtime,
@@ -3517,10 +3533,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3517 raw_spin_lock(&cfs_b->lock); 3533 raw_spin_lock(&cfs_b->lock);
3518 3534
3519 throttled = !list_empty(&cfs_b->throttled_cfs_rq); 3535 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3536
3537 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3520 } 3538 }
3521 3539
3522 /* return (any) remaining runtime */
3523 cfs_b->runtime = runtime;
3524 /* 3540 /*
3525 * While we are ensured activity in the period following an 3541 * While we are ensured activity in the period following an
3526 * unthrottle, this also covers the case in which the new bandwidth is 3542 * unthrottle, this also covers the case in which the new bandwidth is
@@ -3631,10 +3647,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3631 return; 3647 return;
3632 } 3648 }
3633 3649
3634 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { 3650 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
3635 runtime = cfs_b->runtime; 3651 runtime = cfs_b->runtime;
3636 cfs_b->runtime = 0; 3652
3637 }
3638 expires = cfs_b->runtime_expires; 3653 expires = cfs_b->runtime_expires;
3639 raw_spin_unlock(&cfs_b->lock); 3654 raw_spin_unlock(&cfs_b->lock);
3640 3655
@@ -3645,7 +3660,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3645 3660
3646 raw_spin_lock(&cfs_b->lock); 3661 raw_spin_lock(&cfs_b->lock);
3647 if (expires == cfs_b->runtime_expires) 3662 if (expires == cfs_b->runtime_expires)
3648 cfs_b->runtime = runtime; 3663 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3649 raw_spin_unlock(&cfs_b->lock); 3664 raw_spin_unlock(&cfs_b->lock);
3650} 3665}
3651 3666
@@ -3775,6 +3790,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3775 hrtimer_cancel(&cfs_b->slack_timer); 3790 hrtimer_cancel(&cfs_b->slack_timer);
3776} 3791}
3777 3792
3793static void __maybe_unused update_runtime_enabled(struct rq *rq)
3794{
3795 struct cfs_rq *cfs_rq;
3796
3797 for_each_leaf_cfs_rq(rq, cfs_rq) {
3798 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
3799
3800 raw_spin_lock(&cfs_b->lock);
3801 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
3802 raw_spin_unlock(&cfs_b->lock);
3803 }
3804}
3805
3778static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) 3806static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3779{ 3807{
3780 struct cfs_rq *cfs_rq; 3808 struct cfs_rq *cfs_rq;
@@ -3788,6 +3816,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3788 * there's some valid quota amount 3816 * there's some valid quota amount
3789 */ 3817 */
3790 cfs_rq->runtime_remaining = 1; 3818 cfs_rq->runtime_remaining = 1;
3819 /*
3820 * Offline rq is schedulable till cpu is completely disabled
3821 * in take_cpu_down(), so we prevent new cfs throttling here.
3822 */
3823 cfs_rq->runtime_enabled = 0;
3824
3791 if (cfs_rq_throttled(cfs_rq)) 3825 if (cfs_rq_throttled(cfs_rq))
3792 unthrottle_cfs_rq(cfs_rq); 3826 unthrottle_cfs_rq(cfs_rq);
3793 } 3827 }
@@ -3831,6 +3865,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3831 return NULL; 3865 return NULL;
3832} 3866}
3833static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} 3867static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3868static inline void update_runtime_enabled(struct rq *rq) {}
3834static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} 3869static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
3835 3870
3836#endif /* CONFIG_CFS_BANDWIDTH */ 3871#endif /* CONFIG_CFS_BANDWIDTH */
@@ -3854,7 +3889,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
3854 3889
3855 if (delta < 0) { 3890 if (delta < 0) {
3856 if (rq->curr == p) 3891 if (rq->curr == p)
3857 resched_task(p); 3892 resched_curr(rq);
3858 return; 3893 return;
3859 } 3894 }
3860 3895
@@ -4723,7 +4758,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
4723 return; 4758 return;
4724 4759
4725preempt: 4760preempt:
4726 resched_task(curr); 4761 resched_curr(rq);
4727 /* 4762 /*
4728 * Only set the backward buddy when the current task is still 4763 * Only set the backward buddy when the current task is still
4729 * on the rq. This can happen when a wakeup gets interleaved 4764 * on the rq. This can happen when a wakeup gets interleaved
@@ -5094,8 +5129,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
5094/* 5129/*
5095 * Is this task likely cache-hot: 5130 * Is this task likely cache-hot:
5096 */ 5131 */
5097static int 5132static int task_hot(struct task_struct *p, struct lb_env *env)
5098task_hot(struct task_struct *p, u64 now)
5099{ 5133{
5100 s64 delta; 5134 s64 delta;
5101 5135
@@ -5108,7 +5142,7 @@ task_hot(struct task_struct *p, u64 now)
5108 /* 5142 /*
5109 * Buddy candidates are cache hot: 5143 * Buddy candidates are cache hot:
5110 */ 5144 */
5111 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && 5145 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
5112 (&p->se == cfs_rq_of(&p->se)->next || 5146 (&p->se == cfs_rq_of(&p->se)->next ||
5113 &p->se == cfs_rq_of(&p->se)->last)) 5147 &p->se == cfs_rq_of(&p->se)->last))
5114 return 1; 5148 return 1;
@@ -5118,7 +5152,7 @@ task_hot(struct task_struct *p, u64 now)
5118 if (sysctl_sched_migration_cost == 0) 5152 if (sysctl_sched_migration_cost == 0)
5119 return 0; 5153 return 0;
5120 5154
5121 delta = now - p->se.exec_start; 5155 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
5122 5156
5123 return delta < (s64)sysctl_sched_migration_cost; 5157 return delta < (s64)sysctl_sched_migration_cost;
5124} 5158}
@@ -5272,7 +5306,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
5272 * 2) task is cache cold, or 5306 * 2) task is cache cold, or
5273 * 3) too many balance attempts have failed. 5307 * 3) too many balance attempts have failed.
5274 */ 5308 */
5275 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); 5309 tsk_cache_hot = task_hot(p, env);
5276 if (!tsk_cache_hot) 5310 if (!tsk_cache_hot)
5277 tsk_cache_hot = migrate_degrades_locality(p, env); 5311 tsk_cache_hot = migrate_degrades_locality(p, env);
5278 5312
@@ -5864,10 +5898,12 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
5864 * @load_idx: Load index of sched_domain of this_cpu for load calc. 5898 * @load_idx: Load index of sched_domain of this_cpu for load calc.
5865 * @local_group: Does group contain this_cpu. 5899 * @local_group: Does group contain this_cpu.
5866 * @sgs: variable to hold the statistics for this group. 5900 * @sgs: variable to hold the statistics for this group.
5901 * @overload: Indicate more than one runnable task for any CPU.
5867 */ 5902 */
5868static inline void update_sg_lb_stats(struct lb_env *env, 5903static inline void update_sg_lb_stats(struct lb_env *env,
5869 struct sched_group *group, int load_idx, 5904 struct sched_group *group, int load_idx,
5870 int local_group, struct sg_lb_stats *sgs) 5905 int local_group, struct sg_lb_stats *sgs,
5906 bool *overload)
5871{ 5907{
5872 unsigned long load; 5908 unsigned long load;
5873 int i; 5909 int i;
@@ -5885,6 +5921,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5885 5921
5886 sgs->group_load += load; 5922 sgs->group_load += load;
5887 sgs->sum_nr_running += rq->nr_running; 5923 sgs->sum_nr_running += rq->nr_running;
5924
5925 if (rq->nr_running > 1)
5926 *overload = true;
5927
5888#ifdef CONFIG_NUMA_BALANCING 5928#ifdef CONFIG_NUMA_BALANCING
5889 sgs->nr_numa_running += rq->nr_numa_running; 5929 sgs->nr_numa_running += rq->nr_numa_running;
5890 sgs->nr_preferred_running += rq->nr_preferred_running; 5930 sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -5995,6 +6035,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
5995 struct sched_group *sg = env->sd->groups; 6035 struct sched_group *sg = env->sd->groups;
5996 struct sg_lb_stats tmp_sgs; 6036 struct sg_lb_stats tmp_sgs;
5997 int load_idx, prefer_sibling = 0; 6037 int load_idx, prefer_sibling = 0;
6038 bool overload = false;
5998 6039
5999 if (child && child->flags & SD_PREFER_SIBLING) 6040 if (child && child->flags & SD_PREFER_SIBLING)
6000 prefer_sibling = 1; 6041 prefer_sibling = 1;
@@ -6015,7 +6056,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
6015 update_group_capacity(env->sd, env->dst_cpu); 6056 update_group_capacity(env->sd, env->dst_cpu);
6016 } 6057 }
6017 6058
6018 update_sg_lb_stats(env, sg, load_idx, local_group, sgs); 6059 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
6060 &overload);
6019 6061
6020 if (local_group) 6062 if (local_group)
6021 goto next_group; 6063 goto next_group;
@@ -6049,6 +6091,13 @@ next_group:
6049 6091
6050 if (env->sd->flags & SD_NUMA) 6092 if (env->sd->flags & SD_NUMA)
6051 env->fbq_type = fbq_classify_group(&sds->busiest_stat); 6093 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
6094
6095 if (!env->sd->parent) {
6096 /* update overload indicator if we are at root domain */
6097 if (env->dst_rq->rd->overload != overload)
6098 env->dst_rq->rd->overload = overload;
6099 }
6100
6052} 6101}
6053 6102
6054/** 6103/**
@@ -6767,7 +6816,8 @@ static int idle_balance(struct rq *this_rq)
6767 */ 6816 */
6768 this_rq->idle_stamp = rq_clock(this_rq); 6817 this_rq->idle_stamp = rq_clock(this_rq);
6769 6818
6770 if (this_rq->avg_idle < sysctl_sched_migration_cost) { 6819 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
6820 !this_rq->rd->overload) {
6771 rcu_read_lock(); 6821 rcu_read_lock();
6772 sd = rcu_dereference_check_sched_domain(this_rq->sd); 6822 sd = rcu_dereference_check_sched_domain(this_rq->sd);
6773 if (sd) 6823 if (sd)
@@ -7325,6 +7375,8 @@ void trigger_load_balance(struct rq *rq)
7325static void rq_online_fair(struct rq *rq) 7375static void rq_online_fair(struct rq *rq)
7326{ 7376{
7327 update_sysctl(); 7377 update_sysctl();
7378
7379 update_runtime_enabled(rq);
7328} 7380}
7329 7381
7330static void rq_offline_fair(struct rq *rq) 7382static void rq_offline_fair(struct rq *rq)
@@ -7398,7 +7450,7 @@ static void task_fork_fair(struct task_struct *p)
7398 * 'current' within the tree based on its new key value. 7450 * 'current' within the tree based on its new key value.
7399 */ 7451 */
7400 swap(curr->vruntime, se->vruntime); 7452 swap(curr->vruntime, se->vruntime);
7401 resched_task(rq->curr); 7453 resched_curr(rq);
7402 } 7454 }
7403 7455
7404 se->vruntime -= cfs_rq->min_vruntime; 7456 se->vruntime -= cfs_rq->min_vruntime;
@@ -7423,7 +7475,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7423 */ 7475 */
7424 if (rq->curr == p) { 7476 if (rq->curr == p) {
7425 if (p->prio > oldprio) 7477 if (p->prio > oldprio)
7426 resched_task(rq->curr); 7478 resched_curr(rq);
7427 } else 7479 } else
7428 check_preempt_curr(rq, p, 0); 7480 check_preempt_curr(rq, p, 0);
7429} 7481}
@@ -7486,7 +7538,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
7486 * if we can still preempt the current task. 7538 * if we can still preempt the current task.
7487 */ 7539 */
7488 if (rq->curr == p) 7540 if (rq->curr == p)
7489 resched_task(rq->curr); 7541 resched_curr(rq);
7490 else 7542 else
7491 check_preempt_curr(rq, p, 0); 7543 check_preempt_curr(rq, p, 0);
7492} 7544}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index cf009fb0bc25..9f1608f99819 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -79,7 +79,7 @@ static void cpuidle_idle_call(void)
79 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); 79 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
80 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); 80 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
81 int next_state, entered_state; 81 int next_state, entered_state;
82 bool broadcast; 82 unsigned int broadcast;
83 83
84 /* 84 /*
85 * Check if the idle task must be rescheduled. If it is the 85 * Check if the idle task must be rescheduled. If it is the
@@ -135,7 +135,7 @@ use_default:
135 goto exit_idle; 135 goto exit_idle;
136 } 136 }
137 137
138 broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); 138 broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP;
139 139
140 /* 140 /*
141 * Tell the time framework to switch to a broadcast timer 141 * Tell the time framework to switch to a broadcast timer
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 879f2b75266a..67ad4e7f506a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
20 */ 20 */
21static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) 21static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
22{ 22{
23 resched_task(rq->idle); 23 resched_curr(rq);
24} 24}
25 25
26static struct task_struct * 26static struct task_struct *
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a49083192c64..5f6edca4fafd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -463,9 +463,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
463static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 463static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
464{ 464{
465 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 465 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
466 struct rq *rq = rq_of_rt_rq(rt_rq);
466 struct sched_rt_entity *rt_se; 467 struct sched_rt_entity *rt_se;
467 468
468 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 469 int cpu = cpu_of(rq);
469 470
470 rt_se = rt_rq->tg->rt_se[cpu]; 471 rt_se = rt_rq->tg->rt_se[cpu];
471 472
@@ -476,7 +477,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
476 enqueue_rt_entity(rt_se, false); 477 enqueue_rt_entity(rt_se, false);
477 478
478 if (rt_rq->highest_prio.curr < curr->prio) 479 if (rt_rq->highest_prio.curr < curr->prio)
479 resched_task(curr); 480 resched_curr(rq);
480 } 481 }
481} 482}
482 483
@@ -566,7 +567,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
566 return; 567 return;
567 568
568 enqueue_top_rt_rq(rt_rq); 569 enqueue_top_rt_rq(rt_rq);
569 resched_task(rq->curr); 570 resched_curr(rq);
570} 571}
571 572
572static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 573static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -740,6 +741,9 @@ balanced:
740 rt_rq->rt_throttled = 0; 741 rt_rq->rt_throttled = 0;
741 raw_spin_unlock(&rt_rq->rt_runtime_lock); 742 raw_spin_unlock(&rt_rq->rt_runtime_lock);
742 raw_spin_unlock(&rt_b->rt_runtime_lock); 743 raw_spin_unlock(&rt_b->rt_runtime_lock);
744
745 /* Make rt_rq available for pick_next_task() */
746 sched_rt_rq_enqueue(rt_rq);
743 } 747 }
744} 748}
745 749
@@ -948,7 +952,7 @@ static void update_curr_rt(struct rq *rq)
948 raw_spin_lock(&rt_rq->rt_runtime_lock); 952 raw_spin_lock(&rt_rq->rt_runtime_lock);
949 rt_rq->rt_time += delta_exec; 953 rt_rq->rt_time += delta_exec;
950 if (sched_rt_runtime_exceeded(rt_rq)) 954 if (sched_rt_runtime_exceeded(rt_rq))
951 resched_task(curr); 955 resched_curr(rq);
952 raw_spin_unlock(&rt_rq->rt_runtime_lock); 956 raw_spin_unlock(&rt_rq->rt_runtime_lock);
953 } 957 }
954 } 958 }
@@ -1363,7 +1367,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1363 * to try and push current away: 1367 * to try and push current away:
1364 */ 1368 */
1365 requeue_task_rt(rq, p, 1); 1369 requeue_task_rt(rq, p, 1);
1366 resched_task(rq->curr); 1370 resched_curr(rq);
1367} 1371}
1368 1372
1369#endif /* CONFIG_SMP */ 1373#endif /* CONFIG_SMP */
@@ -1374,7 +1378,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1374static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) 1378static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1375{ 1379{
1376 if (p->prio < rq->curr->prio) { 1380 if (p->prio < rq->curr->prio) {
1377 resched_task(rq->curr); 1381 resched_curr(rq);
1378 return; 1382 return;
1379 } 1383 }
1380 1384
@@ -1690,7 +1694,7 @@ retry:
1690 * just reschedule current. 1694 * just reschedule current.
1691 */ 1695 */
1692 if (unlikely(next_task->prio < rq->curr->prio)) { 1696 if (unlikely(next_task->prio < rq->curr->prio)) {
1693 resched_task(rq->curr); 1697 resched_curr(rq);
1694 return 0; 1698 return 0;
1695 } 1699 }
1696 1700
@@ -1737,7 +1741,7 @@ retry:
1737 activate_task(lowest_rq, next_task, 0); 1741 activate_task(lowest_rq, next_task, 0);
1738 ret = 1; 1742 ret = 1;
1739 1743
1740 resched_task(lowest_rq->curr); 1744 resched_curr(lowest_rq);
1741 1745
1742 double_unlock_balance(rq, lowest_rq); 1746 double_unlock_balance(rq, lowest_rq);
1743 1747
@@ -1936,7 +1940,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1936 return; 1940 return;
1937 1941
1938 if (pull_rt_task(rq)) 1942 if (pull_rt_task(rq))
1939 resched_task(rq->curr); 1943 resched_curr(rq);
1940} 1944}
1941 1945
1942void __init init_sched_rt_class(void) 1946void __init init_sched_rt_class(void)
@@ -1974,7 +1978,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1974 check_resched = 0; 1978 check_resched = 0;
1975#endif /* CONFIG_SMP */ 1979#endif /* CONFIG_SMP */
1976 if (check_resched && p->prio < rq->curr->prio) 1980 if (check_resched && p->prio < rq->curr->prio)
1977 resched_task(rq->curr); 1981 resched_curr(rq);
1978 } 1982 }
1979} 1983}
1980 1984
@@ -2003,11 +2007,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2003 * Only reschedule if p is still on the same runqueue. 2007 * Only reschedule if p is still on the same runqueue.
2004 */ 2008 */
2005 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) 2009 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
2006 resched_task(p); 2010 resched_curr(rq);
2007#else 2011#else
2008 /* For UP simply resched on drop of prio */ 2012 /* For UP simply resched on drop of prio */
2009 if (oldprio < p->prio) 2013 if (oldprio < p->prio)
2010 resched_task(p); 2014 resched_curr(rq);
2011#endif /* CONFIG_SMP */ 2015#endif /* CONFIG_SMP */
2012 } else { 2016 } else {
2013 /* 2017 /*
@@ -2016,7 +2020,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2016 * then reschedule. 2020 * then reschedule.
2017 */ 2021 */
2018 if (p->prio < rq->curr->prio) 2022 if (p->prio < rq->curr->prio)
2019 resched_task(rq->curr); 2023 resched_curr(rq);
2020 } 2024 }
2021} 2025}
2022 2026
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 31cc02ebc54e..579712f4e9d5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -477,6 +477,9 @@ struct root_domain {
477 cpumask_var_t span; 477 cpumask_var_t span;
478 cpumask_var_t online; 478 cpumask_var_t online;
479 479
480 /* Indicate more than one runnable task for any CPU */
481 bool overload;
482
480 /* 483 /*
481 * The bit corresponding to a CPU gets set here if such CPU has more 484 * The bit corresponding to a CPU gets set here if such CPU has more
482 * than one runnable -deadline task (as it is below for RT tasks). 485 * than one runnable -deadline task (as it is below for RT tasks).
@@ -884,20 +887,10 @@ enum {
884#undef SCHED_FEAT 887#undef SCHED_FEAT
885 888
886#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) 889#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
887static __always_inline bool static_branch__true(struct static_key *key)
888{
889 return static_key_true(key); /* Not out of line branch. */
890}
891
892static __always_inline bool static_branch__false(struct static_key *key)
893{
894 return static_key_false(key); /* Out of line branch. */
895}
896
897#define SCHED_FEAT(name, enabled) \ 890#define SCHED_FEAT(name, enabled) \
898static __always_inline bool static_branch_##name(struct static_key *key) \ 891static __always_inline bool static_branch_##name(struct static_key *key) \
899{ \ 892{ \
900 return static_branch__##enabled(key); \ 893 return static_key_##enabled(key); \
901} 894}
902 895
903#include "features.h" 896#include "features.h"
@@ -1196,7 +1189,7 @@ extern void init_sched_rt_class(void);
1196extern void init_sched_fair_class(void); 1189extern void init_sched_fair_class(void);
1197extern void init_sched_dl_class(void); 1190extern void init_sched_dl_class(void);
1198 1191
1199extern void resched_task(struct task_struct *p); 1192extern void resched_curr(struct rq *rq);
1200extern void resched_cpu(int cpu); 1193extern void resched_cpu(int cpu);
1201 1194
1202extern struct rt_bandwidth def_rt_bandwidth; 1195extern struct rt_bandwidth def_rt_bandwidth;
@@ -1218,15 +1211,26 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
1218 1211
1219 rq->nr_running = prev_nr + count; 1212 rq->nr_running = prev_nr + count;
1220 1213
1221#ifdef CONFIG_NO_HZ_FULL
1222 if (prev_nr < 2 && rq->nr_running >= 2) { 1214 if (prev_nr < 2 && rq->nr_running >= 2) {
1215#ifdef CONFIG_SMP
1216 if (!rq->rd->overload)
1217 rq->rd->overload = true;
1218#endif
1219
1220#ifdef CONFIG_NO_HZ_FULL
1223 if (tick_nohz_full_cpu(rq->cpu)) { 1221 if (tick_nohz_full_cpu(rq->cpu)) {
1224 /* Order rq->nr_running write against the IPI */ 1222 /*
1225 smp_wmb(); 1223 * Tick is needed if more than one task runs on a CPU.
1226 smp_send_reschedule(rq->cpu); 1224 * Send the target an IPI to kick it out of nohz mode.
1225 *
1226 * We assume that IPI implies full memory barrier and the
1227 * new value of rq->nr_running is visible on reception
1228 * from the target.
1229 */
1230 tick_nohz_full_kick_cpu(rq->cpu);
1227 } 1231 }
1228 }
1229#endif 1232#endif
1233 }
1230} 1234}
1231 1235
1232static inline void sub_nr_running(struct rq *rq, unsigned count) 1236static inline void sub_nr_running(struct rq *rq, unsigned count)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 0ffa20ae657b..15cab1a4f84e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -319,14 +319,14 @@ EXPORT_SYMBOL(wake_bit_function);
319 */ 319 */
320int __sched 320int __sched
321__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, 321__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
322 int (*action)(void *), unsigned mode) 322 wait_bit_action_f *action, unsigned mode)
323{ 323{
324 int ret = 0; 324 int ret = 0;
325 325
326 do { 326 do {
327 prepare_to_wait(wq, &q->wait, mode); 327 prepare_to_wait(wq, &q->wait, mode);
328 if (test_bit(q->key.bit_nr, q->key.flags)) 328 if (test_bit(q->key.bit_nr, q->key.flags))
329 ret = (*action)(q->key.flags); 329 ret = (*action)(&q->key);
330 } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); 330 } while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
331 finish_wait(wq, &q->wait); 331 finish_wait(wq, &q->wait);
332 return ret; 332 return ret;
@@ -334,7 +334,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
334EXPORT_SYMBOL(__wait_on_bit); 334EXPORT_SYMBOL(__wait_on_bit);
335 335
336int __sched out_of_line_wait_on_bit(void *word, int bit, 336int __sched out_of_line_wait_on_bit(void *word, int bit,
337 int (*action)(void *), unsigned mode) 337 wait_bit_action_f *action, unsigned mode)
338{ 338{
339 wait_queue_head_t *wq = bit_waitqueue(word, bit); 339 wait_queue_head_t *wq = bit_waitqueue(word, bit);
340 DEFINE_WAIT_BIT(wait, word, bit); 340 DEFINE_WAIT_BIT(wait, word, bit);
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit);
345 345
346int __sched 346int __sched
347__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, 347__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
348 int (*action)(void *), unsigned mode) 348 wait_bit_action_f *action, unsigned mode)
349{ 349{
350 do { 350 do {
351 int ret; 351 int ret;
@@ -353,7 +353,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
353 prepare_to_wait_exclusive(wq, &q->wait, mode); 353 prepare_to_wait_exclusive(wq, &q->wait, mode);
354 if (!test_bit(q->key.bit_nr, q->key.flags)) 354 if (!test_bit(q->key.bit_nr, q->key.flags))
355 continue; 355 continue;
356 ret = action(q->key.flags); 356 ret = action(&q->key);
357 if (!ret) 357 if (!ret)
358 continue; 358 continue;
359 abort_exclusive_wait(wq, &q->wait, mode, &q->key); 359 abort_exclusive_wait(wq, &q->wait, mode, &q->key);
@@ -365,7 +365,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
365EXPORT_SYMBOL(__wait_on_bit_lock); 365EXPORT_SYMBOL(__wait_on_bit_lock);
366 366
367int __sched out_of_line_wait_on_bit_lock(void *word, int bit, 367int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
368 int (*action)(void *), unsigned mode) 368 wait_bit_action_f *action, unsigned mode)
369{ 369{
370 wait_queue_head_t *wq = bit_waitqueue(word, bit); 370 wait_queue_head_t *wq = bit_waitqueue(word, bit);
371 DEFINE_WAIT_BIT(wait, word, bit); 371 DEFINE_WAIT_BIT(wait, word, bit);
@@ -502,3 +502,21 @@ void wake_up_atomic_t(atomic_t *p)
502 __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); 502 __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
503} 503}
504EXPORT_SYMBOL(wake_up_atomic_t); 504EXPORT_SYMBOL(wake_up_atomic_t);
505
506__sched int bit_wait(struct wait_bit_key *word)
507{
508 if (signal_pending_state(current->state, current))
509 return 1;
510 schedule();
511 return 0;
512}
513EXPORT_SYMBOL(bit_wait);
514
515__sched int bit_wait_io(struct wait_bit_key *word)
516{
517 if (signal_pending_state(current->state, current))
518 return 1;
519 io_schedule();
520 return 0;
521}
522EXPORT_SYMBOL(bit_wait_io);
diff --git a/kernel/signal.c b/kernel/signal.c
index a4077e90f19f..40b76e351e64 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1263,6 +1263,10 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1263 struct sighand_struct *sighand; 1263 struct sighand_struct *sighand;
1264 1264
1265 for (;;) { 1265 for (;;) {
1266 /*
1267 * Disable interrupts early to avoid deadlocks.
1268 * See rcu_read_unlock() comment header for details.
1269 */
1266 local_irq_save(*flags); 1270 local_irq_save(*flags);
1267 rcu_read_lock(); 1271 rcu_read_lock();
1268 sighand = rcu_dereference(tsk->sighand); 1272 sighand = rcu_dereference(tsk->sighand);
diff --git a/kernel/smp.c b/kernel/smp.c
index 80c33f8de14f..487653b5844f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008 4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008
5 */ 5 */
6#include <linux/irq_work.h>
6#include <linux/rcupdate.h> 7#include <linux/rcupdate.h>
7#include <linux/rculist.h> 8#include <linux/rculist.h>
8#include <linux/kernel.h> 9#include <linux/kernel.h>
@@ -251,6 +252,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
251 csd->func(csd->info); 252 csd->func(csd->info);
252 csd_unlock(csd); 253 csd_unlock(csd);
253 } 254 }
255
256 /*
257 * Handle irq works queued remotely by irq_work_queue_on().
258 * Smp functions above are typically synchronous so they
259 * better run first since some other CPUs may be busy waiting
260 * for them.
261 */
262 irq_work_run();
254} 263}
255 264
256/* 265/*
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 88c9c65a430d..4aec4a457431 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -71,7 +71,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void)
71 71
72 return ret; 72 return ret;
73} 73}
74 74EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
75 75
76static int alarmtimer_rtc_add_device(struct device *dev, 76static int alarmtimer_rtc_add_device(struct device *dev,
77 struct class_interface *class_intf) 77 struct class_interface *class_intf)
@@ -585,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
585 struct itimerspec *new_setting, 585 struct itimerspec *new_setting,
586 struct itimerspec *old_setting) 586 struct itimerspec *old_setting)
587{ 587{
588 ktime_t exp;
589
588 if (!rtcdev) 590 if (!rtcdev)
589 return -ENOTSUPP; 591 return -ENOTSUPP;
590 592
593 if (flags & ~TIMER_ABSTIME)
594 return -EINVAL;
595
591 if (old_setting) 596 if (old_setting)
592 alarm_timer_get(timr, old_setting); 597 alarm_timer_get(timr, old_setting);
593 598
@@ -597,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
597 602
598 /* start the timer */ 603 /* start the timer */
599 timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); 604 timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
600 alarm_start(&timr->it.alarm.alarmtimer, 605 exp = timespec_to_ktime(new_setting->it_value);
601 timespec_to_ktime(new_setting->it_value)); 606 /* Convert (if necessary) to absolute time */
607 if (flags != TIMER_ABSTIME) {
608 ktime_t now;
609
610 now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
611 exp = ktime_add(now, exp);
612 }
613
614 alarm_start(&timr->it.alarm.alarmtimer, exp);
602 return 0; 615 return 0;
603} 616}
604 617
@@ -730,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
730 if (!alarmtimer_get_rtcdev()) 743 if (!alarmtimer_get_rtcdev())
731 return -ENOTSUPP; 744 return -ENOTSUPP;
732 745
746 if (flags & ~TIMER_ABSTIME)
747 return -EINVAL;
748
733 if (!capable(CAP_WAKE_ALARM)) 749 if (!capable(CAP_WAKE_ALARM))
734 return -EPERM; 750 return -EPERM;
735 751
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index ad362c260ef4..9c94c19f1305 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -146,7 +146,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
146{ 146{
147 /* Nothing to do if we already reached the limit */ 147 /* Nothing to do if we already reached the limit */
148 if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { 148 if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
149 printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); 149 printk_deferred(KERN_WARNING
150 "CE: Reprogramming failure. Giving up\n");
150 dev->next_event.tv64 = KTIME_MAX; 151 dev->next_event.tv64 = KTIME_MAX;
151 return -ETIME; 152 return -ETIME;
152 } 153 }
@@ -159,9 +160,10 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
159 if (dev->min_delta_ns > MIN_DELTA_LIMIT) 160 if (dev->min_delta_ns > MIN_DELTA_LIMIT)
160 dev->min_delta_ns = MIN_DELTA_LIMIT; 161 dev->min_delta_ns = MIN_DELTA_LIMIT;
161 162
162 printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", 163 printk_deferred(KERN_WARNING
163 dev->name ? dev->name : "?", 164 "CE: %s increased min_delta_ns to %llu nsec\n",
164 (unsigned long long) dev->min_delta_ns); 165 dev->name ? dev->name : "?",
166 (unsigned long long) dev->min_delta_ns);
165 return 0; 167 return 0;
166} 168}
167 169
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 445106d2c729..01d2d15aa662 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -191,7 +191,8 @@ void __init sched_clock_postinit(void)
191 191
192static int sched_clock_suspend(void) 192static int sched_clock_suspend(void)
193{ 193{
194 sched_clock_poll(&sched_clock_timer); 194 update_sched_clock();
195 hrtimer_cancel(&sched_clock_timer);
195 cd.suspended = true; 196 cd.suspended = true;
196 return 0; 197 return 0;
197} 198}
@@ -199,6 +200,7 @@ static int sched_clock_suspend(void)
199static void sched_clock_resume(void) 200static void sched_clock_resume(void)
200{ 201{
201 cd.epoch_cyc = read_sched_clock(); 202 cd.epoch_cyc = read_sched_clock();
203 hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
202 cd.suspended = false; 204 cd.suspended = false;
203} 205}
204 206
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6558b7ac112d..99aa6ee3908f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -154,6 +154,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
154 154
155#ifdef CONFIG_NO_HZ_FULL 155#ifdef CONFIG_NO_HZ_FULL
156cpumask_var_t tick_nohz_full_mask; 156cpumask_var_t tick_nohz_full_mask;
157cpumask_var_t housekeeping_mask;
157bool tick_nohz_full_running; 158bool tick_nohz_full_running;
158 159
159static bool can_stop_full_tick(void) 160static bool can_stop_full_tick(void)
@@ -224,13 +225,15 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
224}; 225};
225 226
226/* 227/*
227 * Kick the current CPU if it's full dynticks in order to force it to 228 * Kick the CPU if it's full dynticks in order to force it to
228 * re-evaluate its dependency on the tick and restart it if necessary. 229 * re-evaluate its dependency on the tick and restart it if necessary.
229 */ 230 */
230void tick_nohz_full_kick(void) 231void tick_nohz_full_kick_cpu(int cpu)
231{ 232{
232 if (tick_nohz_full_cpu(smp_processor_id())) 233 if (!tick_nohz_full_cpu(cpu))
233 irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); 234 return;
235
236 irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
234} 237}
235 238
236static void nohz_full_kick_ipi(void *info) 239static void nohz_full_kick_ipi(void *info)
@@ -281,6 +284,7 @@ static int __init tick_nohz_full_setup(char *str)
281 int cpu; 284 int cpu;
282 285
283 alloc_bootmem_cpumask_var(&tick_nohz_full_mask); 286 alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
287 alloc_bootmem_cpumask_var(&housekeeping_mask);
284 if (cpulist_parse(str, tick_nohz_full_mask) < 0) { 288 if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
285 pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); 289 pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
286 return 1; 290 return 1;
@@ -291,6 +295,8 @@ static int __init tick_nohz_full_setup(char *str)
291 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); 295 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
292 cpumask_clear_cpu(cpu, tick_nohz_full_mask); 296 cpumask_clear_cpu(cpu, tick_nohz_full_mask);
293 } 297 }
298 cpumask_andnot(housekeeping_mask,
299 cpu_possible_mask, tick_nohz_full_mask);
294 tick_nohz_full_running = true; 300 tick_nohz_full_running = true;
295 301
296 return 1; 302 return 1;
@@ -332,9 +338,15 @@ static int tick_nohz_init_all(void)
332 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); 338 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
333 return err; 339 return err;
334 } 340 }
341 if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
342 pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n");
343 return err;
344 }
335 err = 0; 345 err = 0;
336 cpumask_setall(tick_nohz_full_mask); 346 cpumask_setall(tick_nohz_full_mask);
337 cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); 347 cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
348 cpumask_clear(housekeeping_mask);
349 cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
338 tick_nohz_full_running = true; 350 tick_nohz_full_running = true;
339#endif 351#endif
340 return err; 352 return err;
diff --git a/kernel/torture.c b/kernel/torture.c
index 40bb511cca48..d600af21f022 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -708,7 +708,7 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
708 int ret = 0; 708 int ret = 0;
709 709
710 VERBOSE_TOROUT_STRING(m); 710 VERBOSE_TOROUT_STRING(m);
711 *tp = kthread_run(fn, arg, s); 711 *tp = kthread_run(fn, arg, "%s", s);
712 if (IS_ERR(*tp)) { 712 if (IS_ERR(*tp)) {
713 ret = PTR_ERR(*tp); 713 ret = PTR_ERR(*tp);
714 VERBOSE_TOROUT_ERRSTRING(f); 714 VERBOSE_TOROUT_ERRSTRING(f);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d4409356f40d..a5da09c899dd 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -29,11 +29,6 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
29 help 29 help
30 See Documentation/trace/ftrace-design.txt 30 See Documentation/trace/ftrace-design.txt
31 31
32config HAVE_FUNCTION_TRACE_MCOUNT_TEST
33 bool
34 help
35 See Documentation/trace/ftrace-design.txt
36
37config HAVE_DYNAMIC_FTRACE 32config HAVE_DYNAMIC_FTRACE
38 bool 33 bool
39 help 34 help
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2611613f14f1..67d6369ddf83 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
28 28
29obj-$(CONFIG_TRACING) += trace.o 29obj-$(CONFIG_TRACING) += trace.o
30obj-$(CONFIG_TRACING) += trace_output.o 30obj-$(CONFIG_TRACING) += trace_output.o
31obj-$(CONFIG_TRACING) += trace_seq.o
31obj-$(CONFIG_TRACING) += trace_stat.o 32obj-$(CONFIG_TRACING) += trace_stat.o
32obj-$(CONFIG_TRACING) += trace_printk.o 33obj-$(CONFIG_TRACING) += trace_printk.o
33obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o 34obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5b372e3ed675..1654b12c891a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -80,9 +80,6 @@ static struct ftrace_ops ftrace_list_end __read_mostly = {
80int ftrace_enabled __read_mostly; 80int ftrace_enabled __read_mostly;
81static int last_ftrace_enabled; 81static int last_ftrace_enabled;
82 82
83/* Quick disabling of function tracer. */
84int function_trace_stop __read_mostly;
85
86/* Current function tracing op */ 83/* Current function tracing op */
87struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; 84struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
88/* What to set function_trace_op to */ 85/* What to set function_trace_op to */
@@ -265,12 +262,12 @@ static void update_ftrace_function(void)
265 func = ftrace_ops_list_func; 262 func = ftrace_ops_list_func;
266 } 263 }
267 264
265 update_function_graph_func();
266
268 /* If there's no change, then do nothing more here */ 267 /* If there's no change, then do nothing more here */
269 if (ftrace_trace_function == func) 268 if (ftrace_trace_function == func)
270 return; 269 return;
271 270
272 update_function_graph_func();
273
274 /* 271 /*
275 * If we are using the list function, it doesn't care 272 * If we are using the list function, it doesn't care
276 * about the function_trace_ops. 273 * about the function_trace_ops.
@@ -1042,6 +1039,8 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
1042 1039
1043#ifdef CONFIG_DYNAMIC_FTRACE 1040#ifdef CONFIG_DYNAMIC_FTRACE
1044 1041
1042static struct ftrace_ops *removed_ops;
1043
1045#ifndef CONFIG_FTRACE_MCOUNT_RECORD 1044#ifndef CONFIG_FTRACE_MCOUNT_RECORD
1046# error Dynamic ftrace depends on MCOUNT_RECORD 1045# error Dynamic ftrace depends on MCOUNT_RECORD
1047#endif 1046#endif
@@ -1304,25 +1303,15 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1304 struct ftrace_hash *new_hash; 1303 struct ftrace_hash *new_hash;
1305 int size = src->count; 1304 int size = src->count;
1306 int bits = 0; 1305 int bits = 0;
1307 int ret;
1308 int i; 1306 int i;
1309 1307
1310 /* 1308 /*
1311 * Remove the current set, update the hash and add
1312 * them back.
1313 */
1314 ftrace_hash_rec_disable(ops, enable);
1315
1316 /*
1317 * If the new source is empty, just free dst and assign it 1309 * If the new source is empty, just free dst and assign it
1318 * the empty_hash. 1310 * the empty_hash.
1319 */ 1311 */
1320 if (!src->count) { 1312 if (!src->count) {
1321 free_ftrace_hash_rcu(*dst); 1313 new_hash = EMPTY_HASH;
1322 rcu_assign_pointer(*dst, EMPTY_HASH); 1314 goto update;
1323 /* still need to update the function records */
1324 ret = 0;
1325 goto out;
1326 } 1315 }
1327 1316
1328 /* 1317 /*
@@ -1335,10 +1324,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1335 if (bits > FTRACE_HASH_MAX_BITS) 1324 if (bits > FTRACE_HASH_MAX_BITS)
1336 bits = FTRACE_HASH_MAX_BITS; 1325 bits = FTRACE_HASH_MAX_BITS;
1337 1326
1338 ret = -ENOMEM;
1339 new_hash = alloc_ftrace_hash(bits); 1327 new_hash = alloc_ftrace_hash(bits);
1340 if (!new_hash) 1328 if (!new_hash)
1341 goto out; 1329 return -ENOMEM;
1342 1330
1343 size = 1 << src->size_bits; 1331 size = 1 << src->size_bits;
1344 for (i = 0; i < size; i++) { 1332 for (i = 0; i < size; i++) {
@@ -1349,20 +1337,20 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1349 } 1337 }
1350 } 1338 }
1351 1339
1340update:
1341 /*
1342 * Remove the current set, update the hash and add
1343 * them back.
1344 */
1345 ftrace_hash_rec_disable(ops, enable);
1346
1352 old_hash = *dst; 1347 old_hash = *dst;
1353 rcu_assign_pointer(*dst, new_hash); 1348 rcu_assign_pointer(*dst, new_hash);
1354 free_ftrace_hash_rcu(old_hash); 1349 free_ftrace_hash_rcu(old_hash);
1355 1350
1356 ret = 0;
1357 out:
1358 /*
1359 * Enable regardless of ret:
1360 * On success, we enable the new hash.
1361 * On failure, we re-enable the original hash.
1362 */
1363 ftrace_hash_rec_enable(ops, enable); 1351 ftrace_hash_rec_enable(ops, enable);
1364 1352
1365 return ret; 1353 return 0;
1366} 1354}
1367 1355
1368/* 1356/*
@@ -1492,6 +1480,53 @@ int ftrace_text_reserved(const void *start, const void *end)
1492 return (int)!!ret; 1480 return (int)!!ret;
1493} 1481}
1494 1482
1483/* Test if ops registered to this rec needs regs */
1484static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
1485{
1486 struct ftrace_ops *ops;
1487 bool keep_regs = false;
1488
1489 for (ops = ftrace_ops_list;
1490 ops != &ftrace_list_end; ops = ops->next) {
1491 /* pass rec in as regs to have non-NULL val */
1492 if (ftrace_ops_test(ops, rec->ip, rec)) {
1493 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
1494 keep_regs = true;
1495 break;
1496 }
1497 }
1498 }
1499
1500 return keep_regs;
1501}
1502
1503static void ftrace_remove_tramp(struct ftrace_ops *ops,
1504 struct dyn_ftrace *rec)
1505{
1506 struct ftrace_func_entry *entry;
1507
1508 entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip);
1509 if (!entry)
1510 return;
1511
1512 /*
1513 * The tramp_hash entry will be removed at time
1514 * of update.
1515 */
1516 ops->nr_trampolines--;
1517 rec->flags &= ~FTRACE_FL_TRAMP;
1518}
1519
1520static void ftrace_clear_tramps(struct dyn_ftrace *rec)
1521{
1522 struct ftrace_ops *op;
1523
1524 do_for_each_ftrace_op(op, ftrace_ops_list) {
1525 if (op->nr_trampolines)
1526 ftrace_remove_tramp(op, rec);
1527 } while_for_each_ftrace_op(op);
1528}
1529
1495static void __ftrace_hash_rec_update(struct ftrace_ops *ops, 1530static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1496 int filter_hash, 1531 int filter_hash,
1497 bool inc) 1532 bool inc)
@@ -1572,8 +1607,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1572 1607
1573 if (inc) { 1608 if (inc) {
1574 rec->flags++; 1609 rec->flags++;
1575 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) 1610 if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX))
1576 return; 1611 return;
1612
1613 /*
1614 * If there's only a single callback registered to a
1615 * function, and the ops has a trampoline registered
1616 * for it, then we can call it directly.
1617 */
1618 if (ftrace_rec_count(rec) == 1 && ops->trampoline) {
1619 rec->flags |= FTRACE_FL_TRAMP;
1620 ops->nr_trampolines++;
1621 } else {
1622 /*
1623 * If we are adding another function callback
1624 * to this function, and the previous had a
1625 * trampoline used, then we need to go back to
1626 * the default trampoline.
1627 */
1628 rec->flags &= ~FTRACE_FL_TRAMP;
1629
1630 /* remove trampolines from any ops for this rec */
1631 ftrace_clear_tramps(rec);
1632 }
1633
1577 /* 1634 /*
1578 * If any ops wants regs saved for this function 1635 * If any ops wants regs saved for this function
1579 * then all ops will get saved regs. 1636 * then all ops will get saved regs.
@@ -1581,9 +1638,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1581 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) 1638 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
1582 rec->flags |= FTRACE_FL_REGS; 1639 rec->flags |= FTRACE_FL_REGS;
1583 } else { 1640 } else {
1584 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) 1641 if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0))
1585 return; 1642 return;
1586 rec->flags--; 1643 rec->flags--;
1644
1645 if (ops->trampoline && !ftrace_rec_count(rec))
1646 ftrace_remove_tramp(ops, rec);
1647
1648 /*
1649 * If the rec had REGS enabled and the ops that is
1650 * being removed had REGS set, then see if there is
1651 * still any ops for this record that wants regs.
1652 * If not, we can stop recording them.
1653 */
1654 if (ftrace_rec_count(rec) > 0 &&
1655 rec->flags & FTRACE_FL_REGS &&
1656 ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
1657 if (!test_rec_ops_needs_regs(rec))
1658 rec->flags &= ~FTRACE_FL_REGS;
1659 }
1660
1661 /*
1662 * flags will be cleared in ftrace_check_record()
1663 * if rec count is zero.
1664 */
1587 } 1665 }
1588 count++; 1666 count++;
1589 /* Shortcut, if we handled all records, we are done. */ 1667 /* Shortcut, if we handled all records, we are done. */
@@ -1668,17 +1746,23 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1668 * If we are disabling calls, then disable all records that 1746 * If we are disabling calls, then disable all records that
1669 * are enabled. 1747 * are enabled.
1670 */ 1748 */
1671 if (enable && (rec->flags & ~FTRACE_FL_MASK)) 1749 if (enable && ftrace_rec_count(rec))
1672 flag = FTRACE_FL_ENABLED; 1750 flag = FTRACE_FL_ENABLED;
1673 1751
1674 /* 1752 /*
1675 * If enabling and the REGS flag does not match the REGS_EN, then 1753 * If enabling and the REGS flag does not match the REGS_EN, or
1676 * do not ignore this record. Set flags to fail the compare against 1754 * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore
1677 * ENABLED. 1755 * this record. Set flags to fail the compare against ENABLED.
1678 */ 1756 */
1679 if (flag && 1757 if (flag) {
1680 (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) 1758 if (!(rec->flags & FTRACE_FL_REGS) !=
1681 flag |= FTRACE_FL_REGS; 1759 !(rec->flags & FTRACE_FL_REGS_EN))
1760 flag |= FTRACE_FL_REGS;
1761
1762 if (!(rec->flags & FTRACE_FL_TRAMP) !=
1763 !(rec->flags & FTRACE_FL_TRAMP_EN))
1764 flag |= FTRACE_FL_TRAMP;
1765 }
1682 1766
1683 /* If the state of this record hasn't changed, then do nothing */ 1767 /* If the state of this record hasn't changed, then do nothing */
1684 if ((rec->flags & FTRACE_FL_ENABLED) == flag) 1768 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1696,6 +1780,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1696 else 1780 else
1697 rec->flags &= ~FTRACE_FL_REGS_EN; 1781 rec->flags &= ~FTRACE_FL_REGS_EN;
1698 } 1782 }
1783 if (flag & FTRACE_FL_TRAMP) {
1784 if (rec->flags & FTRACE_FL_TRAMP)
1785 rec->flags |= FTRACE_FL_TRAMP_EN;
1786 else
1787 rec->flags &= ~FTRACE_FL_TRAMP_EN;
1788 }
1699 } 1789 }
1700 1790
1701 /* 1791 /*
@@ -1704,7 +1794,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1704 * Otherwise, 1794 * Otherwise,
1705 * return UPDATE_MODIFY_CALL to tell the caller to convert 1795 * return UPDATE_MODIFY_CALL to tell the caller to convert
1706 * from the save regs, to a non-save regs function or 1796 * from the save regs, to a non-save regs function or
1707 * vice versa. 1797 * vice versa, or from a trampoline call.
1708 */ 1798 */
1709 if (flag & FTRACE_FL_ENABLED) 1799 if (flag & FTRACE_FL_ENABLED)
1710 return FTRACE_UPDATE_MAKE_CALL; 1800 return FTRACE_UPDATE_MAKE_CALL;
@@ -1714,7 +1804,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1714 1804
1715 if (update) { 1805 if (update) {
1716 /* If there's no more users, clear all flags */ 1806 /* If there's no more users, clear all flags */
1717 if (!(rec->flags & ~FTRACE_FL_MASK)) 1807 if (!ftrace_rec_count(rec))
1718 rec->flags = 0; 1808 rec->flags = 0;
1719 else 1809 else
1720 /* Just disable the record (keep REGS state) */ 1810 /* Just disable the record (keep REGS state) */
@@ -1751,6 +1841,43 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1751 return ftrace_check_record(rec, enable, 0); 1841 return ftrace_check_record(rec, enable, 0);
1752} 1842}
1753 1843
1844static struct ftrace_ops *
1845ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
1846{
1847 struct ftrace_ops *op;
1848
1849 /* Removed ops need to be tested first */
1850 if (removed_ops && removed_ops->tramp_hash) {
1851 if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip))
1852 return removed_ops;
1853 }
1854
1855 do_for_each_ftrace_op(op, ftrace_ops_list) {
1856 if (!op->tramp_hash)
1857 continue;
1858
1859 if (ftrace_lookup_ip(op->tramp_hash, rec->ip))
1860 return op;
1861
1862 } while_for_each_ftrace_op(op);
1863
1864 return NULL;
1865}
1866
1867static struct ftrace_ops *
1868ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
1869{
1870 struct ftrace_ops *op;
1871
1872 do_for_each_ftrace_op(op, ftrace_ops_list) {
1873 /* pass rec in as regs to have non-NULL val */
1874 if (ftrace_ops_test(op, rec->ip, rec))
1875 return op;
1876 } while_for_each_ftrace_op(op);
1877
1878 return NULL;
1879}
1880
1754/** 1881/**
1755 * ftrace_get_addr_new - Get the call address to set to 1882 * ftrace_get_addr_new - Get the call address to set to
1756 * @rec: The ftrace record descriptor 1883 * @rec: The ftrace record descriptor
@@ -1763,6 +1890,20 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1763 */ 1890 */
1764unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) 1891unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
1765{ 1892{
1893 struct ftrace_ops *ops;
1894
1895 /* Trampolines take precedence over regs */
1896 if (rec->flags & FTRACE_FL_TRAMP) {
1897 ops = ftrace_find_tramp_ops_new(rec);
1898 if (FTRACE_WARN_ON(!ops || !ops->trampoline)) {
1899 pr_warning("Bad trampoline accounting at: %p (%pS)\n",
1900 (void *)rec->ip, (void *)rec->ip);
1901 /* Ftrace is shutting down, return anything */
1902 return (unsigned long)FTRACE_ADDR;
1903 }
1904 return ops->trampoline;
1905 }
1906
1766 if (rec->flags & FTRACE_FL_REGS) 1907 if (rec->flags & FTRACE_FL_REGS)
1767 return (unsigned long)FTRACE_REGS_ADDR; 1908 return (unsigned long)FTRACE_REGS_ADDR;
1768 else 1909 else
@@ -1781,6 +1922,20 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
1781 */ 1922 */
1782unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) 1923unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
1783{ 1924{
1925 struct ftrace_ops *ops;
1926
1927 /* Trampolines take precedence over regs */
1928 if (rec->flags & FTRACE_FL_TRAMP_EN) {
1929 ops = ftrace_find_tramp_ops_curr(rec);
1930 if (FTRACE_WARN_ON(!ops)) {
1931 pr_warning("Bad trampoline accounting at: %p (%pS)\n",
1932 (void *)rec->ip, (void *)rec->ip);
1933 /* Ftrace is shutting down, return anything */
1934 return (unsigned long)FTRACE_ADDR;
1935 }
1936 return ops->trampoline;
1937 }
1938
1784 if (rec->flags & FTRACE_FL_REGS_EN) 1939 if (rec->flags & FTRACE_FL_REGS_EN)
1785 return (unsigned long)FTRACE_REGS_ADDR; 1940 return (unsigned long)FTRACE_REGS_ADDR;
1786 else 1941 else
@@ -2023,6 +2178,89 @@ void __weak arch_ftrace_update_code(int command)
2023 ftrace_run_stop_machine(command); 2178 ftrace_run_stop_machine(command);
2024} 2179}
2025 2180
2181static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops)
2182{
2183 struct ftrace_page *pg;
2184 struct dyn_ftrace *rec;
2185 int size, bits;
2186 int ret;
2187
2188 size = ops->nr_trampolines;
2189 bits = 0;
2190 /*
2191 * Make the hash size about 1/2 the # found
2192 */
2193 for (size /= 2; size; size >>= 1)
2194 bits++;
2195
2196 ops->tramp_hash = alloc_ftrace_hash(bits);
2197 /*
2198 * TODO: a failed allocation is going to screw up
2199 * the accounting of what needs to be modified
2200 * and not. For now, we kill ftrace if we fail
2201 * to allocate here. But there are ways around this,
2202 * but that will take a little more work.
2203 */
2204 if (!ops->tramp_hash)
2205 return -ENOMEM;
2206
2207 do_for_each_ftrace_rec(pg, rec) {
2208 if (ftrace_rec_count(rec) == 1 &&
2209 ftrace_ops_test(ops, rec->ip, rec)) {
2210
2211 /*
2212 * If another ops adds to a rec, the rec will
2213 * lose its trampoline and never get it back
2214 * until all ops are off of it.
2215 */
2216 if (!(rec->flags & FTRACE_FL_TRAMP))
2217 continue;
2218
2219 /* This record had better have a trampoline */
2220 if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN)))
2221 return -1;
2222
2223 ret = add_hash_entry(ops->tramp_hash, rec->ip);
2224 if (ret < 0)
2225 return ret;
2226 }
2227 } while_for_each_ftrace_rec();
2228
2229 /* The number of recs in the hash must match nr_trampolines */
2230 FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines);
2231
2232 return 0;
2233}
2234
2235static int ftrace_save_tramp_hashes(void)
2236{
2237 struct ftrace_ops *op;
2238 int ret;
2239
2240 /*
2241 * Now that any trampoline is being used, we need to save the
2242 * hashes for the ops that have them. This allows the mapping
2243 * back from the record to the ops that has the trampoline to
2244 * know what code is being replaced. Modifying code must always
2245 * verify what it is changing.
2246 */
2247 do_for_each_ftrace_op(op, ftrace_ops_list) {
2248
2249 /* The tramp_hash is recreated each time. */
2250 free_ftrace_hash(op->tramp_hash);
2251 op->tramp_hash = NULL;
2252
2253 if (op->nr_trampolines) {
2254 ret = ftrace_save_ops_tramp_hash(op);
2255 if (ret)
2256 return ret;
2257 }
2258
2259 } while_for_each_ftrace_op(op);
2260
2261 return 0;
2262}
2263
2026static void ftrace_run_update_code(int command) 2264static void ftrace_run_update_code(int command)
2027{ 2265{
2028 int ret; 2266 int ret;
@@ -2031,11 +2269,6 @@ static void ftrace_run_update_code(int command)
2031 FTRACE_WARN_ON(ret); 2269 FTRACE_WARN_ON(ret);
2032 if (ret) 2270 if (ret)
2033 return; 2271 return;
2034 /*
2035 * Do not call function tracer while we update the code.
2036 * We are in stop machine.
2037 */
2038 function_trace_stop++;
2039 2272
2040 /* 2273 /*
2041 * By default we use stop_machine() to modify the code. 2274 * By default we use stop_machine() to modify the code.
@@ -2045,15 +2278,15 @@ static void ftrace_run_update_code(int command)
2045 */ 2278 */
2046 arch_ftrace_update_code(command); 2279 arch_ftrace_update_code(command);
2047 2280
2048 function_trace_stop--;
2049
2050 ret = ftrace_arch_code_modify_post_process(); 2281 ret = ftrace_arch_code_modify_post_process();
2051 FTRACE_WARN_ON(ret); 2282 FTRACE_WARN_ON(ret);
2283
2284 ret = ftrace_save_tramp_hashes();
2285 FTRACE_WARN_ON(ret);
2052} 2286}
2053 2287
2054static ftrace_func_t saved_ftrace_func; 2288static ftrace_func_t saved_ftrace_func;
2055static int ftrace_start_up; 2289static int ftrace_start_up;
2056static int global_start_up;
2057 2290
2058static void control_ops_free(struct ftrace_ops *ops) 2291static void control_ops_free(struct ftrace_ops *ops)
2059{ 2292{
@@ -2117,8 +2350,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2117 2350
2118 ftrace_hash_rec_disable(ops, 1); 2351 ftrace_hash_rec_disable(ops, 1);
2119 2352
2120 if (!global_start_up) 2353 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
2121 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
2122 2354
2123 command |= FTRACE_UPDATE_CALLS; 2355 command |= FTRACE_UPDATE_CALLS;
2124 2356
@@ -2139,8 +2371,16 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2139 return 0; 2371 return 0;
2140 } 2372 }
2141 2373
2374 /*
2375 * If the ops uses a trampoline, then it needs to be
2376 * tested first on update.
2377 */
2378 removed_ops = ops;
2379
2142 ftrace_run_update_code(command); 2380 ftrace_run_update_code(command);
2143 2381
2382 removed_ops = NULL;
2383
2144 /* 2384 /*
2145 * Dynamic ops may be freed, we must make sure that all 2385 * Dynamic ops may be freed, we must make sure that all
2146 * callers are done before leaving this function. 2386 * callers are done before leaving this function.
@@ -2398,7 +2638,8 @@ ftrace_allocate_pages(unsigned long num_to_init)
2398 return start_pg; 2638 return start_pg;
2399 2639
2400 free_pages: 2640 free_pages:
2401 while (start_pg) { 2641 pg = start_pg;
2642 while (pg) {
2402 order = get_count_order(pg->size / ENTRIES_PER_PAGE); 2643 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
2403 free_pages((unsigned long)pg->records, order); 2644 free_pages((unsigned long)pg->records, order);
2404 start_pg = pg->next; 2645 start_pg = pg->next;
@@ -2595,8 +2836,10 @@ static void *t_start(struct seq_file *m, loff_t *pos)
2595 * off, we can short cut and just print out that all 2836 * off, we can short cut and just print out that all
2596 * functions are enabled. 2837 * functions are enabled.
2597 */ 2838 */
2598 if (iter->flags & FTRACE_ITER_FILTER && 2839 if ((iter->flags & FTRACE_ITER_FILTER &&
2599 ftrace_hash_empty(ops->filter_hash)) { 2840 ftrace_hash_empty(ops->filter_hash)) ||
2841 (iter->flags & FTRACE_ITER_NOTRACE &&
2842 ftrace_hash_empty(ops->notrace_hash))) {
2600 if (*pos > 0) 2843 if (*pos > 0)
2601 return t_hash_start(m, pos); 2844 return t_hash_start(m, pos);
2602 iter->flags |= FTRACE_ITER_PRINTALL; 2845 iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2641,7 +2884,10 @@ static int t_show(struct seq_file *m, void *v)
2641 return t_hash_show(m, iter); 2884 return t_hash_show(m, iter);
2642 2885
2643 if (iter->flags & FTRACE_ITER_PRINTALL) { 2886 if (iter->flags & FTRACE_ITER_PRINTALL) {
2644 seq_printf(m, "#### all functions enabled ####\n"); 2887 if (iter->flags & FTRACE_ITER_NOTRACE)
2888 seq_printf(m, "#### no functions disabled ####\n");
2889 else
2890 seq_printf(m, "#### all functions enabled ####\n");
2645 return 0; 2891 return 0;
2646 } 2892 }
2647 2893
@@ -2651,10 +2897,22 @@ static int t_show(struct seq_file *m, void *v)
2651 return 0; 2897 return 0;
2652 2898
2653 seq_printf(m, "%ps", (void *)rec->ip); 2899 seq_printf(m, "%ps", (void *)rec->ip);
2654 if (iter->flags & FTRACE_ITER_ENABLED) 2900 if (iter->flags & FTRACE_ITER_ENABLED) {
2655 seq_printf(m, " (%ld)%s", 2901 seq_printf(m, " (%ld)%s",
2656 rec->flags & ~FTRACE_FL_MASK, 2902 ftrace_rec_count(rec),
2657 rec->flags & FTRACE_FL_REGS ? " R" : ""); 2903 rec->flags & FTRACE_FL_REGS ? " R" : " ");
2904 if (rec->flags & FTRACE_FL_TRAMP_EN) {
2905 struct ftrace_ops *ops;
2906
2907 ops = ftrace_find_tramp_ops_curr(rec);
2908 if (ops && ops->trampoline)
2909 seq_printf(m, "\ttramp: %pS",
2910 (void *)ops->trampoline);
2911 else
2912 seq_printf(m, "\ttramp: ERROR!");
2913 }
2914 }
2915
2658 seq_printf(m, "\n"); 2916 seq_printf(m, "\n");
2659 2917
2660 return 0; 2918 return 0;
@@ -2702,13 +2960,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
2702 return iter ? 0 : -ENOMEM; 2960 return iter ? 0 : -ENOMEM;
2703} 2961}
2704 2962
2705static void ftrace_filter_reset(struct ftrace_hash *hash)
2706{
2707 mutex_lock(&ftrace_lock);
2708 ftrace_hash_clear(hash);
2709 mutex_unlock(&ftrace_lock);
2710}
2711
2712/** 2963/**
2713 * ftrace_regex_open - initialize function tracer filter files 2964 * ftrace_regex_open - initialize function tracer filter files
2714 * @ops: The ftrace_ops that hold the hash filters 2965 * @ops: The ftrace_ops that hold the hash filters
@@ -2758,7 +3009,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
2758 hash = ops->filter_hash; 3009 hash = ops->filter_hash;
2759 3010
2760 if (file->f_mode & FMODE_WRITE) { 3011 if (file->f_mode & FMODE_WRITE) {
2761 iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); 3012 const int size_bits = FTRACE_HASH_DEFAULT_BITS;
3013
3014 if (file->f_flags & O_TRUNC)
3015 iter->hash = alloc_ftrace_hash(size_bits);
3016 else
3017 iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
3018
2762 if (!iter->hash) { 3019 if (!iter->hash) {
2763 trace_parser_put(&iter->parser); 3020 trace_parser_put(&iter->parser);
2764 kfree(iter); 3021 kfree(iter);
@@ -2767,10 +3024,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
2767 } 3024 }
2768 } 3025 }
2769 3026
2770 if ((file->f_mode & FMODE_WRITE) &&
2771 (file->f_flags & O_TRUNC))
2772 ftrace_filter_reset(iter->hash);
2773
2774 if (file->f_mode & FMODE_READ) { 3027 if (file->f_mode & FMODE_READ) {
2775 iter->pg = ftrace_pages_start; 3028 iter->pg = ftrace_pages_start;
2776 3029
@@ -3471,14 +3724,16 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3471 else 3724 else
3472 orig_hash = &ops->notrace_hash; 3725 orig_hash = &ops->notrace_hash;
3473 3726
3474 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); 3727 if (reset)
3728 hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
3729 else
3730 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
3731
3475 if (!hash) { 3732 if (!hash) {
3476 ret = -ENOMEM; 3733 ret = -ENOMEM;
3477 goto out_regex_unlock; 3734 goto out_regex_unlock;
3478 } 3735 }
3479 3736
3480 if (reset)
3481 ftrace_filter_reset(hash);
3482 if (buf && !ftrace_match_records(hash, buf, len)) { 3737 if (buf && !ftrace_match_records(hash, buf, len)) {
3483 ret = -EINVAL; 3738 ret = -EINVAL;
3484 goto out_regex_unlock; 3739 goto out_regex_unlock;
@@ -3630,6 +3885,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
3630 3885
3631#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3886#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3632static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; 3887static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
3888static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
3633static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); 3889static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
3634 3890
3635static int __init set_graph_function(char *str) 3891static int __init set_graph_function(char *str)
@@ -3639,16 +3895,29 @@ static int __init set_graph_function(char *str)
3639} 3895}
3640__setup("ftrace_graph_filter=", set_graph_function); 3896__setup("ftrace_graph_filter=", set_graph_function);
3641 3897
3642static void __init set_ftrace_early_graph(char *buf) 3898static int __init set_graph_notrace_function(char *str)
3899{
3900 strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
3901 return 1;
3902}
3903__setup("ftrace_graph_notrace=", set_graph_notrace_function);
3904
3905static void __init set_ftrace_early_graph(char *buf, int enable)
3643{ 3906{
3644 int ret; 3907 int ret;
3645 char *func; 3908 char *func;
3909 unsigned long *table = ftrace_graph_funcs;
3910 int *count = &ftrace_graph_count;
3911
3912 if (!enable) {
3913 table = ftrace_graph_notrace_funcs;
3914 count = &ftrace_graph_notrace_count;
3915 }
3646 3916
3647 while (buf) { 3917 while (buf) {
3648 func = strsep(&buf, ","); 3918 func = strsep(&buf, ",");
3649 /* we allow only one expression at a time */ 3919 /* we allow only one expression at a time */
3650 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, 3920 ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func);
3651 FTRACE_GRAPH_MAX_FUNCS, func);
3652 if (ret) 3921 if (ret)
3653 printk(KERN_DEBUG "ftrace: function %s not " 3922 printk(KERN_DEBUG "ftrace: function %s not "
3654 "traceable\n", func); 3923 "traceable\n", func);
@@ -3677,7 +3946,9 @@ static void __init set_ftrace_early_filters(void)
3677 ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); 3946 ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
3678#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3947#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3679 if (ftrace_graph_buf[0]) 3948 if (ftrace_graph_buf[0])
3680 set_ftrace_early_graph(ftrace_graph_buf); 3949 set_ftrace_early_graph(ftrace_graph_buf, 1);
3950 if (ftrace_graph_notrace_buf[0])
3951 set_ftrace_early_graph(ftrace_graph_notrace_buf, 0);
3681#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3952#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3682} 3953}
3683 3954
@@ -3819,7 +4090,12 @@ static int g_show(struct seq_file *m, void *v)
3819 return 0; 4090 return 0;
3820 4091
3821 if (ptr == (unsigned long *)1) { 4092 if (ptr == (unsigned long *)1) {
3822 seq_printf(m, "#### all functions enabled ####\n"); 4093 struct ftrace_graph_data *fgd = m->private;
4094
4095 if (fgd->table == ftrace_graph_funcs)
4096 seq_printf(m, "#### all functions enabled ####\n");
4097 else
4098 seq_printf(m, "#### no functions disabled ####\n");
3823 return 0; 4099 return 0;
3824 } 4100 }
3825 4101
@@ -4447,9 +4723,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4447 struct ftrace_ops *op; 4723 struct ftrace_ops *op;
4448 int bit; 4724 int bit;
4449 4725
4450 if (function_trace_stop)
4451 return;
4452
4453 bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); 4726 bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
4454 if (bit < 0) 4727 if (bit < 0)
4455 return; 4728 return;
@@ -4461,9 +4734,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4461 preempt_disable_notrace(); 4734 preempt_disable_notrace();
4462 do_for_each_ftrace_op(op, ftrace_ops_list) { 4735 do_for_each_ftrace_op(op, ftrace_ops_list) {
4463 if (ftrace_ops_test(op, ip, regs)) { 4736 if (ftrace_ops_test(op, ip, regs)) {
4464 if (WARN_ON(!op->func)) { 4737 if (FTRACE_WARN_ON(!op->func)) {
4465 function_trace_stop = 1; 4738 pr_warn("op=%p %pS\n", op, op);
4466 printk("op=%p %pS\n", op, op);
4467 goto out; 4739 goto out;
4468 } 4740 }
4469 op->func(ip, parent_ip, op, regs); 4741 op->func(ip, parent_ip, op, regs);
@@ -5084,6 +5356,12 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5084 /* Function graph doesn't use the .func field of global_ops */ 5356 /* Function graph doesn't use the .func field of global_ops */
5085 global_ops.flags |= FTRACE_OPS_FL_STUB; 5357 global_ops.flags |= FTRACE_OPS_FL_STUB;
5086 5358
5359#ifdef CONFIG_DYNAMIC_FTRACE
5360 /* Optimize function graph calling (if implemented by arch) */
5361 if (FTRACE_GRAPH_TRAMP_ADDR != 0)
5362 global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR;
5363#endif
5364
5087 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); 5365 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
5088 5366
5089out: 5367out:
@@ -5104,6 +5382,10 @@ void unregister_ftrace_graph(void)
5104 __ftrace_graph_entry = ftrace_graph_entry_stub; 5382 __ftrace_graph_entry = ftrace_graph_entry_stub;
5105 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); 5383 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
5106 global_ops.flags &= ~FTRACE_OPS_FL_STUB; 5384 global_ops.flags &= ~FTRACE_OPS_FL_STUB;
5385#ifdef CONFIG_DYNAMIC_FTRACE
5386 if (FTRACE_GRAPH_TRAMP_ADDR != 0)
5387 global_ops.trampoline = 0;
5388#endif
5107 unregister_pm_notifier(&ftrace_suspend_notifier); 5389 unregister_pm_notifier(&ftrace_suspend_notifier);
5108 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 5390 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
5109 5391
@@ -5183,9 +5465,4 @@ void ftrace_graph_exit_task(struct task_struct *t)
5183 5465
5184 kfree(ret_stack); 5466 kfree(ret_stack);
5185} 5467}
5186
5187void ftrace_graph_stop(void)
5188{
5189 ftrace_stop();
5190}
5191#endif 5468#endif
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7c56c3d06943..925f629658d6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -616,10 +616,6 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
616 struct ring_buffer_per_cpu *cpu_buffer; 616 struct ring_buffer_per_cpu *cpu_buffer;
617 struct rb_irq_work *work; 617 struct rb_irq_work *work;
618 618
619 if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
620 (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
621 return POLLIN | POLLRDNORM;
622
623 if (cpu == RING_BUFFER_ALL_CPUS) 619 if (cpu == RING_BUFFER_ALL_CPUS)
624 work = &buffer->irq_work; 620 work = &buffer->irq_work;
625 else { 621 else {
@@ -1693,22 +1689,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1693 if (!cpu_buffer->nr_pages_to_update) 1689 if (!cpu_buffer->nr_pages_to_update)
1694 continue; 1690 continue;
1695 1691
1696 /* The update must run on the CPU that is being updated. */ 1692 /* Can't run something on an offline CPU. */
1697 preempt_disable(); 1693 if (!cpu_online(cpu)) {
1698 if (cpu == smp_processor_id() || !cpu_online(cpu)) {
1699 rb_update_pages(cpu_buffer); 1694 rb_update_pages(cpu_buffer);
1700 cpu_buffer->nr_pages_to_update = 0; 1695 cpu_buffer->nr_pages_to_update = 0;
1701 } else { 1696 } else {
1702 /*
1703 * Can not disable preemption for schedule_work_on()
1704 * on PREEMPT_RT.
1705 */
1706 preempt_enable();
1707 schedule_work_on(cpu, 1697 schedule_work_on(cpu,
1708 &cpu_buffer->update_pages_work); 1698 &cpu_buffer->update_pages_work);
1709 preempt_disable();
1710 } 1699 }
1711 preempt_enable();
1712 } 1700 }
1713 1701
1714 /* wait for all the updates to complete */ 1702 /* wait for all the updates to complete */
@@ -1746,22 +1734,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1746 1734
1747 get_online_cpus(); 1735 get_online_cpus();
1748 1736
1749 preempt_disable(); 1737 /* Can't run something on an offline CPU. */
1750 /* The update must run on the CPU that is being updated. */ 1738 if (!cpu_online(cpu_id))
1751 if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
1752 rb_update_pages(cpu_buffer); 1739 rb_update_pages(cpu_buffer);
1753 else { 1740 else {
1754 /*
1755 * Can not disable preemption for schedule_work_on()
1756 * on PREEMPT_RT.
1757 */
1758 preempt_enable();
1759 schedule_work_on(cpu_id, 1741 schedule_work_on(cpu_id,
1760 &cpu_buffer->update_pages_work); 1742 &cpu_buffer->update_pages_work);
1761 wait_for_completion(&cpu_buffer->update_done); 1743 wait_for_completion(&cpu_buffer->update_done);
1762 preempt_disable();
1763 } 1744 }
1764 preempt_enable();
1765 1745
1766 cpu_buffer->nr_pages_to_update = 0; 1746 cpu_buffer->nr_pages_to_update = 0;
1767 put_online_cpus(); 1747 put_online_cpus();
@@ -3779,7 +3759,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3779 if (rb_per_cpu_empty(cpu_buffer)) 3759 if (rb_per_cpu_empty(cpu_buffer))
3780 return NULL; 3760 return NULL;
3781 3761
3782 if (iter->head >= local_read(&iter->head_page->page->commit)) { 3762 if (iter->head >= rb_page_size(iter->head_page)) {
3783 rb_inc_iter(iter); 3763 rb_inc_iter(iter);
3784 goto again; 3764 goto again;
3785 } 3765 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 84e2b45c0934..8a528392b1f4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -466,6 +466,12 @@ int __trace_puts(unsigned long ip, const char *str, int size)
466 struct print_entry *entry; 466 struct print_entry *entry;
467 unsigned long irq_flags; 467 unsigned long irq_flags;
468 int alloc; 468 int alloc;
469 int pc;
470
471 if (!(trace_flags & TRACE_ITER_PRINTK))
472 return 0;
473
474 pc = preempt_count();
469 475
470 if (unlikely(tracing_selftest_running || tracing_disabled)) 476 if (unlikely(tracing_selftest_running || tracing_disabled))
471 return 0; 477 return 0;
@@ -475,7 +481,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
475 local_save_flags(irq_flags); 481 local_save_flags(irq_flags);
476 buffer = global_trace.trace_buffer.buffer; 482 buffer = global_trace.trace_buffer.buffer;
477 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 483 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
478 irq_flags, preempt_count()); 484 irq_flags, pc);
479 if (!event) 485 if (!event)
480 return 0; 486 return 0;
481 487
@@ -492,6 +498,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
492 entry->buf[size] = '\0'; 498 entry->buf[size] = '\0';
493 499
494 __buffer_unlock_commit(buffer, event); 500 __buffer_unlock_commit(buffer, event);
501 ftrace_trace_stack(buffer, irq_flags, 4, pc);
495 502
496 return size; 503 return size;
497} 504}
@@ -509,6 +516,12 @@ int __trace_bputs(unsigned long ip, const char *str)
509 struct bputs_entry *entry; 516 struct bputs_entry *entry;
510 unsigned long irq_flags; 517 unsigned long irq_flags;
511 int size = sizeof(struct bputs_entry); 518 int size = sizeof(struct bputs_entry);
519 int pc;
520
521 if (!(trace_flags & TRACE_ITER_PRINTK))
522 return 0;
523
524 pc = preempt_count();
512 525
513 if (unlikely(tracing_selftest_running || tracing_disabled)) 526 if (unlikely(tracing_selftest_running || tracing_disabled))
514 return 0; 527 return 0;
@@ -516,7 +529,7 @@ int __trace_bputs(unsigned long ip, const char *str)
516 local_save_flags(irq_flags); 529 local_save_flags(irq_flags);
517 buffer = global_trace.trace_buffer.buffer; 530 buffer = global_trace.trace_buffer.buffer;
518 event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, 531 event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
519 irq_flags, preempt_count()); 532 irq_flags, pc);
520 if (!event) 533 if (!event)
521 return 0; 534 return 0;
522 535
@@ -525,6 +538,7 @@ int __trace_bputs(unsigned long ip, const char *str)
525 entry->str = str; 538 entry->str = str;
526 539
527 __buffer_unlock_commit(buffer, event); 540 __buffer_unlock_commit(buffer, event);
541 ftrace_trace_stack(buffer, irq_flags, 4, pc);
528 542
529 return 1; 543 return 1;
530} 544}
@@ -809,7 +823,7 @@ static struct {
809 { trace_clock_local, "local", 1 }, 823 { trace_clock_local, "local", 1 },
810 { trace_clock_global, "global", 1 }, 824 { trace_clock_global, "global", 1 },
811 { trace_clock_counter, "counter", 0 }, 825 { trace_clock_counter, "counter", 0 },
812 { trace_clock_jiffies, "uptime", 1 }, 826 { trace_clock_jiffies, "uptime", 0 },
813 { trace_clock, "perf", 1 }, 827 { trace_clock, "perf", 1 },
814 { ktime_get_mono_fast_ns, "mono", 1 }, 828 { ktime_get_mono_fast_ns, "mono", 1 },
815 ARCH_TRACE_CLOCKS 829 ARCH_TRACE_CLOCKS
@@ -924,30 +938,6 @@ out:
924 return ret; 938 return ret;
925} 939}
926 940
927ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
928{
929 int len;
930 int ret;
931
932 if (!cnt)
933 return 0;
934
935 if (s->len <= s->readpos)
936 return -EBUSY;
937
938 len = s->len - s->readpos;
939 if (cnt > len)
940 cnt = len;
941 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
942 if (ret == cnt)
943 return -EFAULT;
944
945 cnt -= ret;
946
947 s->readpos += cnt;
948 return cnt;
949}
950
951static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) 941static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
952{ 942{
953 int len; 943 int len;
@@ -3686,6 +3676,7 @@ static const char readme_msg[] =
3686#endif 3676#endif
3687#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3677#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3688 " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" 3678 " set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
3679 " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n"
3689 " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" 3680 " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
3690#endif 3681#endif
3691#ifdef CONFIG_TRACER_SNAPSHOT 3682#ifdef CONFIG_TRACER_SNAPSHOT
@@ -4225,10 +4216,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
4225} 4216}
4226 4217
4227static ssize_t 4218static ssize_t
4228tracing_max_lat_read(struct file *filp, char __user *ubuf, 4219tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
4229 size_t cnt, loff_t *ppos) 4220 size_t cnt, loff_t *ppos)
4230{ 4221{
4231 unsigned long *ptr = filp->private_data;
4232 char buf[64]; 4222 char buf[64];
4233 int r; 4223 int r;
4234 4224
@@ -4240,10 +4230,9 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf,
4240} 4230}
4241 4231
4242static ssize_t 4232static ssize_t
4243tracing_max_lat_write(struct file *filp, const char __user *ubuf, 4233tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
4244 size_t cnt, loff_t *ppos) 4234 size_t cnt, loff_t *ppos)
4245{ 4235{
4246 unsigned long *ptr = filp->private_data;
4247 unsigned long val; 4236 unsigned long val;
4248 int ret; 4237 int ret;
4249 4238
@@ -4256,6 +4245,52 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
4256 return cnt; 4245 return cnt;
4257} 4246}
4258 4247
4248static ssize_t
4249tracing_thresh_read(struct file *filp, char __user *ubuf,
4250 size_t cnt, loff_t *ppos)
4251{
4252 return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos);
4253}
4254
4255static ssize_t
4256tracing_thresh_write(struct file *filp, const char __user *ubuf,
4257 size_t cnt, loff_t *ppos)
4258{
4259 struct trace_array *tr = filp->private_data;
4260 int ret;
4261
4262 mutex_lock(&trace_types_lock);
4263 ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos);
4264 if (ret < 0)
4265 goto out;
4266
4267 if (tr->current_trace->update_thresh) {
4268 ret = tr->current_trace->update_thresh(tr);
4269 if (ret < 0)
4270 goto out;
4271 }
4272
4273 ret = cnt;
4274out:
4275 mutex_unlock(&trace_types_lock);
4276
4277 return ret;
4278}
4279
4280static ssize_t
4281tracing_max_lat_read(struct file *filp, char __user *ubuf,
4282 size_t cnt, loff_t *ppos)
4283{
4284 return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos);
4285}
4286
4287static ssize_t
4288tracing_max_lat_write(struct file *filp, const char __user *ubuf,
4289 size_t cnt, loff_t *ppos)
4290{
4291 return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos);
4292}
4293
4259static int tracing_open_pipe(struct inode *inode, struct file *filp) 4294static int tracing_open_pipe(struct inode *inode, struct file *filp)
4260{ 4295{
4261 struct trace_array *tr = inode->i_private; 4296 struct trace_array *tr = inode->i_private;
@@ -5157,6 +5192,13 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp)
5157#endif /* CONFIG_TRACER_SNAPSHOT */ 5192#endif /* CONFIG_TRACER_SNAPSHOT */
5158 5193
5159 5194
5195static const struct file_operations tracing_thresh_fops = {
5196 .open = tracing_open_generic,
5197 .read = tracing_thresh_read,
5198 .write = tracing_thresh_write,
5199 .llseek = generic_file_llseek,
5200};
5201
5160static const struct file_operations tracing_max_lat_fops = { 5202static const struct file_operations tracing_max_lat_fops = {
5161 .open = tracing_open_generic, 5203 .open = tracing_open_generic,
5162 .read = tracing_max_lat_read, 5204 .read = tracing_max_lat_read,
@@ -6094,10 +6136,8 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
6094 if (!topts) 6136 if (!topts)
6095 return; 6137 return;
6096 6138
6097 for (cnt = 0; topts[cnt].opt; cnt++) { 6139 for (cnt = 0; topts[cnt].opt; cnt++)
6098 if (topts[cnt].entry) 6140 debugfs_remove(topts[cnt].entry);
6099 debugfs_remove(topts[cnt].entry);
6100 }
6101 6141
6102 kfree(topts); 6142 kfree(topts);
6103} 6143}
@@ -6520,7 +6560,7 @@ static __init int tracer_init_debugfs(void)
6520 init_tracer_debugfs(&global_trace, d_tracer); 6560 init_tracer_debugfs(&global_trace, d_tracer);
6521 6561
6522 trace_create_file("tracing_thresh", 0644, d_tracer, 6562 trace_create_file("tracing_thresh", 0644, d_tracer,
6523 &tracing_thresh, &tracing_max_lat_fops); 6563 &global_trace, &tracing_thresh_fops);
6524 6564
6525 trace_create_file("README", 0444, d_tracer, 6565 trace_create_file("README", 0444, d_tracer,
6526 NULL, &tracing_readme_fops); 6566 NULL, &tracing_readme_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9258f5a815db..385391fb1d3b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -339,6 +339,7 @@ struct tracer_flags {
339 * @reset: called when one switches to another tracer 339 * @reset: called when one switches to another tracer
340 * @start: called when tracing is unpaused (echo 1 > tracing_enabled) 340 * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
341 * @stop: called when tracing is paused (echo 0 > tracing_enabled) 341 * @stop: called when tracing is paused (echo 0 > tracing_enabled)
342 * @update_thresh: called when tracing_thresh is updated
342 * @open: called when the trace file is opened 343 * @open: called when the trace file is opened
343 * @pipe_open: called when the trace_pipe file is opened 344 * @pipe_open: called when the trace_pipe file is opened
344 * @close: called when the trace file is released 345 * @close: called when the trace file is released
@@ -357,6 +358,7 @@ struct tracer {
357 void (*reset)(struct trace_array *tr); 358 void (*reset)(struct trace_array *tr);
358 void (*start)(struct trace_array *tr); 359 void (*start)(struct trace_array *tr);
359 void (*stop)(struct trace_array *tr); 360 void (*stop)(struct trace_array *tr);
361 int (*update_thresh)(struct trace_array *tr);
360 void (*open)(struct trace_iterator *iter); 362 void (*open)(struct trace_iterator *iter);
361 void (*pipe_open)(struct trace_iterator *iter); 363 void (*pipe_open)(struct trace_iterator *iter);
362 void (*close)(struct trace_iterator *iter); 364 void (*close)(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 26dc348332b7..57b67b1f24d1 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -59,13 +59,14 @@ u64 notrace trace_clock(void)
59 59
60/* 60/*
61 * trace_jiffy_clock(): Simply use jiffies as a clock counter. 61 * trace_jiffy_clock(): Simply use jiffies as a clock counter.
62 * Note that this use of jiffies_64 is not completely safe on
63 * 32-bit systems. But the window is tiny, and the effect if
64 * we are affected is that we will have an obviously bogus
65 * timestamp on a trace event - i.e. not life threatening.
62 */ 66 */
63u64 notrace trace_clock_jiffies(void) 67u64 notrace trace_clock_jiffies(void)
64{ 68{
65 u64 jiffy = jiffies - INITIAL_JIFFIES; 69 return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
66
67 /* Return nsecs */
68 return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
69} 70}
70 71
71/* 72/*
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 5d12bb407b44..4b9c114ee9de 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -30,6 +30,18 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
30 return ret; 30 return ret;
31 } 31 }
32 32
33 /*
34 * We checked and allowed to create parent,
35 * allow children without checking.
36 */
37 if (p_event->parent)
38 return 0;
39
40 /*
41 * It's ok to check current process (owner) permissions in here,
42 * because code below is called only via perf_event_open syscall.
43 */
44
33 /* The ftrace function trace is allowed only for root. */ 45 /* The ftrace function trace is allowed only for root. */
34 if (ftrace_event_is_function(tp_event)) { 46 if (ftrace_event_is_function(tp_event)) {
35 if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) 47 if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f99e0b3bca8c..ef06ce7e9cf8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,6 +8,8 @@
8 * 8 *
9 */ 9 */
10 10
11#define pr_fmt(fmt) fmt
12
11#include <linux/workqueue.h> 13#include <linux/workqueue.h>
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
13#include <linux/kthread.h> 15#include <linux/kthread.h>
@@ -470,6 +472,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
470 472
471 list_del(&file->list); 473 list_del(&file->list);
472 remove_subsystem(file->system); 474 remove_subsystem(file->system);
475 free_event_filter(file->filter);
473 kmem_cache_free(file_cachep, file); 476 kmem_cache_free(file_cachep, file);
474} 477}
475 478
@@ -1490,7 +1493,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
1490 1493
1491 dir->entry = debugfs_create_dir(name, parent); 1494 dir->entry = debugfs_create_dir(name, parent);
1492 if (!dir->entry) { 1495 if (!dir->entry) {
1493 pr_warning("Failed to create system directory %s\n", name); 1496 pr_warn("Failed to create system directory %s\n", name);
1494 __put_system(system); 1497 __put_system(system);
1495 goto out_free; 1498 goto out_free;
1496 } 1499 }
@@ -1506,7 +1509,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
1506 if (!entry) { 1509 if (!entry) {
1507 kfree(system->filter); 1510 kfree(system->filter);
1508 system->filter = NULL; 1511 system->filter = NULL;
1509 pr_warning("Could not create debugfs '%s/filter' entry\n", name); 1512 pr_warn("Could not create debugfs '%s/filter' entry\n", name);
1510 } 1513 }
1511 1514
1512 trace_create_file("enable", 0644, dir->entry, dir, 1515 trace_create_file("enable", 0644, dir->entry, dir,
@@ -1521,8 +1524,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
1521 out_fail: 1524 out_fail:
1522 /* Only print this message if failed on memory allocation */ 1525 /* Only print this message if failed on memory allocation */
1523 if (!dir || !system) 1526 if (!dir || !system)
1524 pr_warning("No memory to create event subsystem %s\n", 1527 pr_warn("No memory to create event subsystem %s\n", name);
1525 name);
1526 return NULL; 1528 return NULL;
1527} 1529}
1528 1530
@@ -1550,8 +1552,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
1550 name = ftrace_event_name(call); 1552 name = ftrace_event_name(call);
1551 file->dir = debugfs_create_dir(name, d_events); 1553 file->dir = debugfs_create_dir(name, d_events);
1552 if (!file->dir) { 1554 if (!file->dir) {
1553 pr_warning("Could not create debugfs '%s' directory\n", 1555 pr_warn("Could not create debugfs '%s' directory\n", name);
1554 name);
1555 return -1; 1556 return -1;
1556 } 1557 }
1557 1558
@@ -1574,8 +1575,8 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
1574 if (list_empty(head)) { 1575 if (list_empty(head)) {
1575 ret = call->class->define_fields(call); 1576 ret = call->class->define_fields(call);
1576 if (ret < 0) { 1577 if (ret < 0) {
1577 pr_warning("Could not initialize trace point" 1578 pr_warn("Could not initialize trace point events/%s\n",
1578 " events/%s\n", name); 1579 name);
1579 return -1; 1580 return -1;
1580 } 1581 }
1581 } 1582 }
@@ -1620,7 +1621,6 @@ static void event_remove(struct ftrace_event_call *call)
1620 if (file->event_call != call) 1621 if (file->event_call != call)
1621 continue; 1622 continue;
1622 ftrace_event_enable_disable(file, 0); 1623 ftrace_event_enable_disable(file, 0);
1623 destroy_preds(file);
1624 /* 1624 /*
1625 * The do_for_each_event_file() is 1625 * The do_for_each_event_file() is
1626 * a double loop. After finding the call for this 1626 * a double loop. After finding the call for this
@@ -1648,8 +1648,7 @@ static int event_init(struct ftrace_event_call *call)
1648 if (call->class->raw_init) { 1648 if (call->class->raw_init) {
1649 ret = call->class->raw_init(call); 1649 ret = call->class->raw_init(call);
1650 if (ret < 0 && ret != -ENOSYS) 1650 if (ret < 0 && ret != -ENOSYS)
1651 pr_warn("Could not initialize trace events/%s\n", 1651 pr_warn("Could not initialize trace events/%s\n", name);
1652 name);
1653 } 1652 }
1654 1653
1655 return ret; 1654 return ret;
@@ -1748,7 +1747,8 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
1748{ 1747{
1749 event_remove(call); 1748 event_remove(call);
1750 trace_destroy_fields(call); 1749 trace_destroy_fields(call);
1751 destroy_call_preds(call); 1750 free_event_filter(call->filter);
1751 call->filter = NULL;
1752} 1752}
1753 1753
1754static int probe_remove_event_call(struct ftrace_event_call *call) 1754static int probe_remove_event_call(struct ftrace_event_call *call)
@@ -1894,8 +1894,8 @@ __trace_add_event_dirs(struct trace_array *tr)
1894 list_for_each_entry(call, &ftrace_events, list) { 1894 list_for_each_entry(call, &ftrace_events, list) {
1895 ret = __trace_add_new_event(call, tr); 1895 ret = __trace_add_new_event(call, tr);
1896 if (ret < 0) 1896 if (ret < 0)
1897 pr_warning("Could not create directory for event %s\n", 1897 pr_warn("Could not create directory for event %s\n",
1898 ftrace_event_name(call)); 1898 ftrace_event_name(call));
1899 } 1899 }
1900} 1900}
1901 1901
@@ -2207,8 +2207,8 @@ __trace_early_add_event_dirs(struct trace_array *tr)
2207 list_for_each_entry(file, &tr->events, list) { 2207 list_for_each_entry(file, &tr->events, list) {
2208 ret = event_create_dir(tr->event_dir, file); 2208 ret = event_create_dir(tr->event_dir, file);
2209 if (ret < 0) 2209 if (ret < 0)
2210 pr_warning("Could not create directory for event %s\n", 2210 pr_warn("Could not create directory for event %s\n",
2211 ftrace_event_name(file->event_call)); 2211 ftrace_event_name(file->event_call));
2212 } 2212 }
2213} 2213}
2214 2214
@@ -2231,8 +2231,8 @@ __trace_early_add_events(struct trace_array *tr)
2231 2231
2232 ret = __trace_early_add_new_event(call, tr); 2232 ret = __trace_early_add_new_event(call, tr);
2233 if (ret < 0) 2233 if (ret < 0)
2234 pr_warning("Could not create early event %s\n", 2234 pr_warn("Could not create early event %s\n",
2235 ftrace_event_name(call)); 2235 ftrace_event_name(call));
2236 } 2236 }
2237} 2237}
2238 2238
@@ -2279,13 +2279,13 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
2279 entry = debugfs_create_file("set_event", 0644, parent, 2279 entry = debugfs_create_file("set_event", 0644, parent,
2280 tr, &ftrace_set_event_fops); 2280 tr, &ftrace_set_event_fops);
2281 if (!entry) { 2281 if (!entry) {
2282 pr_warning("Could not create debugfs 'set_event' entry\n"); 2282 pr_warn("Could not create debugfs 'set_event' entry\n");
2283 return -ENOMEM; 2283 return -ENOMEM;
2284 } 2284 }
2285 2285
2286 d_events = debugfs_create_dir("events", parent); 2286 d_events = debugfs_create_dir("events", parent);
2287 if (!d_events) { 2287 if (!d_events) {
2288 pr_warning("Could not create debugfs 'events' directory\n"); 2288 pr_warn("Could not create debugfs 'events' directory\n");
2289 return -ENOMEM; 2289 return -ENOMEM;
2290 } 2290 }
2291 2291
@@ -2461,11 +2461,10 @@ static __init int event_trace_init(void)
2461 entry = debugfs_create_file("available_events", 0444, d_tracer, 2461 entry = debugfs_create_file("available_events", 0444, d_tracer,
2462 tr, &ftrace_avail_fops); 2462 tr, &ftrace_avail_fops);
2463 if (!entry) 2463 if (!entry)
2464 pr_warning("Could not create debugfs " 2464 pr_warn("Could not create debugfs 'available_events' entry\n");
2465 "'available_events' entry\n");
2466 2465
2467 if (trace_define_common_fields()) 2466 if (trace_define_common_fields())
2468 pr_warning("tracing: Failed to allocate common fields"); 2467 pr_warn("tracing: Failed to allocate common fields");
2469 2468
2470 ret = early_event_add_tracer(d_tracer, tr); 2469 ret = early_event_add_tracer(d_tracer, tr);
2471 if (ret) 2470 if (ret)
@@ -2474,7 +2473,7 @@ static __init int event_trace_init(void)
2474#ifdef CONFIG_MODULES 2473#ifdef CONFIG_MODULES
2475 ret = register_module_notifier(&trace_module_nb); 2474 ret = register_module_notifier(&trace_module_nb);
2476 if (ret) 2475 if (ret)
2477 pr_warning("Failed to register trace events module notifier\n"); 2476 pr_warn("Failed to register trace events module notifier\n");
2478#endif 2477#endif
2479 return 0; 2478 return 0;
2480} 2479}
@@ -2578,7 +2577,7 @@ static __init void event_trace_self_tests(void)
2578 * it and the self test should not be on. 2577 * it and the self test should not be on.
2579 */ 2578 */
2580 if (file->flags & FTRACE_EVENT_FL_ENABLED) { 2579 if (file->flags & FTRACE_EVENT_FL_ENABLED) {
2581 pr_warning("Enabled event during self test!\n"); 2580 pr_warn("Enabled event during self test!\n");
2582 WARN_ON_ONCE(1); 2581 WARN_ON_ONCE(1);
2583 continue; 2582 continue;
2584 } 2583 }
@@ -2606,8 +2605,8 @@ static __init void event_trace_self_tests(void)
2606 2605
2607 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); 2606 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
2608 if (WARN_ON_ONCE(ret)) { 2607 if (WARN_ON_ONCE(ret)) {
2609 pr_warning("error enabling system %s\n", 2608 pr_warn("error enabling system %s\n",
2610 system->name); 2609 system->name);
2611 continue; 2610 continue;
2612 } 2611 }
2613 2612
@@ -2615,8 +2614,8 @@ static __init void event_trace_self_tests(void)
2615 2614
2616 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); 2615 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
2617 if (WARN_ON_ONCE(ret)) { 2616 if (WARN_ON_ONCE(ret)) {
2618 pr_warning("error disabling system %s\n", 2617 pr_warn("error disabling system %s\n",
2619 system->name); 2618 system->name);
2620 continue; 2619 continue;
2621 } 2620 }
2622 2621
@@ -2630,7 +2629,7 @@ static __init void event_trace_self_tests(void)
2630 2629
2631 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); 2630 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
2632 if (WARN_ON_ONCE(ret)) { 2631 if (WARN_ON_ONCE(ret)) {
2633 pr_warning("error enabling all events\n"); 2632 pr_warn("error enabling all events\n");
2634 return; 2633 return;
2635 } 2634 }
2636 2635
@@ -2639,7 +2638,7 @@ static __init void event_trace_self_tests(void)
2639 /* reset sysname */ 2638 /* reset sysname */
2640 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); 2639 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
2641 if (WARN_ON_ONCE(ret)) { 2640 if (WARN_ON_ONCE(ret)) {
2642 pr_warning("error disabling all events\n"); 2641 pr_warn("error disabling all events\n");
2643 return; 2642 return;
2644 } 2643 }
2645 2644
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8a8631926a07..7a8c1528e141 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -774,17 +774,12 @@ static void __free_preds(struct event_filter *filter)
774 filter->n_preds = 0; 774 filter->n_preds = 0;
775} 775}
776 776
777static void call_filter_disable(struct ftrace_event_call *call)
778{
779 call->flags &= ~TRACE_EVENT_FL_FILTERED;
780}
781
782static void filter_disable(struct ftrace_event_file *file) 777static void filter_disable(struct ftrace_event_file *file)
783{ 778{
784 struct ftrace_event_call *call = file->event_call; 779 struct ftrace_event_call *call = file->event_call;
785 780
786 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) 781 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
787 call_filter_disable(call); 782 call->flags &= ~TRACE_EVENT_FL_FILTERED;
788 else 783 else
789 file->flags &= ~FTRACE_EVENT_FL_FILTERED; 784 file->flags &= ~FTRACE_EVENT_FL_FILTERED;
790} 785}
@@ -804,32 +799,6 @@ void free_event_filter(struct event_filter *filter)
804 __free_filter(filter); 799 __free_filter(filter);
805} 800}
806 801
807void destroy_call_preds(struct ftrace_event_call *call)
808{
809 __free_filter(call->filter);
810 call->filter = NULL;
811}
812
813static void destroy_file_preds(struct ftrace_event_file *file)
814{
815 __free_filter(file->filter);
816 file->filter = NULL;
817}
818
819/*
820 * Called when destroying the ftrace_event_file.
821 * The file is being freed, so we do not need to worry about
822 * the file being currently used. This is for module code removing
823 * the tracepoints from within it.
824 */
825void destroy_preds(struct ftrace_event_file *file)
826{
827 if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
828 destroy_call_preds(file->event_call);
829 else
830 destroy_file_preds(file);
831}
832
833static struct event_filter *__alloc_filter(void) 802static struct event_filter *__alloc_filter(void)
834{ 803{
835 struct event_filter *filter; 804 struct event_filter *filter;
@@ -873,17 +842,14 @@ static inline void __remove_filter(struct ftrace_event_file *file)
873 remove_filter_string(file->filter); 842 remove_filter_string(file->filter);
874} 843}
875 844
876static void filter_free_subsystem_preds(struct event_subsystem *system, 845static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
877 struct trace_array *tr) 846 struct trace_array *tr)
878{ 847{
879 struct ftrace_event_file *file; 848 struct ftrace_event_file *file;
880 struct ftrace_event_call *call;
881 849
882 list_for_each_entry(file, &tr->events, list) { 850 list_for_each_entry(file, &tr->events, list) {
883 call = file->event_call; 851 if (file->system != dir)
884 if (strcmp(call->class->system, system->name) != 0)
885 continue; 852 continue;
886
887 __remove_filter(file); 853 __remove_filter(file);
888 } 854 }
889} 855}
@@ -901,15 +867,13 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file)
901 } 867 }
902} 868}
903 869
904static void filter_free_subsystem_filters(struct event_subsystem *system, 870static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir,
905 struct trace_array *tr) 871 struct trace_array *tr)
906{ 872{
907 struct ftrace_event_file *file; 873 struct ftrace_event_file *file;
908 struct ftrace_event_call *call;
909 874
910 list_for_each_entry(file, &tr->events, list) { 875 list_for_each_entry(file, &tr->events, list) {
911 call = file->event_call; 876 if (file->system != dir)
912 if (strcmp(call->class->system, system->name) != 0)
913 continue; 877 continue;
914 __free_subsystem_filter(file); 878 __free_subsystem_filter(file);
915 } 879 }
@@ -1582,7 +1546,6 @@ static int fold_pred_tree(struct event_filter *filter,
1582static int replace_preds(struct ftrace_event_call *call, 1546static int replace_preds(struct ftrace_event_call *call,
1583 struct event_filter *filter, 1547 struct event_filter *filter,
1584 struct filter_parse_state *ps, 1548 struct filter_parse_state *ps,
1585 char *filter_string,
1586 bool dry_run) 1549 bool dry_run)
1587{ 1550{
1588 char *operand1 = NULL, *operand2 = NULL; 1551 char *operand1 = NULL, *operand2 = NULL;
@@ -1755,13 +1718,12 @@ struct filter_list {
1755 struct event_filter *filter; 1718 struct event_filter *filter;
1756}; 1719};
1757 1720
1758static int replace_system_preds(struct event_subsystem *system, 1721static int replace_system_preds(struct ftrace_subsystem_dir *dir,
1759 struct trace_array *tr, 1722 struct trace_array *tr,
1760 struct filter_parse_state *ps, 1723 struct filter_parse_state *ps,
1761 char *filter_string) 1724 char *filter_string)
1762{ 1725{
1763 struct ftrace_event_file *file; 1726 struct ftrace_event_file *file;
1764 struct ftrace_event_call *call;
1765 struct filter_list *filter_item; 1727 struct filter_list *filter_item;
1766 struct filter_list *tmp; 1728 struct filter_list *tmp;
1767 LIST_HEAD(filter_list); 1729 LIST_HEAD(filter_list);
@@ -1769,15 +1731,14 @@ static int replace_system_preds(struct event_subsystem *system,
1769 int err; 1731 int err;
1770 1732
1771 list_for_each_entry(file, &tr->events, list) { 1733 list_for_each_entry(file, &tr->events, list) {
1772 call = file->event_call; 1734 if (file->system != dir)
1773 if (strcmp(call->class->system, system->name) != 0)
1774 continue; 1735 continue;
1775 1736
1776 /* 1737 /*
1777 * Try to see if the filter can be applied 1738 * Try to see if the filter can be applied
1778 * (filter arg is ignored on dry_run) 1739 * (filter arg is ignored on dry_run)
1779 */ 1740 */
1780 err = replace_preds(call, NULL, ps, filter_string, true); 1741 err = replace_preds(file->event_call, NULL, ps, true);
1781 if (err) 1742 if (err)
1782 event_set_no_set_filter_flag(file); 1743 event_set_no_set_filter_flag(file);
1783 else 1744 else
@@ -1787,9 +1748,7 @@ static int replace_system_preds(struct event_subsystem *system,
1787 list_for_each_entry(file, &tr->events, list) { 1748 list_for_each_entry(file, &tr->events, list) {
1788 struct event_filter *filter; 1749 struct event_filter *filter;
1789 1750
1790 call = file->event_call; 1751 if (file->system != dir)
1791
1792 if (strcmp(call->class->system, system->name) != 0)
1793 continue; 1752 continue;
1794 1753
1795 if (event_no_set_filter_flag(file)) 1754 if (event_no_set_filter_flag(file))
@@ -1811,7 +1770,7 @@ static int replace_system_preds(struct event_subsystem *system,
1811 if (err) 1770 if (err)
1812 goto fail_mem; 1771 goto fail_mem;
1813 1772
1814 err = replace_preds(call, filter, ps, filter_string, false); 1773 err = replace_preds(file->event_call, filter, ps, false);
1815 if (err) { 1774 if (err) {
1816 filter_disable(file); 1775 filter_disable(file);
1817 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 1776 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
@@ -1933,7 +1892,7 @@ static int create_filter(struct ftrace_event_call *call,
1933 1892
1934 err = create_filter_start(filter_str, set_str, &ps, &filter); 1893 err = create_filter_start(filter_str, set_str, &ps, &filter);
1935 if (!err) { 1894 if (!err) {
1936 err = replace_preds(call, filter, ps, filter_str, false); 1895 err = replace_preds(call, filter, ps, false);
1937 if (err && set_str) 1896 if (err && set_str)
1938 append_filter_err(ps, filter); 1897 append_filter_err(ps, filter);
1939 } 1898 }
@@ -1959,7 +1918,7 @@ int create_event_filter(struct ftrace_event_call *call,
1959 * Identical to create_filter() except that it creates a subsystem filter 1918 * Identical to create_filter() except that it creates a subsystem filter
1960 * and always remembers @filter_str. 1919 * and always remembers @filter_str.
1961 */ 1920 */
1962static int create_system_filter(struct event_subsystem *system, 1921static int create_system_filter(struct ftrace_subsystem_dir *dir,
1963 struct trace_array *tr, 1922 struct trace_array *tr,
1964 char *filter_str, struct event_filter **filterp) 1923 char *filter_str, struct event_filter **filterp)
1965{ 1924{
@@ -1969,7 +1928,7 @@ static int create_system_filter(struct event_subsystem *system,
1969 1928
1970 err = create_filter_start(filter_str, true, &ps, &filter); 1929 err = create_filter_start(filter_str, true, &ps, &filter);
1971 if (!err) { 1930 if (!err) {
1972 err = replace_system_preds(system, tr, ps, filter_str); 1931 err = replace_system_preds(dir, tr, ps, filter_str);
1973 if (!err) { 1932 if (!err) {
1974 /* System filters just show a default message */ 1933 /* System filters just show a default message */
1975 kfree(filter->filter_string); 1934 kfree(filter->filter_string);
@@ -2053,18 +2012,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
2053 } 2012 }
2054 2013
2055 if (!strcmp(strstrip(filter_string), "0")) { 2014 if (!strcmp(strstrip(filter_string), "0")) {
2056 filter_free_subsystem_preds(system, tr); 2015 filter_free_subsystem_preds(dir, tr);
2057 remove_filter_string(system->filter); 2016 remove_filter_string(system->filter);
2058 filter = system->filter; 2017 filter = system->filter;
2059 system->filter = NULL; 2018 system->filter = NULL;
2060 /* Ensure all filters are no longer used */ 2019 /* Ensure all filters are no longer used */
2061 synchronize_sched(); 2020 synchronize_sched();
2062 filter_free_subsystem_filters(system, tr); 2021 filter_free_subsystem_filters(dir, tr);
2063 __free_filter(filter); 2022 __free_filter(filter);
2064 goto out_unlock; 2023 goto out_unlock;
2065 } 2024 }
2066 2025
2067 err = create_system_filter(system, tr, filter_string, &filter); 2026 err = create_system_filter(dir, tr, filter_string, &filter);
2068 if (filter) { 2027 if (filter) {
2069 /* 2028 /*
2070 * No event actually uses the system filter 2029 * No event actually uses the system filter
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4de3e57f723c..f0a0c982cde3 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,6 +15,33 @@
15#include "trace.h" 15#include "trace.h"
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18static bool kill_ftrace_graph;
19
20/**
21 * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
22 *
23 * ftrace_graph_stop() is called when a severe error is detected in
24 * the function graph tracing. This function is called by the critical
25 * paths of function graph to keep those paths from doing any more harm.
26 */
27bool ftrace_graph_is_dead(void)
28{
29 return kill_ftrace_graph;
30}
31
32/**
33 * ftrace_graph_stop - set to permanently disable function graph tracincg
34 *
35 * In case of an error int function graph tracing, this is called
36 * to try to keep function graph tracing from causing any more harm.
37 * Usually this is pretty severe and this is called to try to at least
38 * get a warning out to the user.
39 */
40void ftrace_graph_stop(void)
41{
42 kill_ftrace_graph = true;
43}
44
18/* When set, irq functions will be ignored */ 45/* When set, irq functions will be ignored */
19static int ftrace_graph_skip_irqs; 46static int ftrace_graph_skip_irqs;
20 47
@@ -92,6 +119,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
92 unsigned long long calltime; 119 unsigned long long calltime;
93 int index; 120 int index;
94 121
122 if (unlikely(ftrace_graph_is_dead()))
123 return -EBUSY;
124
95 if (!current->ret_stack) 125 if (!current->ret_stack)
96 return -EBUSY; 126 return -EBUSY;
97 127
@@ -323,7 +353,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
323 return ret; 353 return ret;
324} 354}
325 355
326int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) 356static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
327{ 357{
328 if (tracing_thresh) 358 if (tracing_thresh)
329 return 1; 359 return 1;
@@ -412,7 +442,7 @@ void set_graph_array(struct trace_array *tr)
412 smp_mb(); 442 smp_mb();
413} 443}
414 444
415void trace_graph_thresh_return(struct ftrace_graph_ret *trace) 445static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
416{ 446{
417 if (tracing_thresh && 447 if (tracing_thresh &&
418 (trace->rettime - trace->calltime < tracing_thresh)) 448 (trace->rettime - trace->calltime < tracing_thresh))
@@ -445,6 +475,12 @@ static void graph_trace_reset(struct trace_array *tr)
445 unregister_ftrace_graph(); 475 unregister_ftrace_graph();
446} 476}
447 477
478static int graph_trace_update_thresh(struct trace_array *tr)
479{
480 graph_trace_reset(tr);
481 return graph_trace_init(tr);
482}
483
448static int max_bytes_for_cpu; 484static int max_bytes_for_cpu;
449 485
450static enum print_line_t 486static enum print_line_t
@@ -1399,7 +1435,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1399 seq_printf(s, " | | | |\n"); 1435 seq_printf(s, " | | | |\n");
1400} 1436}
1401 1437
1402void print_graph_headers(struct seq_file *s) 1438static void print_graph_headers(struct seq_file *s)
1403{ 1439{
1404 print_graph_headers_flags(s, tracer_flags.val); 1440 print_graph_headers_flags(s, tracer_flags.val);
1405} 1441}
@@ -1495,6 +1531,7 @@ static struct trace_event graph_trace_ret_event = {
1495 1531
1496static struct tracer graph_trace __tracer_data = { 1532static struct tracer graph_trace __tracer_data = {
1497 .name = "function_graph", 1533 .name = "function_graph",
1534 .update_thresh = graph_trace_update_thresh,
1498 .open = graph_trace_open, 1535 .open = graph_trace_open,
1499 .pipe_open = graph_trace_open, 1536 .pipe_open = graph_trace_open,
1500 .close = graph_trace_close, 1537 .close = graph_trace_close,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index f3dad80c20b2..c6977d5a9b12 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -20,23 +20,6 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
20 20
21static int next_event_type = __TRACE_LAST_TYPE + 1; 21static int next_event_type = __TRACE_LAST_TYPE + 1;
22 22
23int trace_print_seq(struct seq_file *m, struct trace_seq *s)
24{
25 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
26 int ret;
27
28 ret = seq_write(m, s->buffer, len);
29
30 /*
31 * Only reset this buffer if we successfully wrote to the
32 * seq_file buffer.
33 */
34 if (!ret)
35 trace_seq_init(s);
36
37 return ret;
38}
39
40enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) 23enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
41{ 24{
42 struct trace_seq *s = &iter->seq; 25 struct trace_seq *s = &iter->seq;
@@ -85,257 +68,6 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
85 return TRACE_TYPE_HANDLED; 68 return TRACE_TYPE_HANDLED;
86} 69}
87 70
88/**
89 * trace_seq_printf - sequence printing of trace information
90 * @s: trace sequence descriptor
91 * @fmt: printf format string
92 *
93 * It returns 0 if the trace oversizes the buffer's free
94 * space, 1 otherwise.
95 *
96 * The tracer may use either sequence operations or its own
97 * copy to user routines. To simplify formating of a trace
98 * trace_seq_printf is used to store strings into a special
99 * buffer (@s). Then the output may be either used by
100 * the sequencer or pulled into another buffer.
101 */
102int
103trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
104{
105 int len = (PAGE_SIZE - 1) - s->len;
106 va_list ap;
107 int ret;
108
109 if (s->full || !len)
110 return 0;
111
112 va_start(ap, fmt);
113 ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
114 va_end(ap);
115
116 /* If we can't write it all, don't bother writing anything */
117 if (ret >= len) {
118 s->full = 1;
119 return 0;
120 }
121
122 s->len += ret;
123
124 return 1;
125}
126EXPORT_SYMBOL_GPL(trace_seq_printf);
127
128/**
129 * trace_seq_bitmask - put a list of longs as a bitmask print output
130 * @s: trace sequence descriptor
131 * @maskp: points to an array of unsigned longs that represent a bitmask
132 * @nmaskbits: The number of bits that are valid in @maskp
133 *
134 * It returns 0 if the trace oversizes the buffer's free
135 * space, 1 otherwise.
136 *
137 * Writes a ASCII representation of a bitmask string into @s.
138 */
139int
140trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
141 int nmaskbits)
142{
143 int len = (PAGE_SIZE - 1) - s->len;
144 int ret;
145
146 if (s->full || !len)
147 return 0;
148
149 ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
150 s->len += ret;
151
152 return 1;
153}
154EXPORT_SYMBOL_GPL(trace_seq_bitmask);
155
156/**
157 * trace_seq_vprintf - sequence printing of trace information
158 * @s: trace sequence descriptor
159 * @fmt: printf format string
160 *
161 * The tracer may use either sequence operations or its own
162 * copy to user routines. To simplify formating of a trace
163 * trace_seq_printf is used to store strings into a special
164 * buffer (@s). Then the output may be either used by
165 * the sequencer or pulled into another buffer.
166 */
167int
168trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
169{
170 int len = (PAGE_SIZE - 1) - s->len;
171 int ret;
172
173 if (s->full || !len)
174 return 0;
175
176 ret = vsnprintf(s->buffer + s->len, len, fmt, args);
177
178 /* If we can't write it all, don't bother writing anything */
179 if (ret >= len) {
180 s->full = 1;
181 return 0;
182 }
183
184 s->len += ret;
185
186 return len;
187}
188EXPORT_SYMBOL_GPL(trace_seq_vprintf);
189
190int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
191{
192 int len = (PAGE_SIZE - 1) - s->len;
193 int ret;
194
195 if (s->full || !len)
196 return 0;
197
198 ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
199
200 /* If we can't write it all, don't bother writing anything */
201 if (ret >= len) {
202 s->full = 1;
203 return 0;
204 }
205
206 s->len += ret;
207
208 return len;
209}
210
211/**
212 * trace_seq_puts - trace sequence printing of simple string
213 * @s: trace sequence descriptor
214 * @str: simple string to record
215 *
216 * The tracer may use either the sequence operations or its own
217 * copy to user routines. This function records a simple string
218 * into a special buffer (@s) for later retrieval by a sequencer
219 * or other mechanism.
220 */
221int trace_seq_puts(struct trace_seq *s, const char *str)
222{
223 int len = strlen(str);
224
225 if (s->full)
226 return 0;
227
228 if (len > ((PAGE_SIZE - 1) - s->len)) {
229 s->full = 1;
230 return 0;
231 }
232
233 memcpy(s->buffer + s->len, str, len);
234 s->len += len;
235
236 return len;
237}
238
239int trace_seq_putc(struct trace_seq *s, unsigned char c)
240{
241 if (s->full)
242 return 0;
243
244 if (s->len >= (PAGE_SIZE - 1)) {
245 s->full = 1;
246 return 0;
247 }
248
249 s->buffer[s->len++] = c;
250
251 return 1;
252}
253EXPORT_SYMBOL(trace_seq_putc);
254
255int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
256{
257 if (s->full)
258 return 0;
259
260 if (len > ((PAGE_SIZE - 1) - s->len)) {
261 s->full = 1;
262 return 0;
263 }
264
265 memcpy(s->buffer + s->len, mem, len);
266 s->len += len;
267
268 return len;
269}
270
271int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
272{
273 unsigned char hex[HEX_CHARS];
274 const unsigned char *data = mem;
275 int i, j;
276
277 if (s->full)
278 return 0;
279
280#ifdef __BIG_ENDIAN
281 for (i = 0, j = 0; i < len; i++) {
282#else
283 for (i = len-1, j = 0; i >= 0; i--) {
284#endif
285 hex[j++] = hex_asc_hi(data[i]);
286 hex[j++] = hex_asc_lo(data[i]);
287 }
288 hex[j++] = ' ';
289
290 return trace_seq_putmem(s, hex, j);
291}
292
293void *trace_seq_reserve(struct trace_seq *s, size_t len)
294{
295 void *ret;
296
297 if (s->full)
298 return NULL;
299
300 if (len > ((PAGE_SIZE - 1) - s->len)) {
301 s->full = 1;
302 return NULL;
303 }
304
305 ret = s->buffer + s->len;
306 s->len += len;
307
308 return ret;
309}
310
311int trace_seq_path(struct trace_seq *s, const struct path *path)
312{
313 unsigned char *p;
314
315 if (s->full)
316 return 0;
317
318 if (s->len >= (PAGE_SIZE - 1)) {
319 s->full = 1;
320 return 0;
321 }
322
323 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
324 if (!IS_ERR(p)) {
325 p = mangle_path(s->buffer + s->len, p, "\n");
326 if (p) {
327 s->len = p - s->buffer;
328 return 1;
329 }
330 } else {
331 s->buffer[s->len++] = '?';
332 return 1;
333 }
334
335 s->full = 1;
336 return 0;
337}
338
339const char * 71const char *
340ftrace_print_flags_seq(struct trace_seq *p, const char *delim, 72ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
341 unsigned long flags, 73 unsigned long flags,
@@ -343,7 +75,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
343{ 75{
344 unsigned long mask; 76 unsigned long mask;
345 const char *str; 77 const char *str;
346 const char *ret = p->buffer + p->len; 78 const char *ret = trace_seq_buffer_ptr(p);
347 int i, first = 1; 79 int i, first = 1;
348 80
349 for (i = 0; flag_array[i].name && flags; i++) { 81 for (i = 0; flag_array[i].name && flags; i++) {
@@ -379,7 +111,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
379 const struct trace_print_flags *symbol_array) 111 const struct trace_print_flags *symbol_array)
380{ 112{
381 int i; 113 int i;
382 const char *ret = p->buffer + p->len; 114 const char *ret = trace_seq_buffer_ptr(p);
383 115
384 for (i = 0; symbol_array[i].name; i++) { 116 for (i = 0; symbol_array[i].name; i++) {
385 117
@@ -390,7 +122,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
390 break; 122 break;
391 } 123 }
392 124
393 if (ret == (const char *)(p->buffer + p->len)) 125 if (ret == (const char *)(trace_seq_buffer_ptr(p)))
394 trace_seq_printf(p, "0x%lx", val); 126 trace_seq_printf(p, "0x%lx", val);
395 127
396 trace_seq_putc(p, 0); 128 trace_seq_putc(p, 0);
@@ -405,7 +137,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
405 const struct trace_print_flags_u64 *symbol_array) 137 const struct trace_print_flags_u64 *symbol_array)
406{ 138{
407 int i; 139 int i;
408 const char *ret = p->buffer + p->len; 140 const char *ret = trace_seq_buffer_ptr(p);
409 141
410 for (i = 0; symbol_array[i].name; i++) { 142 for (i = 0; symbol_array[i].name; i++) {
411 143
@@ -416,7 +148,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
416 break; 148 break;
417 } 149 }
418 150
419 if (ret == (const char *)(p->buffer + p->len)) 151 if (ret == (const char *)(trace_seq_buffer_ptr(p)))
420 trace_seq_printf(p, "0x%llx", val); 152 trace_seq_printf(p, "0x%llx", val);
421 153
422 trace_seq_putc(p, 0); 154 trace_seq_putc(p, 0);
@@ -430,7 +162,7 @@ const char *
430ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, 162ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
431 unsigned int bitmask_size) 163 unsigned int bitmask_size)
432{ 164{
433 const char *ret = p->buffer + p->len; 165 const char *ret = trace_seq_buffer_ptr(p);
434 166
435 trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); 167 trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
436 trace_seq_putc(p, 0); 168 trace_seq_putc(p, 0);
@@ -443,7 +175,7 @@ const char *
443ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) 175ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
444{ 176{
445 int i; 177 int i;
446 const char *ret = p->buffer + p->len; 178 const char *ret = trace_seq_buffer_ptr(p);
447 179
448 for (i = 0; i < buf_len; i++) 180 for (i = 0; i < buf_len; i++)
449 trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); 181 trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 127a9d8c8357..80b25b585a70 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -35,9 +35,6 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
35extern int __unregister_ftrace_event(struct trace_event *event); 35extern int __unregister_ftrace_event(struct trace_event *event);
36extern struct rw_semaphore trace_event_sem; 36extern struct rw_semaphore trace_event_sem;
37 37
38#define MAX_MEMHEX_BYTES 8
39#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
40
41#define SEQ_PUT_FIELD_RET(s, x) \ 38#define SEQ_PUT_FIELD_RET(s, x) \
42do { \ 39do { \
43 if (!trace_seq_putmem(s, &(x), sizeof(x))) \ 40 if (!trace_seq_putmem(s, &(x), sizeof(x))) \
@@ -46,7 +43,6 @@ do { \
46 43
47#define SEQ_PUT_HEX_FIELD_RET(s, x) \ 44#define SEQ_PUT_HEX_FIELD_RET(s, x) \
48do { \ 45do { \
49 BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
50 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ 46 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
51 return TRACE_TYPE_PARTIAL_LINE; \ 47 return TRACE_TYPE_PARTIAL_LINE; \
52} while (0) 48} while (0)
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
new file mode 100644
index 000000000000..1f24ed99dca2
--- /dev/null
+++ b/kernel/trace/trace_seq.c
@@ -0,0 +1,428 @@
1/*
2 * trace_seq.c
3 *
4 * Copyright (C) 2008-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
5 *
6 * The trace_seq is a handy tool that allows you to pass a descriptor around
7 * to a buffer that other functions can write to. It is similar to the
8 * seq_file functionality but has some differences.
9 *
10 * To use it, the trace_seq must be initialized with trace_seq_init().
11 * This will set up the counters within the descriptor. You can call
12 * trace_seq_init() more than once to reset the trace_seq to start
13 * from scratch.
14 *
15 * The buffer size is currently PAGE_SIZE, although it may become dynamic
16 * in the future.
17 *
18 * A write to the buffer will either succed or fail. That is, unlike
19 * sprintf() there will not be a partial write (well it may write into
20 * the buffer but it wont update the pointers). This allows users to
21 * try to write something into the trace_seq buffer and if it fails
22 * they can flush it and try again.
23 *
24 */
25#include <linux/uaccess.h>
26#include <linux/seq_file.h>
27#include <linux/trace_seq.h>
28
29/* How much buffer is left on the trace_seq? */
30#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len)
31
32/* How much buffer is written? */
33#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1))
34
35/**
36 * trace_print_seq - move the contents of trace_seq into a seq_file
37 * @m: the seq_file descriptor that is the destination
38 * @s: the trace_seq descriptor that is the source.
39 *
40 * Returns 0 on success and non zero on error. If it succeeds to
41 * write to the seq_file it will reset the trace_seq, otherwise
42 * it does not modify the trace_seq to let the caller try again.
43 */
44int trace_print_seq(struct seq_file *m, struct trace_seq *s)
45{
46 unsigned int len = TRACE_SEQ_BUF_USED(s);
47 int ret;
48
49 ret = seq_write(m, s->buffer, len);
50
51 /*
52 * Only reset this buffer if we successfully wrote to the
53 * seq_file buffer. This lets the caller try again or
54 * do something else with the contents.
55 */
56 if (!ret)
57 trace_seq_init(s);
58
59 return ret;
60}
61
62/**
63 * trace_seq_printf - sequence printing of trace information
64 * @s: trace sequence descriptor
65 * @fmt: printf format string
66 *
67 * The tracer may use either sequence operations or its own
68 * copy to user routines. To simplify formating of a trace
69 * trace_seq_printf() is used to store strings into a special
70 * buffer (@s). Then the output may be either used by
71 * the sequencer or pulled into another buffer.
72 *
73 * Returns 1 if we successfully written all the contents to
74 * the buffer.
75 * Returns 0 if we the length to write is bigger than the
76 * reserved buffer space. In this case, nothing gets written.
77 */
78int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
79{
80 unsigned int len = TRACE_SEQ_BUF_LEFT(s);
81 va_list ap;
82 int ret;
83
84 if (s->full || !len)
85 return 0;
86
87 va_start(ap, fmt);
88 ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
89 va_end(ap);
90
91 /* If we can't write it all, don't bother writing anything */
92 if (ret >= len) {
93 s->full = 1;
94 return 0;
95 }
96
97 s->len += ret;
98
99 return 1;
100}
101EXPORT_SYMBOL_GPL(trace_seq_printf);
102
103/**
104 * trace_seq_bitmask - write a bitmask array in its ASCII representation
105 * @s: trace sequence descriptor
106 * @maskp: points to an array of unsigned longs that represent a bitmask
107 * @nmaskbits: The number of bits that are valid in @maskp
108 *
109 * Writes a ASCII representation of a bitmask string into @s.
110 *
111 * Returns 1 if we successfully written all the contents to
112 * the buffer.
113 * Returns 0 if we the length to write is bigger than the
114 * reserved buffer space. In this case, nothing gets written.
115 */
116int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
117 int nmaskbits)
118{
119 unsigned int len = TRACE_SEQ_BUF_LEFT(s);
120 int ret;
121
122 if (s->full || !len)
123 return 0;
124
125 ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
126 s->len += ret;
127
128 return 1;
129}
130EXPORT_SYMBOL_GPL(trace_seq_bitmask);
131
132/**
133 * trace_seq_vprintf - sequence printing of trace information
134 * @s: trace sequence descriptor
135 * @fmt: printf format string
136 *
137 * The tracer may use either sequence operations or its own
138 * copy to user routines. To simplify formating of a trace
139 * trace_seq_printf is used to store strings into a special
140 * buffer (@s). Then the output may be either used by
141 * the sequencer or pulled into another buffer.
142 *
143 * Returns how much it wrote to the buffer.
144 */
145int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
146{
147 unsigned int len = TRACE_SEQ_BUF_LEFT(s);
148 int ret;
149
150 if (s->full || !len)
151 return 0;
152
153 ret = vsnprintf(s->buffer + s->len, len, fmt, args);
154
155 /* If we can't write it all, don't bother writing anything */
156 if (ret >= len) {
157 s->full = 1;
158 return 0;
159 }
160
161 s->len += ret;
162
163 return len;
164}
165EXPORT_SYMBOL_GPL(trace_seq_vprintf);
166
167/**
168 * trace_seq_bprintf - Write the printf string from binary arguments
169 * @s: trace sequence descriptor
170 * @fmt: The format string for the @binary arguments
171 * @binary: The binary arguments for @fmt.
172 *
173 * When recording in a fast path, a printf may be recorded with just
174 * saving the format and the arguments as they were passed to the
175 * function, instead of wasting cycles converting the arguments into
176 * ASCII characters. Instead, the arguments are saved in a 32 bit
177 * word array that is defined by the format string constraints.
178 *
179 * This function will take the format and the binary array and finish
180 * the conversion into the ASCII string within the buffer.
181 *
182 * Returns how much it wrote to the buffer.
183 */
184int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
185{
186 unsigned int len = TRACE_SEQ_BUF_LEFT(s);
187 int ret;
188
189 if (s->full || !len)
190 return 0;
191
192 ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
193
194 /* If we can't write it all, don't bother writing anything */
195 if (ret >= len) {
196 s->full = 1;
197 return 0;
198 }
199
200 s->len += ret;
201
202 return len;
203}
204EXPORT_SYMBOL_GPL(trace_seq_bprintf);
205
206/**
207 * trace_seq_puts - trace sequence printing of simple string
208 * @s: trace sequence descriptor
209 * @str: simple string to record
210 *
211 * The tracer may use either the sequence operations or its own
212 * copy to user routines. This function records a simple string
213 * into a special buffer (@s) for later retrieval by a sequencer
214 * or other mechanism.
215 *
216 * Returns how much it wrote to the buffer.
217 */
218int trace_seq_puts(struct trace_seq *s, const char *str)
219{
220 unsigned int len = strlen(str);
221
222 if (s->full)
223 return 0;
224
225 if (len > TRACE_SEQ_BUF_LEFT(s)) {
226 s->full = 1;
227 return 0;
228 }
229
230 memcpy(s->buffer + s->len, str, len);
231 s->len += len;
232
233 return len;
234}
235EXPORT_SYMBOL_GPL(trace_seq_puts);
236
237/**
238 * trace_seq_putc - trace sequence printing of simple character
239 * @s: trace sequence descriptor
240 * @c: simple character to record
241 *
242 * The tracer may use either the sequence operations or its own
243 * copy to user routines. This function records a simple charater
244 * into a special buffer (@s) for later retrieval by a sequencer
245 * or other mechanism.
246 *
247 * Returns how much it wrote to the buffer.
248 */
249int trace_seq_putc(struct trace_seq *s, unsigned char c)
250{
251 if (s->full)
252 return 0;
253
254 if (TRACE_SEQ_BUF_LEFT(s) < 1) {
255 s->full = 1;
256 return 0;
257 }
258
259 s->buffer[s->len++] = c;
260
261 return 1;
262}
263EXPORT_SYMBOL_GPL(trace_seq_putc);
264
265/**
266 * trace_seq_putmem - write raw data into the trace_seq buffer
267 * @s: trace sequence descriptor
268 * @mem: The raw memory to copy into the buffer
269 * @len: The length of the raw memory to copy (in bytes)
270 *
271 * There may be cases where raw memory needs to be written into the
272 * buffer and a strcpy() would not work. Using this function allows
273 * for such cases.
274 *
275 * Returns how much it wrote to the buffer.
276 */
277int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
278{
279 if (s->full)
280 return 0;
281
282 if (len > TRACE_SEQ_BUF_LEFT(s)) {
283 s->full = 1;
284 return 0;
285 }
286
287 memcpy(s->buffer + s->len, mem, len);
288 s->len += len;
289
290 return len;
291}
292EXPORT_SYMBOL_GPL(trace_seq_putmem);
293
294#define MAX_MEMHEX_BYTES 8U
295#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
296
297/**
298 * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
299 * @s: trace sequence descriptor
300 * @mem: The raw memory to write its hex ASCII representation of
301 * @len: The length of the raw memory to copy (in bytes)
302 *
303 * This is similar to trace_seq_putmem() except instead of just copying the
304 * raw memory into the buffer it writes its ASCII representation of it
305 * in hex characters.
306 *
307 * Returns how much it wrote to the buffer.
308 */
309int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
310 unsigned int len)
311{
312 unsigned char hex[HEX_CHARS];
313 const unsigned char *data = mem;
314 unsigned int start_len;
315 int i, j;
316 int cnt = 0;
317
318 if (s->full)
319 return 0;
320
321 while (len) {
322 start_len = min(len, HEX_CHARS - 1);
323#ifdef __BIG_ENDIAN
324 for (i = 0, j = 0; i < start_len; i++) {
325#else
326 for (i = start_len-1, j = 0; i >= 0; i--) {
327#endif
328 hex[j++] = hex_asc_hi(data[i]);
329 hex[j++] = hex_asc_lo(data[i]);
330 }
331 if (WARN_ON_ONCE(j == 0 || j/2 > len))
332 break;
333
334 /* j increments twice per loop */
335 len -= j / 2;
336 hex[j++] = ' ';
337
338 cnt += trace_seq_putmem(s, hex, j);
339 }
340 return cnt;
341}
342EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
343
344/**
345 * trace_seq_path - copy a path into the sequence buffer
346 * @s: trace sequence descriptor
347 * @path: path to write into the sequence buffer.
348 *
349 * Write a path name into the sequence buffer.
350 *
351 * Returns 1 if we successfully written all the contents to
352 * the buffer.
353 * Returns 0 if we the length to write is bigger than the
354 * reserved buffer space. In this case, nothing gets written.
355 */
356int trace_seq_path(struct trace_seq *s, const struct path *path)
357{
358 unsigned char *p;
359
360 if (s->full)
361 return 0;
362
363 if (TRACE_SEQ_BUF_LEFT(s) < 1) {
364 s->full = 1;
365 return 0;
366 }
367
368 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
369 if (!IS_ERR(p)) {
370 p = mangle_path(s->buffer + s->len, p, "\n");
371 if (p) {
372 s->len = p - s->buffer;
373 return 1;
374 }
375 } else {
376 s->buffer[s->len++] = '?';
377 return 1;
378 }
379
380 s->full = 1;
381 return 0;
382}
383EXPORT_SYMBOL_GPL(trace_seq_path);
384
385/**
386 * trace_seq_to_user - copy the squence buffer to user space
387 * @s: trace sequence descriptor
388 * @ubuf: The userspace memory location to copy to
389 * @cnt: The amount to copy
390 *
391 * Copies the sequence buffer into the userspace memory pointed to
392 * by @ubuf. It starts from the last read position (@s->readpos)
393 * and writes up to @cnt characters or till it reaches the end of
394 * the content in the buffer (@s->len), which ever comes first.
395 *
396 * On success, it returns a positive number of the number of bytes
397 * it copied.
398 *
399 * On failure it returns -EBUSY if all of the content in the
400 * sequence has been already read, which includes nothing in the
401 * sequenc (@s->len == @s->readpos).
402 *
403 * Returns -EFAULT if the copy to userspace fails.
404 */
405int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
406{
407 int len;
408 int ret;
409
410 if (!cnt)
411 return 0;
412
413 if (s->len <= s->readpos)
414 return -EBUSY;
415
416 len = s->len - s->readpos;
417 if (cnt > len)
418 cnt = len;
419 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
420 if (ret == cnt)
421 return -EFAULT;
422
423 cnt -= ret;
424
425 s->readpos += cnt;
426 return cnt;
427}
428EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 3c9b97e6b1f4..33ff6a24b802 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -265,7 +265,6 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
265 if (is_ret) 265 if (is_ret)
266 tu->consumer.ret_handler = uretprobe_dispatcher; 266 tu->consumer.ret_handler = uretprobe_dispatcher;
267 init_trace_uprobe_filter(&tu->filter); 267 init_trace_uprobe_filter(&tu->filter);
268 tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;
269 return tu; 268 return tu;
270 269
271error: 270error:
@@ -1292,7 +1291,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
1292 kfree(call->print_fmt); 1291 kfree(call->print_fmt);
1293 return -ENODEV; 1292 return -ENODEV;
1294 } 1293 }
1295 call->flags = 0; 1294
1296 call->class->reg = trace_uprobe_register; 1295 call->class->reg = trace_uprobe_register;
1297 call->data = tu; 1296 call->data = tu;
1298 ret = trace_add_event_call(call); 1297 ret = trace_add_event_call(call);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 35974ac69600..5dbe22aa3efd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -265,7 +265,6 @@ struct workqueue_struct {
265 265
266static struct kmem_cache *pwq_cache; 266static struct kmem_cache *pwq_cache;
267 267
268static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */
269static cpumask_var_t *wq_numa_possible_cpumask; 268static cpumask_var_t *wq_numa_possible_cpumask;
270 /* possible CPUs of each node */ 269 /* possible CPUs of each node */
271 270
@@ -758,13 +757,6 @@ static bool too_many_workers(struct worker_pool *pool)
758 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ 757 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
759 int nr_busy = pool->nr_workers - nr_idle; 758 int nr_busy = pool->nr_workers - nr_idle;
760 759
761 /*
762 * nr_idle and idle_list may disagree if idle rebinding is in
763 * progress. Never return %true if idle_list is empty.
764 */
765 if (list_empty(&pool->idle_list))
766 return false;
767
768 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; 760 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
769} 761}
770 762
@@ -850,7 +842,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
850 pool = worker->pool; 842 pool = worker->pool;
851 843
852 /* this can only happen on the local cpu */ 844 /* this can only happen on the local cpu */
853 if (WARN_ON_ONCE(cpu != raw_smp_processor_id())) 845 if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
854 return NULL; 846 return NULL;
855 847
856 /* 848 /*
@@ -874,35 +866,22 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
874 * worker_set_flags - set worker flags and adjust nr_running accordingly 866 * worker_set_flags - set worker flags and adjust nr_running accordingly
875 * @worker: self 867 * @worker: self
876 * @flags: flags to set 868 * @flags: flags to set
877 * @wakeup: wakeup an idle worker if necessary
878 * 869 *
879 * Set @flags in @worker->flags and adjust nr_running accordingly. If 870 * Set @flags in @worker->flags and adjust nr_running accordingly.
880 * nr_running becomes zero and @wakeup is %true, an idle worker is
881 * woken up.
882 * 871 *
883 * CONTEXT: 872 * CONTEXT:
884 * spin_lock_irq(pool->lock) 873 * spin_lock_irq(pool->lock)
885 */ 874 */
886static inline void worker_set_flags(struct worker *worker, unsigned int flags, 875static inline void worker_set_flags(struct worker *worker, unsigned int flags)
887 bool wakeup)
888{ 876{
889 struct worker_pool *pool = worker->pool; 877 struct worker_pool *pool = worker->pool;
890 878
891 WARN_ON_ONCE(worker->task != current); 879 WARN_ON_ONCE(worker->task != current);
892 880
893 /* 881 /* If transitioning into NOT_RUNNING, adjust nr_running. */
894 * If transitioning into NOT_RUNNING, adjust nr_running and
895 * wake up an idle worker as necessary if requested by
896 * @wakeup.
897 */
898 if ((flags & WORKER_NOT_RUNNING) && 882 if ((flags & WORKER_NOT_RUNNING) &&
899 !(worker->flags & WORKER_NOT_RUNNING)) { 883 !(worker->flags & WORKER_NOT_RUNNING)) {
900 if (wakeup) { 884 atomic_dec(&pool->nr_running);
901 if (atomic_dec_and_test(&pool->nr_running) &&
902 !list_empty(&pool->worklist))
903 wake_up_worker(pool);
904 } else
905 atomic_dec(&pool->nr_running);
906 } 885 }
907 886
908 worker->flags |= flags; 887 worker->flags |= flags;
@@ -1232,7 +1211,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1232 pwq_activate_delayed_work(work); 1211 pwq_activate_delayed_work(work);
1233 1212
1234 list_del_init(&work->entry); 1213 list_del_init(&work->entry);
1235 pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work)); 1214 pwq_dec_nr_in_flight(pwq, get_work_color(work));
1236 1215
1237 /* work->data points to pwq iff queued, point to pool */ 1216 /* work->data points to pwq iff queued, point to pool */
1238 set_work_pool_and_keep_pending(work, pool->id); 1217 set_work_pool_and_keep_pending(work, pool->id);
@@ -1560,7 +1539,7 @@ static void worker_enter_idle(struct worker *worker)
1560 (worker->hentry.next || worker->hentry.pprev))) 1539 (worker->hentry.next || worker->hentry.pprev)))
1561 return; 1540 return;
1562 1541
1563 /* can't use worker_set_flags(), also called from start_worker() */ 1542 /* can't use worker_set_flags(), also called from create_worker() */
1564 worker->flags |= WORKER_IDLE; 1543 worker->flags |= WORKER_IDLE;
1565 pool->nr_idle++; 1544 pool->nr_idle++;
1566 worker->last_active = jiffies; 1545 worker->last_active = jiffies;
@@ -1602,11 +1581,11 @@ static void worker_leave_idle(struct worker *worker)
1602 list_del_init(&worker->entry); 1581 list_del_init(&worker->entry);
1603} 1582}
1604 1583
1605static struct worker *alloc_worker(void) 1584static struct worker *alloc_worker(int node)
1606{ 1585{
1607 struct worker *worker; 1586 struct worker *worker;
1608 1587
1609 worker = kzalloc(sizeof(*worker), GFP_KERNEL); 1588 worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
1610 if (worker) { 1589 if (worker) {
1611 INIT_LIST_HEAD(&worker->entry); 1590 INIT_LIST_HEAD(&worker->entry);
1612 INIT_LIST_HEAD(&worker->scheduled); 1591 INIT_LIST_HEAD(&worker->scheduled);
@@ -1670,6 +1649,9 @@ static void worker_detach_from_pool(struct worker *worker,
1670 detach_completion = pool->detach_completion; 1649 detach_completion = pool->detach_completion;
1671 mutex_unlock(&pool->attach_mutex); 1650 mutex_unlock(&pool->attach_mutex);
1672 1651
1652 /* clear leftover flags without pool->lock after it is detached */
1653 worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
1654
1673 if (detach_completion) 1655 if (detach_completion)
1674 complete(detach_completion); 1656 complete(detach_completion);
1675} 1657}
@@ -1678,8 +1660,7 @@ static void worker_detach_from_pool(struct worker *worker,
1678 * create_worker - create a new workqueue worker 1660 * create_worker - create a new workqueue worker
1679 * @pool: pool the new worker will belong to 1661 * @pool: pool the new worker will belong to
1680 * 1662 *
1681 * Create a new worker which is attached to @pool. The new worker must be 1663 * Create and start a new worker which is attached to @pool.
1682 * started by start_worker().
1683 * 1664 *
1684 * CONTEXT: 1665 * CONTEXT:
1685 * Might sleep. Does GFP_KERNEL allocations. 1666 * Might sleep. Does GFP_KERNEL allocations.
@@ -1698,7 +1679,7 @@ static struct worker *create_worker(struct worker_pool *pool)
1698 if (id < 0) 1679 if (id < 0)
1699 goto fail; 1680 goto fail;
1700 1681
1701 worker = alloc_worker(); 1682 worker = alloc_worker(pool->node);
1702 if (!worker) 1683 if (!worker)
1703 goto fail; 1684 goto fail;
1704 1685
@@ -1724,6 +1705,13 @@ static struct worker *create_worker(struct worker_pool *pool)
1724 /* successful, attach the worker to the pool */ 1705 /* successful, attach the worker to the pool */
1725 worker_attach_to_pool(worker, pool); 1706 worker_attach_to_pool(worker, pool);
1726 1707
1708 /* start the newly created worker */
1709 spin_lock_irq(&pool->lock);
1710 worker->pool->nr_workers++;
1711 worker_enter_idle(worker);
1712 wake_up_process(worker->task);
1713 spin_unlock_irq(&pool->lock);
1714
1727 return worker; 1715 return worker;
1728 1716
1729fail: 1717fail:
@@ -1734,44 +1722,6 @@ fail:
1734} 1722}
1735 1723
1736/** 1724/**
1737 * start_worker - start a newly created worker
1738 * @worker: worker to start
1739 *
1740 * Make the pool aware of @worker and start it.
1741 *
1742 * CONTEXT:
1743 * spin_lock_irq(pool->lock).
1744 */
1745static void start_worker(struct worker *worker)
1746{
1747 worker->pool->nr_workers++;
1748 worker_enter_idle(worker);
1749 wake_up_process(worker->task);
1750}
1751
1752/**
1753 * create_and_start_worker - create and start a worker for a pool
1754 * @pool: the target pool
1755 *
1756 * Grab the managership of @pool and create and start a new worker for it.
1757 *
1758 * Return: 0 on success. A negative error code otherwise.
1759 */
1760static int create_and_start_worker(struct worker_pool *pool)
1761{
1762 struct worker *worker;
1763
1764 worker = create_worker(pool);
1765 if (worker) {
1766 spin_lock_irq(&pool->lock);
1767 start_worker(worker);
1768 spin_unlock_irq(&pool->lock);
1769 }
1770
1771 return worker ? 0 : -ENOMEM;
1772}
1773
1774/**
1775 * destroy_worker - destroy a workqueue worker 1725 * destroy_worker - destroy a workqueue worker
1776 * @worker: worker to be destroyed 1726 * @worker: worker to be destroyed
1777 * 1727 *
@@ -1909,23 +1859,10 @@ restart:
1909 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); 1859 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1910 1860
1911 while (true) { 1861 while (true) {
1912 struct worker *worker; 1862 if (create_worker(pool) || !need_to_create_worker(pool))
1913
1914 worker = create_worker(pool);
1915 if (worker) {
1916 del_timer_sync(&pool->mayday_timer);
1917 spin_lock_irq(&pool->lock);
1918 start_worker(worker);
1919 if (WARN_ON_ONCE(need_to_create_worker(pool)))
1920 goto restart;
1921 return true;
1922 }
1923
1924 if (!need_to_create_worker(pool))
1925 break; 1863 break;
1926 1864
1927 __set_current_state(TASK_INTERRUPTIBLE); 1865 schedule_timeout_interruptible(CREATE_COOLDOWN);
1928 schedule_timeout(CREATE_COOLDOWN);
1929 1866
1930 if (!need_to_create_worker(pool)) 1867 if (!need_to_create_worker(pool))
1931 break; 1868 break;
@@ -1933,6 +1870,11 @@ restart:
1933 1870
1934 del_timer_sync(&pool->mayday_timer); 1871 del_timer_sync(&pool->mayday_timer);
1935 spin_lock_irq(&pool->lock); 1872 spin_lock_irq(&pool->lock);
1873 /*
1874 * This is necessary even after a new worker was just successfully
1875 * created as @pool->lock was dropped and the new worker might have
1876 * already become busy.
1877 */
1936 if (need_to_create_worker(pool)) 1878 if (need_to_create_worker(pool))
1937 goto restart; 1879 goto restart;
1938 return true; 1880 return true;
@@ -2020,13 +1962,8 @@ __acquires(&pool->lock)
2020 1962
2021 lockdep_copy_map(&lockdep_map, &work->lockdep_map); 1963 lockdep_copy_map(&lockdep_map, &work->lockdep_map);
2022#endif 1964#endif
2023 /* 1965 /* ensure we're on the correct CPU */
2024 * Ensure we're on the correct CPU. DISASSOCIATED test is 1966 WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
2025 * necessary to avoid spurious warnings from rescuers servicing the
2026 * unbound or a disassociated pool.
2027 */
2028 WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
2029 !(pool->flags & POOL_DISASSOCIATED) &&
2030 raw_smp_processor_id() != pool->cpu); 1967 raw_smp_processor_id() != pool->cpu);
2031 1968
2032 /* 1969 /*
@@ -2052,17 +1989,22 @@ __acquires(&pool->lock)
2052 list_del_init(&work->entry); 1989 list_del_init(&work->entry);
2053 1990
2054 /* 1991 /*
2055 * CPU intensive works don't participate in concurrency 1992 * CPU intensive works don't participate in concurrency management.
2056 * management. They're the scheduler's responsibility. 1993 * They're the scheduler's responsibility. This takes @worker out
1994 * of concurrency management and the next code block will chain
1995 * execution of the pending work items.
2057 */ 1996 */
2058 if (unlikely(cpu_intensive)) 1997 if (unlikely(cpu_intensive))
2059 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); 1998 worker_set_flags(worker, WORKER_CPU_INTENSIVE);
2060 1999
2061 /* 2000 /*
2062 * Unbound pool isn't concurrency managed and work items should be 2001 * Wake up another worker if necessary. The condition is always
2063 * executed ASAP. Wake up another worker if necessary. 2002 * false for normal per-cpu workers since nr_running would always
2003 * be >= 1 at this point. This is used to chain execution of the
2004 * pending work items for WORKER_NOT_RUNNING workers such as the
2005 * UNBOUND and CPU_INTENSIVE ones.
2064 */ 2006 */
2065 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) 2007 if (need_more_worker(pool))
2066 wake_up_worker(pool); 2008 wake_up_worker(pool);
2067 2009
2068 /* 2010 /*
@@ -2218,7 +2160,7 @@ recheck:
2218 } 2160 }
2219 } while (keep_working(pool)); 2161 } while (keep_working(pool));
2220 2162
2221 worker_set_flags(worker, WORKER_PREP, false); 2163 worker_set_flags(worker, WORKER_PREP);
2222sleep: 2164sleep:
2223 /* 2165 /*
2224 * pool->lock is held and there's no work to process and no need to 2166 * pool->lock is held and there's no work to process and no need to
@@ -2311,29 +2253,27 @@ repeat:
2311 move_linked_works(work, scheduled, &n); 2253 move_linked_works(work, scheduled, &n);
2312 2254
2313 process_scheduled_works(rescuer); 2255 process_scheduled_works(rescuer);
2314 spin_unlock_irq(&pool->lock);
2315
2316 worker_detach_from_pool(rescuer, pool);
2317
2318 spin_lock_irq(&pool->lock);
2319 2256
2320 /* 2257 /*
2321 * Put the reference grabbed by send_mayday(). @pool won't 2258 * Put the reference grabbed by send_mayday(). @pool won't
2322 * go away while we're holding its lock. 2259 * go away while we're still attached to it.
2323 */ 2260 */
2324 put_pwq(pwq); 2261 put_pwq(pwq);
2325 2262
2326 /* 2263 /*
2327 * Leave this pool. If keep_working() is %true, notify a 2264 * Leave this pool. If need_more_worker() is %true, notify a
2328 * regular worker; otherwise, we end up with 0 concurrency 2265 * regular worker; otherwise, we end up with 0 concurrency
2329 * and stalling the execution. 2266 * and stalling the execution.
2330 */ 2267 */
2331 if (keep_working(pool)) 2268 if (need_more_worker(pool))
2332 wake_up_worker(pool); 2269 wake_up_worker(pool);
2333 2270
2334 rescuer->pool = NULL; 2271 rescuer->pool = NULL;
2335 spin_unlock(&pool->lock); 2272 spin_unlock_irq(&pool->lock);
2336 spin_lock(&wq_mayday_lock); 2273
2274 worker_detach_from_pool(rescuer, pool);
2275
2276 spin_lock_irq(&wq_mayday_lock);
2337 } 2277 }
2338 2278
2339 spin_unlock_irq(&wq_mayday_lock); 2279 spin_unlock_irq(&wq_mayday_lock);
@@ -3458,7 +3398,7 @@ static void put_unbound_pool(struct worker_pool *pool)
3458 return; 3398 return;
3459 3399
3460 /* sanity checks */ 3400 /* sanity checks */
3461 if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || 3401 if (WARN_ON(!(pool->cpu < 0)) ||
3462 WARN_ON(!list_empty(&pool->worklist))) 3402 WARN_ON(!list_empty(&pool->worklist)))
3463 return; 3403 return;
3464 3404
@@ -3524,7 +3464,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3524 hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { 3464 hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
3525 if (wqattrs_equal(pool->attrs, attrs)) { 3465 if (wqattrs_equal(pool->attrs, attrs)) {
3526 pool->refcnt++; 3466 pool->refcnt++;
3527 goto out_unlock; 3467 return pool;
3528 } 3468 }
3529 } 3469 }
3530 3470
@@ -3557,12 +3497,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3557 goto fail; 3497 goto fail;
3558 3498
3559 /* create and start the initial worker */ 3499 /* create and start the initial worker */
3560 if (create_and_start_worker(pool) < 0) 3500 if (!create_worker(pool))
3561 goto fail; 3501 goto fail;
3562 3502
3563 /* install */ 3503 /* install */
3564 hash_add(unbound_pool_hash, &pool->hash_node, hash); 3504 hash_add(unbound_pool_hash, &pool->hash_node, hash);
3565out_unlock: 3505
3566 return pool; 3506 return pool;
3567fail: 3507fail:
3568 if (pool) 3508 if (pool)
@@ -3591,11 +3531,6 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
3591 if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) 3531 if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
3592 return; 3532 return;
3593 3533
3594 /*
3595 * Unlink @pwq. Synchronization against wq->mutex isn't strictly
3596 * necessary on release but do it anyway. It's easier to verify
3597 * and consistent with the linking path.
3598 */
3599 mutex_lock(&wq->mutex); 3534 mutex_lock(&wq->mutex);
3600 list_del_rcu(&pwq->pwqs_node); 3535 list_del_rcu(&pwq->pwqs_node);
3601 is_last = list_empty(&wq->pwqs); 3536 is_last = list_empty(&wq->pwqs);
@@ -3692,10 +3627,7 @@ static void link_pwq(struct pool_workqueue *pwq)
3692 if (!list_empty(&pwq->pwqs_node)) 3627 if (!list_empty(&pwq->pwqs_node))
3693 return; 3628 return;
3694 3629
3695 /* 3630 /* set the matching work_color */
3696 * Set the matching work_color. This is synchronized with
3697 * wq->mutex to avoid confusing flush_workqueue().
3698 */
3699 pwq->work_color = wq->work_color; 3631 pwq->work_color = wq->work_color;
3700 3632
3701 /* sync max_active to the current setting */ 3633 /* sync max_active to the current setting */
@@ -3832,7 +3764,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
3832 if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) 3764 if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
3833 return -EINVAL; 3765 return -EINVAL;
3834 3766
3835 pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL); 3767 pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
3836 new_attrs = alloc_workqueue_attrs(GFP_KERNEL); 3768 new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3837 tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); 3769 tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3838 if (!pwq_tbl || !new_attrs || !tmp_attrs) 3770 if (!pwq_tbl || !new_attrs || !tmp_attrs)
@@ -4080,7 +4012,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
4080 4012
4081 /* allocate wq and format name */ 4013 /* allocate wq and format name */
4082 if (flags & WQ_UNBOUND) 4014 if (flags & WQ_UNBOUND)
4083 tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); 4015 tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
4084 4016
4085 wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); 4017 wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
4086 if (!wq) 4018 if (!wq)
@@ -4122,7 +4054,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
4122 if (flags & WQ_MEM_RECLAIM) { 4054 if (flags & WQ_MEM_RECLAIM) {
4123 struct worker *rescuer; 4055 struct worker *rescuer;
4124 4056
4125 rescuer = alloc_worker(); 4057 rescuer = alloc_worker(NUMA_NO_NODE);
4126 if (!rescuer) 4058 if (!rescuer)
4127 goto err_destroy; 4059 goto err_destroy;
4128 4060
@@ -4470,8 +4402,6 @@ static void wq_unbind_fn(struct work_struct *work)
4470 struct worker *worker; 4402 struct worker *worker;
4471 4403
4472 for_each_cpu_worker_pool(pool, cpu) { 4404 for_each_cpu_worker_pool(pool, cpu) {
4473 WARN_ON_ONCE(cpu != smp_processor_id());
4474
4475 mutex_lock(&pool->attach_mutex); 4405 mutex_lock(&pool->attach_mutex);
4476 spin_lock_irq(&pool->lock); 4406 spin_lock_irq(&pool->lock);
4477 4407
@@ -4543,6 +4473,7 @@ static void rebind_workers(struct worker_pool *pool)
4543 pool->attrs->cpumask) < 0); 4473 pool->attrs->cpumask) < 0);
4544 4474
4545 spin_lock_irq(&pool->lock); 4475 spin_lock_irq(&pool->lock);
4476 pool->flags &= ~POOL_DISASSOCIATED;
4546 4477
4547 for_each_pool_worker(worker, pool) { 4478 for_each_pool_worker(worker, pool) {
4548 unsigned int worker_flags = worker->flags; 4479 unsigned int worker_flags = worker->flags;
@@ -4632,7 +4563,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
4632 for_each_cpu_worker_pool(pool, cpu) { 4563 for_each_cpu_worker_pool(pool, cpu) {
4633 if (pool->nr_workers) 4564 if (pool->nr_workers)
4634 continue; 4565 continue;
4635 if (create_and_start_worker(pool) < 0) 4566 if (!create_worker(pool))
4636 return NOTIFY_BAD; 4567 return NOTIFY_BAD;
4637 } 4568 }
4638 break; 4569 break;
@@ -4644,15 +4575,10 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
4644 for_each_pool(pool, pi) { 4575 for_each_pool(pool, pi) {
4645 mutex_lock(&pool->attach_mutex); 4576 mutex_lock(&pool->attach_mutex);
4646 4577
4647 if (pool->cpu == cpu) { 4578 if (pool->cpu == cpu)
4648 spin_lock_irq(&pool->lock);
4649 pool->flags &= ~POOL_DISASSOCIATED;
4650 spin_unlock_irq(&pool->lock);
4651
4652 rebind_workers(pool); 4579 rebind_workers(pool);
4653 } else if (pool->cpu < 0) { 4580 else if (pool->cpu < 0)
4654 restore_unbound_workers_cpumask(pool, cpu); 4581 restore_unbound_workers_cpumask(pool, cpu);
4655 }
4656 4582
4657 mutex_unlock(&pool->attach_mutex); 4583 mutex_unlock(&pool->attach_mutex);
4658 } 4584 }
@@ -4856,10 +4782,6 @@ static void __init wq_numa_init(void)
4856 cpumask_var_t *tbl; 4782 cpumask_var_t *tbl;
4857 int node, cpu; 4783 int node, cpu;
4858 4784
4859 /* determine NUMA pwq table len - highest node id + 1 */
4860 for_each_node(node)
4861 wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
4862
4863 if (num_possible_nodes() <= 1) 4785 if (num_possible_nodes() <= 1)
4864 return; 4786 return;
4865 4787
@@ -4876,7 +4798,7 @@ static void __init wq_numa_init(void)
4876 * available. Build one from cpu_to_node() which should have been 4798 * available. Build one from cpu_to_node() which should have been
4877 * fully initialized by now. 4799 * fully initialized by now.
4878 */ 4800 */
4879 tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL); 4801 tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
4880 BUG_ON(!tbl); 4802 BUG_ON(!tbl);
4881 4803
4882 for_each_node(node) 4804 for_each_node(node)
@@ -4936,7 +4858,7 @@ static int __init init_workqueues(void)
4936 4858
4937 for_each_cpu_worker_pool(pool, cpu) { 4859 for_each_cpu_worker_pool(pool, cpu) {
4938 pool->flags &= ~POOL_DISASSOCIATED; 4860 pool->flags &= ~POOL_DISASSOCIATED;
4939 BUG_ON(create_and_start_worker(pool) < 0); 4861 BUG_ON(!create_worker(pool));
4940 } 4862 }
4941 } 4863 }
4942 4864